RubyGems - attentive - Versions diffs - 0.1.0.beta1 - Mend

attentive 0.1.0.beta1

Files changed (38) hide show

checksums.yaml +7 -0
data/.gitignore +10 -0
data/.travis.yml +4 -0
data/Gemfile +4 -0
data/LICENSE.txt +21 -0
data/README.md +41 -0
data/Rakefile +10 -0
data/attentive.gemspec +31 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/data/contractions.tsv +134 -0
data/data/slang.tsv +19 -0
data/lib/attentive/composite_entity.rb +33 -0
data/lib/attentive/cursor.rb +28 -0
data/lib/attentive/entities/integer.rb +5 -0
data/lib/attentive/entities/relative_date.rb +44 -0
data/lib/attentive/entity.rb +65 -0
data/lib/attentive/errors.rb +7 -0
data/lib/attentive/listener.rb +55 -0
data/lib/attentive/listener_collection.rb +45 -0
data/lib/attentive/match.rb +22 -0
data/lib/attentive/matcher.rb +82 -0
data/lib/attentive/message.rb +24 -0
data/lib/attentive/phrase.rb +19 -0
data/lib/attentive/text.rb +57 -0
data/lib/attentive/token.rb +58 -0
data/lib/attentive/tokenizer.rb +161 -0
data/lib/attentive/tokens/any_of.rb +23 -0
data/lib/attentive/tokens/emoji.rb +17 -0
data/lib/attentive/tokens/me.rb +17 -0
data/lib/attentive/tokens/punctuation.rb +13 -0
data/lib/attentive/tokens/regexp.rb +27 -0
data/lib/attentive/tokens/whitespace.rb +22 -0
data/lib/attentive/tokens/word.rb +8 -0
data/lib/attentive/tokens.rb +45 -0
data/lib/attentive/version.rb +3 -0
data/lib/attentive.rb +20 -0
metadata +206 -0

data/lib/attentive/matcher.rb ADDED Viewed

@@ -0,0 +1,82 @@
+require "attentive/match"
+module Attentive
+  class Matcher
+    attr_reader :phrase, :cursor, :pos
+    def initialize(phrase, cursor, params={})
+      @phrase = phrase
+      @cursor = cursor
+      @pos = params.fetch(:pos, 0)
+      @pos += 1 while phrase[pos] && phrase[pos].whitespace?
+      @match_data = {}
+      @state = :matching
+    end
+    def matching?
+      @state == :matching
+    end
+    def mismatch?
+      @state == :mismatch
+    end
+    def match!
+      while token = cursor.peek
+        if token.ambiguous?
+          unless match_subphrase!(token.possibilities)
+            @state = :mismatch
+            break
+          end
+          @pos += 1 while phrase[pos] && phrase[pos].whitespace?
+        elsif match_data = phrase[pos].matches?(cursor)
+          if match_data.is_a?(MatchData)
+            new_character_index = cursor.offset + match_data.to_s.length
+            @match_data.merge! Hash[match_data.names.zip(match_data.captures)]
+            # Advance the cursor to the first token after the regexp match
+            cursor_pos = cursor.tokens.index { |token| token.pos >= new_character_index }
+            cursor_pos = cursor.tokens.length unless cursor_pos
+            cursor.instance_variable_set :@pos, cursor_pos
+            @pos += 1
+          else
+            @match_data.merge!(match_data) unless match_data == true
+            @pos += 1
+          end
+          @pos += 1 while phrase[pos] && phrase[pos].whitespace?
+          @state = :found
+          # puts "matched #{phrase.inspect}"
+          return Attentive::Match.new(phrase, match_data: @match_data) if pos == phrase.length
+        elsif !token.skippable?
+          @state = :mismatch
+          break
+        end
+        cursor.pop
+        break unless cursor.peek
+        while cursor.peek.whitespace?
+          cursor.pop
+          break unless cursor.peek
+        end
+      end
+      nil
+    end
+    def match_subphrase!(subphrases)
+      subphrases.each do |subphrase|
+        matcher = Matcher.new(phrase, Cursor.new(subphrase), pos: pos)
+        matcher.match!
+        unless matcher.mismatch?
+          @pos = matcher.pos
+          return true
+        end
+      end
+      false
+    end
+  end
+end

data/lib/attentive/message.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require "set"
+require "attentive/tokenizer"
+module Attentive
+  class Message
+    attr_reader :contexts, :text
+    def initialize(text, params)
+      @text = text
+      @contexts = Set.new(params.fetch(:contexts, []))
+    end
+    def tokens
+      @tokens ||= Attentive::Tokenizer.tokenize(text)
+    end
+    alias :to_s :text
+    def inspect
+      tokens.inspect
+    end
+  end
+end

data/lib/attentive/phrase.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require "delegate"
+module Attentive
+  class Phrase < SimpleDelegator
+    def initialize(tokens)
+      super tokens
+    end
+    def to_s
+      join
+    end
+    def inspect
+      "\"#{to_s}\""
+    end
+  end
+end

data/lib/attentive/text.rb ADDED Viewed

@@ -0,0 +1,57 @@
+module Attentive
+  module Text
+    extend self
+    def normalize(text)
+      straighten_quotes downcase text
+    end
+    def downcase(text)
+      text.downcase
+    end
+    def straighten_quotes(text)
+      text.gsub(/[“”]/, "\"").gsub(/[‘’]/, "'")
+    end
+    DATA_PATH = File.expand_path(File.dirname(__FILE__) + "/../../data").freeze
+    CONTRACTIONS = {}.tap do |contractions|
+      File.open(DATA_PATH + "/contractions.tsv") do |file|
+        file.each do |line|
+          next if line.start_with?("#") # skip comments
+          next if line == "\n" # skip blank lines
+          # the file contains tab-separated values.
+          # the first value is the contraction.
+          # the remaining values are possible phrases that match it
+          phrases = line.chomp.split("\t")
+          raise "#{line.inspect} must have exactly two values" unless phrases.length >= 2
+          contractions[phrases.shift] = phrases
+        end
+      end
+    end.freeze
+    SLANG = {}.tap do |slang|
+      File.open(DATA_PATH + "/slang.tsv") do |file|
+        file.each do |line|
+          next if line.start_with?("#") # skip comments
+          next if line == "\n" # skip blank lines
+          # the file contains tab-separated values.
+          # every line should have exactly two values:
+          #  + the first is the slang word
+          #  + the second is the normal word
+          words = line.chomp.split("\t")
+          raise "#{line.inspect} must have exactly two values" unless words.length == 2
+          slang[words[0]] = words[1]
+        end
+      end
+    end.freeze
+  end
+end

data/lib/attentive/token.rb ADDED Viewed

@@ -0,0 +1,58 @@
+module Attentive
+  class Token
+    attr_reader :pos
+    def initialize(pos)
+      @pos = pos
+    end
+    def ==(other)
+      self.class == other.class
+    end
+    def ambiguous?
+      false
+    end
+    def entity?
+      false
+    end
+    def whitespace?
+      false
+    end
+    def skippable?
+      false
+    end
+    def matches?(cursor)
+      self == cursor.peek
+    end
+  end
+  class StringToken < Token
+    attr_reader :string
+    def initialize(string, pos)
+      @string = string
+      super pos
+    end
+    def to_str
+      to_s
+    end
+    def to_s
+      string
+    end
+    def ==(other)
+      self.class == other.class && self.string == other.string
+    end
+  end
+end

data/lib/attentive/tokenizer.rb ADDED Viewed

@@ -0,0 +1,161 @@
+require "attentive/text"
+require "attentive/tokens"
+require "attentive/phrase"
+require "attentive/errors"
+module Attentive
+  class Tokenizer
+    extend Attentive::Tokens
+    # Splits apart words and punctuation,
+    # treats apostrophes and dashes as a word-characters,
+    # trims each fragment of whitepsace
+    # SPLITTER = /\s*([\w'-]+)\s*/.freeze
+    SPLITTER = /(\n|{{|}}|\s+|\.{2,}|[^\s\w'@-])/.freeze
+    PUNCTUATION = /^\W+$/.freeze
+    WHITESPACE = /^\s+$/.freeze
+    ME = "@me".freeze
+    ENTITY_START = "{{".freeze
+    ENTITY_END = "}}".freeze
+    REGEXP_START = "(".freeze
+    REGEXP_END = ")".freeze
+    REGEXP_ESCAPE = "\\".freeze
+    def self.split(message)
+      Attentive::Text.normalize(message).split(SPLITTER).reject(&:empty?)
+    end
+    def self.tokenize(message, options={})
+      match_entities = options.fetch(:entities, false)
+      match_regexps = options.fetch(:regexps, false)
+      fail_if_ambiguous = !options.fetch(:ambiguous, true)
+      strings = split(message)
+      tokens = []
+      i = 0
+      pos = 0
+      while i < strings.length
+        string = strings[i]
+        case string
+        when ""
+          # do nothing
+        when WHITESPACE
+          tokens << whitespace(string, pos: pos)
+        when ":"
+          if strings[i + 2] == ":"
+            tokens << emoji(strings[i + 1], pos: pos)
+            pos += strings[i + 1].length + 1
+            i += 2
+          else
+            tokens << punctuation(":", pos: pos)
+          end
+        when ENTITY_START
+          if match_entities
+            j = i + 1
+            found_entity = false
+            while j < strings.length
+              if strings[j] == ENTITY_END
+                entity = strings[(i + 1)...j] # e.g. ["variable-name", ":" "entity-type"]
+                tokens << entity(*entity.join.split(":").reverse, pos: pos)
+                i = j + 1
+                pos += entity.join.length + 4
+                found_entity = true
+                break
+              end
+              j += 1
+            end
+            next if found_entity
+          end
+          tokens << punctuation(ENTITY_START, pos: pos)
+        when REGEXP_START
+          if match_regexps && strings[i + 1] == "?"
+            j = i + 2
+            found_regexp = false
+            parens = 1
+            inside_square_bracket = false
+            while j < strings.length
+              if strings[j] == "[" && strings[j - 1] != REGEXP_ESCAPE
+                inside_square_bracket = true
+              elsif strings[j] == "]" && strings[j - 1] != REGEXP_ESCAPE
+                inside_square_bracket = false
+              end
+              unless inside_square_bracket
+                if strings[j] == REGEXP_START && strings[j - 1] != REGEXP_ESCAPE
+                  parens += 1
+                elsif strings[j] == REGEXP_END && strings[j - 1] != REGEXP_ESCAPE
+                  parens -= 1
+                end
+                if parens == 0
+                  tokens << regexp(strings[i..j].join, pos: pos)
+                  pos += strings[i..j].join.length + 2
+                  i = j + 1
+                  found_regexp = true
+                  break
+                end
+              end
+              j += 1
+            end
+            next if found_regexp
+          end
+          tokens << punctuation(REGEXP_START, pos: pos)
+        when PUNCTUATION
+          tokens << punctuation(string, pos: pos)
+        when ME
+          tokens << me(pos: pos)
+        else
+          if replace_with = Attentive::Text::SLANG[string]
+            tokens.concat tokenize(replace_with, options)
+          elsif expands_to = Attentive::Text::CONTRACTIONS[string]
+            possibilities = expands_to.map do |possibility|
+              tokenize(possibility, options)
+            end
+            if possibilities.length == 1
+              tokens.concat possibilities[0]
+            else
+              tokens << any_of(possibilities, pos: pos)
+            end
+          else
+            tokens << word(string, pos: pos)
+          end
+        end
+        i += 1
+        pos += string.length
+      end
+      fail_if_ambiguous!(message, tokens) if fail_if_ambiguous
+      Attentive::Phrase.new(tokens)
+    end
+    def self.fail_if_ambiguous!(phrase, tokens)
+      ambiguous_token = tokens.find(&:ambiguous?)
+      return unless ambiguous_token
+      raise Attentive::AmbiguousPhraseError.new(
+        "The phrase #{phrase.inspect} is ambiguous. " <<
+        "Please use #{ambiguous_token.possibilities.map(&:inspect).join(" or ")}")
+    end
+  end
+end
+# Not the perfect place for these...
+# Attentive::Tokenizer needs to be defined first...
+require "attentive/entity"
+require "attentive/composite_entity"
+require "attentive/entities/integer"
+require "attentive/entities/relative_date"

data/lib/attentive/tokens/any_of.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require "attentive/token"
+module Attentive
+  module Tokens
+    class AnyOf < Token
+      attr_reader :possibilities
+      def initialize(possibilities, pos)
+        @possibilities = possibilities
+        super pos
+      end
+      def ==(other)
+        self.class == other.class && self.possibilities == other.possibilities
+      end
+      def ambiguous?
+        true
+      end
+    end
+  end
+end

data/lib/attentive/tokens/emoji.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require "attentive/token"
+module Attentive
+  module Tokens
+    class Emoji < StringToken
+      def to_s
+        ":#{string}:"
+      end
+      def skippable?
+        true
+      end
+    end
+  end
+end

data/lib/attentive/tokens/me.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require "attentive/token"
+module Attentive
+  module Tokens
+    class Me < Token
+      def to_s
+        Attentive::Tokenizer::ME
+      end
+      def skippable?
+        true
+      end
+    end
+  end
+end

data/lib/attentive/tokens/punctuation.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require "attentive/token"
+module Attentive
+  module Tokens
+    class Punctuation < StringToken
+      def skippable?
+        true
+      end
+    end
+  end
+end

data/lib/attentive/tokens/regexp.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require "attentive/token"
+module Attentive
+  module Tokens
+    class Regexp < Token
+      attr_reader :regexp
+      def initialize(string, pos)
+        @regexp = ::Regexp.compile("^#{string}")
+        super pos
+      end
+      def ==(other)
+        self.class == other.class && self.regexp == other.regexp
+      end
+      def matches?(cursor)
+        regexp.match(cursor.to_s)
+      end
+      def to_s
+        regexp.inspect[1...-1]
+      end
+    end
+  end
+end

data/lib/attentive/tokens/whitespace.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require "attentive/token"
+module Attentive
+  module Tokens
+    class Whitespace < StringToken
+      # All whitespace is equal
+      def ==(other)
+        self.class == other.class
+      end
+      def skippable?
+        true
+      end
+      def whitespace?
+        true
+      end
+    end
+  end
+end

data/lib/attentive/tokens/word.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require "attentive/token"
+module Attentive
+  module Tokens
+    class Word < StringToken
+    end
+  end
+end

data/lib/attentive/tokens.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module Attentive
+  module Tokens
+    def any_of(possibilities, pos: nil)
+      Attentive::Tokens::AnyOf.new possibilities, pos
+    end
+    def emoji(string, pos: nil)
+      Attentive::Tokens::Emoji.new string, pos
+    end
+    def entity(entity_name, variable_name=entity_name, pos: nil)
+      Attentive::Entity[entity_name.to_sym].new(variable_name)
+    end
+    def me(pos: nil)
+      Attentive::Tokens::Me.new pos
+    end
+    def punctuation(string, pos: nil)
+      Attentive::Tokens::Punctuation.new string, pos
+    end
+    def regexp(string, pos: nil)
+      Attentive::Tokens::Regexp.new string, pos
+    end
+    def whitespace(string, pos: nil)
+      Attentive::Tokens::Whitespace.new string, pos
+    end
+    def word(string, pos: nil)
+      Attentive::Tokens::Word.new string, pos
+    end
+  end
+end
+require "attentive/tokens/any_of"
+require "attentive/tokens/emoji"
+require "attentive/tokens/me"
+require "attentive/tokens/punctuation"
+require "attentive/tokens/regexp"
+require "attentive/tokens/whitespace"
+require "attentive/tokens/word"

data/lib/attentive/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Attentive
+  VERSION = "0.1.0.beta1"
+end

data/lib/attentive.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require "attentive/version"
+require "attentive/listener_collection"
+require "attentive/message"
+module Attentive
+  def listeners
+    @listeners ||= Attentive::ListenerCollection.new
+  end
+  def listen_for(*args, &block)
+    listeners.listen_for(*args, &block)
+  end
+  def hear(message, params={})
+    message = Attentive::Message.new(message, params) unless message.is_a?(Attentive::Message)
+    listeners.hear message
+  end
+end