attentive 0.1.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +10 -0
- data/attentive.gemspec +31 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/data/contractions.tsv +134 -0
- data/data/slang.tsv +19 -0
- data/lib/attentive/composite_entity.rb +33 -0
- data/lib/attentive/cursor.rb +28 -0
- data/lib/attentive/entities/integer.rb +5 -0
- data/lib/attentive/entities/relative_date.rb +44 -0
- data/lib/attentive/entity.rb +65 -0
- data/lib/attentive/errors.rb +7 -0
- data/lib/attentive/listener.rb +55 -0
- data/lib/attentive/listener_collection.rb +45 -0
- data/lib/attentive/match.rb +22 -0
- data/lib/attentive/matcher.rb +82 -0
- data/lib/attentive/message.rb +24 -0
- data/lib/attentive/phrase.rb +19 -0
- data/lib/attentive/text.rb +57 -0
- data/lib/attentive/token.rb +58 -0
- data/lib/attentive/tokenizer.rb +161 -0
- data/lib/attentive/tokens/any_of.rb +23 -0
- data/lib/attentive/tokens/emoji.rb +17 -0
- data/lib/attentive/tokens/me.rb +17 -0
- data/lib/attentive/tokens/punctuation.rb +13 -0
- data/lib/attentive/tokens/regexp.rb +27 -0
- data/lib/attentive/tokens/whitespace.rb +22 -0
- data/lib/attentive/tokens/word.rb +8 -0
- data/lib/attentive/tokens.rb +45 -0
- data/lib/attentive/version.rb +3 -0
- data/lib/attentive.rb +20 -0
- metadata +206 -0
@@ -0,0 +1,82 @@
|
|
1
|
+
require "attentive/match"
|
2
|
+
|
3
|
+
module Attentive
|
4
|
+
class Matcher
|
5
|
+
attr_reader :phrase, :cursor, :pos
|
6
|
+
|
7
|
+
def initialize(phrase, cursor, params={})
|
8
|
+
@phrase = phrase
|
9
|
+
@cursor = cursor
|
10
|
+
@pos = params.fetch(:pos, 0)
|
11
|
+
@pos += 1 while phrase[pos] && phrase[pos].whitespace?
|
12
|
+
@match_data = {}
|
13
|
+
@state = :matching
|
14
|
+
end
|
15
|
+
|
16
|
+
def matching?
|
17
|
+
@state == :matching
|
18
|
+
end
|
19
|
+
|
20
|
+
def mismatch?
|
21
|
+
@state == :mismatch
|
22
|
+
end
|
23
|
+
|
24
|
+
def match!
|
25
|
+
while token = cursor.peek
|
26
|
+
if token.ambiguous?
|
27
|
+
unless match_subphrase!(token.possibilities)
|
28
|
+
@state = :mismatch
|
29
|
+
break
|
30
|
+
end
|
31
|
+
@pos += 1 while phrase[pos] && phrase[pos].whitespace?
|
32
|
+
|
33
|
+
elsif match_data = phrase[pos].matches?(cursor)
|
34
|
+
if match_data.is_a?(MatchData)
|
35
|
+
new_character_index = cursor.offset + match_data.to_s.length
|
36
|
+
@match_data.merge! Hash[match_data.names.zip(match_data.captures)]
|
37
|
+
|
38
|
+
# Advance the cursor to the first token after the regexp match
|
39
|
+
cursor_pos = cursor.tokens.index { |token| token.pos >= new_character_index }
|
40
|
+
cursor_pos = cursor.tokens.length unless cursor_pos
|
41
|
+
cursor.instance_variable_set :@pos, cursor_pos
|
42
|
+
@pos += 1
|
43
|
+
else
|
44
|
+
@match_data.merge!(match_data) unless match_data == true
|
45
|
+
@pos += 1
|
46
|
+
end
|
47
|
+
@pos += 1 while phrase[pos] && phrase[pos].whitespace?
|
48
|
+
@state = :found
|
49
|
+
# puts "matched #{phrase.inspect}"
|
50
|
+
return Attentive::Match.new(phrase, match_data: @match_data) if pos == phrase.length
|
51
|
+
|
52
|
+
elsif !token.skippable?
|
53
|
+
@state = :mismatch
|
54
|
+
break
|
55
|
+
end
|
56
|
+
|
57
|
+
cursor.pop
|
58
|
+
break unless cursor.peek
|
59
|
+
while cursor.peek.whitespace?
|
60
|
+
cursor.pop
|
61
|
+
break unless cursor.peek
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
|
68
|
+
def match_subphrase!(subphrases)
|
69
|
+
subphrases.each do |subphrase|
|
70
|
+
matcher = Matcher.new(phrase, Cursor.new(subphrase), pos: pos)
|
71
|
+
matcher.match!
|
72
|
+
unless matcher.mismatch?
|
73
|
+
@pos = matcher.pos
|
74
|
+
return true
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
false
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require "set"
|
2
|
+
require "attentive/tokenizer"
|
3
|
+
|
4
|
+
module Attentive
|
5
|
+
class Message
|
6
|
+
attr_reader :contexts, :text
|
7
|
+
|
8
|
+
def initialize(text, params)
|
9
|
+
@text = text
|
10
|
+
@contexts = Set.new(params.fetch(:contexts, []))
|
11
|
+
end
|
12
|
+
|
13
|
+
def tokens
|
14
|
+
@tokens ||= Attentive::Tokenizer.tokenize(text)
|
15
|
+
end
|
16
|
+
|
17
|
+
alias :to_s :text
|
18
|
+
|
19
|
+
def inspect
|
20
|
+
tokens.inspect
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Attentive
|
2
|
+
module Text
|
3
|
+
extend self
|
4
|
+
|
5
|
+
def normalize(text)
|
6
|
+
straighten_quotes downcase text
|
7
|
+
end
|
8
|
+
|
9
|
+
def downcase(text)
|
10
|
+
text.downcase
|
11
|
+
end
|
12
|
+
|
13
|
+
def straighten_quotes(text)
|
14
|
+
text.gsub(/[“”]/, "\"").gsub(/[‘’]/, "'")
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
DATA_PATH = File.expand_path(File.dirname(__FILE__) + "/../../data").freeze
|
20
|
+
|
21
|
+
CONTRACTIONS = {}.tap do |contractions|
|
22
|
+
File.open(DATA_PATH + "/contractions.tsv") do |file|
|
23
|
+
file.each do |line|
|
24
|
+
next if line.start_with?("#") # skip comments
|
25
|
+
next if line == "\n" # skip blank lines
|
26
|
+
|
27
|
+
# the file contains tab-separated values.
|
28
|
+
# the first value is the contraction.
|
29
|
+
# the remaining values are possible phrases that match it
|
30
|
+
phrases = line.chomp.split("\t")
|
31
|
+
raise "#{line.inspect} must have exactly two values" unless phrases.length >= 2
|
32
|
+
|
33
|
+
contractions[phrases.shift] = phrases
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end.freeze
|
37
|
+
|
38
|
+
SLANG = {}.tap do |slang|
|
39
|
+
File.open(DATA_PATH + "/slang.tsv") do |file|
|
40
|
+
file.each do |line|
|
41
|
+
next if line.start_with?("#") # skip comments
|
42
|
+
next if line == "\n" # skip blank lines
|
43
|
+
|
44
|
+
# the file contains tab-separated values.
|
45
|
+
# every line should have exactly two values:
|
46
|
+
# + the first is the slang word
|
47
|
+
# + the second is the normal word
|
48
|
+
words = line.chomp.split("\t")
|
49
|
+
raise "#{line.inspect} must have exactly two values" unless words.length == 2
|
50
|
+
|
51
|
+
slang[words[0]] = words[1]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end.freeze
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Attentive
|
2
|
+
class Token
|
3
|
+
attr_reader :pos
|
4
|
+
|
5
|
+
def initialize(pos)
|
6
|
+
@pos = pos
|
7
|
+
end
|
8
|
+
|
9
|
+
def ==(other)
|
10
|
+
self.class == other.class
|
11
|
+
end
|
12
|
+
|
13
|
+
def ambiguous?
|
14
|
+
false
|
15
|
+
end
|
16
|
+
|
17
|
+
def entity?
|
18
|
+
false
|
19
|
+
end
|
20
|
+
|
21
|
+
def whitespace?
|
22
|
+
false
|
23
|
+
end
|
24
|
+
|
25
|
+
def skippable?
|
26
|
+
false
|
27
|
+
end
|
28
|
+
|
29
|
+
def matches?(cursor)
|
30
|
+
self == cursor.peek
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
class StringToken < Token
|
38
|
+
attr_reader :string
|
39
|
+
|
40
|
+
def initialize(string, pos)
|
41
|
+
@string = string
|
42
|
+
super pos
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_str
|
46
|
+
to_s
|
47
|
+
end
|
48
|
+
|
49
|
+
def to_s
|
50
|
+
string
|
51
|
+
end
|
52
|
+
|
53
|
+
def ==(other)
|
54
|
+
self.class == other.class && self.string == other.string
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require "attentive/text"
|
2
|
+
require "attentive/tokens"
|
3
|
+
require "attentive/phrase"
|
4
|
+
require "attentive/errors"
|
5
|
+
|
6
|
+
module Attentive
|
7
|
+
class Tokenizer
|
8
|
+
extend Attentive::Tokens
|
9
|
+
|
10
|
+
# Splits apart words and punctuation,
|
11
|
+
# treats apostrophes and dashes as a word-characters,
|
12
|
+
# trims each fragment of whitepsace
|
13
|
+
# SPLITTER = /\s*([\w'-]+)\s*/.freeze
|
14
|
+
SPLITTER = /(\n|{{|}}|\s+|\.{2,}|[^\s\w'@-])/.freeze
|
15
|
+
PUNCTUATION = /^\W+$/.freeze
|
16
|
+
WHITESPACE = /^\s+$/.freeze
|
17
|
+
ME = "@me".freeze
|
18
|
+
ENTITY_START = "{{".freeze
|
19
|
+
ENTITY_END = "}}".freeze
|
20
|
+
REGEXP_START = "(".freeze
|
21
|
+
REGEXP_END = ")".freeze
|
22
|
+
REGEXP_ESCAPE = "\\".freeze
|
23
|
+
|
24
|
+
|
25
|
+
def self.split(message)
|
26
|
+
Attentive::Text.normalize(message).split(SPLITTER).reject(&:empty?)
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def self.tokenize(message, options={})
|
31
|
+
match_entities = options.fetch(:entities, false)
|
32
|
+
match_regexps = options.fetch(:regexps, false)
|
33
|
+
fail_if_ambiguous = !options.fetch(:ambiguous, true)
|
34
|
+
strings = split(message)
|
35
|
+
tokens = []
|
36
|
+
i = 0
|
37
|
+
pos = 0
|
38
|
+
while i < strings.length
|
39
|
+
string = strings[i]
|
40
|
+
case string
|
41
|
+
when ""
|
42
|
+
# do nothing
|
43
|
+
|
44
|
+
when WHITESPACE
|
45
|
+
tokens << whitespace(string, pos: pos)
|
46
|
+
|
47
|
+
when ":"
|
48
|
+
if strings[i + 2] == ":"
|
49
|
+
tokens << emoji(strings[i + 1], pos: pos)
|
50
|
+
pos += strings[i + 1].length + 1
|
51
|
+
i += 2
|
52
|
+
else
|
53
|
+
tokens << punctuation(":", pos: pos)
|
54
|
+
end
|
55
|
+
|
56
|
+
when ENTITY_START
|
57
|
+
if match_entities
|
58
|
+
j = i + 1
|
59
|
+
found_entity = false
|
60
|
+
while j < strings.length
|
61
|
+
if strings[j] == ENTITY_END
|
62
|
+
entity = strings[(i + 1)...j] # e.g. ["variable-name", ":" "entity-type"]
|
63
|
+
tokens << entity(*entity.join.split(":").reverse, pos: pos)
|
64
|
+
i = j + 1
|
65
|
+
pos += entity.join.length + 4
|
66
|
+
found_entity = true
|
67
|
+
break
|
68
|
+
end
|
69
|
+
j += 1
|
70
|
+
end
|
71
|
+
next if found_entity
|
72
|
+
end
|
73
|
+
tokens << punctuation(ENTITY_START, pos: pos)
|
74
|
+
|
75
|
+
when REGEXP_START
|
76
|
+
if match_regexps && strings[i + 1] == "?"
|
77
|
+
j = i + 2
|
78
|
+
found_regexp = false
|
79
|
+
parens = 1
|
80
|
+
inside_square_bracket = false
|
81
|
+
while j < strings.length
|
82
|
+
if strings[j] == "[" && strings[j - 1] != REGEXP_ESCAPE
|
83
|
+
inside_square_bracket = true
|
84
|
+
elsif strings[j] == "]" && strings[j - 1] != REGEXP_ESCAPE
|
85
|
+
inside_square_bracket = false
|
86
|
+
end
|
87
|
+
|
88
|
+
unless inside_square_bracket
|
89
|
+
if strings[j] == REGEXP_START && strings[j - 1] != REGEXP_ESCAPE
|
90
|
+
parens += 1
|
91
|
+
elsif strings[j] == REGEXP_END && strings[j - 1] != REGEXP_ESCAPE
|
92
|
+
parens -= 1
|
93
|
+
end
|
94
|
+
|
95
|
+
if parens == 0
|
96
|
+
tokens << regexp(strings[i..j].join, pos: pos)
|
97
|
+
pos += strings[i..j].join.length + 2
|
98
|
+
i = j + 1
|
99
|
+
found_regexp = true
|
100
|
+
break
|
101
|
+
end
|
102
|
+
end
|
103
|
+
j += 1
|
104
|
+
end
|
105
|
+
next if found_regexp
|
106
|
+
end
|
107
|
+
tokens << punctuation(REGEXP_START, pos: pos)
|
108
|
+
|
109
|
+
when PUNCTUATION
|
110
|
+
tokens << punctuation(string, pos: pos)
|
111
|
+
|
112
|
+
when ME
|
113
|
+
tokens << me(pos: pos)
|
114
|
+
|
115
|
+
else
|
116
|
+
if replace_with = Attentive::Text::SLANG[string]
|
117
|
+
tokens.concat tokenize(replace_with, options)
|
118
|
+
|
119
|
+
elsif expands_to = Attentive::Text::CONTRACTIONS[string]
|
120
|
+
possibilities = expands_to.map do |possibility|
|
121
|
+
tokenize(possibility, options)
|
122
|
+
end
|
123
|
+
|
124
|
+
if possibilities.length == 1
|
125
|
+
tokens.concat possibilities[0]
|
126
|
+
else
|
127
|
+
tokens << any_of(possibilities, pos: pos)
|
128
|
+
end
|
129
|
+
else
|
130
|
+
tokens << word(string, pos: pos)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
i += 1
|
135
|
+
pos += string.length
|
136
|
+
end
|
137
|
+
|
138
|
+
fail_if_ambiguous!(message, tokens) if fail_if_ambiguous
|
139
|
+
|
140
|
+
Attentive::Phrase.new(tokens)
|
141
|
+
end
|
142
|
+
|
143
|
+
def self.fail_if_ambiguous!(phrase, tokens)
|
144
|
+
ambiguous_token = tokens.find(&:ambiguous?)
|
145
|
+
return unless ambiguous_token
|
146
|
+
|
147
|
+
raise Attentive::AmbiguousPhraseError.new(
|
148
|
+
"The phrase #{phrase.inspect} is ambiguous. " <<
|
149
|
+
"Please use #{ambiguous_token.possibilities.map(&:inspect).join(" or ")}")
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Not the perfect place for these...
|
156
|
+
# Attentive::Tokenizer needs to be defined first...
|
157
|
+
require "attentive/entity"
|
158
|
+
require "attentive/composite_entity"
|
159
|
+
|
160
|
+
require "attentive/entities/integer"
|
161
|
+
require "attentive/entities/relative_date"
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "attentive/token"
|
2
|
+
|
3
|
+
module Attentive
|
4
|
+
module Tokens
|
5
|
+
class AnyOf < Token
|
6
|
+
attr_reader :possibilities
|
7
|
+
|
8
|
+
def initialize(possibilities, pos)
|
9
|
+
@possibilities = possibilities
|
10
|
+
super pos
|
11
|
+
end
|
12
|
+
|
13
|
+
def ==(other)
|
14
|
+
self.class == other.class && self.possibilities == other.possibilities
|
15
|
+
end
|
16
|
+
|
17
|
+
def ambiguous?
|
18
|
+
true
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "attentive/token"
|
2
|
+
|
3
|
+
module Attentive
|
4
|
+
module Tokens
|
5
|
+
class Regexp < Token
|
6
|
+
attr_reader :regexp
|
7
|
+
|
8
|
+
def initialize(string, pos)
|
9
|
+
@regexp = ::Regexp.compile("^#{string}")
|
10
|
+
super pos
|
11
|
+
end
|
12
|
+
|
13
|
+
def ==(other)
|
14
|
+
self.class == other.class && self.regexp == other.regexp
|
15
|
+
end
|
16
|
+
|
17
|
+
def matches?(cursor)
|
18
|
+
regexp.match(cursor.to_s)
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
regexp.inspect[1...-1]
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require "attentive/token"
|
2
|
+
|
3
|
+
module Attentive
|
4
|
+
module Tokens
|
5
|
+
class Whitespace < StringToken
|
6
|
+
|
7
|
+
# All whitespace is equal
|
8
|
+
def ==(other)
|
9
|
+
self.class == other.class
|
10
|
+
end
|
11
|
+
|
12
|
+
def skippable?
|
13
|
+
true
|
14
|
+
end
|
15
|
+
|
16
|
+
def whitespace?
|
17
|
+
true
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Attentive
|
2
|
+
module Tokens
|
3
|
+
|
4
|
+
def any_of(possibilities, pos: nil)
|
5
|
+
Attentive::Tokens::AnyOf.new possibilities, pos
|
6
|
+
end
|
7
|
+
|
8
|
+
def emoji(string, pos: nil)
|
9
|
+
Attentive::Tokens::Emoji.new string, pos
|
10
|
+
end
|
11
|
+
|
12
|
+
def entity(entity_name, variable_name=entity_name, pos: nil)
|
13
|
+
Attentive::Entity[entity_name.to_sym].new(variable_name)
|
14
|
+
end
|
15
|
+
|
16
|
+
def me(pos: nil)
|
17
|
+
Attentive::Tokens::Me.new pos
|
18
|
+
end
|
19
|
+
|
20
|
+
def punctuation(string, pos: nil)
|
21
|
+
Attentive::Tokens::Punctuation.new string, pos
|
22
|
+
end
|
23
|
+
|
24
|
+
def regexp(string, pos: nil)
|
25
|
+
Attentive::Tokens::Regexp.new string, pos
|
26
|
+
end
|
27
|
+
|
28
|
+
def whitespace(string, pos: nil)
|
29
|
+
Attentive::Tokens::Whitespace.new string, pos
|
30
|
+
end
|
31
|
+
|
32
|
+
def word(string, pos: nil)
|
33
|
+
Attentive::Tokens::Word.new string, pos
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
require "attentive/tokens/any_of"
|
40
|
+
require "attentive/tokens/emoji"
|
41
|
+
require "attentive/tokens/me"
|
42
|
+
require "attentive/tokens/punctuation"
|
43
|
+
require "attentive/tokens/regexp"
|
44
|
+
require "attentive/tokens/whitespace"
|
45
|
+
require "attentive/tokens/word"
|
data/lib/attentive.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require "attentive/version"
|
2
|
+
require "attentive/listener_collection"
|
3
|
+
require "attentive/message"
|
4
|
+
|
5
|
+
module Attentive
|
6
|
+
|
7
|
+
def listeners
|
8
|
+
@listeners ||= Attentive::ListenerCollection.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def listen_for(*args, &block)
|
12
|
+
listeners.listen_for(*args, &block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def hear(message, params={})
|
16
|
+
message = Attentive::Message.new(message, params) unless message.is_a?(Attentive::Message)
|
17
|
+
listeners.hear message
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|