attentive 0.1.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +10 -0
- data/attentive.gemspec +31 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/data/contractions.tsv +134 -0
- data/data/slang.tsv +19 -0
- data/lib/attentive/composite_entity.rb +33 -0
- data/lib/attentive/cursor.rb +28 -0
- data/lib/attentive/entities/integer.rb +5 -0
- data/lib/attentive/entities/relative_date.rb +44 -0
- data/lib/attentive/entity.rb +65 -0
- data/lib/attentive/errors.rb +7 -0
- data/lib/attentive/listener.rb +55 -0
- data/lib/attentive/listener_collection.rb +45 -0
- data/lib/attentive/match.rb +22 -0
- data/lib/attentive/matcher.rb +82 -0
- data/lib/attentive/message.rb +24 -0
- data/lib/attentive/phrase.rb +19 -0
- data/lib/attentive/text.rb +57 -0
- data/lib/attentive/token.rb +58 -0
- data/lib/attentive/tokenizer.rb +161 -0
- data/lib/attentive/tokens/any_of.rb +23 -0
- data/lib/attentive/tokens/emoji.rb +17 -0
- data/lib/attentive/tokens/me.rb +17 -0
- data/lib/attentive/tokens/punctuation.rb +13 -0
- data/lib/attentive/tokens/regexp.rb +27 -0
- data/lib/attentive/tokens/whitespace.rb +22 -0
- data/lib/attentive/tokens/word.rb +8 -0
- data/lib/attentive/tokens.rb +45 -0
- data/lib/attentive/version.rb +3 -0
- data/lib/attentive.rb +20 -0
- metadata +206 -0
@@ -0,0 +1,82 @@
|
|
1
|
+
require "attentive/match"
|
2
|
+
|
3
|
+
module Attentive
|
4
|
+
class Matcher
|
5
|
+
attr_reader :phrase, :cursor, :pos
|
6
|
+
|
7
|
+
def initialize(phrase, cursor, params={})
|
8
|
+
@phrase = phrase
|
9
|
+
@cursor = cursor
|
10
|
+
@pos = params.fetch(:pos, 0)
|
11
|
+
@pos += 1 while phrase[pos] && phrase[pos].whitespace?
|
12
|
+
@match_data = {}
|
13
|
+
@state = :matching
|
14
|
+
end
|
15
|
+
|
16
|
+
def matching?
|
17
|
+
@state == :matching
|
18
|
+
end
|
19
|
+
|
20
|
+
def mismatch?
|
21
|
+
@state == :mismatch
|
22
|
+
end
|
23
|
+
|
24
|
+
def match!
|
25
|
+
while token = cursor.peek
|
26
|
+
if token.ambiguous?
|
27
|
+
unless match_subphrase!(token.possibilities)
|
28
|
+
@state = :mismatch
|
29
|
+
break
|
30
|
+
end
|
31
|
+
@pos += 1 while phrase[pos] && phrase[pos].whitespace?
|
32
|
+
|
33
|
+
elsif match_data = phrase[pos].matches?(cursor)
|
34
|
+
if match_data.is_a?(MatchData)
|
35
|
+
new_character_index = cursor.offset + match_data.to_s.length
|
36
|
+
@match_data.merge! Hash[match_data.names.zip(match_data.captures)]
|
37
|
+
|
38
|
+
# Advance the cursor to the first token after the regexp match
|
39
|
+
cursor_pos = cursor.tokens.index { |token| token.pos >= new_character_index }
|
40
|
+
cursor_pos = cursor.tokens.length unless cursor_pos
|
41
|
+
cursor.instance_variable_set :@pos, cursor_pos
|
42
|
+
@pos += 1
|
43
|
+
else
|
44
|
+
@match_data.merge!(match_data) unless match_data == true
|
45
|
+
@pos += 1
|
46
|
+
end
|
47
|
+
@pos += 1 while phrase[pos] && phrase[pos].whitespace?
|
48
|
+
@state = :found
|
49
|
+
# puts "matched #{phrase.inspect}"
|
50
|
+
return Attentive::Match.new(phrase, match_data: @match_data) if pos == phrase.length
|
51
|
+
|
52
|
+
elsif !token.skippable?
|
53
|
+
@state = :mismatch
|
54
|
+
break
|
55
|
+
end
|
56
|
+
|
57
|
+
cursor.pop
|
58
|
+
break unless cursor.peek
|
59
|
+
while cursor.peek.whitespace?
|
60
|
+
cursor.pop
|
61
|
+
break unless cursor.peek
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
|
68
|
+
def match_subphrase!(subphrases)
|
69
|
+
subphrases.each do |subphrase|
|
70
|
+
matcher = Matcher.new(phrase, Cursor.new(subphrase), pos: pos)
|
71
|
+
matcher.match!
|
72
|
+
unless matcher.mismatch?
|
73
|
+
@pos = matcher.pos
|
74
|
+
return true
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
false
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require "set"
|
2
|
+
require "attentive/tokenizer"
|
3
|
+
|
4
|
+
module Attentive
|
5
|
+
class Message
|
6
|
+
attr_reader :contexts, :text
|
7
|
+
|
8
|
+
def initialize(text, params)
|
9
|
+
@text = text
|
10
|
+
@contexts = Set.new(params.fetch(:contexts, []))
|
11
|
+
end
|
12
|
+
|
13
|
+
def tokens
|
14
|
+
@tokens ||= Attentive::Tokenizer.tokenize(text)
|
15
|
+
end
|
16
|
+
|
17
|
+
alias :to_s :text
|
18
|
+
|
19
|
+
def inspect
|
20
|
+
tokens.inspect
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Attentive
|
2
|
+
module Text
|
3
|
+
extend self
|
4
|
+
|
5
|
+
def normalize(text)
|
6
|
+
straighten_quotes downcase text
|
7
|
+
end
|
8
|
+
|
9
|
+
def downcase(text)
|
10
|
+
text.downcase
|
11
|
+
end
|
12
|
+
|
13
|
+
def straighten_quotes(text)
|
14
|
+
text.gsub(/[“”]/, "\"").gsub(/[‘’]/, "'")
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
DATA_PATH = File.expand_path(File.dirname(__FILE__) + "/../../data").freeze
|
20
|
+
|
21
|
+
CONTRACTIONS = {}.tap do |contractions|
|
22
|
+
File.open(DATA_PATH + "/contractions.tsv") do |file|
|
23
|
+
file.each do |line|
|
24
|
+
next if line.start_with?("#") # skip comments
|
25
|
+
next if line == "\n" # skip blank lines
|
26
|
+
|
27
|
+
# the file contains tab-separated values.
|
28
|
+
# the first value is the contraction.
|
29
|
+
# the remaining values are possible phrases that match it
|
30
|
+
phrases = line.chomp.split("\t")
|
31
|
+
raise "#{line.inspect} must have exactly two values" unless phrases.length >= 2
|
32
|
+
|
33
|
+
contractions[phrases.shift] = phrases
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end.freeze
|
37
|
+
|
38
|
+
SLANG = {}.tap do |slang|
|
39
|
+
File.open(DATA_PATH + "/slang.tsv") do |file|
|
40
|
+
file.each do |line|
|
41
|
+
next if line.start_with?("#") # skip comments
|
42
|
+
next if line == "\n" # skip blank lines
|
43
|
+
|
44
|
+
# the file contains tab-separated values.
|
45
|
+
# every line should have exactly two values:
|
46
|
+
# + the first is the slang word
|
47
|
+
# + the second is the normal word
|
48
|
+
words = line.chomp.split("\t")
|
49
|
+
raise "#{line.inspect} must have exactly two values" unless words.length == 2
|
50
|
+
|
51
|
+
slang[words[0]] = words[1]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end.freeze
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Attentive
|
2
|
+
class Token
|
3
|
+
attr_reader :pos
|
4
|
+
|
5
|
+
def initialize(pos)
|
6
|
+
@pos = pos
|
7
|
+
end
|
8
|
+
|
9
|
+
def ==(other)
|
10
|
+
self.class == other.class
|
11
|
+
end
|
12
|
+
|
13
|
+
def ambiguous?
|
14
|
+
false
|
15
|
+
end
|
16
|
+
|
17
|
+
def entity?
|
18
|
+
false
|
19
|
+
end
|
20
|
+
|
21
|
+
def whitespace?
|
22
|
+
false
|
23
|
+
end
|
24
|
+
|
25
|
+
def skippable?
|
26
|
+
false
|
27
|
+
end
|
28
|
+
|
29
|
+
def matches?(cursor)
|
30
|
+
self == cursor.peek
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
class StringToken < Token
|
38
|
+
attr_reader :string
|
39
|
+
|
40
|
+
def initialize(string, pos)
|
41
|
+
@string = string
|
42
|
+
super pos
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_str
|
46
|
+
to_s
|
47
|
+
end
|
48
|
+
|
49
|
+
def to_s
|
50
|
+
string
|
51
|
+
end
|
52
|
+
|
53
|
+
def ==(other)
|
54
|
+
self.class == other.class && self.string == other.string
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require "attentive/text"
|
2
|
+
require "attentive/tokens"
|
3
|
+
require "attentive/phrase"
|
4
|
+
require "attentive/errors"
|
5
|
+
|
6
|
+
module Attentive
|
7
|
+
class Tokenizer
|
8
|
+
extend Attentive::Tokens
|
9
|
+
|
10
|
+
# Splits apart words and punctuation,
|
11
|
+
# treats apostrophes and dashes as a word-characters,
|
12
|
+
# trims each fragment of whitepsace
|
13
|
+
# SPLITTER = /\s*([\w'-]+)\s*/.freeze
|
14
|
+
SPLITTER = /(\n|{{|}}|\s+|\.{2,}|[^\s\w'@-])/.freeze
|
15
|
+
PUNCTUATION = /^\W+$/.freeze
|
16
|
+
WHITESPACE = /^\s+$/.freeze
|
17
|
+
ME = "@me".freeze
|
18
|
+
ENTITY_START = "{{".freeze
|
19
|
+
ENTITY_END = "}}".freeze
|
20
|
+
REGEXP_START = "(".freeze
|
21
|
+
REGEXP_END = ")".freeze
|
22
|
+
REGEXP_ESCAPE = "\\".freeze
|
23
|
+
|
24
|
+
|
25
|
+
def self.split(message)
|
26
|
+
Attentive::Text.normalize(message).split(SPLITTER).reject(&:empty?)
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def self.tokenize(message, options={})
|
31
|
+
match_entities = options.fetch(:entities, false)
|
32
|
+
match_regexps = options.fetch(:regexps, false)
|
33
|
+
fail_if_ambiguous = !options.fetch(:ambiguous, true)
|
34
|
+
strings = split(message)
|
35
|
+
tokens = []
|
36
|
+
i = 0
|
37
|
+
pos = 0
|
38
|
+
while i < strings.length
|
39
|
+
string = strings[i]
|
40
|
+
case string
|
41
|
+
when ""
|
42
|
+
# do nothing
|
43
|
+
|
44
|
+
when WHITESPACE
|
45
|
+
tokens << whitespace(string, pos: pos)
|
46
|
+
|
47
|
+
when ":"
|
48
|
+
if strings[i + 2] == ":"
|
49
|
+
tokens << emoji(strings[i + 1], pos: pos)
|
50
|
+
pos += strings[i + 1].length + 1
|
51
|
+
i += 2
|
52
|
+
else
|
53
|
+
tokens << punctuation(":", pos: pos)
|
54
|
+
end
|
55
|
+
|
56
|
+
when ENTITY_START
|
57
|
+
if match_entities
|
58
|
+
j = i + 1
|
59
|
+
found_entity = false
|
60
|
+
while j < strings.length
|
61
|
+
if strings[j] == ENTITY_END
|
62
|
+
entity = strings[(i + 1)...j] # e.g. ["variable-name", ":" "entity-type"]
|
63
|
+
tokens << entity(*entity.join.split(":").reverse, pos: pos)
|
64
|
+
i = j + 1
|
65
|
+
pos += entity.join.length + 4
|
66
|
+
found_entity = true
|
67
|
+
break
|
68
|
+
end
|
69
|
+
j += 1
|
70
|
+
end
|
71
|
+
next if found_entity
|
72
|
+
end
|
73
|
+
tokens << punctuation(ENTITY_START, pos: pos)
|
74
|
+
|
75
|
+
when REGEXP_START
|
76
|
+
if match_regexps && strings[i + 1] == "?"
|
77
|
+
j = i + 2
|
78
|
+
found_regexp = false
|
79
|
+
parens = 1
|
80
|
+
inside_square_bracket = false
|
81
|
+
while j < strings.length
|
82
|
+
if strings[j] == "[" && strings[j - 1] != REGEXP_ESCAPE
|
83
|
+
inside_square_bracket = true
|
84
|
+
elsif strings[j] == "]" && strings[j - 1] != REGEXP_ESCAPE
|
85
|
+
inside_square_bracket = false
|
86
|
+
end
|
87
|
+
|
88
|
+
unless inside_square_bracket
|
89
|
+
if strings[j] == REGEXP_START && strings[j - 1] != REGEXP_ESCAPE
|
90
|
+
parens += 1
|
91
|
+
elsif strings[j] == REGEXP_END && strings[j - 1] != REGEXP_ESCAPE
|
92
|
+
parens -= 1
|
93
|
+
end
|
94
|
+
|
95
|
+
if parens == 0
|
96
|
+
tokens << regexp(strings[i..j].join, pos: pos)
|
97
|
+
pos += strings[i..j].join.length + 2
|
98
|
+
i = j + 1
|
99
|
+
found_regexp = true
|
100
|
+
break
|
101
|
+
end
|
102
|
+
end
|
103
|
+
j += 1
|
104
|
+
end
|
105
|
+
next if found_regexp
|
106
|
+
end
|
107
|
+
tokens << punctuation(REGEXP_START, pos: pos)
|
108
|
+
|
109
|
+
when PUNCTUATION
|
110
|
+
tokens << punctuation(string, pos: pos)
|
111
|
+
|
112
|
+
when ME
|
113
|
+
tokens << me(pos: pos)
|
114
|
+
|
115
|
+
else
|
116
|
+
if replace_with = Attentive::Text::SLANG[string]
|
117
|
+
tokens.concat tokenize(replace_with, options)
|
118
|
+
|
119
|
+
elsif expands_to = Attentive::Text::CONTRACTIONS[string]
|
120
|
+
possibilities = expands_to.map do |possibility|
|
121
|
+
tokenize(possibility, options)
|
122
|
+
end
|
123
|
+
|
124
|
+
if possibilities.length == 1
|
125
|
+
tokens.concat possibilities[0]
|
126
|
+
else
|
127
|
+
tokens << any_of(possibilities, pos: pos)
|
128
|
+
end
|
129
|
+
else
|
130
|
+
tokens << word(string, pos: pos)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
i += 1
|
135
|
+
pos += string.length
|
136
|
+
end
|
137
|
+
|
138
|
+
fail_if_ambiguous!(message, tokens) if fail_if_ambiguous
|
139
|
+
|
140
|
+
Attentive::Phrase.new(tokens)
|
141
|
+
end
|
142
|
+
|
143
|
+
def self.fail_if_ambiguous!(phrase, tokens)
|
144
|
+
ambiguous_token = tokens.find(&:ambiguous?)
|
145
|
+
return unless ambiguous_token
|
146
|
+
|
147
|
+
raise Attentive::AmbiguousPhraseError.new(
|
148
|
+
"The phrase #{phrase.inspect} is ambiguous. " <<
|
149
|
+
"Please use #{ambiguous_token.possibilities.map(&:inspect).join(" or ")}")
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Not the perfect place for these...
|
156
|
+
# Attentive::Tokenizer needs to be defined first...
|
157
|
+
require "attentive/entity"
|
158
|
+
require "attentive/composite_entity"
|
159
|
+
|
160
|
+
require "attentive/entities/integer"
|
161
|
+
require "attentive/entities/relative_date"
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "attentive/token"
|
2
|
+
|
3
|
+
module Attentive
|
4
|
+
module Tokens
|
5
|
+
class AnyOf < Token
|
6
|
+
attr_reader :possibilities
|
7
|
+
|
8
|
+
def initialize(possibilities, pos)
|
9
|
+
@possibilities = possibilities
|
10
|
+
super pos
|
11
|
+
end
|
12
|
+
|
13
|
+
def ==(other)
|
14
|
+
self.class == other.class && self.possibilities == other.possibilities
|
15
|
+
end
|
16
|
+
|
17
|
+
def ambiguous?
|
18
|
+
true
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "attentive/token"
|
2
|
+
|
3
|
+
module Attentive
|
4
|
+
module Tokens
|
5
|
+
class Regexp < Token
|
6
|
+
attr_reader :regexp
|
7
|
+
|
8
|
+
def initialize(string, pos)
|
9
|
+
@regexp = ::Regexp.compile("^#{string}")
|
10
|
+
super pos
|
11
|
+
end
|
12
|
+
|
13
|
+
def ==(other)
|
14
|
+
self.class == other.class && self.regexp == other.regexp
|
15
|
+
end
|
16
|
+
|
17
|
+
def matches?(cursor)
|
18
|
+
regexp.match(cursor.to_s)
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
regexp.inspect[1...-1]
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require "attentive/token"
|
2
|
+
|
3
|
+
module Attentive
|
4
|
+
module Tokens
|
5
|
+
class Whitespace < StringToken
|
6
|
+
|
7
|
+
# All whitespace is equal
|
8
|
+
def ==(other)
|
9
|
+
self.class == other.class
|
10
|
+
end
|
11
|
+
|
12
|
+
def skippable?
|
13
|
+
true
|
14
|
+
end
|
15
|
+
|
16
|
+
def whitespace?
|
17
|
+
true
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Attentive
|
2
|
+
module Tokens
|
3
|
+
|
4
|
+
def any_of(possibilities, pos: nil)
|
5
|
+
Attentive::Tokens::AnyOf.new possibilities, pos
|
6
|
+
end
|
7
|
+
|
8
|
+
def emoji(string, pos: nil)
|
9
|
+
Attentive::Tokens::Emoji.new string, pos
|
10
|
+
end
|
11
|
+
|
12
|
+
def entity(entity_name, variable_name=entity_name, pos: nil)
|
13
|
+
Attentive::Entity[entity_name.to_sym].new(variable_name)
|
14
|
+
end
|
15
|
+
|
16
|
+
def me(pos: nil)
|
17
|
+
Attentive::Tokens::Me.new pos
|
18
|
+
end
|
19
|
+
|
20
|
+
def punctuation(string, pos: nil)
|
21
|
+
Attentive::Tokens::Punctuation.new string, pos
|
22
|
+
end
|
23
|
+
|
24
|
+
def regexp(string, pos: nil)
|
25
|
+
Attentive::Tokens::Regexp.new string, pos
|
26
|
+
end
|
27
|
+
|
28
|
+
def whitespace(string, pos: nil)
|
29
|
+
Attentive::Tokens::Whitespace.new string, pos
|
30
|
+
end
|
31
|
+
|
32
|
+
def word(string, pos: nil)
|
33
|
+
Attentive::Tokens::Word.new string, pos
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
require "attentive/tokens/any_of"
|
40
|
+
require "attentive/tokens/emoji"
|
41
|
+
require "attentive/tokens/me"
|
42
|
+
require "attentive/tokens/punctuation"
|
43
|
+
require "attentive/tokens/regexp"
|
44
|
+
require "attentive/tokens/whitespace"
|
45
|
+
require "attentive/tokens/word"
|
data/lib/attentive.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require "attentive/version"
|
2
|
+
require "attentive/listener_collection"
|
3
|
+
require "attentive/message"
|
4
|
+
|
5
|
+
module Attentive
|
6
|
+
|
7
|
+
def listeners
|
8
|
+
@listeners ||= Attentive::ListenerCollection.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def listen_for(*args, &block)
|
12
|
+
listeners.listen_for(*args, &block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def hear(message, params={})
|
16
|
+
message = Attentive::Message.new(message, params) unless message.is_a?(Attentive::Message)
|
17
|
+
listeners.hear message
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|