moo_ebooks 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +6 -0
- data/LICENSE +22 -0
- data/README.md +71 -0
- data/data/stopwords.txt +843 -0
- data/lib/moo_ebooks.rb +14 -0
- data/lib/moo_ebooks/model.rb +270 -0
- data/lib/moo_ebooks/nlp.rb +170 -0
- data/lib/moo_ebooks/suffix.rb +118 -0
- data/lib/moo_ebooks/version.rb +9 -0
- data/spec/data/0xabad1dea.json +1 -0
- data/spec/model_spec.rb +107 -0
- data/spec/spec_helper.rb +108 -0
- metadata +155 -0
data/lib/moo_ebooks.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ebooks
|
4
|
+
GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
5
|
+
DATA_PATH = File.join(GEM_PATH, 'data')
|
6
|
+
SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
|
7
|
+
TEST_PATH = File.join(GEM_PATH, 'test')
|
8
|
+
TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
|
9
|
+
INTERIM = :interim
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'moo_ebooks/nlp'
|
13
|
+
require 'moo_ebooks/suffix'
|
14
|
+
require 'moo_ebooks/model'
|
@@ -0,0 +1,270 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'set'
|
5
|
+
require 'digest/md5'
|
6
|
+
|
7
|
+
module Ebooks
|
8
|
+
# Main class for Model management. Models are required for text generation.
|
9
|
+
#
|
10
|
+
# @notice Only JSON format is supported.
|
11
|
+
# @notice For corpus files. These are assumed to have a `statuses` key and a
|
12
|
+
# `mentions` key, which hold the different statuses in them.
|
13
|
+
#
|
14
|
+
# @notice Make sure NOT to include reblogs (retweets) into corpus data. Those
|
15
|
+
# will negatively impact text creation
|
16
|
+
class Model
|
17
|
+
# @return [Array<String>]
|
18
|
+
# An array of unique tokens. This is the main source of actual strings
|
19
|
+
# in the model. Manipulation of a token is done using its index
|
20
|
+
# in this array, which we call a "tiki"
|
21
|
+
attr_accessor :tokens
|
22
|
+
|
23
|
+
# @return [Array<Array<Integer>>]
|
24
|
+
# Sentences represented by arrays of tikis
|
25
|
+
attr_accessor :sentences
|
26
|
+
|
27
|
+
# @return [Array<Array<Integer>>]
|
28
|
+
# Sentences derived from Twitter mentions
|
29
|
+
attr_accessor :mentions
|
30
|
+
|
31
|
+
# @return [Array<String>]
|
32
|
+
# The top 200 most important keywords, in descending order
|
33
|
+
attr_accessor :keywords
|
34
|
+
def initialize
|
35
|
+
@tokens = []
|
36
|
+
@sentences = []
|
37
|
+
@mentions = []
|
38
|
+
@keywords = []
|
39
|
+
|
40
|
+
# Reverse lookup tiki by token, for faster generation
|
41
|
+
@tikis = {}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Load a saved model
|
45
|
+
# @param data [Hash]
|
46
|
+
# @return [Ebooks::Model]
|
47
|
+
def self.from_hash(data)
|
48
|
+
model = Model.new
|
49
|
+
model.tokens = data[:tokens]
|
50
|
+
model.sentences = data[:sentences]
|
51
|
+
model.mentions = data[:mentions]
|
52
|
+
model.keywords = data[:keywords]
|
53
|
+
model
|
54
|
+
end
|
55
|
+
|
56
|
+
# Load a saved model
|
57
|
+
# @param data [String]
|
58
|
+
# @reutrn [Ebooks::Model]
|
59
|
+
def self.from_json(data)
|
60
|
+
from_hash(JSON.parse(data, symbolize_names: true))
|
61
|
+
end
|
62
|
+
|
63
|
+
# Turn this model into its JSON representation.
|
64
|
+
def to_json
|
65
|
+
to_hash.to_json
|
66
|
+
end
|
67
|
+
|
68
|
+
# Turn this model into its Hash representation
|
69
|
+
def to_hash
|
70
|
+
{ tokens: @tokens, sentences: @sentences, mentions: @mentions,
|
71
|
+
keywords: @keywords }
|
72
|
+
end
|
73
|
+
|
74
|
+
# Consume a corpus into this model
|
75
|
+
# @param content [Hash]
|
76
|
+
def consume(content)
|
77
|
+
model = Ebooks::Model.new
|
78
|
+
model.consume!(content)
|
79
|
+
model
|
80
|
+
end
|
81
|
+
|
82
|
+
# Consume a corpus into this model
|
83
|
+
# @param content [Hash]
|
84
|
+
def consume!(content)
|
85
|
+
unless content.key?(:statuses) || content.key?(:mentions)
|
86
|
+
raise ArgumentError, 'Malformed hash object. At least :statuses and/or'\
|
87
|
+
' :mentions must be present as a key'
|
88
|
+
end
|
89
|
+
consume_statuses(content[:statuses]) unless content[:statuses].nil?
|
90
|
+
consume_mentions(content[:mentions]) unless content[:mentions].nil?
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
|
94
|
+
# Generate some text
|
95
|
+
# @param limit [Integer] available characters
|
96
|
+
# @param generator [SuffixGenerator, nil]
|
97
|
+
# @param retry_limit [Integer] how many times to retry on invalid status
|
98
|
+
# @return [String]
|
99
|
+
def update(limit = 140, generator = nil, retry_limit = 10)
|
100
|
+
tikis = gather_tikis(limit, generator, retry_limit)
|
101
|
+
|
102
|
+
status = NLP.reconstruct(tikis, @tokens)
|
103
|
+
|
104
|
+
fix status
|
105
|
+
end
|
106
|
+
|
107
|
+
# Generates a response by looking for related sentences
|
108
|
+
# in the corpus and building a smaller generator from these
|
109
|
+
# @param input [String]
|
110
|
+
# @param limit [Integer] characters available for response
|
111
|
+
# @param sentences [Array<Array<Integer>>]
|
112
|
+
# @return [String]
|
113
|
+
def reply(input, limit = 140, sentences = @mentions)
|
114
|
+
# Prefer mentions
|
115
|
+
relevant, slightly_relevant = find_relevant(sentences, input)
|
116
|
+
|
117
|
+
if relevant.length >= 3
|
118
|
+
generator = SuffixGenerator.build(relevant)
|
119
|
+
update(limit, generator)
|
120
|
+
elsif slightly_relevant.length >= 5
|
121
|
+
generator = SuffixGenerator.build(slightly_relevant)
|
122
|
+
update(limit, generator)
|
123
|
+
else
|
124
|
+
update(limit)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
|
130
|
+
def gather_tikis(limit, generator, retry_limit)
|
131
|
+
responding = !generator.nil?
|
132
|
+
generator ||= SuffixGenerator.build(@sentences)
|
133
|
+
|
134
|
+
@retries = 0
|
135
|
+
|
136
|
+
tikis = make_bigram_tikis(limit, generator, retry_limit, responding)
|
137
|
+
|
138
|
+
if verbatim?(tikis) && tikis.length > 3
|
139
|
+
# We made a verbatim status by accident
|
140
|
+
tikis = make_unigram_tikis(limit, generator, retry_limit)
|
141
|
+
end
|
142
|
+
@retries = nil
|
143
|
+
tikis
|
144
|
+
end
|
145
|
+
|
146
|
+
def make_unigram_tikis(limit, generator, retry_limit)
|
147
|
+
while (tikis = generator.generate(3, :unigrams))
|
148
|
+
break if valid_status?(tikis, limit) && !verbatim?(tikis)
|
149
|
+
|
150
|
+
@retries += 1
|
151
|
+
break if retry_limit_reached?(retry_limit)
|
152
|
+
end
|
153
|
+
tikis
|
154
|
+
end
|
155
|
+
|
156
|
+
def make_bigram_tikis(limit, generator, retry_limit, responding)
|
157
|
+
while (tikis = generator.generate(3, :bigrams))
|
158
|
+
break if (tikis.length > 3 || responding) && valid_status?(tikis, limit)
|
159
|
+
|
160
|
+
@retries += 1
|
161
|
+
break if retry_limit_reached?(retry_limit)
|
162
|
+
end
|
163
|
+
tikis
|
164
|
+
end
|
165
|
+
|
166
|
+
def retry_limit_reached?(retry_limit)
|
167
|
+
@retries >= retry_limit
|
168
|
+
end
|
169
|
+
|
170
|
+
# Reverse lookup a token index from a token
|
171
|
+
# @param token [String]
|
172
|
+
# @return [Integer]
|
173
|
+
def tikify(token)
|
174
|
+
if @tikis.key?(token)
|
175
|
+
@tikis[token]
|
176
|
+
else
|
177
|
+
@tokens << token
|
178
|
+
@tikis[token] = @tokens.length - 1
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Convert a body of text into arrays of tikis
|
183
|
+
# @param text [String]
|
184
|
+
# @return [Array<Array<Integer>>]
|
185
|
+
def mass_tikify(text)
|
186
|
+
sentences = NLP.sentences(text)
|
187
|
+
|
188
|
+
sentences.map do |s|
|
189
|
+
tokens = NLP.tokenize(s).reject do |t|
|
190
|
+
# Don't include usernames/urls as tokens
|
191
|
+
t.include?('@') || t.include?('http')
|
192
|
+
end
|
193
|
+
|
194
|
+
tokens.map { |t| tikify(t) }
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# Test if a sentence has been copied verbatim from original
|
199
|
+
# @param tikis [Array<Integer>]
|
200
|
+
# @return [Boolean]
|
201
|
+
def verbatim?(tikis)
|
202
|
+
@sentences.include?(tikis) || @mentions.include?(tikis)
|
203
|
+
end
|
204
|
+
|
205
|
+
# Check if an array of tikis comprises a valid status
|
206
|
+
# @param tikis [Array<Integer>]
|
207
|
+
# @param limit Integer how many chars we have left
|
208
|
+
def valid_status?(tikis, limit)
|
209
|
+
status = NLP.reconstruct(tikis, @tokens)
|
210
|
+
status.length <= limit && !NLP.unmatched_enclosers?(status)
|
211
|
+
end
|
212
|
+
|
213
|
+
# Consume a sequence of statuses (excluding mentions)
|
214
|
+
# @param statuses [Array<String>]
|
215
|
+
def consume_statuses(statuses)
|
216
|
+
statuses.map! do |status|
|
217
|
+
NLP.normalize(status)
|
218
|
+
end
|
219
|
+
|
220
|
+
text = statuses.join("\n").encode('UTF-8', invalid: :replace)
|
221
|
+
@sentences = mass_tikify(text)
|
222
|
+
@keywords = NLP.keywords(text).top(200).map(&:to_s)
|
223
|
+
|
224
|
+
nil
|
225
|
+
end
|
226
|
+
|
227
|
+
# Consume a sequence of mentions
|
228
|
+
# @param mentions [Array<String>]
|
229
|
+
def consume_mentions(mentions)
|
230
|
+
mentions.map! do |mention|
|
231
|
+
NLP.normalize(mention)
|
232
|
+
end
|
233
|
+
|
234
|
+
mention_text = mentions.join("\n").encode('UTF-8', invalid: :replace)
|
235
|
+
@mentions = mass_tikify(mention_text)
|
236
|
+
|
237
|
+
nil
|
238
|
+
end
|
239
|
+
|
240
|
+
# Correct encoding issues in generated text
|
241
|
+
# @param text [String]
|
242
|
+
# @return [String]
|
243
|
+
def fix(text)
|
244
|
+
NLP.htmlentities.decode text
|
245
|
+
end
|
246
|
+
|
247
|
+
# Finds relevant and slightly relevant tokenized sentences to input
|
248
|
+
# comparing non-stopword token overlaps
|
249
|
+
# @param sentences [Array<Array<Integer>>]
|
250
|
+
# @param input [String]
|
251
|
+
# @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
|
252
|
+
def find_relevant(sentences, input)
|
253
|
+
relevant = []
|
254
|
+
slightly_relevant = []
|
255
|
+
|
256
|
+
tokenized = NLP.tokenize(input).map(&:downcase)
|
257
|
+
|
258
|
+
sentences.each do |sent|
|
259
|
+
tokenized.each do |token|
|
260
|
+
if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
|
261
|
+
relevant << sent unless NLP.stopword?(token)
|
262
|
+
slightly_relevant << sent
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
[relevant, slightly_relevant]
|
268
|
+
end
|
269
|
+
end
|
270
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'highscore'
|
5
|
+
require 'htmlentities'
|
6
|
+
|
7
|
+
module Ebooks
|
8
|
+
# @private
|
9
|
+
module NLP
|
10
|
+
# We deliberately limit our punctuation handling to stuff we can do
|
11
|
+
# consistently
|
12
|
+
# It'll just be a part of another token if we don't split it out, and
|
13
|
+
# that's fine
|
14
|
+
PUNCTUATION = '.?!,'
|
15
|
+
|
16
|
+
# Lazy-load NLP libraries and resources
|
17
|
+
# Some of this stuff is pretty heavy and we don't necessarily need
|
18
|
+
# to be using it all of the time
|
19
|
+
|
20
|
+
# Lazily loads an array of stopwords
|
21
|
+
# Stopwords are common words that should often be ignored
|
22
|
+
# @return [Array<String>]
|
23
|
+
def self.stopwords
|
24
|
+
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
25
|
+
end
|
26
|
+
|
27
|
+
# Lazily load HTML entity decoder
|
28
|
+
# @return [HTMLEntities]
|
29
|
+
def self.htmlentities
|
30
|
+
@htmlentities ||= HTMLEntities.new
|
31
|
+
end
|
32
|
+
|
33
|
+
### Utility functions
|
34
|
+
|
35
|
+
# Normalize some strange unicode punctuation variants
|
36
|
+
# @param text [String]
|
37
|
+
# @return [String]
|
38
|
+
def self.normalize(text)
|
39
|
+
htmlentities.decode(text.tr('“', '"').tr('”', '"').tr('’', "'")
|
40
|
+
.gsub('…', '...'))
|
41
|
+
end
|
42
|
+
|
43
|
+
# Split text into sentences
|
44
|
+
# We use ad hoc approach because fancy libraries do not deal
|
45
|
+
# especially well with tweet formatting, and we can fake solving
|
46
|
+
# the quote problem during generation
|
47
|
+
# @param text [String]
|
48
|
+
# @return [Array<String>]
|
49
|
+
def self.sentences(text)
|
50
|
+
text.split(/\n+|(?<=[.?!])\s+/)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Split a sentence into word-level tokens
|
54
|
+
# As above, this is ad hoc because tokenization libraries
|
55
|
+
# do not behave well wrt. things like emoticons and timestamps
|
56
|
+
# @param sentence [String]
|
57
|
+
# @return [Array<String>]
|
58
|
+
def self.tokenize(sentence)
|
59
|
+
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|
|
60
|
+
(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/x
|
61
|
+
sentence.split(regex)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Use highscore gem to find interesting keywords in a corpus
|
65
|
+
# @param text [String]
|
66
|
+
# @return [Highscore::Keywords]
|
67
|
+
def self.keywords(text)
|
68
|
+
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
69
|
+
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
70
|
+
|
71
|
+
text = Highscore::Content.new(text)
|
72
|
+
|
73
|
+
text.configure do
|
74
|
+
# set :multiplier, 2
|
75
|
+
# set :upper_case, 3
|
76
|
+
# set :long_words, 2
|
77
|
+
# set :long_words_threshold, 15
|
78
|
+
# set :vowels, 1 # => default: 0 = not considered
|
79
|
+
# set :consonants, 5 # => default: 0 = not considered
|
80
|
+
# set :ignore_case, true # => default: false
|
81
|
+
set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
|
82
|
+
# set :stemming, true # => default: false
|
83
|
+
end
|
84
|
+
|
85
|
+
text.keywords
|
86
|
+
end
|
87
|
+
|
88
|
+
# Builds a proper sentence from a list of tikis
|
89
|
+
# @param tikis [Array<Integer>]
|
90
|
+
# @param tokens [Array<String>]
|
91
|
+
# @return [String]
|
92
|
+
def self.reconstruct(tikis, tokens)
|
93
|
+
text = ''
|
94
|
+
last_token = nil
|
95
|
+
tikis.each do |tiki|
|
96
|
+
next if tiki == INTERIM
|
97
|
+
token = tokens[tiki]
|
98
|
+
text += ' ' if last_token && space_between?(last_token, token)
|
99
|
+
text += token
|
100
|
+
last_token = token
|
101
|
+
end
|
102
|
+
text
|
103
|
+
end
|
104
|
+
|
105
|
+
# Determine if we need to insert a space between two tokens
|
106
|
+
# @param token1 [String]
|
107
|
+
# @param token2 [String]
|
108
|
+
# @return [Boolean]
|
109
|
+
def self.space_between?(token1, token2)
|
110
|
+
p1 = punctuation?(token1)
|
111
|
+
p2 = punctuation?(token2)
|
112
|
+
if (p1 && p2) || (!p1 && p2) # "foo?!" || "foo."
|
113
|
+
false
|
114
|
+
else # "foo rah" || "foo. rah"
|
115
|
+
true
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Is this token comprised of punctuation?
|
120
|
+
# @param token [String]
|
121
|
+
# @return [Boolean]
|
122
|
+
def self.punctuation?(token)
|
123
|
+
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
124
|
+
end
|
125
|
+
|
126
|
+
# Is this token a stopword?
|
127
|
+
# @param token [String]
|
128
|
+
# @return [Boolean]
|
129
|
+
def self.stopword?(token)
|
130
|
+
@stopword_set ||= stopwords.map(&:downcase).to_set
|
131
|
+
@stopword_set.include?(token.downcase)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Determine if a sample of text contains unmatched brackets or quotes
|
135
|
+
# This is one of the more frequent and noticeable failure modes for
|
136
|
+
# the generator; we can just tell it to retry
|
137
|
+
# @param text [String]
|
138
|
+
# @return [Boolean]
|
139
|
+
def self.unmatched_enclosers?(text)
|
140
|
+
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
141
|
+
enclosers.each do |pair|
|
142
|
+
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
|
143
|
+
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
|
144
|
+
|
145
|
+
opened = 0
|
146
|
+
|
147
|
+
tokenize(text).each do |token|
|
148
|
+
opened += 1 if token.match(starter)
|
149
|
+
opened -= 1 if token.match(ender)
|
150
|
+
|
151
|
+
return true if opened.negative? # Too many ends!
|
152
|
+
end
|
153
|
+
|
154
|
+
return true if opened != 0 # Mismatch somewhere.
|
155
|
+
end
|
156
|
+
|
157
|
+
false
|
158
|
+
end
|
159
|
+
|
160
|
+
# Determine if ary2 is a subsequence of ary1
|
161
|
+
# @param ary1 [Array]
|
162
|
+
# @param ary2 [Array]
|
163
|
+
# @return [Boolean]
|
164
|
+
def self.subseq?(ary1, ary2)
|
165
|
+
!ary1.each_index.find do |i|
|
166
|
+
ary1[i...i + ary2.length] == ary2
|
167
|
+
end.nil?
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|