moo_ebooks 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +6 -0
- data/LICENSE +22 -0
- data/README.md +71 -0
- data/data/stopwords.txt +843 -0
- data/lib/moo_ebooks.rb +14 -0
- data/lib/moo_ebooks/model.rb +270 -0
- data/lib/moo_ebooks/nlp.rb +170 -0
- data/lib/moo_ebooks/suffix.rb +118 -0
- data/lib/moo_ebooks/version.rb +9 -0
- data/spec/data/0xabad1dea.json +1 -0
- data/spec/model_spec.rb +107 -0
- data/spec/spec_helper.rb +108 -0
- metadata +155 -0
data/lib/moo_ebooks.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ebooks
|
4
|
+
GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
5
|
+
DATA_PATH = File.join(GEM_PATH, 'data')
|
6
|
+
SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
|
7
|
+
TEST_PATH = File.join(GEM_PATH, 'test')
|
8
|
+
TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
|
9
|
+
INTERIM = :interim
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'moo_ebooks/nlp'
|
13
|
+
require 'moo_ebooks/suffix'
|
14
|
+
require 'moo_ebooks/model'
|
@@ -0,0 +1,270 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'set'
|
5
|
+
require 'digest/md5'
|
6
|
+
|
7
|
+
module Ebooks
|
8
|
+
# Main class for Model management. Models are required for text generation.
|
9
|
+
#
|
10
|
+
# @notice Only JSON format is supported.
|
11
|
+
# @notice For corpus files. These are assumed to have a `statuses` key and a
|
12
|
+
# `mentions` key, which hold the different statuses in them.
|
13
|
+
#
|
14
|
+
# @notice Make sure NOT to include reblogs (retweets) into corpus data. Those
|
15
|
+
# will negatively impact text creation
|
16
|
+
class Model
|
17
|
+
# @return [Array<String>]
|
18
|
+
# An array of unique tokens. This is the main source of actual strings
|
19
|
+
# in the model. Manipulation of a token is done using its index
|
20
|
+
# in this array, which we call a "tiki"
|
21
|
+
attr_accessor :tokens
|
22
|
+
|
23
|
+
# @return [Array<Array<Integer>>]
|
24
|
+
# Sentences represented by arrays of tikis
|
25
|
+
attr_accessor :sentences
|
26
|
+
|
27
|
+
# @return [Array<Array<Integer>>]
|
28
|
+
# Sentences derived from Twitter mentions
|
29
|
+
attr_accessor :mentions
|
30
|
+
|
31
|
+
# @return [Array<String>]
|
32
|
+
# The top 200 most important keywords, in descending order
|
33
|
+
attr_accessor :keywords
|
34
|
+
def initialize
|
35
|
+
@tokens = []
|
36
|
+
@sentences = []
|
37
|
+
@mentions = []
|
38
|
+
@keywords = []
|
39
|
+
|
40
|
+
# Reverse lookup tiki by token, for faster generation
|
41
|
+
@tikis = {}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Load a saved model
|
45
|
+
# @param data [Hash]
|
46
|
+
# @return [Ebooks::Model]
|
47
|
+
def self.from_hash(data)
|
48
|
+
model = Model.new
|
49
|
+
model.tokens = data[:tokens]
|
50
|
+
model.sentences = data[:sentences]
|
51
|
+
model.mentions = data[:mentions]
|
52
|
+
model.keywords = data[:keywords]
|
53
|
+
model
|
54
|
+
end
|
55
|
+
|
56
|
+
# Load a saved model
|
57
|
+
# @param data [String]
|
58
|
+
# @reutrn [Ebooks::Model]
|
59
|
+
def self.from_json(data)
|
60
|
+
from_hash(JSON.parse(data, symbolize_names: true))
|
61
|
+
end
|
62
|
+
|
63
|
+
# Turn this model into its JSON representation.
|
64
|
+
def to_json
|
65
|
+
to_hash.to_json
|
66
|
+
end
|
67
|
+
|
68
|
+
# Turn this model into its Hash representation
|
69
|
+
def to_hash
|
70
|
+
{ tokens: @tokens, sentences: @sentences, mentions: @mentions,
|
71
|
+
keywords: @keywords }
|
72
|
+
end
|
73
|
+
|
74
|
+
# Consume a corpus into this model
|
75
|
+
# @param content [Hash]
|
76
|
+
def consume(content)
|
77
|
+
model = Ebooks::Model.new
|
78
|
+
model.consume!(content)
|
79
|
+
model
|
80
|
+
end
|
81
|
+
|
82
|
+
# Consume a corpus into this model
|
83
|
+
# @param content [Hash]
|
84
|
+
def consume!(content)
|
85
|
+
unless content.key?(:statuses) || content.key?(:mentions)
|
86
|
+
raise ArgumentError, 'Malformed hash object. At least :statuses and/or'\
|
87
|
+
' :mentions must be present as a key'
|
88
|
+
end
|
89
|
+
consume_statuses(content[:statuses]) unless content[:statuses].nil?
|
90
|
+
consume_mentions(content[:mentions]) unless content[:mentions].nil?
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
|
94
|
+
# Generate some text
|
95
|
+
# @param limit [Integer] available characters
|
96
|
+
# @param generator [SuffixGenerator, nil]
|
97
|
+
# @param retry_limit [Integer] how many times to retry on invalid status
|
98
|
+
# @return [String]
|
99
|
+
def update(limit = 140, generator = nil, retry_limit = 10)
|
100
|
+
tikis = gather_tikis(limit, generator, retry_limit)
|
101
|
+
|
102
|
+
status = NLP.reconstruct(tikis, @tokens)
|
103
|
+
|
104
|
+
fix status
|
105
|
+
end
|
106
|
+
|
107
|
+
# Generates a response by looking for related sentences
|
108
|
+
# in the corpus and building a smaller generator from these
|
109
|
+
# @param input [String]
|
110
|
+
# @param limit [Integer] characters available for response
|
111
|
+
# @param sentences [Array<Array<Integer>>]
|
112
|
+
# @return [String]
|
113
|
+
def reply(input, limit = 140, sentences = @mentions)
|
114
|
+
# Prefer mentions
|
115
|
+
relevant, slightly_relevant = find_relevant(sentences, input)
|
116
|
+
|
117
|
+
if relevant.length >= 3
|
118
|
+
generator = SuffixGenerator.build(relevant)
|
119
|
+
update(limit, generator)
|
120
|
+
elsif slightly_relevant.length >= 5
|
121
|
+
generator = SuffixGenerator.build(slightly_relevant)
|
122
|
+
update(limit, generator)
|
123
|
+
else
|
124
|
+
update(limit)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
|
130
|
+
def gather_tikis(limit, generator, retry_limit)
|
131
|
+
responding = !generator.nil?
|
132
|
+
generator ||= SuffixGenerator.build(@sentences)
|
133
|
+
|
134
|
+
@retries = 0
|
135
|
+
|
136
|
+
tikis = make_bigram_tikis(limit, generator, retry_limit, responding)
|
137
|
+
|
138
|
+
if verbatim?(tikis) && tikis.length > 3
|
139
|
+
# We made a verbatim status by accident
|
140
|
+
tikis = make_unigram_tikis(limit, generator, retry_limit)
|
141
|
+
end
|
142
|
+
@retries = nil
|
143
|
+
tikis
|
144
|
+
end
|
145
|
+
|
146
|
+
def make_unigram_tikis(limit, generator, retry_limit)
|
147
|
+
while (tikis = generator.generate(3, :unigrams))
|
148
|
+
break if valid_status?(tikis, limit) && !verbatim?(tikis)
|
149
|
+
|
150
|
+
@retries += 1
|
151
|
+
break if retry_limit_reached?(retry_limit)
|
152
|
+
end
|
153
|
+
tikis
|
154
|
+
end
|
155
|
+
|
156
|
+
def make_bigram_tikis(limit, generator, retry_limit, responding)
|
157
|
+
while (tikis = generator.generate(3, :bigrams))
|
158
|
+
break if (tikis.length > 3 || responding) && valid_status?(tikis, limit)
|
159
|
+
|
160
|
+
@retries += 1
|
161
|
+
break if retry_limit_reached?(retry_limit)
|
162
|
+
end
|
163
|
+
tikis
|
164
|
+
end
|
165
|
+
|
166
|
+
def retry_limit_reached?(retry_limit)
|
167
|
+
@retries >= retry_limit
|
168
|
+
end
|
169
|
+
|
170
|
+
# Reverse lookup a token index from a token
|
171
|
+
# @param token [String]
|
172
|
+
# @return [Integer]
|
173
|
+
def tikify(token)
|
174
|
+
if @tikis.key?(token)
|
175
|
+
@tikis[token]
|
176
|
+
else
|
177
|
+
@tokens << token
|
178
|
+
@tikis[token] = @tokens.length - 1
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Convert a body of text into arrays of tikis
|
183
|
+
# @param text [String]
|
184
|
+
# @return [Array<Array<Integer>>]
|
185
|
+
def mass_tikify(text)
|
186
|
+
sentences = NLP.sentences(text)
|
187
|
+
|
188
|
+
sentences.map do |s|
|
189
|
+
tokens = NLP.tokenize(s).reject do |t|
|
190
|
+
# Don't include usernames/urls as tokens
|
191
|
+
t.include?('@') || t.include?('http')
|
192
|
+
end
|
193
|
+
|
194
|
+
tokens.map { |t| tikify(t) }
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# Test if a sentence has been copied verbatim from original
|
199
|
+
# @param tikis [Array<Integer>]
|
200
|
+
# @return [Boolean]
|
201
|
+
def verbatim?(tikis)
|
202
|
+
@sentences.include?(tikis) || @mentions.include?(tikis)
|
203
|
+
end
|
204
|
+
|
205
|
+
# Check if an array of tikis comprises a valid status
|
206
|
+
# @param tikis [Array<Integer>]
|
207
|
+
# @param limit Integer how many chars we have left
|
208
|
+
def valid_status?(tikis, limit)
|
209
|
+
status = NLP.reconstruct(tikis, @tokens)
|
210
|
+
status.length <= limit && !NLP.unmatched_enclosers?(status)
|
211
|
+
end
|
212
|
+
|
213
|
+
# Consume a sequence of statuses (excluding mentions)
|
214
|
+
# @param statuses [Array<String>]
|
215
|
+
def consume_statuses(statuses)
|
216
|
+
statuses.map! do |status|
|
217
|
+
NLP.normalize(status)
|
218
|
+
end
|
219
|
+
|
220
|
+
text = statuses.join("\n").encode('UTF-8', invalid: :replace)
|
221
|
+
@sentences = mass_tikify(text)
|
222
|
+
@keywords = NLP.keywords(text).top(200).map(&:to_s)
|
223
|
+
|
224
|
+
nil
|
225
|
+
end
|
226
|
+
|
227
|
+
# Consume a sequence of mentions
|
228
|
+
# @param mentions [Array<String>]
|
229
|
+
def consume_mentions(mentions)
|
230
|
+
mentions.map! do |mention|
|
231
|
+
NLP.normalize(mention)
|
232
|
+
end
|
233
|
+
|
234
|
+
mention_text = mentions.join("\n").encode('UTF-8', invalid: :replace)
|
235
|
+
@mentions = mass_tikify(mention_text)
|
236
|
+
|
237
|
+
nil
|
238
|
+
end
|
239
|
+
|
240
|
+
# Correct encoding issues in generated text
|
241
|
+
# @param text [String]
|
242
|
+
# @return [String]
|
243
|
+
def fix(text)
|
244
|
+
NLP.htmlentities.decode text
|
245
|
+
end
|
246
|
+
|
247
|
+
# Finds relevant and slightly relevant tokenized sentences to input
|
248
|
+
# comparing non-stopword token overlaps
|
249
|
+
# @param sentences [Array<Array<Integer>>]
|
250
|
+
# @param input [String]
|
251
|
+
# @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
|
252
|
+
def find_relevant(sentences, input)
|
253
|
+
relevant = []
|
254
|
+
slightly_relevant = []
|
255
|
+
|
256
|
+
tokenized = NLP.tokenize(input).map(&:downcase)
|
257
|
+
|
258
|
+
sentences.each do |sent|
|
259
|
+
tokenized.each do |token|
|
260
|
+
if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
|
261
|
+
relevant << sent unless NLP.stopword?(token)
|
262
|
+
slightly_relevant << sent
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
[relevant, slightly_relevant]
|
268
|
+
end
|
269
|
+
end
|
270
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'highscore'
|
5
|
+
require 'htmlentities'
|
6
|
+
|
7
|
+
module Ebooks
|
8
|
+
# @private
|
9
|
+
module NLP
|
10
|
+
# We deliberately limit our punctuation handling to stuff we can do
|
11
|
+
# consistently
|
12
|
+
# It'll just be a part of another token if we don't split it out, and
|
13
|
+
# that's fine
|
14
|
+
PUNCTUATION = '.?!,'
|
15
|
+
|
16
|
+
# Lazy-load NLP libraries and resources
|
17
|
+
# Some of this stuff is pretty heavy and we don't necessarily need
|
18
|
+
# to be using it all of the time
|
19
|
+
|
20
|
+
# Lazily loads an array of stopwords
|
21
|
+
# Stopwords are common words that should often be ignored
|
22
|
+
# @return [Array<String>]
|
23
|
+
def self.stopwords
|
24
|
+
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
25
|
+
end
|
26
|
+
|
27
|
+
# Lazily load HTML entity decoder
|
28
|
+
# @return [HTMLEntities]
|
29
|
+
def self.htmlentities
|
30
|
+
@htmlentities ||= HTMLEntities.new
|
31
|
+
end
|
32
|
+
|
33
|
+
### Utility functions
|
34
|
+
|
35
|
+
# Normalize some strange unicode punctuation variants
|
36
|
+
# @param text [String]
|
37
|
+
# @return [String]
|
38
|
+
def self.normalize(text)
|
39
|
+
htmlentities.decode(text.tr('“', '"').tr('”', '"').tr('’', "'")
|
40
|
+
.gsub('…', '...'))
|
41
|
+
end
|
42
|
+
|
43
|
+
# Split text into sentences
|
44
|
+
# We use ad hoc approach because fancy libraries do not deal
|
45
|
+
# especially well with tweet formatting, and we can fake solving
|
46
|
+
# the quote problem during generation
|
47
|
+
# @param text [String]
|
48
|
+
# @return [Array<String>]
|
49
|
+
def self.sentences(text)
|
50
|
+
text.split(/\n+|(?<=[.?!])\s+/)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Split a sentence into word-level tokens
|
54
|
+
# As above, this is ad hoc because tokenization libraries
|
55
|
+
# do not behave well wrt. things like emoticons and timestamps
|
56
|
+
# @param sentence [String]
|
57
|
+
# @return [Array<String>]
|
58
|
+
def self.tokenize(sentence)
|
59
|
+
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|
|
60
|
+
(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/x
|
61
|
+
sentence.split(regex)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Use highscore gem to find interesting keywords in a corpus
|
65
|
+
# @param text [String]
|
66
|
+
# @return [Highscore::Keywords]
|
67
|
+
def self.keywords(text)
|
68
|
+
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
69
|
+
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
70
|
+
|
71
|
+
text = Highscore::Content.new(text)
|
72
|
+
|
73
|
+
text.configure do
|
74
|
+
# set :multiplier, 2
|
75
|
+
# set :upper_case, 3
|
76
|
+
# set :long_words, 2
|
77
|
+
# set :long_words_threshold, 15
|
78
|
+
# set :vowels, 1 # => default: 0 = not considered
|
79
|
+
# set :consonants, 5 # => default: 0 = not considered
|
80
|
+
# set :ignore_case, true # => default: false
|
81
|
+
set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
|
82
|
+
# set :stemming, true # => default: false
|
83
|
+
end
|
84
|
+
|
85
|
+
text.keywords
|
86
|
+
end
|
87
|
+
|
88
|
+
# Builds a proper sentence from a list of tikis
|
89
|
+
# @param tikis [Array<Integer>]
|
90
|
+
# @param tokens [Array<String>]
|
91
|
+
# @return [String]
|
92
|
+
def self.reconstruct(tikis, tokens)
|
93
|
+
text = ''
|
94
|
+
last_token = nil
|
95
|
+
tikis.each do |tiki|
|
96
|
+
next if tiki == INTERIM
|
97
|
+
token = tokens[tiki]
|
98
|
+
text += ' ' if last_token && space_between?(last_token, token)
|
99
|
+
text += token
|
100
|
+
last_token = token
|
101
|
+
end
|
102
|
+
text
|
103
|
+
end
|
104
|
+
|
105
|
+
# Determine if we need to insert a space between two tokens
|
106
|
+
# @param token1 [String]
|
107
|
+
# @param token2 [String]
|
108
|
+
# @return [Boolean]
|
109
|
+
def self.space_between?(token1, token2)
|
110
|
+
p1 = punctuation?(token1)
|
111
|
+
p2 = punctuation?(token2)
|
112
|
+
if (p1 && p2) || (!p1 && p2) # "foo?!" || "foo."
|
113
|
+
false
|
114
|
+
else # "foo rah" || "foo. rah"
|
115
|
+
true
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Is this token comprised of punctuation?
|
120
|
+
# @param token [String]
|
121
|
+
# @return [Boolean]
|
122
|
+
def self.punctuation?(token)
|
123
|
+
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
124
|
+
end
|
125
|
+
|
126
|
+
# Is this token a stopword?
|
127
|
+
# @param token [String]
|
128
|
+
# @return [Boolean]
|
129
|
+
def self.stopword?(token)
|
130
|
+
@stopword_set ||= stopwords.map(&:downcase).to_set
|
131
|
+
@stopword_set.include?(token.downcase)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Determine if a sample of text contains unmatched brackets or quotes
|
135
|
+
# This is one of the more frequent and noticeable failure modes for
|
136
|
+
# the generator; we can just tell it to retry
|
137
|
+
# @param text [String]
|
138
|
+
# @return [Boolean]
|
139
|
+
def self.unmatched_enclosers?(text)
|
140
|
+
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
141
|
+
enclosers.each do |pair|
|
142
|
+
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
|
143
|
+
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
|
144
|
+
|
145
|
+
opened = 0
|
146
|
+
|
147
|
+
tokenize(text).each do |token|
|
148
|
+
opened += 1 if token.match(starter)
|
149
|
+
opened -= 1 if token.match(ender)
|
150
|
+
|
151
|
+
return true if opened.negative? # Too many ends!
|
152
|
+
end
|
153
|
+
|
154
|
+
return true if opened != 0 # Mismatch somewhere.
|
155
|
+
end
|
156
|
+
|
157
|
+
false
|
158
|
+
end
|
159
|
+
|
160
|
+
# Determine if ary2 is a subsequence of ary1
|
161
|
+
# @param ary1 [Array]
|
162
|
+
# @param ary2 [Array]
|
163
|
+
# @return [Boolean]
|
164
|
+
def self.subseq?(ary1, ary2)
|
165
|
+
!ary1.each_index.find do |i|
|
166
|
+
ary1[i...i + ary2.length] == ary2
|
167
|
+
end.nil?
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|