moo_ebooks 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ebooks
4
+ GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
5
+ DATA_PATH = File.join(GEM_PATH, 'data')
6
+ SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
7
+ TEST_PATH = File.join(GEM_PATH, 'test')
8
+ TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
9
+ INTERIM = :interim
10
+ end
11
+
12
+ require 'moo_ebooks/nlp'
13
+ require 'moo_ebooks/suffix'
14
+ require 'moo_ebooks/model'
@@ -0,0 +1,270 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'set'
5
+ require 'digest/md5'
6
+
7
+ module Ebooks
8
+ # Main class for Model management. Models are required for text generation.
9
+ #
10
+ # @notice Only JSON format is supported.
11
+ # @notice For corpus files. These are assumed to have a `statuses` key and a
12
+ # `mentions` key, which hold the different statuses in them.
13
+ #
14
+ # @notice Make sure NOT to include reblogs (retweets) into corpus data. Those
15
+ # will negatively impact text creation
16
+ class Model
17
+ # @return [Array<String>]
18
+ # An array of unique tokens. This is the main source of actual strings
19
+ # in the model. Manipulation of a token is done using its index
20
+ # in this array, which we call a "tiki"
21
+ attr_accessor :tokens
22
+
23
+ # @return [Array<Array<Integer>>]
24
+ # Sentences represented by arrays of tikis
25
+ attr_accessor :sentences
26
+
27
+ # @return [Array<Array<Integer>>]
28
+ # Sentences derived from Twitter mentions
29
+ attr_accessor :mentions
30
+
31
+ # @return [Array<String>]
32
+ # The top 200 most important keywords, in descending order
33
+ attr_accessor :keywords
34
+ def initialize
35
+ @tokens = []
36
+ @sentences = []
37
+ @mentions = []
38
+ @keywords = []
39
+
40
+ # Reverse lookup tiki by token, for faster generation
41
+ @tikis = {}
42
+ end
43
+
44
+ # Load a saved model
45
+ # @param data [Hash]
46
+ # @return [Ebooks::Model]
47
+ def self.from_hash(data)
48
+ model = Model.new
49
+ model.tokens = data[:tokens]
50
+ model.sentences = data[:sentences]
51
+ model.mentions = data[:mentions]
52
+ model.keywords = data[:keywords]
53
+ model
54
+ end
55
+
56
+ # Load a saved model
57
+ # @param data [String]
58
+ # @reutrn [Ebooks::Model]
59
+ def self.from_json(data)
60
+ from_hash(JSON.parse(data, symbolize_names: true))
61
+ end
62
+
63
+ # Turn this model into its JSON representation.
64
+ def to_json
65
+ to_hash.to_json
66
+ end
67
+
68
+ # Turn this model into its Hash representation
69
+ def to_hash
70
+ { tokens: @tokens, sentences: @sentences, mentions: @mentions,
71
+ keywords: @keywords }
72
+ end
73
+
74
+ # Consume a corpus into this model
75
+ # @param content [Hash]
76
+ def consume(content)
77
+ model = Ebooks::Model.new
78
+ model.consume!(content)
79
+ model
80
+ end
81
+
82
+ # Consume a corpus into this model
83
+ # @param content [Hash]
84
+ def consume!(content)
85
+ unless content.key?(:statuses) || content.key?(:mentions)
86
+ raise ArgumentError, 'Malformed hash object. At least :statuses and/or'\
87
+ ' :mentions must be present as a key'
88
+ end
89
+ consume_statuses(content[:statuses]) unless content[:statuses].nil?
90
+ consume_mentions(content[:mentions]) unless content[:mentions].nil?
91
+ nil
92
+ end
93
+
94
+ # Generate some text
95
+ # @param limit [Integer] available characters
96
+ # @param generator [SuffixGenerator, nil]
97
+ # @param retry_limit [Integer] how many times to retry on invalid status
98
+ # @return [String]
99
+ def update(limit = 140, generator = nil, retry_limit = 10)
100
+ tikis = gather_tikis(limit, generator, retry_limit)
101
+
102
+ status = NLP.reconstruct(tikis, @tokens)
103
+
104
+ fix status
105
+ end
106
+
107
+ # Generates a response by looking for related sentences
108
+ # in the corpus and building a smaller generator from these
109
+ # @param input [String]
110
+ # @param limit [Integer] characters available for response
111
+ # @param sentences [Array<Array<Integer>>]
112
+ # @return [String]
113
+ def reply(input, limit = 140, sentences = @mentions)
114
+ # Prefer mentions
115
+ relevant, slightly_relevant = find_relevant(sentences, input)
116
+
117
+ if relevant.length >= 3
118
+ generator = SuffixGenerator.build(relevant)
119
+ update(limit, generator)
120
+ elsif slightly_relevant.length >= 5
121
+ generator = SuffixGenerator.build(slightly_relevant)
122
+ update(limit, generator)
123
+ else
124
+ update(limit)
125
+ end
126
+ end
127
+
128
+ private
129
+
130
+ def gather_tikis(limit, generator, retry_limit)
131
+ responding = !generator.nil?
132
+ generator ||= SuffixGenerator.build(@sentences)
133
+
134
+ @retries = 0
135
+
136
+ tikis = make_bigram_tikis(limit, generator, retry_limit, responding)
137
+
138
+ if verbatim?(tikis) && tikis.length > 3
139
+ # We made a verbatim status by accident
140
+ tikis = make_unigram_tikis(limit, generator, retry_limit)
141
+ end
142
+ @retries = nil
143
+ tikis
144
+ end
145
+
146
+ def make_unigram_tikis(limit, generator, retry_limit)
147
+ while (tikis = generator.generate(3, :unigrams))
148
+ break if valid_status?(tikis, limit) && !verbatim?(tikis)
149
+
150
+ @retries += 1
151
+ break if retry_limit_reached?(retry_limit)
152
+ end
153
+ tikis
154
+ end
155
+
156
+ def make_bigram_tikis(limit, generator, retry_limit, responding)
157
+ while (tikis = generator.generate(3, :bigrams))
158
+ break if (tikis.length > 3 || responding) && valid_status?(tikis, limit)
159
+
160
+ @retries += 1
161
+ break if retry_limit_reached?(retry_limit)
162
+ end
163
+ tikis
164
+ end
165
+
166
+ def retry_limit_reached?(retry_limit)
167
+ @retries >= retry_limit
168
+ end
169
+
170
+ # Reverse lookup a token index from a token
171
+ # @param token [String]
172
+ # @return [Integer]
173
+ def tikify(token)
174
+ if @tikis.key?(token)
175
+ @tikis[token]
176
+ else
177
+ @tokens << token
178
+ @tikis[token] = @tokens.length - 1
179
+ end
180
+ end
181
+
182
+ # Convert a body of text into arrays of tikis
183
+ # @param text [String]
184
+ # @return [Array<Array<Integer>>]
185
+ def mass_tikify(text)
186
+ sentences = NLP.sentences(text)
187
+
188
+ sentences.map do |s|
189
+ tokens = NLP.tokenize(s).reject do |t|
190
+ # Don't include usernames/urls as tokens
191
+ t.include?('@') || t.include?('http')
192
+ end
193
+
194
+ tokens.map { |t| tikify(t) }
195
+ end
196
+ end
197
+
198
+ # Test if a sentence has been copied verbatim from original
199
+ # @param tikis [Array<Integer>]
200
+ # @return [Boolean]
201
+ def verbatim?(tikis)
202
+ @sentences.include?(tikis) || @mentions.include?(tikis)
203
+ end
204
+
205
+ # Check if an array of tikis comprises a valid status
206
+ # @param tikis [Array<Integer>]
207
+ # @param limit Integer how many chars we have left
208
+ def valid_status?(tikis, limit)
209
+ status = NLP.reconstruct(tikis, @tokens)
210
+ status.length <= limit && !NLP.unmatched_enclosers?(status)
211
+ end
212
+
213
+ # Consume a sequence of statuses (excluding mentions)
214
+ # @param statuses [Array<String>]
215
+ def consume_statuses(statuses)
216
+ statuses.map! do |status|
217
+ NLP.normalize(status)
218
+ end
219
+
220
+ text = statuses.join("\n").encode('UTF-8', invalid: :replace)
221
+ @sentences = mass_tikify(text)
222
+ @keywords = NLP.keywords(text).top(200).map(&:to_s)
223
+
224
+ nil
225
+ end
226
+
227
+ # Consume a sequence of mentions
228
+ # @param mentions [Array<String>]
229
+ def consume_mentions(mentions)
230
+ mentions.map! do |mention|
231
+ NLP.normalize(mention)
232
+ end
233
+
234
+ mention_text = mentions.join("\n").encode('UTF-8', invalid: :replace)
235
+ @mentions = mass_tikify(mention_text)
236
+
237
+ nil
238
+ end
239
+
240
+ # Correct encoding issues in generated text
241
+ # @param text [String]
242
+ # @return [String]
243
+ def fix(text)
244
+ NLP.htmlentities.decode text
245
+ end
246
+
247
+ # Finds relevant and slightly relevant tokenized sentences to input
248
+ # comparing non-stopword token overlaps
249
+ # @param sentences [Array<Array<Integer>>]
250
+ # @param input [String]
251
+ # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
252
+ def find_relevant(sentences, input)
253
+ relevant = []
254
+ slightly_relevant = []
255
+
256
+ tokenized = NLP.tokenize(input).map(&:downcase)
257
+
258
+ sentences.each do |sent|
259
+ tokenized.each do |token|
260
+ if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
261
+ relevant << sent unless NLP.stopword?(token)
262
+ slightly_relevant << sent
263
+ end
264
+ end
265
+ end
266
+
267
+ [relevant, slightly_relevant]
268
+ end
269
+ end
270
+ end
@@ -0,0 +1,170 @@
1
+
2
+ # frozen_string_literal: true
3
+
4
+ require 'highscore'
5
+ require 'htmlentities'
6
+
7
+ module Ebooks
8
+ # @private
9
+ module NLP
10
+ # We deliberately limit our punctuation handling to stuff we can do
11
+ # consistently
12
+ # It'll just be a part of another token if we don't split it out, and
13
+ # that's fine
14
+ PUNCTUATION = '.?!,'
15
+
16
+ # Lazy-load NLP libraries and resources
17
+ # Some of this stuff is pretty heavy and we don't necessarily need
18
+ # to be using it all of the time
19
+
20
+ # Lazily loads an array of stopwords
21
+ # Stopwords are common words that should often be ignored
22
+ # @return [Array<String>]
23
+ def self.stopwords
24
+ @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
25
+ end
26
+
27
+ # Lazily load HTML entity decoder
28
+ # @return [HTMLEntities]
29
+ def self.htmlentities
30
+ @htmlentities ||= HTMLEntities.new
31
+ end
32
+
33
+ ### Utility functions
34
+
35
+ # Normalize some strange unicode punctuation variants
36
+ # @param text [String]
37
+ # @return [String]
38
+ def self.normalize(text)
39
+ htmlentities.decode(text.tr('“', '"').tr('”', '"').tr('’', "'")
40
+ .gsub('…', '...'))
41
+ end
42
+
43
+ # Split text into sentences
44
+ # We use ad hoc approach because fancy libraries do not deal
45
+ # especially well with tweet formatting, and we can fake solving
46
+ # the quote problem during generation
47
+ # @param text [String]
48
+ # @return [Array<String>]
49
+ def self.sentences(text)
50
+ text.split(/\n+|(?<=[.?!])\s+/)
51
+ end
52
+
53
+ # Split a sentence into word-level tokens
54
+ # As above, this is ad hoc because tokenization libraries
55
+ # do not behave well wrt. things like emoticons and timestamps
56
+ # @param sentence [String]
57
+ # @return [Array<String>]
58
+ def self.tokenize(sentence)
59
+ regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|
60
+ (?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/x
61
+ sentence.split(regex)
62
+ end
63
+
64
+ # Use highscore gem to find interesting keywords in a corpus
65
+ # @param text [String]
66
+ # @return [Highscore::Keywords]
67
+ def self.keywords(text)
68
+ # Preprocess to remove stopwords (highscore's blacklist is v. slow)
69
+ text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
70
+
71
+ text = Highscore::Content.new(text)
72
+
73
+ text.configure do
74
+ # set :multiplier, 2
75
+ # set :upper_case, 3
76
+ # set :long_words, 2
77
+ # set :long_words_threshold, 15
78
+ # set :vowels, 1 # => default: 0 = not considered
79
+ # set :consonants, 5 # => default: 0 = not considered
80
+ # set :ignore_case, true # => default: false
81
+ set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
82
+ # set :stemming, true # => default: false
83
+ end
84
+
85
+ text.keywords
86
+ end
87
+
88
+ # Builds a proper sentence from a list of tikis
89
+ # @param tikis [Array<Integer>]
90
+ # @param tokens [Array<String>]
91
+ # @return [String]
92
+ def self.reconstruct(tikis, tokens)
93
+ text = ''
94
+ last_token = nil
95
+ tikis.each do |tiki|
96
+ next if tiki == INTERIM
97
+ token = tokens[tiki]
98
+ text += ' ' if last_token && space_between?(last_token, token)
99
+ text += token
100
+ last_token = token
101
+ end
102
+ text
103
+ end
104
+
105
+ # Determine if we need to insert a space between two tokens
106
+ # @param token1 [String]
107
+ # @param token2 [String]
108
+ # @return [Boolean]
109
+ def self.space_between?(token1, token2)
110
+ p1 = punctuation?(token1)
111
+ p2 = punctuation?(token2)
112
+ if (p1 && p2) || (!p1 && p2) # "foo?!" || "foo."
113
+ false
114
+ else # "foo rah" || "foo. rah"
115
+ true
116
+ end
117
+ end
118
+
119
+ # Is this token comprised of punctuation?
120
+ # @param token [String]
121
+ # @return [Boolean]
122
+ def self.punctuation?(token)
123
+ (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
124
+ end
125
+
126
+ # Is this token a stopword?
127
+ # @param token [String]
128
+ # @return [Boolean]
129
+ def self.stopword?(token)
130
+ @stopword_set ||= stopwords.map(&:downcase).to_set
131
+ @stopword_set.include?(token.downcase)
132
+ end
133
+
134
+ # Determine if a sample of text contains unmatched brackets or quotes
135
+ # This is one of the more frequent and noticeable failure modes for
136
+ # the generator; we can just tell it to retry
137
+ # @param text [String]
138
+ # @return [Boolean]
139
+ def self.unmatched_enclosers?(text)
140
+ enclosers = ['**', '""', '()', '[]', '``', "''"]
141
+ enclosers.each do |pair|
142
+ starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
143
+ ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
144
+
145
+ opened = 0
146
+
147
+ tokenize(text).each do |token|
148
+ opened += 1 if token.match(starter)
149
+ opened -= 1 if token.match(ender)
150
+
151
+ return true if opened.negative? # Too many ends!
152
+ end
153
+
154
+ return true if opened != 0 # Mismatch somewhere.
155
+ end
156
+
157
+ false
158
+ end
159
+
160
+ # Determine if ary2 is a subsequence of ary1
161
+ # @param ary1 [Array]
162
+ # @param ary2 [Array]
163
+ # @return [Boolean]
164
+ def self.subseq?(ary1, ary2)
165
+ !ary1.each_index.find do |i|
166
+ ary1[i...i + ary2.length] == ary2
167
+ end.nil?
168
+ end
169
+ end
170
+ end