foxdear_ebooks 3.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,361 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'json'
5
+ require 'set'
6
+ require 'digest/md5'
7
+ require 'csv'
8
+
9
+ module Ebooks
10
+ class Model
11
+ # @return [Array<String>]
12
+ # An array of unique tokens. This is the main source of actual strings
13
+ # in the model. Manipulation of a token is done using its index
14
+ # in this array, which we call a "tiki"
15
+ attr_accessor :tokens
16
+
17
+ # @return [Array<Array<Integer>>]
18
+ # Sentences represented by arrays of tikis
19
+ attr_accessor :sentences
20
+
21
+ # @return [Array<Array<Integer>>]
22
+ # Sentences derived from Twitter mentions
23
+ attr_accessor :mentions
24
+
25
+ # @return [Array<String>]
26
+ # The top 200 most important keywords, in descending order
27
+ attr_accessor :keywords
28
+
29
+ # Generate a new model from a corpus file
30
+ # @param path [String]
31
+ # @return [Ebooks::Model]
32
+ def self.consume(path)
33
+ Model.new.consume(path)
34
+ end
35
+
36
+ # Generate a new model from multiple corpus files
37
+ # @param paths [Array<String>]
38
+ # @return [Ebooks::Model]
39
+ def self.consume_all(paths)
40
+ Model.new.consume_all(paths)
41
+ end
42
+
43
+ # Load a saved model
44
+ # @param path [String]
45
+ # @return [Ebooks::Model]
46
+ def self.load(path)
47
+ model = Model.new
48
+ model.instance_eval do
49
+ props = Marshal.load(File.open(path, 'rb') { |f| f.read })
50
+ @tokens = props[:tokens]
51
+ @sentences = props[:sentences]
52
+ @mentions = props[:mentions]
53
+ @keywords = props[:keywords]
54
+ end
55
+ model
56
+ end
57
+
58
+ # Save model to a file
59
+ # @param path [String]
60
+ def save(path)
61
+ File.open(path, 'wb') do |f|
62
+ f.write(Marshal.dump({
63
+ tokens: @tokens,
64
+ sentences: @sentences,
65
+ mentions: @mentions,
66
+ keywords: @keywords
67
+ }))
68
+ end
69
+ self
70
+ end
71
+
72
+ # Append a generated model to existing model file instead of overwriting it
73
+ # @param path [String]
74
+ def append(path)
75
+ existing = File.file?(path)
76
+ if !existing
77
+ log "No existing model found at #{path}"
78
+ return
79
+ else
80
+ #read-in and deserialize existing model
81
+ props = Marshal.load(File.open(path,'rb') { |old| old.read })
82
+ old_tokens = props[:tokens]
83
+ old_sentences = props[:sentences]
84
+ old_mentions = props[:mentions]
85
+ old_keywords = props[:keywords]
86
+
87
+ #append existing properties to new ones and overwrite with new model
88
+ File.open(path, 'wb') do |f|
89
+ f.write(Marshal.dump({
90
+ tokens: @tokens.concat(old_tokens),
91
+ sentences: @sentences.concat(old_sentences),
92
+ mentions: @mentions.concat(old_mentions),
93
+ keywords: @keywords.concat(old_keywords)
94
+ }))
95
+ end
96
+ end
97
+ self
98
+ end
99
+
100
+
101
+ def initialize
102
+ @tokens = []
103
+ @banned_words_file ||= 'banned_words.txt'
104
+ @banned_words ||= File.exists?(@banned_words_file) ? File.read(@banned_words_file).split : []
105
+ # Reverse lookup tiki by token, for faster generation
106
+ @tikis = {}
107
+ end
108
+
109
+ # Reverse lookup a token index from a token
110
+ # @param token [String]
111
+ # @return [Integer]
112
+ def tikify(token)
113
+ if @tikis.has_key?(token) then
114
+ return @tikis[token]
115
+ else
116
+ (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
117
+ @tokens << token
118
+ return @tikis[token] = @tokens.length-1
119
+ end
120
+ end
121
+
122
+ # Set the banned words list for the model
123
+ # @param path [String]
124
+ def set_banned_words(path = 'banned_words.txt')
125
+ return if @banned_words_file == path
126
+ @banned_words_file = path
127
+ if File.exists?(@banned_words_file)
128
+ @banned_words = File.read(@banned_words_file).split
129
+ log "Successfully loaded banned words list #{path}"
130
+ else
131
+ log "Error: Banned words list #{path} does not exist"
132
+ end
133
+ end
134
+
135
+
136
+ # Convert a body of text into arrays of tikis
137
+ # @param text [String]
138
+ # @return [Array<Array<Integer>>]
139
+ def mass_tikify(text)
140
+ sentences = NLP.sentences(text)
141
+
142
+ sentences.map do |s|
143
+ tokens = NLP.tokenize(s).reject do |t|
144
+ # Don't include usernames/urls as tokens
145
+ t.include?('@') || t.include?('http')
146
+ end
147
+
148
+ tokens.map { |t| tikify(t) }
149
+ end
150
+ end
151
+
152
+ # Consume a corpus into this model
153
+ # @param path [String]
154
+ def consume(path)
155
+ content = File.read(path, :encoding => 'utf-8')
156
+
157
+ if path.split('.')[-1] == "json"
158
+ log "Reading json corpus from #{path}"
159
+ lines = JSON.parse(content).map do |tweet|
160
+ tweet['text'] || tweet['full_text']
161
+ end
162
+ elsif path.split('.')[-1] == "csv"
163
+ log "Reading CSV corpus from #{path}"
164
+ content = CSV.parse(content)
165
+ header = content.shift
166
+ text_col = header.index('text')
167
+ lines = content.map do |tweet|
168
+ tweet[text_col]
169
+ end
170
+ else
171
+ log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
172
+ lines = content.split("\n")
173
+ end
174
+
175
+ consume_lines(lines)
176
+ end
177
+
178
+ # Consume a sequence of lines
179
+ # @param lines [Array<String>]
180
+ def consume_lines(lines)
181
+ log "Removing commented lines and sorting mentions"
182
+
183
+ statements = []
184
+ mentions = []
185
+ lines.each do |l|
186
+ next if l.start_with?('#') # Remove commented lines
187
+ next if l.include?('RT') || l.include?('MT') # Remove soft retweets
188
+
189
+ if l.include?('@')
190
+ mentions << NLP.normalize(l)
191
+ else
192
+ statements << NLP.normalize(l)
193
+ end
194
+ end
195
+
196
+ text = statements.join("\n").encode('UTF-8', :invalid => :replace)
197
+ mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)
198
+
199
+ lines = nil; statements = nil; mentions = nil # Allow garbage collection
200
+
201
+ log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"
202
+
203
+ @sentences = mass_tikify(text)
204
+ @mentions = mass_tikify(mention_text)
205
+
206
+ log "Ranking keywords"
207
+ @keywords = NLP.keywords(text).top(200).map(&:to_s)
208
+ log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"
209
+
210
+ self
211
+ end
212
+
213
+ # Consume multiple corpuses into this model
214
+ # @param paths [Array<String>]
215
+ def consume_all(paths)
216
+ lines = []
217
+ paths.each do |path|
218
+ content = File.read(path, :encoding => 'utf-8')
219
+
220
+ if path.split('.')[-1] == "json"
221
+ log "Reading json corpus from #{path}"
222
+ l = JSON.parse(content).map do |tweet|
223
+ tweet['text'] || tweet['full_text']
224
+ end
225
+ lines.concat(l)
226
+ elsif path.split('.')[-1] == "csv"
227
+ log "Reading CSV corpus from #{path}"
228
+ content = CSV.parse(content)
229
+ header = content.shift
230
+ text_col = header.index('text')
231
+ l = content.map do |tweet|
232
+ tweet[text_col]
233
+ end
234
+ lines.concat(l)
235
+ else
236
+ log "Reading plaintext corpus from #{path}"
237
+ l = content.split("\n")
238
+ lines.concat(l)
239
+ end
240
+ end
241
+ consume_lines(lines)
242
+ end
243
+
244
+ # Correct encoding issues in generated text
245
+ # @param text [String]
246
+ # @return [String]
247
+ def fix(text)
248
+ NLP.htmlentities.decode text
249
+ end
250
+
251
+ # Check if an array of tikis comprises a valid tweet
252
+ # @param tikis [Array<Integer>]
253
+ # @param limit Integer how many chars we have left
254
+ def valid_tweet?(tikis, limit)
255
+ tweet = NLP.reconstruct(tikis, @tokens)
256
+ found_banned = @banned_words.any? do |word|
257
+ re = Regexp.new("\\b#{word}\\b", "i")
258
+ re.match tweet
259
+ end
260
+ tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) && !found_banned
261
+ end
262
+
263
+ # Generate some text
264
+ # @param limit [Integer] available characters
265
+ # @param generator [SuffixGenerator, nil]
266
+ # @param retry_limit [Integer] how many times to retry on invalid tweet
267
+ # @return [String]
268
+ def make_statement(limit=280, generator=nil, retry_limit=10)
269
+ responding = !generator.nil?
270
+ generator ||= SuffixGenerator.build(@sentences)
271
+
272
+ retries = 0
273
+ tweet = ""
274
+
275
+ while (tikis = generator.generate(3, :bigrams)) do
276
+ #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
277
+ break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)
278
+
279
+ retries += 1
280
+ break if retries >= retry_limit
281
+ end
282
+
283
+ if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
284
+ #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
285
+ while (tikis = generator.generate(3, :unigrams)) do
286
+ break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
287
+
288
+ retries += 1
289
+ break if retries >= retry_limit
290
+ end
291
+ end
292
+
293
+ tweet = NLP.reconstruct(tikis, @tokens)
294
+
295
+ if retries >= retry_limit
296
+ log "Unable to produce valid non-verbatim tweet; result was \"#{tweet}\""
297
+ if valid_tweet?(tikis, limit)
298
+ log "Tweet contains no banned words; sending anyways"
299
+ else
300
+ log "Tweet contains banned words or is invalid; replacing with dummy message"
301
+ tweet = "Sorry, try again."
302
+ end
303
+ end
304
+
305
+ fix tweet
306
+ end
307
+
308
+ # Test if a sentence has been copied verbatim from original
309
+ # @param tikis [Array<Integer>]
310
+ # @return [Boolean]
311
+ def verbatim?(tikis)
312
+ @sentences.include?(tikis) || @mentions.include?(tikis)
313
+ end
314
+
315
+ # Finds relevant and slightly relevant tokenized sentences to input
316
+ # comparing non-stopword token overlaps
317
+ # @param sentences [Array<Array<Integer>>]
318
+ # @param input [String]
319
+ # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
320
+ def find_relevant(sentences, input)
321
+ relevant = []
322
+ slightly_relevant = []
323
+
324
+ tokenized = NLP.tokenize(input).map(&:downcase)
325
+
326
+ sentences.each do |sent|
327
+ tokenized.each do |token|
328
+ if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
329
+ relevant << sent unless NLP.stopword?(token)
330
+ slightly_relevant << sent
331
+ end
332
+ end
333
+ end
334
+
335
+ [relevant, slightly_relevant]
336
+ end
337
+
338
+ # Generates a response by looking for related sentences
339
+ # in the corpus and building a smaller generator from these
340
+ # @param input [String]
341
+ # @param limit [Integer] characters available for response
342
+ # @param sentences [Array<Array<Integer>>]
343
+ # @return [String]
344
+ def make_response(input, limit=280, sentences=@mentions)
345
+ # Prefer mentions
346
+ relevant, slightly_relevant = find_relevant(sentences, input)
347
+
348
+ if relevant.length >= 3
349
+ generator = SuffixGenerator.build(relevant)
350
+ make_statement(limit, generator)
351
+ elsif slightly_relevant.length >= 5
352
+ generator = SuffixGenerator.build(slightly_relevant)
353
+ make_statement(limit, generator)
354
+ elsif sentences.equal?(@mentions)
355
+ make_response(input, limit, @sentences)
356
+ else
357
+ make_statement(limit)
358
+ end
359
+ end
360
+ end
361
+ end
@@ -0,0 +1,195 @@
1
+ # encoding: utf-8
2
+ require 'fast-stemmer'
3
+ require 'highscore'
4
+ require 'htmlentities'
5
+
6
+ module Ebooks
7
+ module NLP
8
+ # We deliberately limit our punctuation handling to stuff we can do consistently
9
+ # It'll just be a part of another token if we don't split it out, and that's fine
10
+ PUNCTUATION = ".?!,"
11
+
12
+ # Lazy-load NLP libraries and resources
13
+ # Some of this stuff is pretty heavy and we don't necessarily need
14
+ # to be using it all of the time
15
+
16
+ # Lazily loads an array of stopwords
17
+ # Stopwords are common words that should often be ignored
18
+ # @return [Array<String>]
19
+ def self.stopwords
20
+ @stopwords ||= File.exists?('stopwords.txt') ? File.read('stopwords.txt').split : []
21
+ end
22
+
23
+ # Lazily loads an array of known English nouns
24
+ # @return [Array<String>]
25
+ def self.nouns
26
+ @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
27
+ end
28
+
29
+ # Lazily loads an array of known English adjectives
30
+ # @return [Array<String>]
31
+ def self.adjectives
32
+ @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
33
+ end
34
+
35
+ # Lazily load part-of-speech tagging library
36
+ # This can determine whether a word is being used as a noun/adjective/verb
37
+ # @return [EngTagger]
38
+ def self.tagger
39
+ require 'engtagger'
40
+ @tagger ||= EngTagger.new
41
+ end
42
+
43
+ # Lazily load HTML entity decoder
44
+ # @return [HTMLEntities]
45
+ def self.htmlentities
46
+ @htmlentities ||= HTMLEntities.new
47
+ end
48
+
49
+ ### Utility functions
50
+
51
+ # Normalize some strange unicode punctuation variants
52
+ # @param text [String]
53
+ # @return [String]
54
+ def self.normalize(text)
55
+ htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
56
+ end
57
+
58
+ # Split text into sentences
59
+ # We use ad hoc approach because fancy libraries do not deal
60
+ # especially well with tweet formatting, and we can fake solving
61
+ # the quote problem during generation
62
+ # @param text [String]
63
+ # @return [Array<String>]
64
+ def self.sentences(text)
65
+ text.split(/\n+|(?<=[.?!])\s+/)
66
+ end
67
+
68
+ # Split a sentence into word-level tokens
69
+ # As above, this is ad hoc because tokenization libraries
70
+ # do not behave well wrt. things like emoticons and timestamps
71
+ # @param sentence [String]
72
+ # @return [Array<String>]
73
+ def self.tokenize(sentence)
74
+ regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
75
+ sentence.split(regex)
76
+ end
77
+
78
+ # Get the 'stem' form of a word e.g. 'cats' -> 'cat'
79
+ # @param word [String]
80
+ # @return [String]
81
+ def self.stem(word)
82
+ Stemmer::stem_word(word.downcase)
83
+ end
84
+
85
+ # Use highscore gem to find interesting keywords in a corpus
86
+ # @param text [String]
87
+ # @return [Highscore::Keywords]
88
+ def self.keywords(text)
89
+ # Preprocess to remove stopwords (highscore's blacklist is v. slow)
90
+ text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
91
+
92
+ text = Highscore::Content.new(text)
93
+
94
+ text.configure do
95
+ #set :multiplier, 2
96
+ #set :upper_case, 3
97
+ #set :long_words, 2
98
+ #set :long_words_threshold, 15
99
+ #set :vowels, 1 # => default: 0 = not considered
100
+ #set :consonants, 5 # => default: 0 = not considered
101
+ #set :ignore_case, true # => default: false
102
+ set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
103
+ #set :stemming, true # => default: false
104
+ end
105
+
106
+ text.keywords
107
+ end
108
+
109
+ # Builds a proper sentence from a list of tikis
110
+ # @param tikis [Array<Integer>]
111
+ # @param tokens [Array<String>]
112
+ # @return [String]
113
+ def self.reconstruct(tikis, tokens)
114
+ text = ""
115
+ last_token = nil
116
+ tikis.each do |tiki|
117
+ next if tiki == INTERIM
118
+ token = tokens[tiki]
119
+ text += ' ' if last_token && space_between?(last_token, token)
120
+ text += token
121
+ last_token = token
122
+ end
123
+ text
124
+ end
125
+
126
+ # Determine if we need to insert a space between two tokens
127
+ # @param token1 [String]
128
+ # @param token2 [String]
129
+ # @return [Boolean]
130
+ def self.space_between?(token1, token2)
131
+ p1 = self.punctuation?(token1)
132
+ p2 = self.punctuation?(token2)
133
+ if p1 && p2 # "foo?!"
134
+ false
135
+ elsif !p1 && p2 # "foo."
136
+ false
137
+ elsif p1 && !p2 # "foo. rah"
138
+ true
139
+ else # "foo rah"
140
+ true
141
+ end
142
+ end
143
+
144
+ # Is this token comprised of punctuation?
145
+ # @param token [String]
146
+ # @return [Boolean]
147
+ def self.punctuation?(token)
148
+ (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
149
+ end
150
+
151
+ # Is this token a stopword?
152
+ # @param token [String]
153
+ # @return [Boolean]
154
+ def self.stopword?(token)
155
+ @stopword_set ||= stopwords.map(&:downcase).to_set
156
+ @stopword_set.include?(token.downcase)
157
+ end
158
+
159
+ # Determine if a sample of text contains unmatched brackets or quotes
160
+ # This is one of the more frequent and noticeable failure modes for
161
+ # the generator; we can just tell it to retry
162
+ # @param text [String]
163
+ # @return [Boolean]
164
+ def self.unmatched_enclosers?(text)
165
+ enclosers = ['**', '""', '()', '[]', '``', "''"]
166
+ enclosers.each do |pair|
167
+ starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
168
+ ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
169
+
170
+ opened = 0
171
+
172
+ tokenize(text).each do |token|
173
+ opened += 1 if token.match(starter)
174
+ opened -= 1 if token.match(ender)
175
+
176
+ return true if opened < 0 # Too many ends!
177
+ end
178
+
179
+ return true if opened != 0 # Mismatch somewhere.
180
+ end
181
+
182
+ false
183
+ end
184
+
185
+ # Determine if a2 is a subsequence of a1
186
+ # @param a1 [Array]
187
+ # @param a2 [Array]
188
+ # @return [Boolean]
189
+ def self.subseq?(a1, a2)
190
+ !a1.each_index.find do |i|
191
+ a1[i...i+a2.length] == a2
192
+ end.nil?
193
+ end
194
+ end
195
+ end