bot_twitter_ebooks 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,336 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'json'
5
+ require 'set'
6
+ require 'digest/md5'
7
+ require 'csv'
8
+
9
+ module Ebooks
10
+ class Model
11
+ # @return [Array<String>]
12
+ # An array of unique tokens. This is the main source of actual strings
13
+ # in the model. Manipulation of a token is done using its index
14
+ # in this array, which we call a "tiki"
15
+ attr_accessor :tokens
16
+
17
+ # @return [Array<Array<Integer>>]
18
+ # Sentences represented by arrays of tikis
19
+ attr_accessor :sentences
20
+
21
+ # @return [Array<Array<Integer>>]
22
+ # Sentences derived from Twitter mentions
23
+ attr_accessor :mentions
24
+
25
+ # @return [Array<String>]
26
+ # The top 200 most important keywords, in descending order
27
+ attr_accessor :keywords
28
+
29
+ # Generate a new model from a corpus file
30
+ # @param path [String]
31
+ # @return [Ebooks::Model]
32
+ def self.consume(path)
33
+ Model.new.consume(path)
34
+ end
35
+
36
+ # Generate a new model from multiple corpus files
37
+ # @param paths [Array<String>]
38
+ # @return [Ebooks::Model]
39
+ def self.consume_all(paths)
40
+ Model.new.consume_all(paths)
41
+ end
42
+
43
+ # Load a saved model
44
+ # @param path [String]
45
+ # @return [Ebooks::Model]
46
+ def self.load(path)
47
+ model = Model.new
48
+ model.instance_eval do
49
+ props = Marshal.load(File.open(path, 'rb') { |f| f.read })
50
+ @tokens = props[:tokens]
51
+ @sentences = props[:sentences]
52
+ @mentions = props[:mentions]
53
+ @keywords = props[:keywords]
54
+ end
55
+ model
56
+ end
57
+
58
+ # Save model to a file
59
+ # @param path [String]
60
+ def save(path)
61
+ File.open(path, 'wb') do |f|
62
+ f.write(Marshal.dump({
63
+ tokens: @tokens,
64
+ sentences: @sentences,
65
+ mentions: @mentions,
66
+ keywords: @keywords
67
+ }))
68
+ end
69
+ self
70
+ end
71
+
72
+ # Append a generated model to existing model file instead of overwriting it
73
+ # @param path [String]
74
+ def append(path)
75
+ existing = File.file?(path)
76
+ if !existing
77
+ log "No existing model found at #{path}"
78
+ return
79
+ else
80
+ #read-in and deserialize existing model
81
+ props = Marshal.load(File.open(path,'rb') { |old| old.read })
82
+ old_tokens = props[:tokens]
83
+ old_sentences = props[:sentences]
84
+ old_mentions = props[:mentions]
85
+ old_keywords = props[:keywords]
86
+
87
+ #append existing properties to new ones and overwrite with new model
88
+ File.open(path, 'wb') do |f|
89
+ f.write(Marshal.dump({
90
+ tokens: @tokens.concat(old_tokens),
91
+ sentences: @sentences.concat(old_sentences),
92
+ mentions: @mentions.concat(old_mentions),
93
+ keywords: @keywords.concat(old_keywords)
94
+ }))
95
+ end
96
+ end
97
+ self
98
+ end
99
+
100
+
101
+ def initialize
102
+ @tokens = []
103
+
104
+ # Reverse lookup tiki by token, for faster generation
105
+ @tikis = {}
106
+ end
107
+
108
+ # Reverse lookup a token index from a token
109
+ # @param token [String]
110
+ # @return [Integer]
111
+ def tikify(token)
112
+ if @tikis.has_key?(token) then
113
+ return @tikis[token]
114
+ else
115
+ (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
116
+ @tokens << token
117
+ return @tikis[token] = @tokens.length-1
118
+ end
119
+ end
120
+
121
+ # Convert a body of text into arrays of tikis
122
+ # @param text [String]
123
+ # @return [Array<Array<Integer>>]
124
+ def mass_tikify(text)
125
+ sentences = NLP.sentences(text)
126
+
127
+ sentences.map do |s|
128
+ tokens = NLP.tokenize(s).reject do |t|
129
+ # Don't include usernames/urls as tokens
130
+ t.include?('@') || t.include?('http')
131
+ end
132
+
133
+ tokens.map { |t| tikify(t) }
134
+ end
135
+ end
136
+
137
+ # Consume a corpus into this model
138
+ # @param path [String]
139
+ def consume(path)
140
+ content = File.read(path, :encoding => 'utf-8')
141
+
142
+ if path.split('.')[-1] == "json"
143
+ log "Reading json corpus from #{path}"
144
+ lines = JSON.parse(content).map do |tweet|
145
+ tweet['text']
146
+ end
147
+ elsif path.split('.')[-1] == "csv"
148
+ log "Reading CSV corpus from #{path}"
149
+ content = CSV.parse(content)
150
+ header = content.shift
151
+ text_col = header.index('text')
152
+ lines = content.map do |tweet|
153
+ tweet[text_col]
154
+ end
155
+ else
156
+ log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
157
+ lines = content.split("\n")
158
+ end
159
+
160
+ consume_lines(lines)
161
+ end
162
+
163
+ # Consume a sequence of lines
164
+ # @param lines [Array<String>]
165
+ def consume_lines(lines)
166
+ log "Removing commented lines and sorting mentions"
167
+
168
+ statements = []
169
+ mentions = []
170
+ lines.each do |l|
171
+ next if l.start_with?('#') # Remove commented lines
172
+ next if l.include?('RT') || l.include?('MT') # Remove soft retweets
173
+
174
+ if l.include?('@')
175
+ mentions << NLP.normalize(l)
176
+ else
177
+ statements << NLP.normalize(l)
178
+ end
179
+ end
180
+
181
+ text = statements.join("\n").encode('UTF-8', :invalid => :replace)
182
+ mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)
183
+
184
+ lines = nil; statements = nil; mentions = nil # Allow garbage collection
185
+
186
+ log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"
187
+
188
+ @sentences = mass_tikify(text)
189
+ @mentions = mass_tikify(mention_text)
190
+
191
+ log "Ranking keywords"
192
+ @keywords = NLP.keywords(text).top(200).map(&:to_s)
193
+ log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"
194
+
195
+ self
196
+ end
197
+
198
+ # Consume multiple corpuses into this model
199
+ # @param paths [Array<String>]
200
+ def consume_all(paths)
201
+ lines = []
202
+ paths.each do |path|
203
+ content = File.read(path, :encoding => 'utf-8')
204
+
205
+ if path.split('.')[-1] == "json"
206
+ log "Reading json corpus from #{path}"
207
+ l = JSON.parse(content).map do |tweet|
208
+ tweet['text']
209
+ end
210
+ lines.concat(l)
211
+ elsif path.split('.')[-1] == "csv"
212
+ log "Reading CSV corpus from #{path}"
213
+ content = CSV.parse(content)
214
+ header = content.shift
215
+ text_col = header.index('text')
216
+ l = content.map do |tweet|
217
+ tweet[text_col]
218
+ end
219
+ lines.concat(l)
220
+ else
221
+ log "Reading plaintext corpus from #{path}"
222
+ l = content.split("\n")
223
+ lines.concat(l)
224
+ end
225
+ end
226
+ consume_lines(lines)
227
+ end
228
+
229
+ # Correct encoding issues in generated text
230
+ # @param text [String]
231
+ # @return [String]
232
+ def fix(text)
233
+ NLP.htmlentities.decode text
234
+ end
235
+
236
+ # Check if an array of tikis comprises a valid tweet
237
+ # @param tikis [Array<Integer>]
238
+ # @param limit Integer how many chars we have left
239
+ def valid_tweet?(tikis, limit)
240
+ tweet = NLP.reconstruct(tikis, @tokens)
241
+ tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
242
+ end
243
+
244
+ # Generate some text
245
+ # @param limit [Integer] available characters
246
+ # @param generator [SuffixGenerator, nil]
247
+ # @param retry_limit [Integer] how many times to retry on invalid tweet
248
+ # @return [String]
249
+ def make_statement(limit=140, generator=nil, retry_limit=10)
250
+ responding = !generator.nil?
251
+ generator ||= SuffixGenerator.build(@sentences)
252
+
253
+ retries = 0
254
+ tweet = ""
255
+
256
+ while (tikis = generator.generate(3, :bigrams)) do
257
+ #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
258
+ break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)
259
+
260
+ retries += 1
261
+ break if retries >= retry_limit
262
+ end
263
+
264
+ if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
265
+ #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
266
+ while (tikis = generator.generate(3, :unigrams)) do
267
+ break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
268
+
269
+ retries += 1
270
+ break if retries >= retry_limit
271
+ end
272
+ end
273
+
274
+ tweet = NLP.reconstruct(tikis, @tokens)
275
+
276
+ if retries >= retry_limit
277
+ log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
278
+ end
279
+
280
+ fix tweet
281
+ end
282
+
283
+ # Test if a sentence has been copied verbatim from original
284
+ # @param tikis [Array<Integer>]
285
+ # @return [Boolean]
286
+ def verbatim?(tikis)
287
+ @sentences.include?(tikis) || @mentions.include?(tikis)
288
+ end
289
+
290
+ # Finds relevant and slightly relevant tokenized sentences to input
291
+ # comparing non-stopword token overlaps
292
+ # @param sentences [Array<Array<Integer>>]
293
+ # @param input [String]
294
+ # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
295
+ def find_relevant(sentences, input)
296
+ relevant = []
297
+ slightly_relevant = []
298
+
299
+ tokenized = NLP.tokenize(input).map(&:downcase)
300
+
301
+ sentences.each do |sent|
302
+ tokenized.each do |token|
303
+ if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
304
+ relevant << sent unless NLP.stopword?(token)
305
+ slightly_relevant << sent
306
+ end
307
+ end
308
+ end
309
+
310
+ [relevant, slightly_relevant]
311
+ end
312
+
313
+ # Generates a response by looking for related sentences
314
+ # in the corpus and building a smaller generator from these
315
+ # @param input [String]
316
+ # @param limit [Integer] characters available for response
317
+ # @param sentences [Array<Array<Integer>>]
318
+ # @return [String]
319
+ def make_response(input, limit=140, sentences=@mentions)
320
+ # Prefer mentions
321
+ relevant, slightly_relevant = find_relevant(sentences, input)
322
+
323
+ if relevant.length >= 3
324
+ generator = SuffixGenerator.build(relevant)
325
+ make_statement(limit, generator)
326
+ elsif slightly_relevant.length >= 5
327
+ generator = SuffixGenerator.build(slightly_relevant)
328
+ make_statement(limit, generator)
329
+ elsif sentences.equal?(@mentions)
330
+ make_response(input, limit, @sentences)
331
+ else
332
+ make_statement(limit)
333
+ end
334
+ end
335
+ end
336
+ end
@@ -0,0 +1,195 @@
1
+ # encoding: utf-8
2
+ require 'fast-stemmer'
3
+ require 'highscore'
4
+ require 'htmlentities'
5
+
6
+ module Ebooks
7
+ module NLP
8
+ # We deliberately limit our punctuation handling to stuff we can do consistently
9
+ # It'll just be a part of another token if we don't split it out, and that's fine
10
+ PUNCTUATION = ".?!,"
11
+
12
+ # Lazy-load NLP libraries and resources
13
+ # Some of this stuff is pretty heavy and we don't necessarily need
14
+ # to be using it all of the time
15
+
16
+ # Lazily loads an array of stopwords
17
+ # Stopwords are common words that should often be ignored
18
+ # @return [Array<String>]
19
+ def self.stopwords
20
+ @stopwords ||= File.exists?('stopwords.txt') ? File.read('stopwords.txt').split : []
21
+ end
22
+
23
+ # Lazily loads an array of known English nouns
24
+ # @return [Array<String>]
25
+ def self.nouns
26
+ @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
27
+ end
28
+
29
+ # Lazily loads an array of known English adjectives
30
+ # @return [Array<String>]
31
+ def self.adjectives
32
+ @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
33
+ end
34
+
35
+ # Lazily load part-of-speech tagging library
36
+ # This can determine whether a word is being used as a noun/adjective/verb
37
+ # @return [EngTagger]
38
+ def self.tagger
39
+ require 'engtagger'
40
+ @tagger ||= EngTagger.new
41
+ end
42
+
43
+ # Lazily load HTML entity decoder
44
+ # @return [HTMLEntities]
45
+ def self.htmlentities
46
+ @htmlentities ||= HTMLEntities.new
47
+ end
48
+
49
+ ### Utility functions
50
+
51
+ # Normalize some strange unicode punctuation variants
52
+ # @param text [String]
53
+ # @return [String]
54
+ def self.normalize(text)
55
+ htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
56
+ end
57
+
58
+ # Split text into sentences
59
+ # We use ad hoc approach because fancy libraries do not deal
60
+ # especially well with tweet formatting, and we can fake solving
61
+ # the quote problem during generation
62
+ # @param text [String]
63
+ # @return [Array<String>]
64
+ def self.sentences(text)
65
+ text.split(/\n+|(?<=[.?!])\s+/)
66
+ end
67
+
68
+ # Split a sentence into word-level tokens
69
+ # As above, this is ad hoc because tokenization libraries
70
+ # do not behave well wrt. things like emoticons and timestamps
71
+ # @param sentence [String]
72
+ # @return [Array<String>]
73
+ def self.tokenize(sentence)
74
+ regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
75
+ sentence.split(regex)
76
+ end
77
+
78
+ # Get the 'stem' form of a word e.g. 'cats' -> 'cat'
79
+ # @param word [String]
80
+ # @return [String]
81
+ def self.stem(word)
82
+ Stemmer::stem_word(word.downcase)
83
+ end
84
+
85
+ # Use highscore gem to find interesting keywords in a corpus
86
+ # @param text [String]
87
+ # @return [Highscore::Keywords]
88
+ def self.keywords(text)
89
+ # Preprocess to remove stopwords (highscore's blacklist is v. slow)
90
+ text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
91
+
92
+ text = Highscore::Content.new(text)
93
+
94
+ text.configure do
95
+ #set :multiplier, 2
96
+ #set :upper_case, 3
97
+ #set :long_words, 2
98
+ #set :long_words_threshold, 15
99
+ #set :vowels, 1 # => default: 0 = not considered
100
+ #set :consonants, 5 # => default: 0 = not considered
101
+ #set :ignore_case, true # => default: false
102
+ set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
103
+ #set :stemming, true # => default: false
104
+ end
105
+
106
+ text.keywords
107
+ end
108
+
109
+ # Builds a proper sentence from a list of tikis
110
+ # @param tikis [Array<Integer>]
111
+ # @param tokens [Array<String>]
112
+ # @return [String]
113
+ def self.reconstruct(tikis, tokens)
114
+ text = ""
115
+ last_token = nil
116
+ tikis.each do |tiki|
117
+ next if tiki == INTERIM
118
+ token = tokens[tiki]
119
+ text += ' ' if last_token && space_between?(last_token, token)
120
+ text += token
121
+ last_token = token
122
+ end
123
+ text
124
+ end
125
+
126
+ # Determine if we need to insert a space between two tokens
127
+ # @param token1 [String]
128
+ # @param token2 [String]
129
+ # @return [Boolean]
130
+ def self.space_between?(token1, token2)
131
+ p1 = self.punctuation?(token1)
132
+ p2 = self.punctuation?(token2)
133
+ if p1 && p2 # "foo?!"
134
+ false
135
+ elsif !p1 && p2 # "foo."
136
+ false
137
+ elsif p1 && !p2 # "foo. rah"
138
+ true
139
+ else # "foo rah"
140
+ true
141
+ end
142
+ end
143
+
144
+ # Is this token comprised of punctuation?
145
+ # @param token [String]
146
+ # @return [Boolean]
147
+ def self.punctuation?(token)
148
+ (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
149
+ end
150
+
151
+ # Is this token a stopword?
152
+ # @param token [String]
153
+ # @return [Boolean]
154
+ def self.stopword?(token)
155
+ @stopword_set ||= stopwords.map(&:downcase).to_set
156
+ @stopword_set.include?(token.downcase)
157
+ end
158
+
159
+ # Determine if a sample of text contains unmatched brackets or quotes
160
+ # This is one of the more frequent and noticeable failure modes for
161
+ # the generator; we can just tell it to retry
162
+ # @param text [String]
163
+ # @return [Boolean]
164
+ def self.unmatched_enclosers?(text)
165
+ enclosers = ['**', '""', '()', '[]', '``', "''"]
166
+ enclosers.each do |pair|
167
+ starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
168
+ ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
169
+
170
+ opened = 0
171
+
172
+ tokenize(text).each do |token|
173
+ opened += 1 if token.match(starter)
174
+ opened -= 1 if token.match(ender)
175
+
176
+ return true if opened < 0 # Too many ends!
177
+ end
178
+
179
+ return true if opened != 0 # Mismatch somewhere.
180
+ end
181
+
182
+ false
183
+ end
184
+
185
+ # Determine if a2 is a subsequence of a1
186
+ # @param a1 [Array]
187
+ # @param a2 [Array]
188
+ # @return [Boolean]
189
+ def self.subseq?(a1, a2)
190
+ !a1.each_index.find do |i|
191
+ a1[i...i+a2.length] == a2
192
+ end.nil?
193
+ end
194
+ end
195
+ end