bot_twitter_ebooks 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,336 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'json'
5
+ require 'set'
6
+ require 'digest/md5'
7
+ require 'csv'
8
+
9
+ module Ebooks
10
+ class Model
11
+ # @return [Array<String>]
12
+ # An array of unique tokens. This is the main source of actual strings
13
+ # in the model. Manipulation of a token is done using its index
14
+ # in this array, which we call a "tiki"
15
+ attr_accessor :tokens
16
+
17
+ # @return [Array<Array<Integer>>]
18
+ # Sentences represented by arrays of tikis
19
+ attr_accessor :sentences
20
+
21
+ # @return [Array<Array<Integer>>]
22
+ # Sentences derived from Twitter mentions
23
+ attr_accessor :mentions
24
+
25
+ # @return [Array<String>]
26
+ # The top 200 most important keywords, in descending order
27
+ attr_accessor :keywords
28
+
29
+ # Generate a new model from a corpus file
30
+ # @param path [String]
31
+ # @return [Ebooks::Model]
32
+ def self.consume(path)
33
+ Model.new.consume(path)
34
+ end
35
+
36
+ # Generate a new model from multiple corpus files
37
+ # @param paths [Array<String>]
38
+ # @return [Ebooks::Model]
39
+ def self.consume_all(paths)
40
+ Model.new.consume_all(paths)
41
+ end
42
+
43
+ # Load a saved model
44
+ # @param path [String]
45
+ # @return [Ebooks::Model]
46
+ def self.load(path)
47
+ model = Model.new
48
+ model.instance_eval do
49
+ props = Marshal.load(File.open(path, 'rb') { |f| f.read })
50
+ @tokens = props[:tokens]
51
+ @sentences = props[:sentences]
52
+ @mentions = props[:mentions]
53
+ @keywords = props[:keywords]
54
+ end
55
+ model
56
+ end
57
+
58
+ # Save model to a file
59
+ # @param path [String]
60
+ def save(path)
61
+ File.open(path, 'wb') do |f|
62
+ f.write(Marshal.dump({
63
+ tokens: @tokens,
64
+ sentences: @sentences,
65
+ mentions: @mentions,
66
+ keywords: @keywords
67
+ }))
68
+ end
69
+ self
70
+ end
71
+
72
+ # Append a generated model to existing model file instead of overwriting it
73
+ # @param path [String]
74
+ def append(path)
75
+ existing = File.file?(path)
76
+ if !existing
77
+ log "No existing model found at #{path}"
78
+ return
79
+ else
80
+ #read-in and deserialize existing model
81
+ props = Marshal.load(File.open(path,'rb') { |old| old.read })
82
+ old_tokens = props[:tokens]
83
+ old_sentences = props[:sentences]
84
+ old_mentions = props[:mentions]
85
+ old_keywords = props[:keywords]
86
+
87
+ #append existing properties to new ones and overwrite with new model
88
+ File.open(path, 'wb') do |f|
89
+ f.write(Marshal.dump({
90
+ tokens: @tokens.concat(old_tokens),
91
+ sentences: @sentences.concat(old_sentences),
92
+ mentions: @mentions.concat(old_mentions),
93
+ keywords: @keywords.concat(old_keywords)
94
+ }))
95
+ end
96
+ end
97
+ self
98
+ end
99
+
100
+
101
+ def initialize
102
+ @tokens = []
103
+
104
+ # Reverse lookup tiki by token, for faster generation
105
+ @tikis = {}
106
+ end
107
+
108
+ # Reverse lookup a token index from a token
109
+ # @param token [String]
110
+ # @return [Integer]
111
+ def tikify(token)
112
+ if @tikis.has_key?(token) then
113
+ return @tikis[token]
114
+ else
115
+ (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
116
+ @tokens << token
117
+ return @tikis[token] = @tokens.length-1
118
+ end
119
+ end
120
+
121
+ # Convert a body of text into arrays of tikis
122
+ # @param text [String]
123
+ # @return [Array<Array<Integer>>]
124
+ def mass_tikify(text)
125
+ sentences = NLP.sentences(text)
126
+
127
+ sentences.map do |s|
128
+ tokens = NLP.tokenize(s).reject do |t|
129
+ # Don't include usernames/urls as tokens
130
+ t.include?('@') || t.include?('http')
131
+ end
132
+
133
+ tokens.map { |t| tikify(t) }
134
+ end
135
+ end
136
+
137
+ # Consume a corpus into this model
138
+ # @param path [String]
139
+ def consume(path)
140
+ content = File.read(path, :encoding => 'utf-8')
141
+
142
+ if path.split('.')[-1] == "json"
143
+ log "Reading json corpus from #{path}"
144
+ lines = JSON.parse(content).map do |tweet|
145
+ tweet['text']
146
+ end
147
+ elsif path.split('.')[-1] == "csv"
148
+ log "Reading CSV corpus from #{path}"
149
+ content = CSV.parse(content)
150
+ header = content.shift
151
+ text_col = header.index('text')
152
+ lines = content.map do |tweet|
153
+ tweet[text_col]
154
+ end
155
+ else
156
+ log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
157
+ lines = content.split("\n")
158
+ end
159
+
160
+ consume_lines(lines)
161
+ end
162
+
163
+ # Consume a sequence of lines
164
+ # @param lines [Array<String>]
165
+ def consume_lines(lines)
166
+ log "Removing commented lines and sorting mentions"
167
+
168
+ statements = []
169
+ mentions = []
170
+ lines.each do |l|
171
+ next if l.start_with?('#') # Remove commented lines
172
+ next if l.include?('RT') || l.include?('MT') # Remove soft retweets
173
+
174
+ if l.include?('@')
175
+ mentions << NLP.normalize(l)
176
+ else
177
+ statements << NLP.normalize(l)
178
+ end
179
+ end
180
+
181
+ text = statements.join("\n").encode('UTF-8', :invalid => :replace)
182
+ mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)
183
+
184
+ lines = nil; statements = nil; mentions = nil # Allow garbage collection
185
+
186
+ log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"
187
+
188
+ @sentences = mass_tikify(text)
189
+ @mentions = mass_tikify(mention_text)
190
+
191
+ log "Ranking keywords"
192
+ @keywords = NLP.keywords(text).top(200).map(&:to_s)
193
+ log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"
194
+
195
+ self
196
+ end
197
+
198
+ # Consume multiple corpuses into this model
199
+ # @param paths [Array<String>]
200
+ def consume_all(paths)
201
+ lines = []
202
+ paths.each do |path|
203
+ content = File.read(path, :encoding => 'utf-8')
204
+
205
+ if path.split('.')[-1] == "json"
206
+ log "Reading json corpus from #{path}"
207
+ l = JSON.parse(content).map do |tweet|
208
+ tweet['text']
209
+ end
210
+ lines.concat(l)
211
+ elsif path.split('.')[-1] == "csv"
212
+ log "Reading CSV corpus from #{path}"
213
+ content = CSV.parse(content)
214
+ header = content.shift
215
+ text_col = header.index('text')
216
+ l = content.map do |tweet|
217
+ tweet[text_col]
218
+ end
219
+ lines.concat(l)
220
+ else
221
+ log "Reading plaintext corpus from #{path}"
222
+ l = content.split("\n")
223
+ lines.concat(l)
224
+ end
225
+ end
226
+ consume_lines(lines)
227
+ end
228
+
229
+ # Correct encoding issues in generated text
230
+ # @param text [String]
231
+ # @return [String]
232
+ def fix(text)
233
+ NLP.htmlentities.decode text
234
+ end
235
+
236
+ # Check if an array of tikis comprises a valid tweet
237
+ # @param tikis [Array<Integer>]
238
+ # @param limit Integer how many chars we have left
239
+ def valid_tweet?(tikis, limit)
240
+ tweet = NLP.reconstruct(tikis, @tokens)
241
+ tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
242
+ end
243
+
244
+ # Generate some text
245
+ # @param limit [Integer] available characters
246
+ # @param generator [SuffixGenerator, nil]
247
+ # @param retry_limit [Integer] how many times to retry on invalid tweet
248
+ # @return [String]
249
+ def make_statement(limit=140, generator=nil, retry_limit=10)
250
+ responding = !generator.nil?
251
+ generator ||= SuffixGenerator.build(@sentences)
252
+
253
+ retries = 0
254
+ tweet = ""
255
+
256
+ while (tikis = generator.generate(3, :bigrams)) do
257
+ #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
258
+ break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)
259
+
260
+ retries += 1
261
+ break if retries >= retry_limit
262
+ end
263
+
264
+ if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
265
+ #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
266
+ while (tikis = generator.generate(3, :unigrams)) do
267
+ break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
268
+
269
+ retries += 1
270
+ break if retries >= retry_limit
271
+ end
272
+ end
273
+
274
+ tweet = NLP.reconstruct(tikis, @tokens)
275
+
276
+ if retries >= retry_limit
277
+ log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
278
+ end
279
+
280
+ fix tweet
281
+ end
282
+
283
+ # Test if a sentence has been copied verbatim from original
284
+ # @param tikis [Array<Integer>]
285
+ # @return [Boolean]
286
+ def verbatim?(tikis)
287
+ @sentences.include?(tikis) || @mentions.include?(tikis)
288
+ end
289
+
290
+ # Finds relevant and slightly relevant tokenized sentences to input
291
+ # comparing non-stopword token overlaps
292
+ # @param sentences [Array<Array<Integer>>]
293
+ # @param input [String]
294
+ # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
295
+ def find_relevant(sentences, input)
296
+ relevant = []
297
+ slightly_relevant = []
298
+
299
+ tokenized = NLP.tokenize(input).map(&:downcase)
300
+
301
+ sentences.each do |sent|
302
+ tokenized.each do |token|
303
+ if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
304
+ relevant << sent unless NLP.stopword?(token)
305
+ slightly_relevant << sent
306
+ end
307
+ end
308
+ end
309
+
310
+ [relevant, slightly_relevant]
311
+ end
312
+
313
+ # Generates a response by looking for related sentences
314
+ # in the corpus and building a smaller generator from these
315
+ # @param input [String]
316
+ # @param limit [Integer] characters available for response
317
+ # @param sentences [Array<Array<Integer>>]
318
+ # @return [String]
319
+ def make_response(input, limit=140, sentences=@mentions)
320
+ # Prefer mentions
321
+ relevant, slightly_relevant = find_relevant(sentences, input)
322
+
323
+ if relevant.length >= 3
324
+ generator = SuffixGenerator.build(relevant)
325
+ make_statement(limit, generator)
326
+ elsif slightly_relevant.length >= 5
327
+ generator = SuffixGenerator.build(slightly_relevant)
328
+ make_statement(limit, generator)
329
+ elsif sentences.equal?(@mentions)
330
+ make_response(input, limit, @sentences)
331
+ else
332
+ make_statement(limit)
333
+ end
334
+ end
335
+ end
336
+ end
@@ -0,0 +1,195 @@
1
+ # encoding: utf-8
2
+ require 'fast-stemmer'
3
+ require 'highscore'
4
+ require 'htmlentities'
5
+
6
+ module Ebooks
7
+ module NLP
8
+ # We deliberately limit our punctuation handling to stuff we can do consistently
9
+ # It'll just be a part of another token if we don't split it out, and that's fine
10
+ PUNCTUATION = ".?!,"
11
+
12
+ # Lazy-load NLP libraries and resources
13
+ # Some of this stuff is pretty heavy and we don't necessarily need
14
+ # to be using it all of the time
15
+
16
+ # Lazily loads an array of stopwords
17
+ # Stopwords are common words that should often be ignored
18
+ # @return [Array<String>]
19
+ def self.stopwords
20
+ @stopwords ||= File.exists?('stopwords.txt') ? File.read('stopwords.txt').split : []
21
+ end
22
+
23
+ # Lazily loads an array of known English nouns
24
+ # @return [Array<String>]
25
+ def self.nouns
26
+ @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
27
+ end
28
+
29
+ # Lazily loads an array of known English adjectives
30
+ # @return [Array<String>]
31
+ def self.adjectives
32
+ @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
33
+ end
34
+
35
+ # Lazily load part-of-speech tagging library
36
+ # This can determine whether a word is being used as a noun/adjective/verb
37
+ # @return [EngTagger]
38
+ def self.tagger
39
+ require 'engtagger'
40
+ @tagger ||= EngTagger.new
41
+ end
42
+
43
+ # Lazily load HTML entity decoder
44
+ # @return [HTMLEntities]
45
+ def self.htmlentities
46
+ @htmlentities ||= HTMLEntities.new
47
+ end
48
+
49
+ ### Utility functions
50
+
51
+ # Normalize some strange unicode punctuation variants
52
+ # @param text [String]
53
+ # @return [String]
54
+ def self.normalize(text)
55
+ htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
56
+ end
57
+
58
+ # Split text into sentences
59
+ # We use ad hoc approach because fancy libraries do not deal
60
+ # especially well with tweet formatting, and we can fake solving
61
+ # the quote problem during generation
62
+ # @param text [String]
63
+ # @return [Array<String>]
64
+ def self.sentences(text)
65
+ text.split(/\n+|(?<=[.?!])\s+/)
66
+ end
67
+
68
+ # Split a sentence into word-level tokens
69
+ # As above, this is ad hoc because tokenization libraries
70
+ # do not behave well wrt. things like emoticons and timestamps
71
+ # @param sentence [String]
72
+ # @return [Array<String>]
73
+ def self.tokenize(sentence)
74
+ regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
75
+ sentence.split(regex)
76
+ end
77
+
78
+ # Get the 'stem' form of a word e.g. 'cats' -> 'cat'
79
+ # @param word [String]
80
+ # @return [String]
81
+ def self.stem(word)
82
+ Stemmer::stem_word(word.downcase)
83
+ end
84
+
85
+ # Use highscore gem to find interesting keywords in a corpus
86
+ # @param text [String]
87
+ # @return [Highscore::Keywords]
88
+ def self.keywords(text)
89
+ # Preprocess to remove stopwords (highscore's blacklist is v. slow)
90
+ text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
91
+
92
+ text = Highscore::Content.new(text)
93
+
94
+ text.configure do
95
+ #set :multiplier, 2
96
+ #set :upper_case, 3
97
+ #set :long_words, 2
98
+ #set :long_words_threshold, 15
99
+ #set :vowels, 1 # => default: 0 = not considered
100
+ #set :consonants, 5 # => default: 0 = not considered
101
+ #set :ignore_case, true # => default: false
102
+ set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
103
+ #set :stemming, true # => default: false
104
+ end
105
+
106
+ text.keywords
107
+ end
108
+
109
+ # Builds a proper sentence from a list of tikis
110
+ # @param tikis [Array<Integer>]
111
+ # @param tokens [Array<String>]
112
+ # @return [String]
113
+ def self.reconstruct(tikis, tokens)
114
+ text = ""
115
+ last_token = nil
116
+ tikis.each do |tiki|
117
+ next if tiki == INTERIM
118
+ token = tokens[tiki]
119
+ text += ' ' if last_token && space_between?(last_token, token)
120
+ text += token
121
+ last_token = token
122
+ end
123
+ text
124
+ end
125
+
126
+ # Determine if we need to insert a space between two tokens
127
+ # @param token1 [String]
128
+ # @param token2 [String]
129
+ # @return [Boolean]
130
+ def self.space_between?(token1, token2)
131
+ p1 = self.punctuation?(token1)
132
+ p2 = self.punctuation?(token2)
133
+ if p1 && p2 # "foo?!"
134
+ false
135
+ elsif !p1 && p2 # "foo."
136
+ false
137
+ elsif p1 && !p2 # "foo. rah"
138
+ true
139
+ else # "foo rah"
140
+ true
141
+ end
142
+ end
143
+
144
+ # Is this token comprised of punctuation?
145
+ # @param token [String]
146
+ # @return [Boolean]
147
+ def self.punctuation?(token)
148
+ (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
149
+ end
150
+
151
+ # Is this token a stopword?
152
+ # @param token [String]
153
+ # @return [Boolean]
154
+ def self.stopword?(token)
155
+ @stopword_set ||= stopwords.map(&:downcase).to_set
156
+ @stopword_set.include?(token.downcase)
157
+ end
158
+
159
+ # Determine if a sample of text contains unmatched brackets or quotes
160
+ # This is one of the more frequent and noticeable failure modes for
161
+ # the generator; we can just tell it to retry
162
+ # @param text [String]
163
+ # @return [Boolean]
164
+ def self.unmatched_enclosers?(text)
165
+ enclosers = ['**', '""', '()', '[]', '``', "''"]
166
+ enclosers.each do |pair|
167
+ starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
168
+ ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
169
+
170
+ opened = 0
171
+
172
+ tokenize(text).each do |token|
173
+ opened += 1 if token.match(starter)
174
+ opened -= 1 if token.match(ender)
175
+
176
+ return true if opened < 0 # Too many ends!
177
+ end
178
+
179
+ return true if opened != 0 # Mismatch somewhere.
180
+ end
181
+
182
+ false
183
+ end
184
+
185
+ # Determine if a2 is a subsequence of a1
186
+ # @param a1 [Array]
187
+ # @param a2 [Array]
188
+ # @return [Boolean]
189
+ def self.subseq?(a1, a2)
190
+ !a1.each_index.find do |i|
191
+ a1[i...i+a2.length] == a2
192
+ end.nil?
193
+ end
194
+ end
195
+ end