twitter_ebooks_poll 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitattributes +2 -0
- data/.gitignore +6 -0
- data/.rspec +1 -0
- data/.travis.yml +8 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +167 -0
- data/Rakefile +2 -0
- data/bin/ebooks +449 -0
- data/data/adjectives.txt +1466 -0
- data/data/nouns.txt +2193 -0
- data/lib/twitter_ebooks/archive.rb +116 -0
- data/lib/twitter_ebooks/bot.rb +521 -0
- data/lib/twitter_ebooks/model.rb +336 -0
- data/lib/twitter_ebooks/nlp.rb +195 -0
- data/lib/twitter_ebooks/suffix.rb +104 -0
- data/lib/twitter_ebooks/sync.rb +52 -0
- data/lib/twitter_ebooks/version.rb +3 -0
- data/lib/twitter_ebooks.rb +22 -0
- data/skeleton/Gemfile +4 -0
- data/skeleton/Procfile +1 -0
- data/skeleton/bots.rb +65 -0
- data/skeleton/corpus/.gitignore +0 -0
- data/skeleton/gitignore +1 -0
- data/skeleton/image/.gitignore +0 -0
- data/skeleton/model/.gitignore +0 -0
- data/skeleton/stopwords.txt +843 -0
- data/spec/bot_spec.rb +216 -0
- data/spec/data/0xabad1dea.json +203945 -0
- data/spec/data/0xabad1dea.model +6158 -1
- data/spec/memprof.rb +37 -0
- data/spec/model_spec.rb +88 -0
- data/spec/spec_helper.rb +6 -0
- data/twitter_ebooks.gemspec +37 -0
- metadata +309 -0
@@ -0,0 +1,336 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'json'
|
5
|
+
require 'set'
|
6
|
+
require 'digest/md5'
|
7
|
+
require 'csv'
|
8
|
+
|
9
|
+
module Ebooks
|
10
|
+
class Model
|
11
|
+
# @return [Array<String>]
|
12
|
+
# An array of unique tokens. This is the main source of actual strings
|
13
|
+
# in the model. Manipulation of a token is done using its index
|
14
|
+
# in this array, which we call a "tiki"
|
15
|
+
attr_accessor :tokens
|
16
|
+
|
17
|
+
# @return [Array<Array<Integer>>]
|
18
|
+
# Sentences represented by arrays of tikis
|
19
|
+
attr_accessor :sentences
|
20
|
+
|
21
|
+
# @return [Array<Array<Integer>>]
|
22
|
+
# Sentences derived from Twitter mentions
|
23
|
+
attr_accessor :mentions
|
24
|
+
|
25
|
+
# @return [Array<String>]
|
26
|
+
# The top 200 most important keywords, in descending order
|
27
|
+
attr_accessor :keywords
|
28
|
+
|
29
|
+
# Generate a new model from a corpus file
|
30
|
+
# @param path [String]
|
31
|
+
# @return [Ebooks::Model]
|
32
|
+
def self.consume(path)
|
33
|
+
Model.new.consume(path)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Generate a new model from multiple corpus files
|
37
|
+
# @param paths [Array<String>]
|
38
|
+
# @return [Ebooks::Model]
|
39
|
+
def self.consume_all(paths)
|
40
|
+
Model.new.consume_all(paths)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Load a saved model
|
44
|
+
# @param path [String]
|
45
|
+
# @return [Ebooks::Model]
|
46
|
+
def self.load(path)
|
47
|
+
model = Model.new
|
48
|
+
model.instance_eval do
|
49
|
+
props = Marshal.load(File.open(path, 'rb') { |f| f.read })
|
50
|
+
@tokens = props[:tokens]
|
51
|
+
@sentences = props[:sentences]
|
52
|
+
@mentions = props[:mentions]
|
53
|
+
@keywords = props[:keywords]
|
54
|
+
end
|
55
|
+
model
|
56
|
+
end
|
57
|
+
|
58
|
+
# Save model to a file
|
59
|
+
# @param path [String]
|
60
|
+
def save(path)
|
61
|
+
File.open(path, 'wb') do |f|
|
62
|
+
f.write(Marshal.dump({
|
63
|
+
tokens: @tokens,
|
64
|
+
sentences: @sentences,
|
65
|
+
mentions: @mentions,
|
66
|
+
keywords: @keywords
|
67
|
+
}))
|
68
|
+
end
|
69
|
+
self
|
70
|
+
end
|
71
|
+
|
72
|
+
# Append a generated model to existing model file instead of overwriting it
|
73
|
+
# @param path [String]
|
74
|
+
def append(path)
|
75
|
+
existing = File.file?(path)
|
76
|
+
if !existing
|
77
|
+
log "No existing model found at #{path}"
|
78
|
+
return
|
79
|
+
else
|
80
|
+
#read-in and deserialize existing model
|
81
|
+
props = Marshal.load(File.open(path,'rb') { |old| old.read })
|
82
|
+
old_tokens = props[:tokens]
|
83
|
+
old_sentences = props[:sentences]
|
84
|
+
old_mentions = props[:mentions]
|
85
|
+
old_keywords = props[:keywords]
|
86
|
+
|
87
|
+
#append existing properties to new ones and overwrite with new model
|
88
|
+
File.open(path, 'wb') do |f|
|
89
|
+
f.write(Marshal.dump({
|
90
|
+
tokens: @tokens.concat(old_tokens),
|
91
|
+
sentences: @sentences.concat(old_sentences),
|
92
|
+
mentions: @mentions.concat(old_mentions),
|
93
|
+
keywords: @keywords.concat(old_keywords)
|
94
|
+
}))
|
95
|
+
end
|
96
|
+
end
|
97
|
+
self
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
def initialize
|
102
|
+
@tokens = []
|
103
|
+
|
104
|
+
# Reverse lookup tiki by token, for faster generation
|
105
|
+
@tikis = {}
|
106
|
+
end
|
107
|
+
|
108
|
+
# Reverse lookup a token index from a token
|
109
|
+
# @param token [String]
|
110
|
+
# @return [Integer]
|
111
|
+
def tikify(token)
|
112
|
+
if @tikis.has_key?(token) then
|
113
|
+
return @tikis[token]
|
114
|
+
else
|
115
|
+
(@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
|
116
|
+
@tokens << token
|
117
|
+
return @tikis[token] = @tokens.length-1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# Convert a body of text into arrays of tikis
|
122
|
+
# @param text [String]
|
123
|
+
# @return [Array<Array<Integer>>]
|
124
|
+
def mass_tikify(text)
|
125
|
+
sentences = NLP.sentences(text)
|
126
|
+
|
127
|
+
sentences.map do |s|
|
128
|
+
tokens = NLP.tokenize(s).reject do |t|
|
129
|
+
# Don't include usernames/urls as tokens
|
130
|
+
t.include?('@') || t.include?('http')
|
131
|
+
end
|
132
|
+
|
133
|
+
tokens.map { |t| tikify(t) }
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Consume a corpus into this model
|
138
|
+
# @param path [String]
|
139
|
+
def consume(path)
|
140
|
+
content = File.read(path, :encoding => 'utf-8')
|
141
|
+
|
142
|
+
if path.split('.')[-1] == "json"
|
143
|
+
log "Reading json corpus from #{path}"
|
144
|
+
lines = JSON.parse(content).map do |tweet|
|
145
|
+
tweet['text']
|
146
|
+
end
|
147
|
+
elsif path.split('.')[-1] == "csv"
|
148
|
+
log "Reading CSV corpus from #{path}"
|
149
|
+
content = CSV.parse(content)
|
150
|
+
header = content.shift
|
151
|
+
text_col = header.index('text')
|
152
|
+
lines = content.map do |tweet|
|
153
|
+
tweet[text_col]
|
154
|
+
end
|
155
|
+
else
|
156
|
+
log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
|
157
|
+
lines = content.split("\n")
|
158
|
+
end
|
159
|
+
|
160
|
+
consume_lines(lines)
|
161
|
+
end
|
162
|
+
|
163
|
+
# Consume a sequence of lines
|
164
|
+
# @param lines [Array<String>]
|
165
|
+
def consume_lines(lines)
|
166
|
+
log "Removing commented lines and sorting mentions"
|
167
|
+
|
168
|
+
statements = []
|
169
|
+
mentions = []
|
170
|
+
lines.each do |l|
|
171
|
+
next if l.start_with?('#') # Remove commented lines
|
172
|
+
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
|
173
|
+
|
174
|
+
if l.include?('@')
|
175
|
+
mentions << NLP.normalize(l)
|
176
|
+
else
|
177
|
+
statements << NLP.normalize(l)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
text = statements.join("\n").encode('UTF-8', :invalid => :replace)
|
182
|
+
mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)
|
183
|
+
|
184
|
+
lines = nil; statements = nil; mentions = nil # Allow garbage collection
|
185
|
+
|
186
|
+
log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"
|
187
|
+
|
188
|
+
@sentences = mass_tikify(text)
|
189
|
+
@mentions = mass_tikify(mention_text)
|
190
|
+
|
191
|
+
log "Ranking keywords"
|
192
|
+
@keywords = NLP.keywords(text).top(200).map(&:to_s)
|
193
|
+
log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"
|
194
|
+
|
195
|
+
self
|
196
|
+
end
|
197
|
+
|
198
|
+
# Consume multiple corpuses into this model
|
199
|
+
# @param paths [Array<String>]
|
200
|
+
def consume_all(paths)
|
201
|
+
lines = []
|
202
|
+
paths.each do |path|
|
203
|
+
content = File.read(path, :encoding => 'utf-8')
|
204
|
+
|
205
|
+
if path.split('.')[-1] == "json"
|
206
|
+
log "Reading json corpus from #{path}"
|
207
|
+
l = JSON.parse(content).map do |tweet|
|
208
|
+
tweet['text']
|
209
|
+
end
|
210
|
+
lines.concat(l)
|
211
|
+
elsif path.split('.')[-1] == "csv"
|
212
|
+
log "Reading CSV corpus from #{path}"
|
213
|
+
content = CSV.parse(content)
|
214
|
+
header = content.shift
|
215
|
+
text_col = header.index('text')
|
216
|
+
l = content.map do |tweet|
|
217
|
+
tweet[text_col]
|
218
|
+
end
|
219
|
+
lines.concat(l)
|
220
|
+
else
|
221
|
+
log "Reading plaintext corpus from #{path}"
|
222
|
+
l = content.split("\n")
|
223
|
+
lines.concat(l)
|
224
|
+
end
|
225
|
+
end
|
226
|
+
consume_lines(lines)
|
227
|
+
end
|
228
|
+
|
229
|
+
# Correct encoding issues in generated text
|
230
|
+
# @param text [String]
|
231
|
+
# @return [String]
|
232
|
+
def fix(text)
|
233
|
+
NLP.htmlentities.decode text
|
234
|
+
end
|
235
|
+
|
236
|
+
# Check if an array of tikis comprises a valid tweet
|
237
|
+
# @param tikis [Array<Integer>]
|
238
|
+
# @param limit Integer how many chars we have left
|
239
|
+
def valid_tweet?(tikis, limit)
|
240
|
+
tweet = NLP.reconstruct(tikis, @tokens)
|
241
|
+
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
242
|
+
end
|
243
|
+
|
244
|
+
# Generate some text
|
245
|
+
# @param limit [Integer] available characters
|
246
|
+
# @param generator [SuffixGenerator, nil]
|
247
|
+
# @param retry_limit [Integer] how many times to retry on invalid tweet
|
248
|
+
# @return [String]
|
249
|
+
def make_statement(limit=140, generator=nil, retry_limit=10)
|
250
|
+
responding = !generator.nil?
|
251
|
+
generator ||= SuffixGenerator.build(@sentences)
|
252
|
+
|
253
|
+
retries = 0
|
254
|
+
tweet = ""
|
255
|
+
|
256
|
+
while (tikis = generator.generate(3, :bigrams)) do
|
257
|
+
#log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
|
258
|
+
break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)
|
259
|
+
|
260
|
+
retries += 1
|
261
|
+
break if retries >= retry_limit
|
262
|
+
end
|
263
|
+
|
264
|
+
if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
|
265
|
+
#log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
|
266
|
+
while (tikis = generator.generate(3, :unigrams)) do
|
267
|
+
break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
|
268
|
+
|
269
|
+
retries += 1
|
270
|
+
break if retries >= retry_limit
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
tweet = NLP.reconstruct(tikis, @tokens)
|
275
|
+
|
276
|
+
if retries >= retry_limit
|
277
|
+
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
|
278
|
+
end
|
279
|
+
|
280
|
+
fix tweet
|
281
|
+
end
|
282
|
+
|
283
|
+
# Test if a sentence has been copied verbatim from original
|
284
|
+
# @param tikis [Array<Integer>]
|
285
|
+
# @return [Boolean]
|
286
|
+
def verbatim?(tikis)
|
287
|
+
@sentences.include?(tikis) || @mentions.include?(tikis)
|
288
|
+
end
|
289
|
+
|
290
|
+
# Finds relevant and slightly relevant tokenized sentences to input
|
291
|
+
# comparing non-stopword token overlaps
|
292
|
+
# @param sentences [Array<Array<Integer>>]
|
293
|
+
# @param input [String]
|
294
|
+
# @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
|
295
|
+
def find_relevant(sentences, input)
|
296
|
+
relevant = []
|
297
|
+
slightly_relevant = []
|
298
|
+
|
299
|
+
tokenized = NLP.tokenize(input).map(&:downcase)
|
300
|
+
|
301
|
+
sentences.each do |sent|
|
302
|
+
tokenized.each do |token|
|
303
|
+
if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
|
304
|
+
relevant << sent unless NLP.stopword?(token)
|
305
|
+
slightly_relevant << sent
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
[relevant, slightly_relevant]
|
311
|
+
end
|
312
|
+
|
313
|
+
# Generates a response by looking for related sentences
|
314
|
+
# in the corpus and building a smaller generator from these
|
315
|
+
# @param input [String]
|
316
|
+
# @param limit [Integer] characters available for response
|
317
|
+
# @param sentences [Array<Array<Integer>>]
|
318
|
+
# @return [String]
|
319
|
+
def make_response(input, limit=140, sentences=@mentions)
|
320
|
+
# Prefer mentions
|
321
|
+
relevant, slightly_relevant = find_relevant(sentences, input)
|
322
|
+
|
323
|
+
if relevant.length >= 3
|
324
|
+
generator = SuffixGenerator.build(relevant)
|
325
|
+
make_statement(limit, generator)
|
326
|
+
elsif slightly_relevant.length >= 5
|
327
|
+
generator = SuffixGenerator.build(slightly_relevant)
|
328
|
+
make_statement(limit, generator)
|
329
|
+
elsif sentences.equal?(@mentions)
|
330
|
+
make_response(input, limit, @sentences)
|
331
|
+
else
|
332
|
+
make_statement(limit)
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
end
|
@@ -0,0 +1,195 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'fast-stemmer'
|
3
|
+
require 'highscore'
|
4
|
+
require 'htmlentities'
|
5
|
+
|
6
|
+
module Ebooks
|
7
|
+
module NLP
|
8
|
+
# We deliberately limit our punctuation handling to stuff we can do consistently
|
9
|
+
# It'll just be a part of another token if we don't split it out, and that's fine
|
10
|
+
PUNCTUATION = ".?!,"
|
11
|
+
|
12
|
+
# Lazy-load NLP libraries and resources
|
13
|
+
# Some of this stuff is pretty heavy and we don't necessarily need
|
14
|
+
# to be using it all of the time
|
15
|
+
|
16
|
+
# Lazily loads an array of stopwords
|
17
|
+
# Stopwords are common words that should often be ignored
|
18
|
+
# @return [Array<String>]
|
19
|
+
def self.stopwords
|
20
|
+
@stopwords ||= File.exists?('stopwords.txt') ? File.read('stopwords.txt').split : []
|
21
|
+
end
|
22
|
+
|
23
|
+
# Lazily loads an array of known English nouns
|
24
|
+
# @return [Array<String>]
|
25
|
+
def self.nouns
|
26
|
+
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
|
27
|
+
end
|
28
|
+
|
29
|
+
# Lazily loads an array of known English adjectives
|
30
|
+
# @return [Array<String>]
|
31
|
+
def self.adjectives
|
32
|
+
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
33
|
+
end
|
34
|
+
|
35
|
+
# Lazily load part-of-speech tagging library
|
36
|
+
# This can determine whether a word is being used as a noun/adjective/verb
|
37
|
+
# @return [EngTagger]
|
38
|
+
def self.tagger
|
39
|
+
require 'engtagger'
|
40
|
+
@tagger ||= EngTagger.new
|
41
|
+
end
|
42
|
+
|
43
|
+
# Lazily load HTML entity decoder
|
44
|
+
# @return [HTMLEntities]
|
45
|
+
def self.htmlentities
|
46
|
+
@htmlentities ||= HTMLEntities.new
|
47
|
+
end
|
48
|
+
|
49
|
+
### Utility functions
|
50
|
+
|
51
|
+
# Normalize some strange unicode punctuation variants
|
52
|
+
# @param text [String]
|
53
|
+
# @return [String]
|
54
|
+
def self.normalize(text)
|
55
|
+
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
56
|
+
end
|
57
|
+
|
58
|
+
# Split text into sentences
|
59
|
+
# We use ad hoc approach because fancy libraries do not deal
|
60
|
+
# especially well with tweet formatting, and we can fake solving
|
61
|
+
# the quote problem during generation
|
62
|
+
# @param text [String]
|
63
|
+
# @return [Array<String>]
|
64
|
+
def self.sentences(text)
|
65
|
+
text.split(/\n+|(?<=[.?!])\s+/)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Split a sentence into word-level tokens
|
69
|
+
# As above, this is ad hoc because tokenization libraries
|
70
|
+
# do not behave well wrt. things like emoticons and timestamps
|
71
|
+
# @param sentence [String]
|
72
|
+
# @return [Array<String>]
|
73
|
+
def self.tokenize(sentence)
|
74
|
+
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
75
|
+
sentence.split(regex)
|
76
|
+
end
|
77
|
+
|
78
|
+
# Get the 'stem' form of a word e.g. 'cats' -> 'cat'
|
79
|
+
# @param word [String]
|
80
|
+
# @return [String]
|
81
|
+
def self.stem(word)
|
82
|
+
Stemmer::stem_word(word.downcase)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Use highscore gem to find interesting keywords in a corpus
|
86
|
+
# @param text [String]
|
87
|
+
# @return [Highscore::Keywords]
|
88
|
+
def self.keywords(text)
|
89
|
+
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
90
|
+
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
91
|
+
|
92
|
+
text = Highscore::Content.new(text)
|
93
|
+
|
94
|
+
text.configure do
|
95
|
+
#set :multiplier, 2
|
96
|
+
#set :upper_case, 3
|
97
|
+
#set :long_words, 2
|
98
|
+
#set :long_words_threshold, 15
|
99
|
+
#set :vowels, 1 # => default: 0 = not considered
|
100
|
+
#set :consonants, 5 # => default: 0 = not considered
|
101
|
+
#set :ignore_case, true # => default: false
|
102
|
+
set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
|
103
|
+
#set :stemming, true # => default: false
|
104
|
+
end
|
105
|
+
|
106
|
+
text.keywords
|
107
|
+
end
|
108
|
+
|
109
|
+
# Builds a proper sentence from a list of tikis
|
110
|
+
# @param tikis [Array<Integer>]
|
111
|
+
# @param tokens [Array<String>]
|
112
|
+
# @return [String]
|
113
|
+
def self.reconstruct(tikis, tokens)
|
114
|
+
text = ""
|
115
|
+
last_token = nil
|
116
|
+
tikis.each do |tiki|
|
117
|
+
next if tiki == INTERIM
|
118
|
+
token = tokens[tiki]
|
119
|
+
text += ' ' if last_token && space_between?(last_token, token)
|
120
|
+
text += token
|
121
|
+
last_token = token
|
122
|
+
end
|
123
|
+
text
|
124
|
+
end
|
125
|
+
|
126
|
+
# Determine if we need to insert a space between two tokens
|
127
|
+
# @param token1 [String]
|
128
|
+
# @param token2 [String]
|
129
|
+
# @return [Boolean]
|
130
|
+
def self.space_between?(token1, token2)
|
131
|
+
p1 = self.punctuation?(token1)
|
132
|
+
p2 = self.punctuation?(token2)
|
133
|
+
if p1 && p2 # "foo?!"
|
134
|
+
false
|
135
|
+
elsif !p1 && p2 # "foo."
|
136
|
+
false
|
137
|
+
elsif p1 && !p2 # "foo. rah"
|
138
|
+
true
|
139
|
+
else # "foo rah"
|
140
|
+
true
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Is this token comprised of punctuation?
|
145
|
+
# @param token [String]
|
146
|
+
# @return [Boolean]
|
147
|
+
def self.punctuation?(token)
|
148
|
+
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
149
|
+
end
|
150
|
+
|
151
|
+
# Is this token a stopword?
|
152
|
+
# @param token [String]
|
153
|
+
# @return [Boolean]
|
154
|
+
def self.stopword?(token)
|
155
|
+
@stopword_set ||= stopwords.map(&:downcase).to_set
|
156
|
+
@stopword_set.include?(token.downcase)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Determine if a sample of text contains unmatched brackets or quotes
|
160
|
+
# This is one of the more frequent and noticeable failure modes for
|
161
|
+
# the generator; we can just tell it to retry
|
162
|
+
# @param text [String]
|
163
|
+
# @return [Boolean]
|
164
|
+
def self.unmatched_enclosers?(text)
|
165
|
+
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
166
|
+
enclosers.each do |pair|
|
167
|
+
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
|
168
|
+
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
|
169
|
+
|
170
|
+
opened = 0
|
171
|
+
|
172
|
+
tokenize(text).each do |token|
|
173
|
+
opened += 1 if token.match(starter)
|
174
|
+
opened -= 1 if token.match(ender)
|
175
|
+
|
176
|
+
return true if opened < 0 # Too many ends!
|
177
|
+
end
|
178
|
+
|
179
|
+
return true if opened != 0 # Mismatch somewhere.
|
180
|
+
end
|
181
|
+
|
182
|
+
false
|
183
|
+
end
|
184
|
+
|
185
|
+
# Determine if a2 is a subsequence of a1
|
186
|
+
# @param a1 [Array]
|
187
|
+
# @param a2 [Array]
|
188
|
+
# @return [Boolean]
|
189
|
+
def self.subseq?(a1, a2)
|
190
|
+
!a1.each_index.find do |i|
|
191
|
+
a1[i...i+a2.length] == a2
|
192
|
+
end.nil?
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Ebooks
|
4
|
+
# This generator uses data similar to a Markov model, but
|
5
|
+
# instead of making a chain by looking up bigrams it uses the
|
6
|
+
# positions to randomly replace token array suffixes in one sentence
|
7
|
+
# with matching suffixes in another
|
8
|
+
class SuffixGenerator
|
9
|
+
# Build a generator from a corpus of tikified sentences
|
10
|
+
# "tikis" are token indexes-- a way of representing words
|
11
|
+
# and punctuation as their integer position in a big array
|
12
|
+
# of such tokens
|
13
|
+
# @param sentences [Array<Array<Integer>>]
|
14
|
+
# @return [SuffixGenerator]
|
15
|
+
def self.build(sentences)
|
16
|
+
SuffixGenerator.new(sentences)
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(sentences)
|
20
|
+
@sentences = sentences.reject { |s| s.empty? }
|
21
|
+
@unigrams = {}
|
22
|
+
@bigrams = {}
|
23
|
+
|
24
|
+
@sentences.each_with_index do |tikis, i|
|
25
|
+
if (i % 10000 == 0) then
|
26
|
+
log ("Building: sentence #{i} of #{sentences.length}")
|
27
|
+
end
|
28
|
+
last_tiki = INTERIM
|
29
|
+
tikis.each_with_index do |tiki, j|
|
30
|
+
@unigrams[last_tiki] ||= []
|
31
|
+
@unigrams[last_tiki] << [i, j]
|
32
|
+
|
33
|
+
@bigrams[last_tiki] ||= {}
|
34
|
+
@bigrams[last_tiki][tiki] ||= []
|
35
|
+
|
36
|
+
if j == tikis.length-1 # Mark sentence endings
|
37
|
+
@unigrams[tiki] ||= []
|
38
|
+
@unigrams[tiki] << [i, INTERIM]
|
39
|
+
@bigrams[last_tiki][tiki] << [i, INTERIM]
|
40
|
+
else
|
41
|
+
@bigrams[last_tiki][tiki] << [i, j+1]
|
42
|
+
end
|
43
|
+
|
44
|
+
last_tiki = tiki
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
# Generate a recombined sequence of tikis
|
52
|
+
# @param passes [Integer] number of times to recombine
|
53
|
+
# @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
|
54
|
+
# @return [Array<Integer>]
|
55
|
+
def generate(passes=5, n=:unigrams)
|
56
|
+
index = rand(@sentences.length)
|
57
|
+
tikis = @sentences[index]
|
58
|
+
used = [index] # Sentences we've already used
|
59
|
+
verbatim = [tikis] # Verbatim sentences to avoid reproducing
|
60
|
+
|
61
|
+
0.upto(passes-1) do
|
62
|
+
varsites = {} # Map bigram start site => next tiki alternatives
|
63
|
+
|
64
|
+
tikis.each_with_index do |tiki, i|
|
65
|
+
next_tiki = tikis[i+1]
|
66
|
+
break if next_tiki.nil?
|
67
|
+
|
68
|
+
alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
|
69
|
+
# Filter out suffixes from previous sentences
|
70
|
+
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
71
|
+
varsites[i] = alternatives unless alternatives.empty?
|
72
|
+
end
|
73
|
+
|
74
|
+
variant = nil
|
75
|
+
varsites.to_a.shuffle.each do |site|
|
76
|
+
start = site[0]
|
77
|
+
|
78
|
+
site[1].shuffle.each do |alt|
|
79
|
+
verbatim << @sentences[alt[0]]
|
80
|
+
suffix = @sentences[alt[0]][alt[1]..-1]
|
81
|
+
potential = tikis[0..start+1] + suffix
|
82
|
+
|
83
|
+
# Ensure we're not just rebuilding some segment of another sentence
|
84
|
+
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
85
|
+
used << alt[0]
|
86
|
+
variant = potential
|
87
|
+
break
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
break if variant
|
92
|
+
end
|
93
|
+
|
94
|
+
# If we failed to produce a variation from any alternative, there
|
95
|
+
# is no use running additional passes-- they'll have the same result.
|
96
|
+
break if variant.nil?
|
97
|
+
|
98
|
+
tikis = variant
|
99
|
+
end
|
100
|
+
|
101
|
+
tikis
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'twitter'
|
5
|
+
require 'json'
|
6
|
+
require 'mini_magick'
|
7
|
+
require 'open-uri'
|
8
|
+
require 'pry'
|
9
|
+
|
10
|
+
module Ebooks
|
11
|
+
class Sync
|
12
|
+
|
13
|
+
def self.run(botname, username)
|
14
|
+
bot = Ebooks::Bot.get(botname)
|
15
|
+
bot.configure
|
16
|
+
source_user = username
|
17
|
+
ebooks_user = bot.username
|
18
|
+
user = bot.twitter.user(source_user)
|
19
|
+
if user.profile_image_url then
|
20
|
+
Ebooks::Sync::get(user.profile_image_url(:original), "image/#{source_user}_avatar")
|
21
|
+
avatar = MiniMagick::Image.open("image/#{source_user}_avatar")
|
22
|
+
avatar.flip
|
23
|
+
avatar.write("image/#{ebooks_user}_avatar")
|
24
|
+
avatar64 = Base64.encode64(File.read("image/#{ebooks_user}_avatar"))
|
25
|
+
bot.twitter.update_profile_image(avatar64)
|
26
|
+
p "Updated profile image for #{ebooks_user} from #{source_user}."
|
27
|
+
else
|
28
|
+
p "#{source_user} does not have a profile image to clone."
|
29
|
+
end
|
30
|
+
if user.profile_banner_url then
|
31
|
+
Ebooks::Sync::get(user.profile_banner_url, "image/#{source_user}banner")
|
32
|
+
banner = MiniMagick::Image.open("image/#{source_user}banner")
|
33
|
+
banner.flip
|
34
|
+
banner.write("image/#{ebooks_user}_banner")
|
35
|
+
banner64 = Base64.encode64(File.read("image/#{ebooks_user}_banner"))
|
36
|
+
bot.twitter.update_profile_banner(banner64)
|
37
|
+
p "Updated cover image for #{ebooks_user} from #{source_user}."
|
38
|
+
else
|
39
|
+
p "#{source_user} does not have a cover image to clone."
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.get(url, destination)
|
44
|
+
File.open(destination, "wb") do |saved_file|
|
45
|
+
open(url, "rb") do |read_file|
|
46
|
+
saved_file.write(read_file.read)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|