twitter_ebooks 2.3.2 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +7 -0
- data/README.md +60 -30
- data/bin/ebooks +239 -117
- data/lib/twitter_ebooks.rb +2 -2
- data/lib/twitter_ebooks/archive.rb +12 -9
- data/lib/twitter_ebooks/bot.rb +343 -109
- data/lib/twitter_ebooks/model.rb +104 -22
- data/lib/twitter_ebooks/nlp.rb +46 -13
- data/lib/twitter_ebooks/suffix.rb +9 -1
- data/lib/twitter_ebooks/version.rb +1 -1
- data/skeleton/Gemfile +1 -1
- data/skeleton/Procfile +1 -1
- data/skeleton/bots.rb +35 -22
- data/spec/bot_spec.rb +178 -0
- data/spec/model_spec.rb +18 -2
- data/twitter_ebooks.gemspec +7 -3
- metadata +72 -20
- data/lib/twitter_ebooks/markov.rb +0 -82
- data/skeleton/run.rb +0 -9
- data/test/corpus/0xabad1dea.tweets +0 -14696
- data/test/keywords.rb +0 -18
- data/test/tokenize.rb +0 -18
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -8,12 +8,41 @@ require 'csv'
|
|
8
8
|
|
9
9
|
module Ebooks
|
10
10
|
class Model
|
11
|
-
|
11
|
+
# @return [Array<String>]
|
12
|
+
# An array of unique tokens. This is the main source of actual strings
|
13
|
+
# in the model. Manipulation of a token is done using its index
|
14
|
+
# in this array, which we call a "tiki"
|
15
|
+
attr_accessor :tokens
|
16
|
+
|
17
|
+
# @return [Array<Array<Integer>>]
|
18
|
+
# Sentences represented by arrays of tikis
|
19
|
+
attr_accessor :sentences
|
20
|
+
|
21
|
+
# @return [Array<Array<Integer>>]
|
22
|
+
# Sentences derived from Twitter mentions
|
23
|
+
attr_accessor :mentions
|
24
|
+
|
25
|
+
# @return [Array<String>]
|
26
|
+
# The top 200 most important keywords, in descending order
|
27
|
+
attr_accessor :keywords
|
28
|
+
|
29
|
+
# Generate a new model from a corpus file
|
30
|
+
# @param path [String]
|
31
|
+
# @return [Ebooks::Model]
|
32
|
+
def self.consume(path)
|
33
|
+
Model.new.consume(path)
|
34
|
+
end
|
12
35
|
|
13
|
-
|
14
|
-
|
36
|
+
# Generate a new model from multiple corpus files
|
37
|
+
# @param paths [Array<String>]
|
38
|
+
# @return [Ebooks::Model]
|
39
|
+
def self.consume_all(paths)
|
40
|
+
Model.new.consume_all(paths)
|
15
41
|
end
|
16
42
|
|
43
|
+
# Load a saved model
|
44
|
+
# @param path [String]
|
45
|
+
# @return [Ebooks::Model]
|
17
46
|
def self.load(path)
|
18
47
|
model = Model.new
|
19
48
|
model.instance_eval do
|
@@ -26,6 +55,8 @@ module Ebooks
|
|
26
55
|
model
|
27
56
|
end
|
28
57
|
|
58
|
+
# Save model to a file
|
59
|
+
# @param path [String]
|
29
60
|
def save(path)
|
30
61
|
File.open(path, 'wb') do |f|
|
31
62
|
f.write(Marshal.dump({
|
@@ -39,19 +70,22 @@ module Ebooks
|
|
39
70
|
end
|
40
71
|
|
41
72
|
def initialize
|
42
|
-
# This is the only source of actual strings in the model. It is
|
43
|
-
# an array of unique tokens. Manipulation of a token is mostly done
|
44
|
-
# using its index in this array, which we call a "tiki"
|
45
73
|
@tokens = []
|
46
74
|
|
47
75
|
# Reverse lookup tiki by token, for faster generation
|
48
76
|
@tikis = {}
|
49
77
|
end
|
50
78
|
|
79
|
+
# Reverse lookup a token index from a token
|
80
|
+
# @param token [String]
|
81
|
+
# @return [Integer]
|
51
82
|
def tikify(token)
|
52
83
|
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
|
53
84
|
end
|
54
85
|
|
86
|
+
# Convert a body of text into arrays of tikis
|
87
|
+
# @param text [String]
|
88
|
+
# @return [Array<Array<Integer>>]
|
55
89
|
def mass_tikify(text)
|
56
90
|
sentences = NLP.sentences(text)
|
57
91
|
|
@@ -65,9 +99,10 @@ module Ebooks
|
|
65
99
|
end
|
66
100
|
end
|
67
101
|
|
102
|
+
# Consume a corpus into this model
|
103
|
+
# @param path [String]
|
68
104
|
def consume(path)
|
69
105
|
content = File.read(path, :encoding => 'utf-8')
|
70
|
-
@hash = Digest::MD5.hexdigest(content)
|
71
106
|
|
72
107
|
if path.split('.')[-1] == "json"
|
73
108
|
log "Reading json corpus from #{path}"
|
@@ -87,6 +122,12 @@ module Ebooks
|
|
87
122
|
lines = content.split("\n")
|
88
123
|
end
|
89
124
|
|
125
|
+
consume_lines(lines)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Consume a sequence of lines
|
129
|
+
# @param lines [Array<String>]
|
130
|
+
def consume_lines(lines)
|
90
131
|
log "Removing commented lines and sorting mentions"
|
91
132
|
|
92
133
|
statements = []
|
@@ -113,30 +154,62 @@ module Ebooks
|
|
113
154
|
@mentions = mass_tikify(mention_text)
|
114
155
|
|
115
156
|
log "Ranking keywords"
|
116
|
-
@keywords = NLP.keywords(text)
|
157
|
+
@keywords = NLP.keywords(text).top(200).map(&:to_s)
|
117
158
|
|
118
159
|
self
|
119
160
|
end
|
120
161
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
162
|
+
# Consume multiple corpuses into this model
|
163
|
+
# @param paths [Array<String>]
|
164
|
+
def consume_all(paths)
|
165
|
+
lines = []
|
166
|
+
paths.each do |path|
|
167
|
+
content = File.read(path, :encoding => 'utf-8')
|
168
|
+
|
169
|
+
if path.split('.')[-1] == "json"
|
170
|
+
log "Reading json corpus from #{path}"
|
171
|
+
l = JSON.parse(content).map do |tweet|
|
172
|
+
tweet['text']
|
173
|
+
end
|
174
|
+
lines.concat(l)
|
175
|
+
elsif path.split('.')[-1] == "csv"
|
176
|
+
log "Reading CSV corpus from #{path}"
|
177
|
+
content = CSV.parse(content)
|
178
|
+
header = content.shift
|
179
|
+
text_col = header.index('text')
|
180
|
+
l = content.map do |tweet|
|
181
|
+
tweet[text_col]
|
182
|
+
end
|
183
|
+
lines.concat(l)
|
184
|
+
else
|
185
|
+
log "Reading plaintext corpus from #{path}"
|
186
|
+
l = content.split("\n")
|
187
|
+
lines.concat(l)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
consume_lines(lines)
|
191
|
+
end
|
131
192
|
|
132
|
-
|
193
|
+
# Correct encoding issues in generated text
|
194
|
+
# @param text [String]
|
195
|
+
# @return [String]
|
196
|
+
def fix(text)
|
197
|
+
NLP.htmlentities.decode text
|
133
198
|
end
|
134
199
|
|
200
|
+
# Check if an array of tikis comprises a valid tweet
|
201
|
+
# @param tikis [Array<Integer>]
|
202
|
+
# @param limit Integer how many chars we have left
|
135
203
|
def valid_tweet?(tikis, limit)
|
136
204
|
tweet = NLP.reconstruct(tikis, @tokens)
|
137
205
|
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
138
206
|
end
|
139
207
|
|
208
|
+
# Generate some text
|
209
|
+
# @param limit [Integer] available characters
|
210
|
+
# @param generator [SuffixGenerator, nil]
|
211
|
+
# @param retry_limit [Integer] how many times to retry on duplicates
|
212
|
+
# @return [String]
|
140
213
|
def make_statement(limit=140, generator=nil, retry_limit=10)
|
141
214
|
responding = !generator.nil?
|
142
215
|
generator ||= SuffixGenerator.build(@sentences)
|
@@ -171,12 +244,17 @@ module Ebooks
|
|
171
244
|
end
|
172
245
|
|
173
246
|
# Test if a sentence has been copied verbatim from original
|
174
|
-
|
175
|
-
|
247
|
+
# @param tikis [Array<Integer>]
|
248
|
+
# @return [Boolean]
|
249
|
+
def verbatim?(tikis)
|
250
|
+
@sentences.include?(tikis) || @mentions.include?(tikis)
|
176
251
|
end
|
177
252
|
|
178
|
-
# Finds
|
253
|
+
# Finds relevant and slightly relevant tokenized sentences to input
|
179
254
|
# comparing non-stopword token overlaps
|
255
|
+
# @param sentences [Array<Array<Integer>>]
|
256
|
+
# @param input [String]
|
257
|
+
# @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
|
180
258
|
def find_relevant(sentences, input)
|
181
259
|
relevant = []
|
182
260
|
slightly_relevant = []
|
@@ -197,6 +275,10 @@ module Ebooks
|
|
197
275
|
|
198
276
|
# Generates a response by looking for related sentences
|
199
277
|
# in the corpus and building a smaller generator from these
|
278
|
+
# @param input [String]
|
279
|
+
# @param limit [Integer] characters available for response
|
280
|
+
# @param sentences [Array<Array<Integer>>]
|
281
|
+
# @return [String]
|
200
282
|
def make_response(input, limit=140, sentences=@mentions)
|
201
283
|
# Prefer mentions
|
202
284
|
relevant, slightly_relevant = find_relevant(sentences, input)
|
data/lib/twitter_ebooks/nlp.rb
CHANGED
@@ -12,31 +12,35 @@ module Ebooks
|
|
12
12
|
# Some of this stuff is pretty heavy and we don't necessarily need
|
13
13
|
# to be using it all of the time
|
14
14
|
|
15
|
+
# Lazily loads an array of stopwords
|
16
|
+
# Stopwords are common English words that should often be ignored
|
17
|
+
# @return [Array<String>]
|
15
18
|
def self.stopwords
|
16
19
|
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
17
20
|
end
|
18
21
|
|
22
|
+
# Lazily loads an array of known English nouns
|
23
|
+
# @return [Array<String>]
|
19
24
|
def self.nouns
|
20
25
|
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
|
21
26
|
end
|
22
27
|
|
28
|
+
# Lazily loads an array of known English adjectives
|
29
|
+
# @return [Array<String>]
|
23
30
|
def self.adjectives
|
24
31
|
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
25
32
|
end
|
26
33
|
|
27
|
-
#
|
34
|
+
# Lazily load part-of-speech tagging library
|
35
|
+
# This can determine whether a word is being used as a noun/adjective/verb
|
36
|
+
# @return [EngTagger]
|
28
37
|
def self.tagger
|
29
38
|
require 'engtagger'
|
30
39
|
@tagger ||= EngTagger.new
|
31
40
|
end
|
32
41
|
|
33
|
-
#
|
34
|
-
|
35
|
-
require 'gingerice'
|
36
|
-
Gingerice::Parser.new # No caching for this one
|
37
|
-
end
|
38
|
-
|
39
|
-
# For decoding html entities
|
42
|
+
# Lazily load HTML entity decoder
|
43
|
+
# @return [HTMLEntities]
|
40
44
|
def self.htmlentities
|
41
45
|
require 'htmlentities'
|
42
46
|
@htmlentities ||= HTMLEntities.new
|
@@ -44,7 +48,9 @@ module Ebooks
|
|
44
48
|
|
45
49
|
### Utility functions
|
46
50
|
|
47
|
-
#
|
51
|
+
# Normalize some strange unicode punctuation variants
|
52
|
+
# @param text [String]
|
53
|
+
# @return [String]
|
48
54
|
def self.normalize(text)
|
49
55
|
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
50
56
|
end
|
@@ -53,6 +59,8 @@ module Ebooks
|
|
53
59
|
# We use ad hoc approach because fancy libraries do not deal
|
54
60
|
# especially well with tweet formatting, and we can fake solving
|
55
61
|
# the quote problem during generation
|
62
|
+
# @param text [String]
|
63
|
+
# @return [Array<String>]
|
56
64
|
def self.sentences(text)
|
57
65
|
text.split(/\n+|(?<=[.?!])\s+/)
|
58
66
|
end
|
@@ -60,15 +68,23 @@ module Ebooks
|
|
60
68
|
# Split a sentence into word-level tokens
|
61
69
|
# As above, this is ad hoc because tokenization libraries
|
62
70
|
# do not behave well wrt. things like emoticons and timestamps
|
71
|
+
# @param sentence [String]
|
72
|
+
# @return [Array<String>]
|
63
73
|
def self.tokenize(sentence)
|
64
74
|
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
65
75
|
sentence.split(regex)
|
66
76
|
end
|
67
77
|
|
78
|
+
# Get the 'stem' form of a word e.g. 'cats' -> 'cat'
|
79
|
+
# @param word [String]
|
80
|
+
# @return [String]
|
68
81
|
def self.stem(word)
|
69
82
|
Stemmer::stem_word(word.downcase)
|
70
83
|
end
|
71
84
|
|
85
|
+
# Use highscore gem to find interesting keywords in a corpus
|
86
|
+
# @param text [String]
|
87
|
+
# @return [Highscore::Keywords]
|
72
88
|
def self.keywords(text)
|
73
89
|
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
74
90
|
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
@@ -90,7 +106,10 @@ module Ebooks
|
|
90
106
|
text.keywords
|
91
107
|
end
|
92
108
|
|
93
|
-
#
|
109
|
+
# Builds a proper sentence from a list of tikis
|
110
|
+
# @param tikis [Array<Integer>]
|
111
|
+
# @param tokens [Array<String>]
|
112
|
+
# @return [String]
|
94
113
|
def self.reconstruct(tikis, tokens)
|
95
114
|
text = ""
|
96
115
|
last_token = nil
|
@@ -105,6 +124,9 @@ module Ebooks
|
|
105
124
|
end
|
106
125
|
|
107
126
|
# Determine if we need to insert a space between two tokens
|
127
|
+
# @param token1 [String]
|
128
|
+
# @param token2 [String]
|
129
|
+
# @return [Boolean]
|
108
130
|
def self.space_between?(token1, token2)
|
109
131
|
p1 = self.punctuation?(token1)
|
110
132
|
p2 = self.punctuation?(token2)
|
@@ -119,10 +141,16 @@ module Ebooks
|
|
119
141
|
end
|
120
142
|
end
|
121
143
|
|
144
|
+
# Is this token comprised of punctuation?
|
145
|
+
# @param token [String]
|
146
|
+
# @return [Boolean]
|
122
147
|
def self.punctuation?(token)
|
123
148
|
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
124
149
|
end
|
125
150
|
|
151
|
+
# Is this token a stopword?
|
152
|
+
# @param token [String]
|
153
|
+
# @return [Boolean]
|
126
154
|
def self.stopword?(token)
|
127
155
|
@stopword_set ||= stopwords.map(&:downcase).to_set
|
128
156
|
@stopword_set.include?(token.downcase)
|
@@ -130,7 +158,9 @@ module Ebooks
|
|
130
158
|
|
131
159
|
# Determine if a sample of text contains unmatched brackets or quotes
|
132
160
|
# This is one of the more frequent and noticeable failure modes for
|
133
|
-
# the
|
161
|
+
# the generator; we can just tell it to retry
|
162
|
+
# @param text [String]
|
163
|
+
# @return [Boolean]
|
134
164
|
def self.unmatched_enclosers?(text)
|
135
165
|
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
136
166
|
enclosers.each do |pair|
|
@@ -153,10 +183,13 @@ module Ebooks
|
|
153
183
|
end
|
154
184
|
|
155
185
|
# Determine if a2 is a subsequence of a1
|
186
|
+
# @param a1 [Array]
|
187
|
+
# @param a2 [Array]
|
188
|
+
# @return [Boolean]
|
156
189
|
def self.subseq?(a1, a2)
|
157
|
-
a1.each_index.find do |i|
|
190
|
+
!a1.each_index.find do |i|
|
158
191
|
a1[i...i+a2.length] == a2
|
159
|
-
end
|
192
|
+
end.nil?
|
160
193
|
end
|
161
194
|
end
|
162
195
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
module Ebooks
|
4
|
-
# This generator uses data identical to
|
4
|
+
# This generator uses data identical to a markov model, but
|
5
5
|
# instead of making a chain by looking up bigrams it uses the
|
6
6
|
# positions to randomly replace suffixes in one sentence with
|
7
7
|
# matching suffixes in another
|
8
8
|
class SuffixGenerator
|
9
|
+
# Build a generator from a corpus of tikified sentences
|
10
|
+
# @param sentences [Array<Array<Integer>>]
|
11
|
+
# @return [SuffixGenerator]
|
9
12
|
def self.build(sentences)
|
10
13
|
SuffixGenerator.new(sentences)
|
11
14
|
end
|
@@ -39,6 +42,11 @@ module Ebooks
|
|
39
42
|
self
|
40
43
|
end
|
41
44
|
|
45
|
+
|
46
|
+
# Generate a recombined sequence of tikis
|
47
|
+
# @param passes [Integer] number of times to recombine
|
48
|
+
# @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
|
49
|
+
# @return [Array<Integer>]
|
42
50
|
def generate(passes=5, n=:unigrams)
|
43
51
|
index = rand(@sentences.length)
|
44
52
|
tikis = @sentences[index]
|
data/skeleton/Gemfile
CHANGED
data/skeleton/Procfile
CHANGED
@@ -1 +1 @@
|
|
1
|
-
worker:
|
1
|
+
worker: bundle exec ebooks start
|
data/skeleton/bots.rb
CHANGED
@@ -1,42 +1,55 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
1
|
require 'twitter_ebooks'
|
4
2
|
|
5
3
|
# This is an example bot definition with event handlers commented out
|
6
|
-
# You can define as many
|
4
|
+
# You can define and instantiate as many bots as you like
|
5
|
+
|
6
|
+
class MyBot < Ebooks::Bot
|
7
|
+
# Configuration here applies to all MyBots
|
8
|
+
def configure
|
9
|
+
# Consumer details come from registering an app at https://dev.twitter.com/
|
10
|
+
# Once you have consumer details, use "ebooks auth" for new access tokens
|
11
|
+
self.consumer_key = '' # Your app consumer key
|
12
|
+
self.consumer_secret = '' # Your app consumer secret
|
13
|
+
|
14
|
+
# Users to block instead of interacting with
|
15
|
+
self.blacklist = ['tnietzschequote']
|
16
|
+
|
17
|
+
# Range in seconds to randomize delay when bot.delay is called
|
18
|
+
self.delay_range = 1..6
|
19
|
+
end
|
7
20
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
21
|
+
def on_startup
|
22
|
+
scheduler.every '24h' do
|
23
|
+
# Tweet something every 24 hours
|
24
|
+
# See https://github.com/jmettraux/rufus-scheduler
|
25
|
+
# bot.tweet("hi")
|
26
|
+
# bot.pictweet("hi", "cuteselfie.jpg")
|
27
|
+
end
|
28
|
+
end
|
15
29
|
|
16
|
-
|
30
|
+
def on_message(dm)
|
17
31
|
# Reply to a DM
|
18
32
|
# bot.reply(dm, "secret secrets")
|
19
33
|
end
|
20
34
|
|
21
|
-
|
35
|
+
def on_follow(user)
|
22
36
|
# Follow a user back
|
23
37
|
# bot.follow(user[:screen_name])
|
24
38
|
end
|
25
39
|
|
26
|
-
|
40
|
+
def on_mention(tweet)
|
27
41
|
# Reply to a mention
|
28
|
-
# bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
|
42
|
+
# bot.reply(tweet, meta(tweet)[:reply_prefix] + "oh hullo")
|
29
43
|
end
|
30
44
|
|
31
|
-
|
45
|
+
def on_timeline(tweet)
|
32
46
|
# Reply to a tweet in the bot's timeline
|
33
|
-
# bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
|
47
|
+
# bot.reply(tweet, meta(tweet)[:reply_prefix] + "nice tweet")
|
34
48
|
end
|
49
|
+
end
|
35
50
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
# bot.pictweet("hi", "cuteselfie.jpg", ":possibly_sensitive => true")
|
41
|
-
end
|
51
|
+
# Make a MyBot and attach it to an account
|
52
|
+
MyBot.new("{{BOT_NAME}}") do |bot|
|
53
|
+
bot.access_token = "" # Token connecting the app to this account
|
54
|
+
bot.access_token_secret = "" # Secret connecting the app to this account
|
42
55
|
end
|