twitter_ebooks 2.3.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,12 +8,41 @@ require 'csv'
8
8
 
9
9
  module Ebooks
10
10
  class Model
11
- attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
11
+ # @return [Array<String>]
12
+ # An array of unique tokens. This is the main source of actual strings
13
+ # in the model. Manipulation of a token is done using its index
14
+ # in this array, which we call a "tiki"
15
+ attr_accessor :tokens
16
+
17
+ # @return [Array<Array<Integer>>]
18
+ # Sentences represented by arrays of tikis
19
+ attr_accessor :sentences
20
+
21
+ # @return [Array<Array<Integer>>]
22
+ # Sentences derived from Twitter mentions
23
+ attr_accessor :mentions
24
+
25
+ # @return [Array<String>]
26
+ # The top 200 most important keywords, in descending order
27
+ attr_accessor :keywords
28
+
29
+ # Generate a new model from a corpus file
30
+ # @param path [String]
31
+ # @return [Ebooks::Model]
32
+ def self.consume(path)
33
+ Model.new.consume(path)
34
+ end
12
35
 
13
- def self.consume(txtpath)
14
- Model.new.consume(txtpath)
36
+ # Generate a new model from multiple corpus files
37
+ # @param paths [Array<String>]
38
+ # @return [Ebooks::Model]
39
+ def self.consume_all(paths)
40
+ Model.new.consume_all(paths)
15
41
  end
16
42
 
43
+ # Load a saved model
44
+ # @param path [String]
45
+ # @return [Ebooks::Model]
17
46
  def self.load(path)
18
47
  model = Model.new
19
48
  model.instance_eval do
@@ -26,6 +55,8 @@ module Ebooks
26
55
  model
27
56
  end
28
57
 
58
+ # Save model to a file
59
+ # @param path [String]
29
60
  def save(path)
30
61
  File.open(path, 'wb') do |f|
31
62
  f.write(Marshal.dump({
@@ -39,19 +70,22 @@ module Ebooks
39
70
  end
40
71
 
41
72
  def initialize
42
- # This is the only source of actual strings in the model. It is
43
- # an array of unique tokens. Manipulation of a token is mostly done
44
- # using its index in this array, which we call a "tiki"
45
73
  @tokens = []
46
74
 
47
75
  # Reverse lookup tiki by token, for faster generation
48
76
  @tikis = {}
49
77
  end
50
78
 
79
+ # Reverse lookup a token index from a token
80
+ # @param token [String]
81
+ # @return [Integer]
51
82
  def tikify(token)
52
83
  @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
53
84
  end
54
85
 
86
+ # Convert a body of text into arrays of tikis
87
+ # @param text [String]
88
+ # @return [Array<Array<Integer>>]
55
89
  def mass_tikify(text)
56
90
  sentences = NLP.sentences(text)
57
91
 
@@ -65,9 +99,10 @@ module Ebooks
65
99
  end
66
100
  end
67
101
 
102
+ # Consume a corpus into this model
103
+ # @param path [String]
68
104
  def consume(path)
69
105
  content = File.read(path, :encoding => 'utf-8')
70
- @hash = Digest::MD5.hexdigest(content)
71
106
 
72
107
  if path.split('.')[-1] == "json"
73
108
  log "Reading json corpus from #{path}"
@@ -87,6 +122,12 @@ module Ebooks
87
122
  lines = content.split("\n")
88
123
  end
89
124
 
125
+ consume_lines(lines)
126
+ end
127
+
128
+ # Consume a sequence of lines
129
+ # @param lines [Array<String>]
130
+ def consume_lines(lines)
90
131
  log "Removing commented lines and sorting mentions"
91
132
 
92
133
  statements = []
@@ -113,30 +154,62 @@ module Ebooks
113
154
  @mentions = mass_tikify(mention_text)
114
155
 
115
156
  log "Ranking keywords"
116
- @keywords = NLP.keywords(text)
157
+ @keywords = NLP.keywords(text).top(200).map(&:to_s)
117
158
 
118
159
  self
119
160
  end
120
161
 
121
- def fix(tweet)
122
- # This seems to require an external api call
123
- #begin
124
- # fixer = NLP.gingerice.parse(tweet)
125
- # log fixer if fixer['corrections']
126
- # tweet = fixer['result']
127
- #rescue Exception => e
128
- # log e.message
129
- # log e.backtrace
130
- #end
162
+ # Consume multiple corpuses into this model
163
+ # @param paths [Array<String>]
164
+ def consume_all(paths)
165
+ lines = []
166
+ paths.each do |path|
167
+ content = File.read(path, :encoding => 'utf-8')
168
+
169
+ if path.split('.')[-1] == "json"
170
+ log "Reading json corpus from #{path}"
171
+ l = JSON.parse(content).map do |tweet|
172
+ tweet['text']
173
+ end
174
+ lines.concat(l)
175
+ elsif path.split('.')[-1] == "csv"
176
+ log "Reading CSV corpus from #{path}"
177
+ content = CSV.parse(content)
178
+ header = content.shift
179
+ text_col = header.index('text')
180
+ l = content.map do |tweet|
181
+ tweet[text_col]
182
+ end
183
+ lines.concat(l)
184
+ else
185
+ log "Reading plaintext corpus from #{path}"
186
+ l = content.split("\n")
187
+ lines.concat(l)
188
+ end
189
+ end
190
+ consume_lines(lines)
191
+ end
131
192
 
132
- NLP.htmlentities.decode tweet
193
+ # Correct encoding issues in generated text
194
+ # @param text [String]
195
+ # @return [String]
196
+ def fix(text)
197
+ NLP.htmlentities.decode text
133
198
  end
134
199
 
200
+ # Check if an array of tikis comprises a valid tweet
201
+ # @param tikis [Array<Integer>]
202
+ # @param limit Integer how many chars we have left
135
203
  def valid_tweet?(tikis, limit)
136
204
  tweet = NLP.reconstruct(tikis, @tokens)
137
205
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
138
206
  end
139
207
 
208
+ # Generate some text
209
+ # @param limit [Integer] available characters
210
+ # @param generator [SuffixGenerator, nil]
211
+ # @param retry_limit [Integer] how many times to retry on duplicates
212
+ # @return [String]
140
213
  def make_statement(limit=140, generator=nil, retry_limit=10)
141
214
  responding = !generator.nil?
142
215
  generator ||= SuffixGenerator.build(@sentences)
@@ -171,12 +244,17 @@ module Ebooks
171
244
  end
172
245
 
173
246
  # Test if a sentence has been copied verbatim from original
174
- def verbatim?(tokens)
175
- @sentences.include?(tokens) || @mentions.include?(tokens)
247
+ # @param tikis [Array<Integer>]
248
+ # @return [Boolean]
249
+ def verbatim?(tikis)
250
+ @sentences.include?(tikis) || @mentions.include?(tikis)
176
251
  end
177
252
 
178
- # Finds all relevant tokenized sentences to given input by
253
+ # Finds relevant and slightly relevant tokenized sentences to input
179
254
  # comparing non-stopword token overlaps
255
+ # @param sentences [Array<Array<Integer>>]
256
+ # @param input [String]
257
+ # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
180
258
  def find_relevant(sentences, input)
181
259
  relevant = []
182
260
  slightly_relevant = []
@@ -197,6 +275,10 @@ module Ebooks
197
275
 
198
276
  # Generates a response by looking for related sentences
199
277
  # in the corpus and building a smaller generator from these
278
+ # @param input [String]
279
+ # @param limit [Integer] characters available for response
280
+ # @param sentences [Array<Array<Integer>>]
281
+ # @return [String]
200
282
  def make_response(input, limit=140, sentences=@mentions)
201
283
  # Prefer mentions
202
284
  relevant, slightly_relevant = find_relevant(sentences, input)
@@ -12,31 +12,35 @@ module Ebooks
12
12
  # Some of this stuff is pretty heavy and we don't necessarily need
13
13
  # to be using it all of the time
14
14
 
15
+ # Lazily loads an array of stopwords
16
+ # Stopwords are common English words that should often be ignored
17
+ # @return [Array<String>]
15
18
  def self.stopwords
16
19
  @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
17
20
  end
18
21
 
22
+ # Lazily loads an array of known English nouns
23
+ # @return [Array<String>]
19
24
  def self.nouns
20
25
  @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
21
26
  end
22
27
 
28
+ # Lazily loads an array of known English adjectives
29
+ # @return [Array<String>]
23
30
  def self.adjectives
24
31
  @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
25
32
  end
26
33
 
27
- # POS tagger
34
+ # Lazily load part-of-speech tagging library
35
+ # This can determine whether a word is being used as a noun/adjective/verb
36
+ # @return [EngTagger]
28
37
  def self.tagger
29
38
  require 'engtagger'
30
39
  @tagger ||= EngTagger.new
31
40
  end
32
41
 
33
- # Gingerice text correction service
34
- def self.gingerice
35
- require 'gingerice'
36
- Gingerice::Parser.new # No caching for this one
37
- end
38
-
39
- # For decoding html entities
42
+ # Lazily load HTML entity decoder
43
+ # @return [HTMLEntities]
40
44
  def self.htmlentities
41
45
  require 'htmlentities'
42
46
  @htmlentities ||= HTMLEntities.new
@@ -44,7 +48,9 @@ module Ebooks
44
48
 
45
49
  ### Utility functions
46
50
 
47
- # We don't really want to deal with all this weird unicode punctuation
51
+ # Normalize some strange unicode punctuation variants
52
+ # @param text [String]
53
+ # @return [String]
48
54
  def self.normalize(text)
49
55
  htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
50
56
  end
@@ -53,6 +59,8 @@ module Ebooks
53
59
  # We use ad hoc approach because fancy libraries do not deal
54
60
  # especially well with tweet formatting, and we can fake solving
55
61
  # the quote problem during generation
62
+ # @param text [String]
63
+ # @return [Array<String>]
56
64
  def self.sentences(text)
57
65
  text.split(/\n+|(?<=[.?!])\s+/)
58
66
  end
@@ -60,15 +68,23 @@ module Ebooks
60
68
  # Split a sentence into word-level tokens
61
69
  # As above, this is ad hoc because tokenization libraries
62
70
  # do not behave well wrt. things like emoticons and timestamps
71
+ # @param sentence [String]
72
+ # @return [Array<String>]
63
73
  def self.tokenize(sentence)
64
74
  regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
65
75
  sentence.split(regex)
66
76
  end
67
77
 
78
+ # Get the 'stem' form of a word e.g. 'cats' -> 'cat'
79
+ # @param word [String]
80
+ # @return [String]
68
81
  def self.stem(word)
69
82
  Stemmer::stem_word(word.downcase)
70
83
  end
71
84
 
85
+ # Use highscore gem to find interesting keywords in a corpus
86
+ # @param text [String]
87
+ # @return [Highscore::Keywords]
72
88
  def self.keywords(text)
73
89
  # Preprocess to remove stopwords (highscore's blacklist is v. slow)
74
90
  text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
@@ -90,7 +106,10 @@ module Ebooks
90
106
  text.keywords
91
107
  end
92
108
 
93
- # Takes a list of tokens and builds a nice-looking sentence
109
+ # Builds a proper sentence from a list of tikis
110
+ # @param tikis [Array<Integer>]
111
+ # @param tokens [Array<String>]
112
+ # @return [String]
94
113
  def self.reconstruct(tikis, tokens)
95
114
  text = ""
96
115
  last_token = nil
@@ -105,6 +124,9 @@ module Ebooks
105
124
  end
106
125
 
107
126
  # Determine if we need to insert a space between two tokens
127
+ # @param token1 [String]
128
+ # @param token2 [String]
129
+ # @return [Boolean]
108
130
  def self.space_between?(token1, token2)
109
131
  p1 = self.punctuation?(token1)
110
132
  p2 = self.punctuation?(token2)
@@ -119,10 +141,16 @@ module Ebooks
119
141
  end
120
142
  end
121
143
 
144
+ # Is this token comprised of punctuation?
145
+ # @param token [String]
146
+ # @return [Boolean]
122
147
  def self.punctuation?(token)
123
148
  (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
124
149
  end
125
150
 
151
+ # Is this token a stopword?
152
+ # @param token [String]
153
+ # @return [Boolean]
126
154
  def self.stopword?(token)
127
155
  @stopword_set ||= stopwords.map(&:downcase).to_set
128
156
  @stopword_set.include?(token.downcase)
@@ -130,7 +158,9 @@ module Ebooks
130
158
 
131
159
  # Determine if a sample of text contains unmatched brackets or quotes
132
160
  # This is one of the more frequent and noticeable failure modes for
133
- # the markov generator; we can just tell it to retry
161
+ # the generator; we can just tell it to retry
162
+ # @param text [String]
163
+ # @return [Boolean]
134
164
  def self.unmatched_enclosers?(text)
135
165
  enclosers = ['**', '""', '()', '[]', '``', "''"]
136
166
  enclosers.each do |pair|
@@ -153,10 +183,13 @@ module Ebooks
153
183
  end
154
184
 
155
185
  # Determine if a2 is a subsequence of a1
186
+ # @param a1 [Array]
187
+ # @param a2 [Array]
188
+ # @return [Boolean]
156
189
  def self.subseq?(a1, a2)
157
- a1.each_index.find do |i|
190
+ !a1.each_index.find do |i|
158
191
  a1[i...i+a2.length] == a2
159
- end
192
+ end.nil?
160
193
  end
161
194
  end
162
195
  end
@@ -1,11 +1,14 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  module Ebooks
4
- # This generator uses data identical to the markov model, but
4
+ # This generator uses data identical to a markov model, but
5
5
  # instead of making a chain by looking up bigrams it uses the
6
6
  # positions to randomly replace suffixes in one sentence with
7
7
  # matching suffixes in another
8
8
  class SuffixGenerator
9
+ # Build a generator from a corpus of tikified sentences
10
+ # @param sentences [Array<Array<Integer>>]
11
+ # @return [SuffixGenerator]
9
12
  def self.build(sentences)
10
13
  SuffixGenerator.new(sentences)
11
14
  end
@@ -39,6 +42,11 @@ module Ebooks
39
42
  self
40
43
  end
41
44
 
45
+
46
+ # Generate a recombined sequence of tikis
47
+ # @param passes [Integer] number of times to recombine
48
+ # @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
49
+ # @return [Array<Integer>]
42
50
  def generate(passes=5, n=:unigrams)
43
51
  index = rand(@sentences.length)
44
52
  tikis = @sentences[index]
@@ -1,3 +1,3 @@
1
1
  module Ebooks
2
- VERSION = "2.3.2"
2
+ VERSION = "3.0.0"
3
3
  end
@@ -1,4 +1,4 @@
1
1
  source 'http://rubygems.org'
2
- ruby '1.9.3'
2
+ ruby '{{RUBY_VERSION}}'
3
3
 
4
4
  gem 'twitter_ebooks'
@@ -1 +1 @@
1
- worker: ruby run.rb start
1
+ worker: bundle exec ebooks start
@@ -1,42 +1,55 @@
1
- #!/usr/bin/env ruby
2
-
3
1
  require 'twitter_ebooks'
4
2
 
5
3
  # This is an example bot definition with event handlers commented out
6
- # You can define as many of these as you like; they will run simultaneously
4
+ # You can define and instantiate as many bots as you like
5
+
6
+ class MyBot < Ebooks::Bot
7
+ # Configuration here applies to all MyBots
8
+ def configure
9
+ # Consumer details come from registering an app at https://dev.twitter.com/
10
+ # Once you have consumer details, use "ebooks auth" for new access tokens
11
+ self.consumer_key = '' # Your app consumer key
12
+ self.consumer_secret = '' # Your app consumer secret
13
+
14
+ # Users to block instead of interacting with
15
+ self.blacklist = ['tnietzschequote']
16
+
17
+ # Range in seconds to randomize delay when bot.delay is called
18
+ self.delay_range = 1..6
19
+ end
7
20
 
8
- Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
9
- # Consumer details come from registering an app at https://dev.twitter.com/
10
- # OAuth details can be fetched with https://github.com/marcel/twurl
11
- bot.consumer_key = "" # Your app consumer key
12
- bot.consumer_secret = "" # Your app consumer secret
13
- bot.oauth_token = "" # Token connecting the app to this account
14
- bot.oauth_token_secret = "" # Secret connecting the app to this account
21
+ def on_startup
22
+ scheduler.every '24h' do
23
+ # Tweet something every 24 hours
24
+ # See https://github.com/jmettraux/rufus-scheduler
25
+ # bot.tweet("hi")
26
+ # bot.pictweet("hi", "cuteselfie.jpg")
27
+ end
28
+ end
15
29
 
16
- bot.on_message do |dm|
30
+ def on_message(dm)
17
31
  # Reply to a DM
18
32
  # bot.reply(dm, "secret secrets")
19
33
  end
20
34
 
21
- bot.on_follow do |user|
35
+ def on_follow(user)
22
36
  # Follow a user back
23
37
  # bot.follow(user[:screen_name])
24
38
  end
25
39
 
26
- bot.on_mention do |tweet, meta|
40
+ def on_mention(tweet)
27
41
  # Reply to a mention
28
- # bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
42
+ # bot.reply(tweet, meta(tweet)[:reply_prefix] + "oh hullo")
29
43
  end
30
44
 
31
- bot.on_timeline do |tweet, meta|
45
+ def on_timeline(tweet)
32
46
  # Reply to a tweet in the bot's timeline
33
- # bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
47
+ # bot.reply(tweet, meta(tweet)[:reply_prefix] + "nice tweet")
34
48
  end
49
+ end
35
50
 
36
- bot.scheduler.every '24h' do
37
- # Tweet something every 24 hours
38
- # See https://github.com/jmettraux/rufus-scheduler
39
- # bot.tweet("hi")
40
- # bot.pictweet("hi", "cuteselfie.jpg", ":possibly_sensitive => true")
41
- end
51
+ # Make a MyBot and attach it to an account
52
+ MyBot.new("{{BOT_NAME}}") do |bot|
53
+ bot.access_token = "" # Token connecting the app to this account
54
+ bot.access_token_secret = "" # Secret connecting the app to this account
42
55
  end