twitter_ebooks 2.2.9 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 50a5dca9c31287964724b38cb022c6273a242a59
4
- data.tar.gz: 7c2f8e441bc119f1bc29b8e7ece6650812785fa4
3
+ metadata.gz: a33310c52cb154361bfa00ccdf9cba3b9850af3b
4
+ data.tar.gz: c6c29fd59ae7a7926b5e649ca2858eed204f0cdb
5
5
  SHA512:
6
- metadata.gz: fdd9dee8a8f53bb421761a0a485b0c5bcb2677355e6c4186c69c8e60d9f15dbd13a4c1cd77c09243e609bc53b02cecc3d2c6a2f81e2fadc4da0cc8b527558df3
7
- data.tar.gz: 6a0beb91162f03bfd3ddc1edb386940919b066c13ef323d370b3314b9e4cdd3cca4463f22719ce0cecfd5f5f33fe5b5352268d71d99fa2f15f2b8f41749845a7
6
+ metadata.gz: 807bdfd51ac33fdb4ae25687e74ff89da02ca8004a7cae576d1fa159df2e7801d4f98195983ae8f8f3c5bff0136b4e880ee4bd67b1b219ede0e1aaf350b6e627
7
+ data.tar.gz: 082ef0b1c815c30d535174c19a7f21c21d9988486f3c64759d908b04d5b7379a077cf54cfdf39bff2c11346fea7654187a80f097c938d0fc26a3128d49eeb51b
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # twitter\_ebooks 2.2.9
1
+ # twitter\_ebooks 2.3.0
2
2
 
3
3
  Rewrite of my twitter\_ebooks code. While the original was solely a tweeting Markov generator, this framework helps you build any kind of interactive twitterbot which responds to mentions/DMs. See [ebooks\_example](https://github.com/mispy/ebooks_example) for an example of a full bot.
4
4
 
@@ -18,18 +18,31 @@ module Ebooks
18
18
  Marshal.load(File.open(path, 'rb') { |f| f.read })
19
19
  end
20
20
 
21
- def mass_tokenize(text)
21
+ def initialize
22
+ # This is the only source of actual strings in the model. It is
23
+ # an array of unique tokens. Manipulation of a token is mostly done
24
+ # using its index in this array, which we call a "tiki"
25
+ @tokens = []
26
+
27
+ # Reverse lookup tiki by token, for faster generation
28
+ @tikis = {}
29
+ end
30
+
31
+ def tikify(token)
32
+ @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
33
+ end
34
+
35
+ def mass_tikify(text)
22
36
  sentences = NLP.sentences(text)
23
- tokens = []
24
37
 
25
- sentences.each do |s|
26
- tokens << NLP.tokenize(s).reject do |t|
38
+ sentences.map do |s|
39
+ tokens = NLP.tokenize(s).reject do |t|
27
40
  # Don't include usernames/urls as tokens
28
41
  t.include?('@') || t.include?('http')
29
42
  end
30
- end
31
43
 
32
- tokens
44
+ tokens.map { |t| tikify(t) }
45
+ end
33
46
  end
34
47
 
35
48
  def consume(path)
@@ -63,9 +76,9 @@ module Ebooks
63
76
  next if l.include?('RT') || l.include?('MT') # Remove soft retweets
64
77
 
65
78
  if l.include?('@')
66
- statements << NLP.normalize(l)
67
- else
68
79
  mentions << NLP.normalize(l)
80
+ else
81
+ statements << NLP.normalize(l)
69
82
  end
70
83
  end
71
84
 
@@ -76,11 +89,11 @@ module Ebooks
76
89
 
77
90
  log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
78
91
 
79
- @sentences = mass_tokenize(text)
80
- @mentions = mass_tokenize(mention_text)
92
+ @sentences = mass_tikify(text)
93
+ @mentions = mass_tikify(mention_text)
81
94
 
82
95
  log "Ranking keywords"
83
- @keywords = NLP.keywords(@sentences)
96
+ @keywords = NLP.keywords(text)
84
97
 
85
98
  self
86
99
  end
@@ -106,8 +119,8 @@ module Ebooks
106
119
  NLP.htmlentities.decode tweet
107
120
  end
108
121
 
109
- def valid_tweet?(tokens, limit)
110
- tweet = NLP.reconstruct(tokens)
122
+ def valid_tweet?(tikis, limit)
123
+ tweet = NLP.reconstruct(tikis, @tokens)
111
124
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
112
125
  end
113
126
 
@@ -118,24 +131,24 @@ module Ebooks
118
131
  retries = 0
119
132
  tweet = ""
120
133
 
121
- while (tokens = generator.generate(3, :bigrams)) do
122
- next if tokens.length <= 3 && !responding
123
- break if valid_tweet?(tokens, limit)
134
+ while (tikis = generator.generate(3, :bigrams)) do
135
+ next if tikis.length <= 3 && !responding
136
+ break if valid_tweet?(tikis, limit)
124
137
 
125
138
  retries += 1
126
139
  break if retries >= retry_limit
127
140
  end
128
141
 
129
- if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
130
- while (tokens = generator.generate(3, :unigrams)) do
131
- break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
142
+ if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
143
+ while (tikis = generator.generate(3, :unigrams)) do
144
+ break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
132
145
 
133
146
  retries += 1
134
147
  break if retries >= retry_limit
135
148
  end
136
149
  end
137
150
 
138
- tweet = NLP.reconstruct(tokens)
151
+ tweet = NLP.reconstruct(tikis, @tokens)
139
152
 
140
153
  if retries >= retry_limit
141
154
  log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
@@ -159,7 +172,7 @@ module Ebooks
159
172
 
160
173
  sentences.each do |sent|
161
174
  tokenized.each do |token|
162
- if sent.map(&:downcase).include?(token)
175
+ if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
163
176
  relevant << sent unless NLP.stopword?(token)
164
177
  slightly_relevant << sent
165
178
  end
@@ -69,9 +69,9 @@ module Ebooks
69
69
  Stemmer::stem_word(word.downcase)
70
70
  end
71
71
 
72
- def self.keywords(sentences)
72
+ def self.keywords(text)
73
73
  # Preprocess to remove stopwords (highscore's blacklist is v. slow)
74
- text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
74
+ text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
75
75
 
76
76
  text = Highscore::Content.new(text)
77
77
 
@@ -91,11 +91,12 @@ module Ebooks
91
91
  end
92
92
 
93
93
  # Takes a list of tokens and builds a nice-looking sentence
94
- def self.reconstruct(tokens)
94
+ def self.reconstruct(tikis, tokens)
95
95
  text = ""
96
96
  last_token = nil
97
- tokens.each do |token|
98
- next if token == INTERIM
97
+ tikis.each do |tiki|
98
+ next if tiki == INTERIM
99
+ token = tokens[tiki]
99
100
  text += ' ' if last_token && space_between?(last_token, token)
100
101
  text += token
101
102
  last_token = token
@@ -15,24 +15,24 @@ module Ebooks
15
15
  @unigrams = {}
16
16
  @bigrams = {}
17
17
 
18
- @sentences.each_with_index do |tokens, i|
19
- last_token = INTERIM
20
- tokens.each_with_index do |token, j|
21
- @unigrams[last_token] ||= []
22
- @unigrams[last_token] << [i, j]
23
-
24
- @bigrams[last_token] ||= {}
25
- @bigrams[last_token][token] ||= []
26
-
27
- if j == tokens.length-1 # Mark sentence endings
28
- @unigrams[token] ||= []
29
- @unigrams[token] << [i, INTERIM]
30
- @bigrams[last_token][token] << [i, INTERIM]
18
+ @sentences.each_with_index do |tikis, i|
19
+ last_tiki = INTERIM
20
+ tikis.each_with_index do |tiki, j|
21
+ @unigrams[last_tiki] ||= []
22
+ @unigrams[last_tiki] << [i, j]
23
+
24
+ @bigrams[last_tiki] ||= {}
25
+ @bigrams[last_tiki][tiki] ||= []
26
+
27
+ if j == tikis.length-1 # Mark sentence endings
28
+ @unigrams[tiki] ||= []
29
+ @unigrams[tiki] << [i, INTERIM]
30
+ @bigrams[last_tiki][tiki] << [i, INTERIM]
31
31
  else
32
- @bigrams[last_token][token] << [i, j+1]
32
+ @bigrams[last_tiki][tiki] << [i, j+1]
33
33
  end
34
34
 
35
- last_token = token
35
+ last_tiki = tiki
36
36
  end
37
37
  end
38
38
 
@@ -41,19 +41,18 @@ module Ebooks
41
41
 
42
42
  def generate(passes=5, n=:unigrams)
43
43
  index = rand(@sentences.length)
44
- tokens = @sentences[index]
44
+ tikis = @sentences[index]
45
45
  used = [index] # Sentences we've already used
46
- verbatim = [tokens] # Verbatim sentences to avoid reproducing
46
+ verbatim = [tikis] # Verbatim sentences to avoid reproducing
47
47
 
48
48
  0.upto(passes-1) do
49
- log NLP.reconstruct(tokens) if $debug
50
- varsites = {} # Map bigram start site => next token alternatives
49
+ varsites = {} # Map bigram start site => next tiki alternatives
51
50
 
52
- tokens.each_with_index do |token, i|
53
- next_token = tokens[i+1]
54
- break if next_token.nil?
51
+ tikis.each_with_index do |tiki, i|
52
+ next_tiki = tikis[i+1]
53
+ break if next_tiki.nil?
55
54
 
56
- alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
55
+ alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
57
56
  # Filter out suffixes from previous sentences
58
57
  alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
59
58
  varsites[i] = alternatives unless alternatives.empty?
@@ -67,7 +66,7 @@ module Ebooks
67
66
  start, alt = site[0], site[1].sample
68
67
  verbatim << @sentences[alt[0]]
69
68
  suffix = @sentences[alt[0]][alt[1]..-1]
70
- potential = tokens[0..start+1] + suffix
69
+ potential = tikis[0..start+1] + suffix
71
70
 
72
71
  # Ensure we're not just rebuilding some segment of another sentence
73
72
  unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
@@ -80,10 +79,10 @@ module Ebooks
80
79
  break if variant
81
80
  end
82
81
 
83
- tokens = variant if variant
82
+ tikis = variant if variant
84
83
  end
85
84
 
86
- tokens
85
+ tikis
87
86
  end
88
87
  end
89
88
  end
@@ -1,3 +1,3 @@
1
1
  module Ebooks
2
- VERSION = "2.2.9"
2
+ VERSION = "2.3.0"
3
3
  end
data/spec/model_spec.rb CHANGED
@@ -1,9 +1,27 @@
1
1
  require 'spec_helper'
2
2
  require 'memory_profiler'
3
+ require 'tempfile'
3
4
 
4
5
  def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
5
6
 
6
7
  describe Ebooks::Model do
8
+ describe 'making tweets' do
9
+ before(:all) { @model = Ebooks::Model.consume(path("data/0xabad1dea.json")) }
10
+
11
+ it "generates a tweet" do
12
+ s = @model.make_statement
13
+ expect(s.length).to be <= 140
14
+ puts s
15
+ end
16
+
17
+ it "generates an appropriate response" do
18
+ s = @model.make_response("hi")
19
+ expect(s.length).to be <= 140
20
+ expect(s.downcase).to include("hi")
21
+ puts s
22
+ end
23
+ end
24
+
7
25
  it "does not use a ridiculous amount of memory" do
8
26
  report = MemoryUsage.report do
9
27
  model = Ebooks::Model.consume(path("data/0xabad1dea.json"))
@@ -11,4 +29,30 @@ describe Ebooks::Model do
11
29
 
12
30
  expect(report.total_memsize).to be < 1000000000
13
31
  end
32
+
33
+ describe '.consume' do
34
+ it 'interprets lines with @ as mentions' do
35
+ file = Tempfile.new('mentions')
36
+ file.write('@m1spy hello!')
37
+ file.close
38
+
39
+ model = Ebooks::Model.consume(file.path)
40
+ expect(model.sentences.count).to eq 0
41
+ expect(model.mentions.count).to eq 1
42
+
43
+ file.unlink
44
+ end
45
+
46
+ it 'interprets lines without @ as statements' do
47
+ file = Tempfile.new('statements')
48
+ file.write('hello!')
49
+ file.close
50
+
51
+ model = Ebooks::Model.consume(file.path)
52
+ expect(model.mentions.count).to eq 0
53
+ expect(model.sentences.count).to eq 1
54
+
55
+ file.unlink
56
+ end
57
+ end
14
58
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter_ebooks
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.9
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaiden Mispy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-20 00:00:00.000000000 Z
11
+ date: 2014-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec