twitter_ebooks 2.2.9 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 50a5dca9c31287964724b38cb022c6273a242a59
4
- data.tar.gz: 7c2f8e441bc119f1bc29b8e7ece6650812785fa4
3
+ metadata.gz: a33310c52cb154361bfa00ccdf9cba3b9850af3b
4
+ data.tar.gz: c6c29fd59ae7a7926b5e649ca2858eed204f0cdb
5
5
  SHA512:
6
- metadata.gz: fdd9dee8a8f53bb421761a0a485b0c5bcb2677355e6c4186c69c8e60d9f15dbd13a4c1cd77c09243e609bc53b02cecc3d2c6a2f81e2fadc4da0cc8b527558df3
7
- data.tar.gz: 6a0beb91162f03bfd3ddc1edb386940919b066c13ef323d370b3314b9e4cdd3cca4463f22719ce0cecfd5f5f33fe5b5352268d71d99fa2f15f2b8f41749845a7
6
+ metadata.gz: 807bdfd51ac33fdb4ae25687e74ff89da02ca8004a7cae576d1fa159df2e7801d4f98195983ae8f8f3c5bff0136b4e880ee4bd67b1b219ede0e1aaf350b6e627
7
+ data.tar.gz: 082ef0b1c815c30d535174c19a7f21c21d9988486f3c64759d908b04d5b7379a077cf54cfdf39bff2c11346fea7654187a80f097c938d0fc26a3128d49eeb51b
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # twitter\_ebooks 2.2.9
1
+ # twitter\_ebooks 2.3.0
2
2
 
3
3
  Rewrite of my twitter\_ebooks code. While the original was solely a tweeting Markov generator, this framework helps you build any kind of interactive twitterbot which responds to mentions/DMs. See [ebooks\_example](https://github.com/mispy/ebooks_example) for an example of a full bot.
4
4
 
@@ -18,18 +18,31 @@ module Ebooks
18
18
  Marshal.load(File.open(path, 'rb') { |f| f.read })
19
19
  end
20
20
 
21
- def mass_tokenize(text)
21
+ def initialize
22
+ # This is the only source of actual strings in the model. It is
23
+ # an array of unique tokens. Manipulation of a token is mostly done
24
+ # using its index in this array, which we call a "tiki"
25
+ @tokens = []
26
+
27
+ # Reverse lookup tiki by token, for faster generation
28
+ @tikis = {}
29
+ end
30
+
31
+ def tikify(token)
32
+ @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
33
+ end
34
+
35
+ def mass_tikify(text)
22
36
  sentences = NLP.sentences(text)
23
- tokens = []
24
37
 
25
- sentences.each do |s|
26
- tokens << NLP.tokenize(s).reject do |t|
38
+ sentences.map do |s|
39
+ tokens = NLP.tokenize(s).reject do |t|
27
40
  # Don't include usernames/urls as tokens
28
41
  t.include?('@') || t.include?('http')
29
42
  end
30
- end
31
43
 
32
- tokens
44
+ tokens.map { |t| tikify(t) }
45
+ end
33
46
  end
34
47
 
35
48
  def consume(path)
@@ -63,9 +76,9 @@ module Ebooks
63
76
  next if l.include?('RT') || l.include?('MT') # Remove soft retweets
64
77
 
65
78
  if l.include?('@')
66
- statements << NLP.normalize(l)
67
- else
68
79
  mentions << NLP.normalize(l)
80
+ else
81
+ statements << NLP.normalize(l)
69
82
  end
70
83
  end
71
84
 
@@ -76,11 +89,11 @@ module Ebooks
76
89
 
77
90
  log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
78
91
 
79
- @sentences = mass_tokenize(text)
80
- @mentions = mass_tokenize(mention_text)
92
+ @sentences = mass_tikify(text)
93
+ @mentions = mass_tikify(mention_text)
81
94
 
82
95
  log "Ranking keywords"
83
- @keywords = NLP.keywords(@sentences)
96
+ @keywords = NLP.keywords(text)
84
97
 
85
98
  self
86
99
  end
@@ -106,8 +119,8 @@ module Ebooks
106
119
  NLP.htmlentities.decode tweet
107
120
  end
108
121
 
109
- def valid_tweet?(tokens, limit)
110
- tweet = NLP.reconstruct(tokens)
122
+ def valid_tweet?(tikis, limit)
123
+ tweet = NLP.reconstruct(tikis, @tokens)
111
124
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
112
125
  end
113
126
 
@@ -118,24 +131,24 @@ module Ebooks
118
131
  retries = 0
119
132
  tweet = ""
120
133
 
121
- while (tokens = generator.generate(3, :bigrams)) do
122
- next if tokens.length <= 3 && !responding
123
- break if valid_tweet?(tokens, limit)
134
+ while (tikis = generator.generate(3, :bigrams)) do
135
+ next if tikis.length <= 3 && !responding
136
+ break if valid_tweet?(tikis, limit)
124
137
 
125
138
  retries += 1
126
139
  break if retries >= retry_limit
127
140
  end
128
141
 
129
- if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
130
- while (tokens = generator.generate(3, :unigrams)) do
131
- break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
142
+ if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
143
+ while (tikis = generator.generate(3, :unigrams)) do
144
+ break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
132
145
 
133
146
  retries += 1
134
147
  break if retries >= retry_limit
135
148
  end
136
149
  end
137
150
 
138
- tweet = NLP.reconstruct(tokens)
151
+ tweet = NLP.reconstruct(tikis, @tokens)
139
152
 
140
153
  if retries >= retry_limit
141
154
  log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
@@ -159,7 +172,7 @@ module Ebooks
159
172
 
160
173
  sentences.each do |sent|
161
174
  tokenized.each do |token|
162
- if sent.map(&:downcase).include?(token)
175
+ if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
163
176
  relevant << sent unless NLP.stopword?(token)
164
177
  slightly_relevant << sent
165
178
  end
@@ -69,9 +69,9 @@ module Ebooks
69
69
  Stemmer::stem_word(word.downcase)
70
70
  end
71
71
 
72
- def self.keywords(sentences)
72
+ def self.keywords(text)
73
73
  # Preprocess to remove stopwords (highscore's blacklist is v. slow)
74
- text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
74
+ text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
75
75
 
76
76
  text = Highscore::Content.new(text)
77
77
 
@@ -91,11 +91,12 @@ module Ebooks
91
91
  end
92
92
 
93
93
  # Takes a list of tokens and builds a nice-looking sentence
94
- def self.reconstruct(tokens)
94
+ def self.reconstruct(tikis, tokens)
95
95
  text = ""
96
96
  last_token = nil
97
- tokens.each do |token|
98
- next if token == INTERIM
97
+ tikis.each do |tiki|
98
+ next if tiki == INTERIM
99
+ token = tokens[tiki]
99
100
  text += ' ' if last_token && space_between?(last_token, token)
100
101
  text += token
101
102
  last_token = token
@@ -15,24 +15,24 @@ module Ebooks
15
15
  @unigrams = {}
16
16
  @bigrams = {}
17
17
 
18
- @sentences.each_with_index do |tokens, i|
19
- last_token = INTERIM
20
- tokens.each_with_index do |token, j|
21
- @unigrams[last_token] ||= []
22
- @unigrams[last_token] << [i, j]
23
-
24
- @bigrams[last_token] ||= {}
25
- @bigrams[last_token][token] ||= []
26
-
27
- if j == tokens.length-1 # Mark sentence endings
28
- @unigrams[token] ||= []
29
- @unigrams[token] << [i, INTERIM]
30
- @bigrams[last_token][token] << [i, INTERIM]
18
+ @sentences.each_with_index do |tikis, i|
19
+ last_tiki = INTERIM
20
+ tikis.each_with_index do |tiki, j|
21
+ @unigrams[last_tiki] ||= []
22
+ @unigrams[last_tiki] << [i, j]
23
+
24
+ @bigrams[last_tiki] ||= {}
25
+ @bigrams[last_tiki][tiki] ||= []
26
+
27
+ if j == tikis.length-1 # Mark sentence endings
28
+ @unigrams[tiki] ||= []
29
+ @unigrams[tiki] << [i, INTERIM]
30
+ @bigrams[last_tiki][tiki] << [i, INTERIM]
31
31
  else
32
- @bigrams[last_token][token] << [i, j+1]
32
+ @bigrams[last_tiki][tiki] << [i, j+1]
33
33
  end
34
34
 
35
- last_token = token
35
+ last_tiki = tiki
36
36
  end
37
37
  end
38
38
 
@@ -41,19 +41,18 @@ module Ebooks
41
41
 
42
42
  def generate(passes=5, n=:unigrams)
43
43
  index = rand(@sentences.length)
44
- tokens = @sentences[index]
44
+ tikis = @sentences[index]
45
45
  used = [index] # Sentences we've already used
46
- verbatim = [tokens] # Verbatim sentences to avoid reproducing
46
+ verbatim = [tikis] # Verbatim sentences to avoid reproducing
47
47
 
48
48
  0.upto(passes-1) do
49
- log NLP.reconstruct(tokens) if $debug
50
- varsites = {} # Map bigram start site => next token alternatives
49
+ varsites = {} # Map bigram start site => next tiki alternatives
51
50
 
52
- tokens.each_with_index do |token, i|
53
- next_token = tokens[i+1]
54
- break if next_token.nil?
51
+ tikis.each_with_index do |tiki, i|
52
+ next_tiki = tikis[i+1]
53
+ break if next_tiki.nil?
55
54
 
56
- alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
55
+ alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
57
56
  # Filter out suffixes from previous sentences
58
57
  alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
59
58
  varsites[i] = alternatives unless alternatives.empty?
@@ -67,7 +66,7 @@ module Ebooks
67
66
  start, alt = site[0], site[1].sample
68
67
  verbatim << @sentences[alt[0]]
69
68
  suffix = @sentences[alt[0]][alt[1]..-1]
70
- potential = tokens[0..start+1] + suffix
69
+ potential = tikis[0..start+1] + suffix
71
70
 
72
71
  # Ensure we're not just rebuilding some segment of another sentence
73
72
  unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
@@ -80,10 +79,10 @@ module Ebooks
80
79
  break if variant
81
80
  end
82
81
 
83
- tokens = variant if variant
82
+ tikis = variant if variant
84
83
  end
85
84
 
86
- tokens
85
+ tikis
87
86
  end
88
87
  end
89
88
  end
@@ -1,3 +1,3 @@
1
1
  module Ebooks
2
- VERSION = "2.2.9"
2
+ VERSION = "2.3.0"
3
3
  end
data/spec/model_spec.rb CHANGED
@@ -1,9 +1,27 @@
1
1
  require 'spec_helper'
2
2
  require 'memory_profiler'
3
+ require 'tempfile'
3
4
 
4
5
  def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
5
6
 
6
7
  describe Ebooks::Model do
8
+ describe 'making tweets' do
9
+ before(:all) { @model = Ebooks::Model.consume(path("data/0xabad1dea.json")) }
10
+
11
+ it "generates a tweet" do
12
+ s = @model.make_statement
13
+ expect(s.length).to be <= 140
14
+ puts s
15
+ end
16
+
17
+ it "generates an appropriate response" do
18
+ s = @model.make_response("hi")
19
+ expect(s.length).to be <= 140
20
+ expect(s.downcase).to include("hi")
21
+ puts s
22
+ end
23
+ end
24
+
7
25
  it "does not use a ridiculous amount of memory" do
8
26
  report = MemoryUsage.report do
9
27
  model = Ebooks::Model.consume(path("data/0xabad1dea.json"))
@@ -11,4 +29,30 @@ describe Ebooks::Model do
11
29
 
12
30
  expect(report.total_memsize).to be < 1000000000
13
31
  end
32
+
33
+ describe '.consume' do
34
+ it 'interprets lines with @ as mentions' do
35
+ file = Tempfile.new('mentions')
36
+ file.write('@m1spy hello!')
37
+ file.close
38
+
39
+ model = Ebooks::Model.consume(file.path)
40
+ expect(model.sentences.count).to eq 0
41
+ expect(model.mentions.count).to eq 1
42
+
43
+ file.unlink
44
+ end
45
+
46
+ it 'interprets lines without @ as statements' do
47
+ file = Tempfile.new('statements')
48
+ file.write('hello!')
49
+ file.close
50
+
51
+ model = Ebooks::Model.consume(file.path)
52
+ expect(model.mentions.count).to eq 0
53
+ expect(model.sentences.count).to eq 1
54
+
55
+ file.unlink
56
+ end
57
+ end
14
58
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter_ebooks
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.9
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaiden Mispy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-20 00:00:00.000000000 Z
11
+ date: 2014-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec