twitter_ebooks 2.2.9 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/twitter_ebooks/model.rb +34 -21
- data/lib/twitter_ebooks/nlp.rb +6 -5
- data/lib/twitter_ebooks/suffix.rb +25 -26
- data/lib/twitter_ebooks/version.rb +1 -1
- data/spec/model_spec.rb +44 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a33310c52cb154361bfa00ccdf9cba3b9850af3b
|
4
|
+
data.tar.gz: c6c29fd59ae7a7926b5e649ca2858eed204f0cdb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 807bdfd51ac33fdb4ae25687e74ff89da02ca8004a7cae576d1fa159df2e7801d4f98195983ae8f8f3c5bff0136b4e880ee4bd67b1b219ede0e1aaf350b6e627
|
7
|
+
data.tar.gz: 082ef0b1c815c30d535174c19a7f21c21d9988486f3c64759d908b04d5b7379a077cf54cfdf39bff2c11346fea7654187a80f097c938d0fc26a3128d49eeb51b
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# twitter\_ebooks 2.
|
1
|
+
# twitter\_ebooks 2.3.0
|
2
2
|
|
3
3
|
Rewrite of my twitter\_ebooks code. While the original was solely a tweeting Markov generator, this framework helps you build any kind of interactive twitterbot which responds to mentions/DMs. See [ebooks\_example](https://github.com/mispy/ebooks_example) for an example of a full bot.
|
4
4
|
|
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -18,18 +18,31 @@ module Ebooks
|
|
18
18
|
Marshal.load(File.open(path, 'rb') { |f| f.read })
|
19
19
|
end
|
20
20
|
|
21
|
-
def
|
21
|
+
def initialize
|
22
|
+
# This is the only source of actual strings in the model. It is
|
23
|
+
# an array of unique tokens. Manipulation of a token is mostly done
|
24
|
+
# using its index in this array, which we call a "tiki"
|
25
|
+
@tokens = []
|
26
|
+
|
27
|
+
# Reverse lookup tiki by token, for faster generation
|
28
|
+
@tikis = {}
|
29
|
+
end
|
30
|
+
|
31
|
+
def tikify(token)
|
32
|
+
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
|
33
|
+
end
|
34
|
+
|
35
|
+
def mass_tikify(text)
|
22
36
|
sentences = NLP.sentences(text)
|
23
|
-
tokens = []
|
24
37
|
|
25
|
-
sentences.
|
26
|
-
tokens
|
38
|
+
sentences.map do |s|
|
39
|
+
tokens = NLP.tokenize(s).reject do |t|
|
27
40
|
# Don't include usernames/urls as tokens
|
28
41
|
t.include?('@') || t.include?('http')
|
29
42
|
end
|
30
|
-
end
|
31
43
|
|
32
|
-
|
44
|
+
tokens.map { |t| tikify(t) }
|
45
|
+
end
|
33
46
|
end
|
34
47
|
|
35
48
|
def consume(path)
|
@@ -63,9 +76,9 @@ module Ebooks
|
|
63
76
|
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
|
64
77
|
|
65
78
|
if l.include?('@')
|
66
|
-
statements << NLP.normalize(l)
|
67
|
-
else
|
68
79
|
mentions << NLP.normalize(l)
|
80
|
+
else
|
81
|
+
statements << NLP.normalize(l)
|
69
82
|
end
|
70
83
|
end
|
71
84
|
|
@@ -76,11 +89,11 @@ module Ebooks
|
|
76
89
|
|
77
90
|
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
|
78
91
|
|
79
|
-
@sentences =
|
80
|
-
@mentions =
|
92
|
+
@sentences = mass_tikify(text)
|
93
|
+
@mentions = mass_tikify(mention_text)
|
81
94
|
|
82
95
|
log "Ranking keywords"
|
83
|
-
@keywords = NLP.keywords(
|
96
|
+
@keywords = NLP.keywords(text)
|
84
97
|
|
85
98
|
self
|
86
99
|
end
|
@@ -106,8 +119,8 @@ module Ebooks
|
|
106
119
|
NLP.htmlentities.decode tweet
|
107
120
|
end
|
108
121
|
|
109
|
-
def valid_tweet?(
|
110
|
-
tweet = NLP.reconstruct(tokens)
|
122
|
+
def valid_tweet?(tikis, limit)
|
123
|
+
tweet = NLP.reconstruct(tikis, @tokens)
|
111
124
|
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
112
125
|
end
|
113
126
|
|
@@ -118,24 +131,24 @@ module Ebooks
|
|
118
131
|
retries = 0
|
119
132
|
tweet = ""
|
120
133
|
|
121
|
-
while (
|
122
|
-
next if
|
123
|
-
break if valid_tweet?(
|
134
|
+
while (tikis = generator.generate(3, :bigrams)) do
|
135
|
+
next if tikis.length <= 3 && !responding
|
136
|
+
break if valid_tweet?(tikis, limit)
|
124
137
|
|
125
138
|
retries += 1
|
126
139
|
break if retries >= retry_limit
|
127
140
|
end
|
128
141
|
|
129
|
-
if verbatim?(
|
130
|
-
while (
|
131
|
-
break if valid_tweet?(
|
142
|
+
if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
|
143
|
+
while (tikis = generator.generate(3, :unigrams)) do
|
144
|
+
break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
|
132
145
|
|
133
146
|
retries += 1
|
134
147
|
break if retries >= retry_limit
|
135
148
|
end
|
136
149
|
end
|
137
150
|
|
138
|
-
tweet = NLP.reconstruct(tokens)
|
151
|
+
tweet = NLP.reconstruct(tikis, @tokens)
|
139
152
|
|
140
153
|
if retries >= retry_limit
|
141
154
|
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
|
@@ -159,7 +172,7 @@ module Ebooks
|
|
159
172
|
|
160
173
|
sentences.each do |sent|
|
161
174
|
tokenized.each do |token|
|
162
|
-
if sent.map
|
175
|
+
if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
|
163
176
|
relevant << sent unless NLP.stopword?(token)
|
164
177
|
slightly_relevant << sent
|
165
178
|
end
|
data/lib/twitter_ebooks/nlp.rb
CHANGED
@@ -69,9 +69,9 @@ module Ebooks
|
|
69
69
|
Stemmer::stem_word(word.downcase)
|
70
70
|
end
|
71
71
|
|
72
|
-
def self.keywords(
|
72
|
+
def self.keywords(text)
|
73
73
|
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
74
|
-
text =
|
74
|
+
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
75
75
|
|
76
76
|
text = Highscore::Content.new(text)
|
77
77
|
|
@@ -91,11 +91,12 @@ module Ebooks
|
|
91
91
|
end
|
92
92
|
|
93
93
|
# Takes a list of tokens and builds a nice-looking sentence
|
94
|
-
def self.reconstruct(tokens)
|
94
|
+
def self.reconstruct(tikis, tokens)
|
95
95
|
text = ""
|
96
96
|
last_token = nil
|
97
|
-
|
98
|
-
next if
|
97
|
+
tikis.each do |tiki|
|
98
|
+
next if tiki == INTERIM
|
99
|
+
token = tokens[tiki]
|
99
100
|
text += ' ' if last_token && space_between?(last_token, token)
|
100
101
|
text += token
|
101
102
|
last_token = token
|
@@ -15,24 +15,24 @@ module Ebooks
|
|
15
15
|
@unigrams = {}
|
16
16
|
@bigrams = {}
|
17
17
|
|
18
|
-
@sentences.each_with_index do |
|
19
|
-
|
20
|
-
|
21
|
-
@unigrams[
|
22
|
-
@unigrams[
|
23
|
-
|
24
|
-
@bigrams[
|
25
|
-
@bigrams[
|
26
|
-
|
27
|
-
if j ==
|
28
|
-
@unigrams[
|
29
|
-
@unigrams[
|
30
|
-
@bigrams[
|
18
|
+
@sentences.each_with_index do |tikis, i|
|
19
|
+
last_tiki = INTERIM
|
20
|
+
tikis.each_with_index do |tiki, j|
|
21
|
+
@unigrams[last_tiki] ||= []
|
22
|
+
@unigrams[last_tiki] << [i, j]
|
23
|
+
|
24
|
+
@bigrams[last_tiki] ||= {}
|
25
|
+
@bigrams[last_tiki][tiki] ||= []
|
26
|
+
|
27
|
+
if j == tikis.length-1 # Mark sentence endings
|
28
|
+
@unigrams[tiki] ||= []
|
29
|
+
@unigrams[tiki] << [i, INTERIM]
|
30
|
+
@bigrams[last_tiki][tiki] << [i, INTERIM]
|
31
31
|
else
|
32
|
-
@bigrams[
|
32
|
+
@bigrams[last_tiki][tiki] << [i, j+1]
|
33
33
|
end
|
34
34
|
|
35
|
-
|
35
|
+
last_tiki = tiki
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
@@ -41,19 +41,18 @@ module Ebooks
|
|
41
41
|
|
42
42
|
def generate(passes=5, n=:unigrams)
|
43
43
|
index = rand(@sentences.length)
|
44
|
-
|
44
|
+
tikis = @sentences[index]
|
45
45
|
used = [index] # Sentences we've already used
|
46
|
-
verbatim = [
|
46
|
+
verbatim = [tikis] # Verbatim sentences to avoid reproducing
|
47
47
|
|
48
48
|
0.upto(passes-1) do
|
49
|
-
|
50
|
-
varsites = {} # Map bigram start site => next token alternatives
|
49
|
+
varsites = {} # Map bigram start site => next tiki alternatives
|
51
50
|
|
52
|
-
|
53
|
-
|
54
|
-
break if
|
51
|
+
tikis.each_with_index do |tiki, i|
|
52
|
+
next_tiki = tikis[i+1]
|
53
|
+
break if next_tiki.nil?
|
55
54
|
|
56
|
-
alternatives = (n == :unigrams) ? @unigrams[
|
55
|
+
alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
|
57
56
|
# Filter out suffixes from previous sentences
|
58
57
|
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
59
58
|
varsites[i] = alternatives unless alternatives.empty?
|
@@ -67,7 +66,7 @@ module Ebooks
|
|
67
66
|
start, alt = site[0], site[1].sample
|
68
67
|
verbatim << @sentences[alt[0]]
|
69
68
|
suffix = @sentences[alt[0]][alt[1]..-1]
|
70
|
-
potential =
|
69
|
+
potential = tikis[0..start+1] + suffix
|
71
70
|
|
72
71
|
# Ensure we're not just rebuilding some segment of another sentence
|
73
72
|
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
@@ -80,10 +79,10 @@ module Ebooks
|
|
80
79
|
break if variant
|
81
80
|
end
|
82
81
|
|
83
|
-
|
82
|
+
tikis = variant if variant
|
84
83
|
end
|
85
84
|
|
86
|
-
|
85
|
+
tikis
|
87
86
|
end
|
88
87
|
end
|
89
88
|
end
|
data/spec/model_spec.rb
CHANGED
@@ -1,9 +1,27 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
require 'memory_profiler'
|
3
|
+
require 'tempfile'
|
3
4
|
|
4
5
|
def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
|
5
6
|
|
6
7
|
describe Ebooks::Model do
|
8
|
+
describe 'making tweets' do
|
9
|
+
before(:all) { @model = Ebooks::Model.consume(path("data/0xabad1dea.json")) }
|
10
|
+
|
11
|
+
it "generates a tweet" do
|
12
|
+
s = @model.make_statement
|
13
|
+
expect(s.length).to be <= 140
|
14
|
+
puts s
|
15
|
+
end
|
16
|
+
|
17
|
+
it "generates an appropriate response" do
|
18
|
+
s = @model.make_response("hi")
|
19
|
+
expect(s.length).to be <= 140
|
20
|
+
expect(s.downcase).to include("hi")
|
21
|
+
puts s
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
7
25
|
it "does not use a ridiculous amount of memory" do
|
8
26
|
report = MemoryUsage.report do
|
9
27
|
model = Ebooks::Model.consume(path("data/0xabad1dea.json"))
|
@@ -11,4 +29,30 @@ describe Ebooks::Model do
|
|
11
29
|
|
12
30
|
expect(report.total_memsize).to be < 1000000000
|
13
31
|
end
|
32
|
+
|
33
|
+
describe '.consume' do
|
34
|
+
it 'interprets lines with @ as mentions' do
|
35
|
+
file = Tempfile.new('mentions')
|
36
|
+
file.write('@m1spy hello!')
|
37
|
+
file.close
|
38
|
+
|
39
|
+
model = Ebooks::Model.consume(file.path)
|
40
|
+
expect(model.sentences.count).to eq 0
|
41
|
+
expect(model.mentions.count).to eq 1
|
42
|
+
|
43
|
+
file.unlink
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'interprets lines without @ as statements' do
|
47
|
+
file = Tempfile.new('statements')
|
48
|
+
file.write('hello!')
|
49
|
+
file.close
|
50
|
+
|
51
|
+
model = Ebooks::Model.consume(file.path)
|
52
|
+
expect(model.mentions.count).to eq 0
|
53
|
+
expect(model.sentences.count).to eq 1
|
54
|
+
|
55
|
+
file.unlink
|
56
|
+
end
|
57
|
+
end
|
14
58
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_ebooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaiden Mispy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|