twitter_ebooks 2.2.9 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/twitter_ebooks/model.rb +34 -21
- data/lib/twitter_ebooks/nlp.rb +6 -5
- data/lib/twitter_ebooks/suffix.rb +25 -26
- data/lib/twitter_ebooks/version.rb +1 -1
- data/spec/model_spec.rb +44 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a33310c52cb154361bfa00ccdf9cba3b9850af3b
|
4
|
+
data.tar.gz: c6c29fd59ae7a7926b5e649ca2858eed204f0cdb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 807bdfd51ac33fdb4ae25687e74ff89da02ca8004a7cae576d1fa159df2e7801d4f98195983ae8f8f3c5bff0136b4e880ee4bd67b1b219ede0e1aaf350b6e627
|
7
|
+
data.tar.gz: 082ef0b1c815c30d535174c19a7f21c21d9988486f3c64759d908b04d5b7379a077cf54cfdf39bff2c11346fea7654187a80f097c938d0fc26a3128d49eeb51b
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# twitter\_ebooks 2.
|
1
|
+
# twitter\_ebooks 2.3.0
|
2
2
|
|
3
3
|
Rewrite of my twitter\_ebooks code. While the original was solely a tweeting Markov generator, this framework helps you build any kind of interactive twitterbot which responds to mentions/DMs. See [ebooks\_example](https://github.com/mispy/ebooks_example) for an example of a full bot.
|
4
4
|
|
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -18,18 +18,31 @@ module Ebooks
|
|
18
18
|
Marshal.load(File.open(path, 'rb') { |f| f.read })
|
19
19
|
end
|
20
20
|
|
21
|
-
def
|
21
|
+
def initialize
|
22
|
+
# This is the only source of actual strings in the model. It is
|
23
|
+
# an array of unique tokens. Manipulation of a token is mostly done
|
24
|
+
# using its index in this array, which we call a "tiki"
|
25
|
+
@tokens = []
|
26
|
+
|
27
|
+
# Reverse lookup tiki by token, for faster generation
|
28
|
+
@tikis = {}
|
29
|
+
end
|
30
|
+
|
31
|
+
def tikify(token)
|
32
|
+
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
|
33
|
+
end
|
34
|
+
|
35
|
+
def mass_tikify(text)
|
22
36
|
sentences = NLP.sentences(text)
|
23
|
-
tokens = []
|
24
37
|
|
25
|
-
sentences.
|
26
|
-
tokens
|
38
|
+
sentences.map do |s|
|
39
|
+
tokens = NLP.tokenize(s).reject do |t|
|
27
40
|
# Don't include usernames/urls as tokens
|
28
41
|
t.include?('@') || t.include?('http')
|
29
42
|
end
|
30
|
-
end
|
31
43
|
|
32
|
-
|
44
|
+
tokens.map { |t| tikify(t) }
|
45
|
+
end
|
33
46
|
end
|
34
47
|
|
35
48
|
def consume(path)
|
@@ -63,9 +76,9 @@ module Ebooks
|
|
63
76
|
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
|
64
77
|
|
65
78
|
if l.include?('@')
|
66
|
-
statements << NLP.normalize(l)
|
67
|
-
else
|
68
79
|
mentions << NLP.normalize(l)
|
80
|
+
else
|
81
|
+
statements << NLP.normalize(l)
|
69
82
|
end
|
70
83
|
end
|
71
84
|
|
@@ -76,11 +89,11 @@ module Ebooks
|
|
76
89
|
|
77
90
|
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
|
78
91
|
|
79
|
-
@sentences =
|
80
|
-
@mentions =
|
92
|
+
@sentences = mass_tikify(text)
|
93
|
+
@mentions = mass_tikify(mention_text)
|
81
94
|
|
82
95
|
log "Ranking keywords"
|
83
|
-
@keywords = NLP.keywords(
|
96
|
+
@keywords = NLP.keywords(text)
|
84
97
|
|
85
98
|
self
|
86
99
|
end
|
@@ -106,8 +119,8 @@ module Ebooks
|
|
106
119
|
NLP.htmlentities.decode tweet
|
107
120
|
end
|
108
121
|
|
109
|
-
def valid_tweet?(
|
110
|
-
tweet = NLP.reconstruct(tokens)
|
122
|
+
def valid_tweet?(tikis, limit)
|
123
|
+
tweet = NLP.reconstruct(tikis, @tokens)
|
111
124
|
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
112
125
|
end
|
113
126
|
|
@@ -118,24 +131,24 @@ module Ebooks
|
|
118
131
|
retries = 0
|
119
132
|
tweet = ""
|
120
133
|
|
121
|
-
while (
|
122
|
-
next if
|
123
|
-
break if valid_tweet?(
|
134
|
+
while (tikis = generator.generate(3, :bigrams)) do
|
135
|
+
next if tikis.length <= 3 && !responding
|
136
|
+
break if valid_tweet?(tikis, limit)
|
124
137
|
|
125
138
|
retries += 1
|
126
139
|
break if retries >= retry_limit
|
127
140
|
end
|
128
141
|
|
129
|
-
if verbatim?(
|
130
|
-
while (
|
131
|
-
break if valid_tweet?(
|
142
|
+
if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
|
143
|
+
while (tikis = generator.generate(3, :unigrams)) do
|
144
|
+
break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
|
132
145
|
|
133
146
|
retries += 1
|
134
147
|
break if retries >= retry_limit
|
135
148
|
end
|
136
149
|
end
|
137
150
|
|
138
|
-
tweet = NLP.reconstruct(tokens)
|
151
|
+
tweet = NLP.reconstruct(tikis, @tokens)
|
139
152
|
|
140
153
|
if retries >= retry_limit
|
141
154
|
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
|
@@ -159,7 +172,7 @@ module Ebooks
|
|
159
172
|
|
160
173
|
sentences.each do |sent|
|
161
174
|
tokenized.each do |token|
|
162
|
-
if sent.map
|
175
|
+
if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
|
163
176
|
relevant << sent unless NLP.stopword?(token)
|
164
177
|
slightly_relevant << sent
|
165
178
|
end
|
data/lib/twitter_ebooks/nlp.rb
CHANGED
@@ -69,9 +69,9 @@ module Ebooks
|
|
69
69
|
Stemmer::stem_word(word.downcase)
|
70
70
|
end
|
71
71
|
|
72
|
-
def self.keywords(
|
72
|
+
def self.keywords(text)
|
73
73
|
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
74
|
-
text =
|
74
|
+
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
75
75
|
|
76
76
|
text = Highscore::Content.new(text)
|
77
77
|
|
@@ -91,11 +91,12 @@ module Ebooks
|
|
91
91
|
end
|
92
92
|
|
93
93
|
# Takes a list of tokens and builds a nice-looking sentence
|
94
|
-
def self.reconstruct(tokens)
|
94
|
+
def self.reconstruct(tikis, tokens)
|
95
95
|
text = ""
|
96
96
|
last_token = nil
|
97
|
-
|
98
|
-
next if
|
97
|
+
tikis.each do |tiki|
|
98
|
+
next if tiki == INTERIM
|
99
|
+
token = tokens[tiki]
|
99
100
|
text += ' ' if last_token && space_between?(last_token, token)
|
100
101
|
text += token
|
101
102
|
last_token = token
|
@@ -15,24 +15,24 @@ module Ebooks
|
|
15
15
|
@unigrams = {}
|
16
16
|
@bigrams = {}
|
17
17
|
|
18
|
-
@sentences.each_with_index do |
|
19
|
-
|
20
|
-
|
21
|
-
@unigrams[
|
22
|
-
@unigrams[
|
23
|
-
|
24
|
-
@bigrams[
|
25
|
-
@bigrams[
|
26
|
-
|
27
|
-
if j ==
|
28
|
-
@unigrams[
|
29
|
-
@unigrams[
|
30
|
-
@bigrams[
|
18
|
+
@sentences.each_with_index do |tikis, i|
|
19
|
+
last_tiki = INTERIM
|
20
|
+
tikis.each_with_index do |tiki, j|
|
21
|
+
@unigrams[last_tiki] ||= []
|
22
|
+
@unigrams[last_tiki] << [i, j]
|
23
|
+
|
24
|
+
@bigrams[last_tiki] ||= {}
|
25
|
+
@bigrams[last_tiki][tiki] ||= []
|
26
|
+
|
27
|
+
if j == tikis.length-1 # Mark sentence endings
|
28
|
+
@unigrams[tiki] ||= []
|
29
|
+
@unigrams[tiki] << [i, INTERIM]
|
30
|
+
@bigrams[last_tiki][tiki] << [i, INTERIM]
|
31
31
|
else
|
32
|
-
@bigrams[
|
32
|
+
@bigrams[last_tiki][tiki] << [i, j+1]
|
33
33
|
end
|
34
34
|
|
35
|
-
|
35
|
+
last_tiki = tiki
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
@@ -41,19 +41,18 @@ module Ebooks
|
|
41
41
|
|
42
42
|
def generate(passes=5, n=:unigrams)
|
43
43
|
index = rand(@sentences.length)
|
44
|
-
|
44
|
+
tikis = @sentences[index]
|
45
45
|
used = [index] # Sentences we've already used
|
46
|
-
verbatim = [
|
46
|
+
verbatim = [tikis] # Verbatim sentences to avoid reproducing
|
47
47
|
|
48
48
|
0.upto(passes-1) do
|
49
|
-
|
50
|
-
varsites = {} # Map bigram start site => next token alternatives
|
49
|
+
varsites = {} # Map bigram start site => next tiki alternatives
|
51
50
|
|
52
|
-
|
53
|
-
|
54
|
-
break if
|
51
|
+
tikis.each_with_index do |tiki, i|
|
52
|
+
next_tiki = tikis[i+1]
|
53
|
+
break if next_tiki.nil?
|
55
54
|
|
56
|
-
alternatives = (n == :unigrams) ? @unigrams[
|
55
|
+
alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
|
57
56
|
# Filter out suffixes from previous sentences
|
58
57
|
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
59
58
|
varsites[i] = alternatives unless alternatives.empty?
|
@@ -67,7 +66,7 @@ module Ebooks
|
|
67
66
|
start, alt = site[0], site[1].sample
|
68
67
|
verbatim << @sentences[alt[0]]
|
69
68
|
suffix = @sentences[alt[0]][alt[1]..-1]
|
70
|
-
potential =
|
69
|
+
potential = tikis[0..start+1] + suffix
|
71
70
|
|
72
71
|
# Ensure we're not just rebuilding some segment of another sentence
|
73
72
|
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
@@ -80,10 +79,10 @@ module Ebooks
|
|
80
79
|
break if variant
|
81
80
|
end
|
82
81
|
|
83
|
-
|
82
|
+
tikis = variant if variant
|
84
83
|
end
|
85
84
|
|
86
|
-
|
85
|
+
tikis
|
87
86
|
end
|
88
87
|
end
|
89
88
|
end
|
data/spec/model_spec.rb
CHANGED
@@ -1,9 +1,27 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
require 'memory_profiler'
|
3
|
+
require 'tempfile'
|
3
4
|
|
4
5
|
def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
|
5
6
|
|
6
7
|
describe Ebooks::Model do
|
8
|
+
describe 'making tweets' do
|
9
|
+
before(:all) { @model = Ebooks::Model.consume(path("data/0xabad1dea.json")) }
|
10
|
+
|
11
|
+
it "generates a tweet" do
|
12
|
+
s = @model.make_statement
|
13
|
+
expect(s.length).to be <= 140
|
14
|
+
puts s
|
15
|
+
end
|
16
|
+
|
17
|
+
it "generates an appropriate response" do
|
18
|
+
s = @model.make_response("hi")
|
19
|
+
expect(s.length).to be <= 140
|
20
|
+
expect(s.downcase).to include("hi")
|
21
|
+
puts s
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
7
25
|
it "does not use a ridiculous amount of memory" do
|
8
26
|
report = MemoryUsage.report do
|
9
27
|
model = Ebooks::Model.consume(path("data/0xabad1dea.json"))
|
@@ -11,4 +29,30 @@ describe Ebooks::Model do
|
|
11
29
|
|
12
30
|
expect(report.total_memsize).to be < 1000000000
|
13
31
|
end
|
32
|
+
|
33
|
+
describe '.consume' do
|
34
|
+
it 'interprets lines with @ as mentions' do
|
35
|
+
file = Tempfile.new('mentions')
|
36
|
+
file.write('@m1spy hello!')
|
37
|
+
file.close
|
38
|
+
|
39
|
+
model = Ebooks::Model.consume(file.path)
|
40
|
+
expect(model.sentences.count).to eq 0
|
41
|
+
expect(model.mentions.count).to eq 1
|
42
|
+
|
43
|
+
file.unlink
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'interprets lines without @ as statements' do
|
47
|
+
file = Tempfile.new('statements')
|
48
|
+
file.write('hello!')
|
49
|
+
file.close
|
50
|
+
|
51
|
+
model = Ebooks::Model.consume(file.path)
|
52
|
+
expect(model.mentions.count).to eq 0
|
53
|
+
expect(model.sentences.count).to eq 1
|
54
|
+
|
55
|
+
file.unlink
|
56
|
+
end
|
57
|
+
end
|
14
58
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_ebooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaiden Mispy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|