twitter_ebooks 2.0.7 → 2.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +0 -0
- data/Gemfile +0 -0
- data/Gemfile.lock +1 -5
- data/LICENSE +0 -0
- data/NOTES.md +0 -0
- data/README.md +2 -13
- data/Rakefile +0 -0
- data/bin/ebooks +3 -3
- data/data/adjectives.txt +0 -0
- data/data/nouns.txt +0 -0
- data/data/stopwords.txt +0 -0
- data/lib/twitter_ebooks/archiver.rb +0 -0
- data/lib/twitter_ebooks/bot.rb +0 -0
- data/lib/twitter_ebooks/markov.rb +2 -1
- data/lib/twitter_ebooks/model.rb +28 -15
- data/lib/twitter_ebooks/nlp.rb +8 -1
- data/lib/twitter_ebooks/suffix.rb +82 -0
- data/lib/twitter_ebooks/version.rb +1 -1
- data/lib/twitter_ebooks.rb +1 -0
- data/skeleton/.gitignore +0 -0
- data/skeleton/Procfile +0 -0
- data/skeleton/bots.rb +0 -0
- data/test/corpus/0xabad1dea.tweets +0 -0
- data/twitter_ebooks.gemspec +0 -0
- metadata +3 -3
- data/skeleton/corpus/README.md +0 -1
data/.gitignore
CHANGED
File without changes
|
data/Gemfile
CHANGED
File without changes
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitter_ebooks (2.0.
|
5
|
-
bloomfilter-rb
|
4
|
+
twitter_ebooks (2.0.7)
|
6
5
|
engtagger
|
7
6
|
fast-stemmer
|
8
7
|
gingerice
|
@@ -19,8 +18,6 @@ GEM
|
|
19
18
|
addressable (2.3.5)
|
20
19
|
atomic (1.1.14)
|
21
20
|
awesome_print (1.2.0)
|
22
|
-
bloomfilter-rb (2.1.1)
|
23
|
-
redis
|
24
21
|
cookiejar (0.3.0)
|
25
22
|
daemons (1.1.9)
|
26
23
|
em-http-request (1.0.3)
|
@@ -50,7 +47,6 @@ GEM
|
|
50
47
|
minitest (5.0.8)
|
51
48
|
multi_json (1.8.2)
|
52
49
|
multipart-post (1.2.0)
|
53
|
-
redis (3.0.5)
|
54
50
|
rufus-scheduler (3.0.2)
|
55
51
|
tzinfo
|
56
52
|
simple_oauth (0.2.0)
|
data/LICENSE
CHANGED
File without changes
|
data/NOTES.md
CHANGED
File without changes
|
data/README.md
CHANGED
@@ -1,20 +1,9 @@
|
|
1
|
-
# twitter\_ebooks 2.0.
|
1
|
+
# twitter\_ebooks 2.0.8
|
2
2
|
|
3
|
-
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality
|
3
|
+
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
7
7
|
```bash
|
8
8
|
gem install twitter_ebooks
|
9
9
|
```
|
10
|
-
|
11
|
-
## Making a bot
|
12
|
-
|
13
|
-
twitter\_ebooks uses a Rails-like skeleton app generator. Let's say we want to make a revolutionary Marxist bot based on the writings of Leon Trotsky (who doesn't?):
|
14
|
-
|
15
|
-
```bash
|
16
|
-
ebooks new trotsky_ebooks
|
17
|
-
cd trotsky_ebooks
|
18
|
-
```
|
19
|
-
|
20
|
-
|
data/Rakefile
CHANGED
File without changes
|
data/bin/ebooks
CHANGED
@@ -46,9 +46,9 @@ module Ebooks
|
|
46
46
|
def self.gen(model_path, input)
|
47
47
|
model = Model.load(model_path)
|
48
48
|
if input && !input.empty?
|
49
|
-
puts "@cmd " + model.
|
49
|
+
puts "@cmd " + model.make_response(input, 135)
|
50
50
|
else
|
51
|
-
puts model.
|
51
|
+
puts model.make_statement
|
52
52
|
end
|
53
53
|
end
|
54
54
|
|
@@ -64,7 +64,7 @@ module Ebooks
|
|
64
64
|
def self.tweet(modelpath, username)
|
65
65
|
load File.join(APP_PATH, 'bots.rb')
|
66
66
|
model = Model.load(modelpath)
|
67
|
-
statement = model.
|
67
|
+
statement = model.make_statement
|
68
68
|
log "@#{username}: #{statement}"
|
69
69
|
bot = Bot.get(username)
|
70
70
|
bot.configure
|
data/data/adjectives.txt
CHANGED
File without changes
|
data/data/nouns.txt
CHANGED
File without changes
|
data/data/stopwords.txt
CHANGED
File without changes
|
File without changes
|
data/lib/twitter_ebooks/bot.rb
CHANGED
File without changes
|
@@ -54,9 +54,10 @@ module Ebooks
|
|
54
54
|
|
55
55
|
def chain(tokens)
|
56
56
|
if tokens.length == 1
|
57
|
-
matches = @unigrams[tokens[
|
57
|
+
matches = @unigrams[tokens[-1]]
|
58
58
|
else
|
59
59
|
matches = @bigrams[tokens[-2]][tokens[-1]]
|
60
|
+
matches = @unigrams[tokens[-1]] if matches.length < 2
|
60
61
|
end
|
61
62
|
|
62
63
|
if matches.empty?
|
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -7,7 +7,7 @@ require 'digest/md5'
|
|
7
7
|
|
8
8
|
module Ebooks
|
9
9
|
class Model
|
10
|
-
attr_accessor :hash, :sentences, :
|
10
|
+
attr_accessor :hash, :sentences, :generator, :keywords
|
11
11
|
|
12
12
|
def self.consume(txtpath)
|
13
13
|
Model.new.consume(txtpath)
|
@@ -67,16 +67,29 @@ module Ebooks
|
|
67
67
|
NLP.htmlentities.decode tweet
|
68
68
|
end
|
69
69
|
|
70
|
-
def
|
71
|
-
|
70
|
+
def valid_tweet?(tokens, limit)
|
71
|
+
tweet = NLP.reconstruct(tokens)
|
72
|
+
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
73
|
+
end
|
74
|
+
|
75
|
+
def make_statement(limit=140, generator=nil)
|
76
|
+
responding = !generator.nil?
|
77
|
+
generator = SuffixGenerator.build(@sentences)
|
72
78
|
tweet = ""
|
73
79
|
|
74
|
-
while (
|
75
|
-
next if
|
76
|
-
|
77
|
-
|
80
|
+
while (tokens = generator.generate(3, :bigrams)) do
|
81
|
+
next if tokens.length <= 3 && !responding
|
82
|
+
break if valid_tweet?(tokens, limit)
|
83
|
+
end
|
84
|
+
|
85
|
+
if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
|
86
|
+
while (tokens = generator.generate(3, :unigrams)) do
|
87
|
+
break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
|
88
|
+
end
|
78
89
|
end
|
79
90
|
|
91
|
+
tweet = NLP.reconstruct(tokens)
|
92
|
+
|
80
93
|
fix tweet
|
81
94
|
end
|
82
95
|
|
@@ -101,19 +114,19 @@ module Ebooks
|
|
101
114
|
end
|
102
115
|
|
103
116
|
# Generates a response by looking for related sentences
|
104
|
-
# in the corpus and building a smaller
|
105
|
-
def
|
117
|
+
# in the corpus and building a smaller generator from these
|
118
|
+
def make_response(input, limit=140)
|
106
119
|
# First try
|
107
120
|
relevant, slightly_relevant = relevant_sentences(input)
|
108
121
|
|
109
122
|
if relevant.length >= 3
|
110
|
-
|
111
|
-
|
112
|
-
elsif slightly_relevant.length
|
113
|
-
|
114
|
-
|
123
|
+
generator = SuffixGenerator.build(relevant)
|
124
|
+
make_statement(limit, generator)
|
125
|
+
elsif slightly_relevant.length >= 5
|
126
|
+
generator = SuffixGenerator.build(slightly_relevant)
|
127
|
+
make_statement(limit, generator)
|
115
128
|
else
|
116
|
-
|
129
|
+
make_statement(limit)
|
117
130
|
end
|
118
131
|
end
|
119
132
|
end
|
data/lib/twitter_ebooks/nlp.rb
CHANGED
@@ -61,7 +61,7 @@ module Ebooks
|
|
61
61
|
# As above, this is ad hoc because tokenization libraries
|
62
62
|
# do not behave well wrt. things like emoticons and timestamps
|
63
63
|
def self.tokenize(sentence)
|
64
|
-
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]
|
64
|
+
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
65
65
|
sentence.split(regex)
|
66
66
|
end
|
67
67
|
|
@@ -150,5 +150,12 @@ module Ebooks
|
|
150
150
|
|
151
151
|
false
|
152
152
|
end
|
153
|
+
|
154
|
+
# Determine if a2 is a subsequence of a1
|
155
|
+
def self.subseq?(a1, a2)
|
156
|
+
a1.each_index.find do |i|
|
157
|
+
a1[i...i+a2.length] == a2
|
158
|
+
end
|
159
|
+
end
|
153
160
|
end
|
154
161
|
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module Ebooks
|
2
|
+
class SuffixGenerator
|
3
|
+
def self.build(sentences)
|
4
|
+
SuffixGenerator.new(sentences)
|
5
|
+
end
|
6
|
+
|
7
|
+
def initialize(sentences)
|
8
|
+
@sentences = sentences.reject { |s| s.length < 2 }
|
9
|
+
@unigrams = {}
|
10
|
+
@bigrams = {}
|
11
|
+
|
12
|
+
@sentences.each_with_index do |tokens, i|
|
13
|
+
last_token = INTERIM
|
14
|
+
tokens.each_with_index do |token, j|
|
15
|
+
@unigrams[last_token] ||= []
|
16
|
+
@unigrams[last_token] << [i, j]
|
17
|
+
|
18
|
+
@bigrams[last_token] ||= {}
|
19
|
+
@bigrams[last_token][token] ||= []
|
20
|
+
|
21
|
+
if j == tokens.length-1 # Mark sentence endings
|
22
|
+
@unigrams[token] ||= []
|
23
|
+
@unigrams[token] << [i, INTERIM]
|
24
|
+
@bigrams[last_token][token] << [i, INTERIM]
|
25
|
+
else
|
26
|
+
@bigrams[last_token][token] << [i, j+1]
|
27
|
+
end
|
28
|
+
|
29
|
+
last_token = token
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
def generate(passes=5, n=:unigrams)
|
37
|
+
index = rand(@sentences.length)
|
38
|
+
tokens = @sentences[index]
|
39
|
+
used = [index] # Sentences we've already used
|
40
|
+
verbatim = [tokens] # Verbatim sentences to avoid reproducing
|
41
|
+
|
42
|
+
0.upto(passes-1) do
|
43
|
+
puts NLP.reconstruct(tokens)
|
44
|
+
varsites = {} # Map bigram start site => next token alternatives
|
45
|
+
|
46
|
+
tokens.each_with_index do |token, i|
|
47
|
+
next_token = tokens[i+1]
|
48
|
+
break if next_token.nil?
|
49
|
+
|
50
|
+
alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
|
51
|
+
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
52
|
+
varsites[i] = alternatives unless alternatives.empty?
|
53
|
+
end
|
54
|
+
|
55
|
+
variant = nil
|
56
|
+
varsites.to_a.shuffle.each do |site|
|
57
|
+
start = site[0]
|
58
|
+
|
59
|
+
site[1].shuffle.each do |alt|
|
60
|
+
start, alt = site[0], site[1].sample
|
61
|
+
verbatim << @sentences[alt[0]]
|
62
|
+
suffix = @sentences[alt[0]][alt[1]..-1]
|
63
|
+
potential = tokens[0..start+1] + suffix
|
64
|
+
|
65
|
+
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
66
|
+
used << alt[0]
|
67
|
+
variant = potential
|
68
|
+
break
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
break if variant
|
73
|
+
end
|
74
|
+
|
75
|
+
tokens = variant if variant
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
tokens
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/twitter_ebooks.rb
CHANGED
data/skeleton/.gitignore
CHANGED
File without changes
|
data/skeleton/Procfile
CHANGED
File without changes
|
data/skeleton/bots.rb
CHANGED
File without changes
|
File without changes
|
data/twitter_ebooks.gemspec
CHANGED
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_ebooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-11-
|
12
|
+
date: 2013-11-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|
@@ -180,12 +180,12 @@ files:
|
|
180
180
|
- lib/twitter_ebooks/markov.rb
|
181
181
|
- lib/twitter_ebooks/model.rb
|
182
182
|
- lib/twitter_ebooks/nlp.rb
|
183
|
+
- lib/twitter_ebooks/suffix.rb
|
183
184
|
- lib/twitter_ebooks/version.rb
|
184
185
|
- script/process_anc_data.rb
|
185
186
|
- skeleton/.gitignore
|
186
187
|
- skeleton/Procfile
|
187
188
|
- skeleton/bots.rb
|
188
|
-
- skeleton/corpus/README.md
|
189
189
|
- skeleton/run.rb
|
190
190
|
- test/corpus/0xabad1dea.tweets
|
191
191
|
- test/keywords.rb
|
data/skeleton/corpus/README.md
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
Put any raw text files in here to be processed.
|