twitter_ebooks 2.0.7 → 2.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +0 -0
- data/Gemfile +0 -0
- data/Gemfile.lock +1 -5
- data/LICENSE +0 -0
- data/NOTES.md +0 -0
- data/README.md +2 -13
- data/Rakefile +0 -0
- data/bin/ebooks +3 -3
- data/data/adjectives.txt +0 -0
- data/data/nouns.txt +0 -0
- data/data/stopwords.txt +0 -0
- data/lib/twitter_ebooks/archiver.rb +0 -0
- data/lib/twitter_ebooks/bot.rb +0 -0
- data/lib/twitter_ebooks/markov.rb +2 -1
- data/lib/twitter_ebooks/model.rb +28 -15
- data/lib/twitter_ebooks/nlp.rb +8 -1
- data/lib/twitter_ebooks/suffix.rb +82 -0
- data/lib/twitter_ebooks/version.rb +1 -1
- data/lib/twitter_ebooks.rb +1 -0
- data/skeleton/.gitignore +0 -0
- data/skeleton/Procfile +0 -0
- data/skeleton/bots.rb +0 -0
- data/test/corpus/0xabad1dea.tweets +0 -0
- data/twitter_ebooks.gemspec +0 -0
- metadata +3 -3
- data/skeleton/corpus/README.md +0 -1
data/.gitignore
CHANGED
File without changes
|
data/Gemfile
CHANGED
File without changes
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitter_ebooks (2.0.
|
5
|
-
bloomfilter-rb
|
4
|
+
twitter_ebooks (2.0.7)
|
6
5
|
engtagger
|
7
6
|
fast-stemmer
|
8
7
|
gingerice
|
@@ -19,8 +18,6 @@ GEM
|
|
19
18
|
addressable (2.3.5)
|
20
19
|
atomic (1.1.14)
|
21
20
|
awesome_print (1.2.0)
|
22
|
-
bloomfilter-rb (2.1.1)
|
23
|
-
redis
|
24
21
|
cookiejar (0.3.0)
|
25
22
|
daemons (1.1.9)
|
26
23
|
em-http-request (1.0.3)
|
@@ -50,7 +47,6 @@ GEM
|
|
50
47
|
minitest (5.0.8)
|
51
48
|
multi_json (1.8.2)
|
52
49
|
multipart-post (1.2.0)
|
53
|
-
redis (3.0.5)
|
54
50
|
rufus-scheduler (3.0.2)
|
55
51
|
tzinfo
|
56
52
|
simple_oauth (0.2.0)
|
data/LICENSE
CHANGED
File without changes
|
data/NOTES.md
CHANGED
File without changes
|
data/README.md
CHANGED
@@ -1,20 +1,9 @@
|
|
1
|
-
# twitter\_ebooks 2.0.
|
1
|
+
# twitter\_ebooks 2.0.8
|
2
2
|
|
3
|
-
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality
|
3
|
+
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
7
7
|
```bash
|
8
8
|
gem install twitter_ebooks
|
9
9
|
```
|
10
|
-
|
11
|
-
## Making a bot
|
12
|
-
|
13
|
-
twitter\_ebooks uses a Rails-like skeleton app generator. Let's say we want to make a revolutionary Marxist bot based on the writings of Leon Trotsky (who doesn't?):
|
14
|
-
|
15
|
-
```bash
|
16
|
-
ebooks new trotsky_ebooks
|
17
|
-
cd trotsky_ebooks
|
18
|
-
```
|
19
|
-
|
20
|
-
|
data/Rakefile
CHANGED
File without changes
|
data/bin/ebooks
CHANGED
@@ -46,9 +46,9 @@ module Ebooks
|
|
46
46
|
def self.gen(model_path, input)
|
47
47
|
model = Model.load(model_path)
|
48
48
|
if input && !input.empty?
|
49
|
-
puts "@cmd " + model.
|
49
|
+
puts "@cmd " + model.make_response(input, 135)
|
50
50
|
else
|
51
|
-
puts model.
|
51
|
+
puts model.make_statement
|
52
52
|
end
|
53
53
|
end
|
54
54
|
|
@@ -64,7 +64,7 @@ module Ebooks
|
|
64
64
|
def self.tweet(modelpath, username)
|
65
65
|
load File.join(APP_PATH, 'bots.rb')
|
66
66
|
model = Model.load(modelpath)
|
67
|
-
statement = model.
|
67
|
+
statement = model.make_statement
|
68
68
|
log "@#{username}: #{statement}"
|
69
69
|
bot = Bot.get(username)
|
70
70
|
bot.configure
|
data/data/adjectives.txt
CHANGED
File without changes
|
data/data/nouns.txt
CHANGED
File without changes
|
data/data/stopwords.txt
CHANGED
File without changes
|
File without changes
|
data/lib/twitter_ebooks/bot.rb
CHANGED
File without changes
|
@@ -54,9 +54,10 @@ module Ebooks
|
|
54
54
|
|
55
55
|
def chain(tokens)
|
56
56
|
if tokens.length == 1
|
57
|
-
matches = @unigrams[tokens[
|
57
|
+
matches = @unigrams[tokens[-1]]
|
58
58
|
else
|
59
59
|
matches = @bigrams[tokens[-2]][tokens[-1]]
|
60
|
+
matches = @unigrams[tokens[-1]] if matches.length < 2
|
60
61
|
end
|
61
62
|
|
62
63
|
if matches.empty?
|
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -7,7 +7,7 @@ require 'digest/md5'
|
|
7
7
|
|
8
8
|
module Ebooks
|
9
9
|
class Model
|
10
|
-
attr_accessor :hash, :sentences, :
|
10
|
+
attr_accessor :hash, :sentences, :generator, :keywords
|
11
11
|
|
12
12
|
def self.consume(txtpath)
|
13
13
|
Model.new.consume(txtpath)
|
@@ -67,16 +67,29 @@ module Ebooks
|
|
67
67
|
NLP.htmlentities.decode tweet
|
68
68
|
end
|
69
69
|
|
70
|
-
def
|
71
|
-
|
70
|
+
def valid_tweet?(tokens, limit)
|
71
|
+
tweet = NLP.reconstruct(tokens)
|
72
|
+
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
73
|
+
end
|
74
|
+
|
75
|
+
def make_statement(limit=140, generator=nil)
|
76
|
+
responding = !generator.nil?
|
77
|
+
generator = SuffixGenerator.build(@sentences)
|
72
78
|
tweet = ""
|
73
79
|
|
74
|
-
while (
|
75
|
-
next if
|
76
|
-
|
77
|
-
|
80
|
+
while (tokens = generator.generate(3, :bigrams)) do
|
81
|
+
next if tokens.length <= 3 && !responding
|
82
|
+
break if valid_tweet?(tokens, limit)
|
83
|
+
end
|
84
|
+
|
85
|
+
if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
|
86
|
+
while (tokens = generator.generate(3, :unigrams)) do
|
87
|
+
break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
|
88
|
+
end
|
78
89
|
end
|
79
90
|
|
91
|
+
tweet = NLP.reconstruct(tokens)
|
92
|
+
|
80
93
|
fix tweet
|
81
94
|
end
|
82
95
|
|
@@ -101,19 +114,19 @@ module Ebooks
|
|
101
114
|
end
|
102
115
|
|
103
116
|
# Generates a response by looking for related sentences
|
104
|
-
# in the corpus and building a smaller
|
105
|
-
def
|
117
|
+
# in the corpus and building a smaller generator from these
|
118
|
+
def make_response(input, limit=140)
|
106
119
|
# First try
|
107
120
|
relevant, slightly_relevant = relevant_sentences(input)
|
108
121
|
|
109
122
|
if relevant.length >= 3
|
110
|
-
|
111
|
-
|
112
|
-
elsif slightly_relevant.length
|
113
|
-
|
114
|
-
|
123
|
+
generator = SuffixGenerator.build(relevant)
|
124
|
+
make_statement(limit, generator)
|
125
|
+
elsif slightly_relevant.length >= 5
|
126
|
+
generator = SuffixGenerator.build(slightly_relevant)
|
127
|
+
make_statement(limit, generator)
|
115
128
|
else
|
116
|
-
|
129
|
+
make_statement(limit)
|
117
130
|
end
|
118
131
|
end
|
119
132
|
end
|
data/lib/twitter_ebooks/nlp.rb
CHANGED
@@ -61,7 +61,7 @@ module Ebooks
|
|
61
61
|
# As above, this is ad hoc because tokenization libraries
|
62
62
|
# do not behave well wrt. things like emoticons and timestamps
|
63
63
|
def self.tokenize(sentence)
|
64
|
-
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]
|
64
|
+
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
65
65
|
sentence.split(regex)
|
66
66
|
end
|
67
67
|
|
@@ -150,5 +150,12 @@ module Ebooks
|
|
150
150
|
|
151
151
|
false
|
152
152
|
end
|
153
|
+
|
154
|
+
# Determine if a2 is a subsequence of a1
|
155
|
+
def self.subseq?(a1, a2)
|
156
|
+
a1.each_index.find do |i|
|
157
|
+
a1[i...i+a2.length] == a2
|
158
|
+
end
|
159
|
+
end
|
153
160
|
end
|
154
161
|
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module Ebooks
|
2
|
+
class SuffixGenerator
|
3
|
+
def self.build(sentences)
|
4
|
+
SuffixGenerator.new(sentences)
|
5
|
+
end
|
6
|
+
|
7
|
+
def initialize(sentences)
|
8
|
+
@sentences = sentences.reject { |s| s.length < 2 }
|
9
|
+
@unigrams = {}
|
10
|
+
@bigrams = {}
|
11
|
+
|
12
|
+
@sentences.each_with_index do |tokens, i|
|
13
|
+
last_token = INTERIM
|
14
|
+
tokens.each_with_index do |token, j|
|
15
|
+
@unigrams[last_token] ||= []
|
16
|
+
@unigrams[last_token] << [i, j]
|
17
|
+
|
18
|
+
@bigrams[last_token] ||= {}
|
19
|
+
@bigrams[last_token][token] ||= []
|
20
|
+
|
21
|
+
if j == tokens.length-1 # Mark sentence endings
|
22
|
+
@unigrams[token] ||= []
|
23
|
+
@unigrams[token] << [i, INTERIM]
|
24
|
+
@bigrams[last_token][token] << [i, INTERIM]
|
25
|
+
else
|
26
|
+
@bigrams[last_token][token] << [i, j+1]
|
27
|
+
end
|
28
|
+
|
29
|
+
last_token = token
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
def generate(passes=5, n=:unigrams)
|
37
|
+
index = rand(@sentences.length)
|
38
|
+
tokens = @sentences[index]
|
39
|
+
used = [index] # Sentences we've already used
|
40
|
+
verbatim = [tokens] # Verbatim sentences to avoid reproducing
|
41
|
+
|
42
|
+
0.upto(passes-1) do
|
43
|
+
puts NLP.reconstruct(tokens)
|
44
|
+
varsites = {} # Map bigram start site => next token alternatives
|
45
|
+
|
46
|
+
tokens.each_with_index do |token, i|
|
47
|
+
next_token = tokens[i+1]
|
48
|
+
break if next_token.nil?
|
49
|
+
|
50
|
+
alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
|
51
|
+
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
52
|
+
varsites[i] = alternatives unless alternatives.empty?
|
53
|
+
end
|
54
|
+
|
55
|
+
variant = nil
|
56
|
+
varsites.to_a.shuffle.each do |site|
|
57
|
+
start = site[0]
|
58
|
+
|
59
|
+
site[1].shuffle.each do |alt|
|
60
|
+
start, alt = site[0], site[1].sample
|
61
|
+
verbatim << @sentences[alt[0]]
|
62
|
+
suffix = @sentences[alt[0]][alt[1]..-1]
|
63
|
+
potential = tokens[0..start+1] + suffix
|
64
|
+
|
65
|
+
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
66
|
+
used << alt[0]
|
67
|
+
variant = potential
|
68
|
+
break
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
break if variant
|
73
|
+
end
|
74
|
+
|
75
|
+
tokens = variant if variant
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
tokens
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/twitter_ebooks.rb
CHANGED
data/skeleton/.gitignore
CHANGED
File without changes
|
data/skeleton/Procfile
CHANGED
File without changes
|
data/skeleton/bots.rb
CHANGED
File without changes
|
File without changes
|
data/twitter_ebooks.gemspec
CHANGED
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_ebooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-11-
|
12
|
+
date: 2013-11-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|
@@ -180,12 +180,12 @@ files:
|
|
180
180
|
- lib/twitter_ebooks/markov.rb
|
181
181
|
- lib/twitter_ebooks/model.rb
|
182
182
|
- lib/twitter_ebooks/nlp.rb
|
183
|
+
- lib/twitter_ebooks/suffix.rb
|
183
184
|
- lib/twitter_ebooks/version.rb
|
184
185
|
- script/process_anc_data.rb
|
185
186
|
- skeleton/.gitignore
|
186
187
|
- skeleton/Procfile
|
187
188
|
- skeleton/bots.rb
|
188
|
-
- skeleton/corpus/README.md
|
189
189
|
- skeleton/run.rb
|
190
190
|
- test/corpus/0xabad1dea.tweets
|
191
191
|
- test/keywords.rb
|
data/skeleton/corpus/README.md
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
Put any raw text files in here to be processed.
|