twitter_ebooks 2.1.0 → 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/bin/ebooks +2 -0
- data/lib/twitter_ebooks.rb +2 -0
- data/lib/twitter_ebooks/model.rb +59 -19
- data/lib/twitter_ebooks/suffix.rb +1 -1
- data/lib/twitter_ebooks/version.rb +1 -1
- metadata +2 -2
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
data/bin/ebooks
CHANGED
data/lib/twitter_ebooks.rb
CHANGED
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -7,7 +7,7 @@ require 'digest/md5'
|
|
7
7
|
|
8
8
|
module Ebooks
|
9
9
|
class Model
|
10
|
-
attr_accessor :hash, :sentences, :
|
10
|
+
attr_accessor :hash, :sentences, :mentions, :keywords
|
11
11
|
|
12
12
|
def self.consume(txtpath)
|
13
13
|
Model.new.consume(txtpath)
|
@@ -22,23 +22,44 @@ module Ebooks
|
|
22
22
|
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
23
23
|
|
24
24
|
text = File.read(txtpath)
|
25
|
-
log "Removing commented lines and
|
25
|
+
log "Removing commented lines and sorting mentions"
|
26
26
|
|
27
27
|
lines = text.split("\n")
|
28
28
|
keeping = []
|
29
|
+
mentions = []
|
29
30
|
lines.each do |l|
|
30
|
-
next if l.start_with?('#')
|
31
|
-
|
32
|
-
|
31
|
+
next if l.start_with?('#') # Remove commented lines
|
32
|
+
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
|
33
|
+
|
34
|
+
if l.include?('@')
|
35
|
+
mentions << l
|
36
|
+
else
|
37
|
+
keeping << l
|
38
|
+
end
|
33
39
|
end
|
34
|
-
text = NLP.normalize(keeping.join("\n"))
|
40
|
+
text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
|
41
|
+
mention_text = NLP.normalize(mentions.join("\n"))
|
35
42
|
|
36
43
|
log "Segmenting text into sentences"
|
37
44
|
|
38
|
-
|
45
|
+
statements = NLP.sentences(text)
|
46
|
+
mentions = NLP.sentences(mention_text)
|
47
|
+
|
48
|
+
log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
|
49
|
+
@sentences = []
|
50
|
+
@mentions = []
|
39
51
|
|
40
|
-
|
41
|
-
|
52
|
+
statements.each do |s|
|
53
|
+
@sentences << NLP.tokenize(s).reject do |t|
|
54
|
+
t.start_with?('@') || t.start_with?('http')
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
mentions.each do |s|
|
59
|
+
@mentions << NLP.tokenize(s).reject do |t|
|
60
|
+
t.start_with?('@') || t.start_with?('http')
|
61
|
+
end
|
62
|
+
end
|
42
63
|
|
43
64
|
log "Ranking keywords"
|
44
65
|
@keywords = NLP.keywords(@sentences)
|
@@ -72,38 +93,55 @@ module Ebooks
|
|
72
93
|
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
73
94
|
end
|
74
95
|
|
75
|
-
def make_statement(limit=140, generator=nil)
|
96
|
+
def make_statement(limit=140, generator=nil, retry_limit=10)
|
76
97
|
responding = !generator.nil?
|
77
98
|
generator ||= SuffixGenerator.build(@sentences)
|
99
|
+
|
100
|
+
retries = 0
|
78
101
|
tweet = ""
|
79
102
|
|
80
103
|
while (tokens = generator.generate(3, :bigrams)) do
|
81
104
|
next if tokens.length <= 3 && !responding
|
82
105
|
break if valid_tweet?(tokens, limit)
|
106
|
+
|
107
|
+
retries += 1
|
108
|
+
break if retries >= retry_limit
|
83
109
|
end
|
84
110
|
|
85
|
-
if
|
111
|
+
if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
|
86
112
|
while (tokens = generator.generate(3, :unigrams)) do
|
87
|
-
break if valid_tweet?(tokens, limit) &&
|
113
|
+
break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
|
114
|
+
|
115
|
+
retries += 1
|
116
|
+
break if retries >= retry_limit
|
88
117
|
end
|
89
118
|
end
|
90
119
|
|
91
120
|
tweet = NLP.reconstruct(tokens)
|
92
121
|
|
122
|
+
if retries >= retry_limit
|
123
|
+
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
|
124
|
+
end
|
125
|
+
|
93
126
|
fix tweet
|
94
127
|
end
|
95
128
|
|
129
|
+
# Test if a sentence has been copied verbatim from original
|
130
|
+
def verbatim?(tokens)
|
131
|
+
@sentences.include?(tokens) || @mentions.include?(tokens)
|
132
|
+
end
|
133
|
+
|
96
134
|
# Finds all relevant tokenized sentences to given input by
|
97
135
|
# comparing non-stopword token overlaps
|
98
|
-
def
|
136
|
+
def find_relevant(sentences, input)
|
99
137
|
relevant = []
|
100
138
|
slightly_relevant = []
|
101
139
|
|
102
|
-
tokenized = NLP.tokenize(input)
|
140
|
+
tokenized = NLP.tokenize(input).map(&:downcase)
|
103
141
|
|
104
|
-
|
142
|
+
sentences.each do |sent|
|
105
143
|
tokenized.each do |token|
|
106
|
-
if sent.include?(token)
|
144
|
+
if sent.map(&:downcase).include?(token)
|
107
145
|
relevant << sent unless NLP.stopword?(token)
|
108
146
|
slightly_relevant << sent
|
109
147
|
end
|
@@ -115,9 +153,9 @@ module Ebooks
|
|
115
153
|
|
116
154
|
# Generates a response by looking for related sentences
|
117
155
|
# in the corpus and building a smaller generator from these
|
118
|
-
def make_response(input, limit=140)
|
119
|
-
#
|
120
|
-
relevant, slightly_relevant =
|
156
|
+
def make_response(input, limit=140, sentences=@mentions)
|
157
|
+
# Prefer mentions
|
158
|
+
relevant, slightly_relevant = find_relevant(sentences, input)
|
121
159
|
|
122
160
|
if relevant.length >= 3
|
123
161
|
generator = SuffixGenerator.build(relevant)
|
@@ -125,6 +163,8 @@ module Ebooks
|
|
125
163
|
elsif slightly_relevant.length >= 5
|
126
164
|
generator = SuffixGenerator.build(slightly_relevant)
|
127
165
|
make_statement(limit, generator)
|
166
|
+
elsif sentences.equal?(@mentions)
|
167
|
+
make_response(input, limit, @sentences)
|
128
168
|
else
|
129
169
|
make_statement(limit)
|
130
170
|
end
|
@@ -44,7 +44,7 @@ module Ebooks
|
|
44
44
|
verbatim = [tokens] # Verbatim sentences to avoid reproducing
|
45
45
|
|
46
46
|
0.upto(passes-1) do
|
47
|
-
|
47
|
+
log NLP.reconstruct(tokens) if $debug
|
48
48
|
varsites = {} # Map bigram start site => next token alternatives
|
49
49
|
|
50
50
|
tokens.each_with_index do |token, i|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_ebooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-11-
|
12
|
+
date: 2013-11-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|