twitter_ebooks 2.1.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/bin/ebooks +2 -0
- data/lib/twitter_ebooks.rb +2 -0
- data/lib/twitter_ebooks/model.rb +59 -19
- data/lib/twitter_ebooks/suffix.rb +1 -1
- data/lib/twitter_ebooks/version.rb +1 -1
- metadata +2 -2
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
data/bin/ebooks
CHANGED
data/lib/twitter_ebooks.rb
CHANGED
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -7,7 +7,7 @@ require 'digest/md5'
|
|
7
7
|
|
8
8
|
module Ebooks
|
9
9
|
class Model
|
10
|
-
attr_accessor :hash, :sentences, :
|
10
|
+
attr_accessor :hash, :sentences, :mentions, :keywords
|
11
11
|
|
12
12
|
def self.consume(txtpath)
|
13
13
|
Model.new.consume(txtpath)
|
@@ -22,23 +22,44 @@ module Ebooks
|
|
22
22
|
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
23
23
|
|
24
24
|
text = File.read(txtpath)
|
25
|
-
log "Removing commented lines and
|
25
|
+
log "Removing commented lines and sorting mentions"
|
26
26
|
|
27
27
|
lines = text.split("\n")
|
28
28
|
keeping = []
|
29
|
+
mentions = []
|
29
30
|
lines.each do |l|
|
30
|
-
next if l.start_with?('#')
|
31
|
-
|
32
|
-
|
31
|
+
next if l.start_with?('#') # Remove commented lines
|
32
|
+
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
|
33
|
+
|
34
|
+
if l.include?('@')
|
35
|
+
mentions << l
|
36
|
+
else
|
37
|
+
keeping << l
|
38
|
+
end
|
33
39
|
end
|
34
|
-
text = NLP.normalize(keeping.join("\n"))
|
40
|
+
text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
|
41
|
+
mention_text = NLP.normalize(mentions.join("\n"))
|
35
42
|
|
36
43
|
log "Segmenting text into sentences"
|
37
44
|
|
38
|
-
|
45
|
+
statements = NLP.sentences(text)
|
46
|
+
mentions = NLP.sentences(mention_text)
|
47
|
+
|
48
|
+
log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
|
49
|
+
@sentences = []
|
50
|
+
@mentions = []
|
39
51
|
|
40
|
-
|
41
|
-
|
52
|
+
statements.each do |s|
|
53
|
+
@sentences << NLP.tokenize(s).reject do |t|
|
54
|
+
t.start_with?('@') || t.start_with?('http')
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
mentions.each do |s|
|
59
|
+
@mentions << NLP.tokenize(s).reject do |t|
|
60
|
+
t.start_with?('@') || t.start_with?('http')
|
61
|
+
end
|
62
|
+
end
|
42
63
|
|
43
64
|
log "Ranking keywords"
|
44
65
|
@keywords = NLP.keywords(@sentences)
|
@@ -72,38 +93,55 @@ module Ebooks
|
|
72
93
|
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
73
94
|
end
|
74
95
|
|
75
|
-
def make_statement(limit=140, generator=nil)
|
96
|
+
def make_statement(limit=140, generator=nil, retry_limit=10)
|
76
97
|
responding = !generator.nil?
|
77
98
|
generator ||= SuffixGenerator.build(@sentences)
|
99
|
+
|
100
|
+
retries = 0
|
78
101
|
tweet = ""
|
79
102
|
|
80
103
|
while (tokens = generator.generate(3, :bigrams)) do
|
81
104
|
next if tokens.length <= 3 && !responding
|
82
105
|
break if valid_tweet?(tokens, limit)
|
106
|
+
|
107
|
+
retries += 1
|
108
|
+
break if retries >= retry_limit
|
83
109
|
end
|
84
110
|
|
85
|
-
if
|
111
|
+
if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
|
86
112
|
while (tokens = generator.generate(3, :unigrams)) do
|
87
|
-
break if valid_tweet?(tokens, limit) &&
|
113
|
+
break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
|
114
|
+
|
115
|
+
retries += 1
|
116
|
+
break if retries >= retry_limit
|
88
117
|
end
|
89
118
|
end
|
90
119
|
|
91
120
|
tweet = NLP.reconstruct(tokens)
|
92
121
|
|
122
|
+
if retries >= retry_limit
|
123
|
+
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
|
124
|
+
end
|
125
|
+
|
93
126
|
fix tweet
|
94
127
|
end
|
95
128
|
|
129
|
+
# Test if a sentence has been copied verbatim from original
|
130
|
+
def verbatim?(tokens)
|
131
|
+
@sentences.include?(tokens) || @mentions.include?(tokens)
|
132
|
+
end
|
133
|
+
|
96
134
|
# Finds all relevant tokenized sentences to given input by
|
97
135
|
# comparing non-stopword token overlaps
|
98
|
-
def
|
136
|
+
def find_relevant(sentences, input)
|
99
137
|
relevant = []
|
100
138
|
slightly_relevant = []
|
101
139
|
|
102
|
-
tokenized = NLP.tokenize(input)
|
140
|
+
tokenized = NLP.tokenize(input).map(&:downcase)
|
103
141
|
|
104
|
-
|
142
|
+
sentences.each do |sent|
|
105
143
|
tokenized.each do |token|
|
106
|
-
if sent.include?(token)
|
144
|
+
if sent.map(&:downcase).include?(token)
|
107
145
|
relevant << sent unless NLP.stopword?(token)
|
108
146
|
slightly_relevant << sent
|
109
147
|
end
|
@@ -115,9 +153,9 @@ module Ebooks
|
|
115
153
|
|
116
154
|
# Generates a response by looking for related sentences
|
117
155
|
# in the corpus and building a smaller generator from these
|
118
|
-
def make_response(input, limit=140)
|
119
|
-
#
|
120
|
-
relevant, slightly_relevant =
|
156
|
+
def make_response(input, limit=140, sentences=@mentions)
|
157
|
+
# Prefer mentions
|
158
|
+
relevant, slightly_relevant = find_relevant(sentences, input)
|
121
159
|
|
122
160
|
if relevant.length >= 3
|
123
161
|
generator = SuffixGenerator.build(relevant)
|
@@ -125,6 +163,8 @@ module Ebooks
|
|
125
163
|
elsif slightly_relevant.length >= 5
|
126
164
|
generator = SuffixGenerator.build(slightly_relevant)
|
127
165
|
make_statement(limit, generator)
|
166
|
+
elsif sentences.equal?(@mentions)
|
167
|
+
make_response(input, limit, @sentences)
|
128
168
|
else
|
129
169
|
make_statement(limit)
|
130
170
|
end
|
@@ -44,7 +44,7 @@ module Ebooks
|
|
44
44
|
verbatim = [tokens] # Verbatim sentences to avoid reproducing
|
45
45
|
|
46
46
|
0.upto(passes-1) do
|
47
|
-
|
47
|
+
log NLP.reconstruct(tokens) if $debug
|
48
48
|
varsites = {} # Map bigram start site => next token alternatives
|
49
49
|
|
50
50
|
tokens.each_with_index do |token, i|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_ebooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-11-
|
12
|
+
date: 2013-11-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|