twitter_ebooks 2.0.4 → 2.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/twitter_ebooks/model.rb +3 -13
- data/lib/twitter_ebooks/nlp.rb +0 -23
- data/lib/twitter_ebooks/version.rb +1 -1
- metadata +1 -3
- data/data/ANC-all-count.txt +0 -297241
- data/data/wordfreq.json +0 -1
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -40,15 +40,8 @@ module Ebooks
|
|
40
40
|
log "Tokenizing #{sentences.length} sentences"
|
41
41
|
@sentences = sentences.map { |sent| NLP.tokenize(sent) }
|
42
42
|
|
43
|
-
log "Building markov model"
|
44
|
-
@markov = MarkovModel.build(@sentences)
|
45
|
-
|
46
43
|
log "Ranking keywords"
|
47
|
-
|
48
|
-
puts Benchmark.measure {
|
49
|
-
@keywords = NLP.keywords(@sentences)
|
50
|
-
p @keywords.top(100)
|
51
|
-
}
|
44
|
+
@keywords = NLP.keywords(@sentences)
|
52
45
|
|
53
46
|
self
|
54
47
|
end
|
@@ -75,10 +68,10 @@ module Ebooks
|
|
75
68
|
end
|
76
69
|
|
77
70
|
def markov_statement(limit=140, markov=nil)
|
78
|
-
markov ||= @
|
71
|
+
markov ||= MarkovModel.build(@sentences)
|
79
72
|
tweet = ""
|
80
73
|
|
81
|
-
while (tweet = markov.generate) do
|
74
|
+
while (tweet = markov.generate(@sentences)) do
|
82
75
|
next if tweet.length > limit
|
83
76
|
next if NLP.unmatched_enclosers?(tweet)
|
84
77
|
break if tweet.length > limit*0.4 || rand > 0.8
|
@@ -113,9 +106,6 @@ module Ebooks
|
|
113
106
|
# First try
|
114
107
|
relevant, slightly_relevant = relevant_sentences(input)
|
115
108
|
|
116
|
-
p relevant
|
117
|
-
p slightly_relevant.length
|
118
|
-
|
119
109
|
if relevant.length >= 3
|
120
110
|
markov = MarkovModel.new.consume(relevant)
|
121
111
|
markov_statement(limit, markov)
|
data/lib/twitter_ebooks/nlp.rb
CHANGED
@@ -23,10 +23,6 @@ module Ebooks
|
|
23
23
|
def self.adjectives
|
24
24
|
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
25
25
|
end
|
26
|
-
|
27
|
-
def self.wordfreq
|
28
|
-
@wordfreq ||= JSON.load(File.read(File.join(DATA_PATH, 'wordfreq.json')))
|
29
|
-
end
|
30
26
|
|
31
27
|
# POS tagger
|
32
28
|
def self.tagger
|
@@ -94,25 +90,6 @@ module Ebooks
|
|
94
90
|
text.keywords
|
95
91
|
end
|
96
92
|
|
97
|
-
def self.stemset(sentence)
|
98
|
-
tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
|
99
|
-
tokens.map(&:downcase)
|
100
|
-
.reject { |token| stopwords.include?(token) }
|
101
|
-
.map { |t| stemmer.stem(t) }
|
102
|
-
.to_set
|
103
|
-
end
|
104
|
-
|
105
|
-
# Builds a token stem frequency map
|
106
|
-
def self.stemfreq(sentences)
|
107
|
-
freqmap = {}
|
108
|
-
sentences.flatten.each do |token|
|
109
|
-
stem = NLP.stem(token)
|
110
|
-
freqmap[stem] ||= 0
|
111
|
-
freqmap[stem] += 1
|
112
|
-
end
|
113
|
-
freqmap
|
114
|
-
end
|
115
|
-
|
116
93
|
# Takes a list of tokens and builds a nice-looking sentence
|
117
94
|
def self.reconstruct(tokens)
|
118
95
|
text = ""
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_ebooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -171,11 +171,9 @@ files:
|
|
171
171
|
- README.md
|
172
172
|
- Rakefile
|
173
173
|
- bin/ebooks
|
174
|
-
- data/ANC-all-count.txt
|
175
174
|
- data/adjectives.txt
|
176
175
|
- data/nouns.txt
|
177
176
|
- data/stopwords.txt
|
178
|
-
- data/wordfreq.json
|
179
177
|
- lib/twitter_ebooks.rb
|
180
178
|
- lib/twitter_ebooks/archiver.rb
|
181
179
|
- lib/twitter_ebooks/bot.rb
|