twitter_ebooks 2.0.4 → 2.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/twitter_ebooks/model.rb +3 -13
- data/lib/twitter_ebooks/nlp.rb +0 -23
- data/lib/twitter_ebooks/version.rb +1 -1
- metadata +1 -3
- data/data/ANC-all-count.txt +0 -297241
- data/data/wordfreq.json +0 -1
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -40,15 +40,8 @@ module Ebooks
|
|
40
40
|
log "Tokenizing #{sentences.length} sentences"
|
41
41
|
@sentences = sentences.map { |sent| NLP.tokenize(sent) }
|
42
42
|
|
43
|
-
log "Building markov model"
|
44
|
-
@markov = MarkovModel.build(@sentences)
|
45
|
-
|
46
43
|
log "Ranking keywords"
|
47
|
-
|
48
|
-
puts Benchmark.measure {
|
49
|
-
@keywords = NLP.keywords(@sentences)
|
50
|
-
p @keywords.top(100)
|
51
|
-
}
|
44
|
+
@keywords = NLP.keywords(@sentences)
|
52
45
|
|
53
46
|
self
|
54
47
|
end
|
@@ -75,10 +68,10 @@ module Ebooks
|
|
75
68
|
end
|
76
69
|
|
77
70
|
def markov_statement(limit=140, markov=nil)
|
78
|
-
markov ||= @
|
71
|
+
markov ||= MarkovModel.build(@sentences)
|
79
72
|
tweet = ""
|
80
73
|
|
81
|
-
while (tweet = markov.generate) do
|
74
|
+
while (tweet = markov.generate(@sentences)) do
|
82
75
|
next if tweet.length > limit
|
83
76
|
next if NLP.unmatched_enclosers?(tweet)
|
84
77
|
break if tweet.length > limit*0.4 || rand > 0.8
|
@@ -113,9 +106,6 @@ module Ebooks
|
|
113
106
|
# First try
|
114
107
|
relevant, slightly_relevant = relevant_sentences(input)
|
115
108
|
|
116
|
-
p relevant
|
117
|
-
p slightly_relevant.length
|
118
|
-
|
119
109
|
if relevant.length >= 3
|
120
110
|
markov = MarkovModel.new.consume(relevant)
|
121
111
|
markov_statement(limit, markov)
|
data/lib/twitter_ebooks/nlp.rb
CHANGED
@@ -23,10 +23,6 @@ module Ebooks
|
|
23
23
|
def self.adjectives
|
24
24
|
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
25
25
|
end
|
26
|
-
|
27
|
-
def self.wordfreq
|
28
|
-
@wordfreq ||= JSON.load(File.read(File.join(DATA_PATH, 'wordfreq.json')))
|
29
|
-
end
|
30
26
|
|
31
27
|
# POS tagger
|
32
28
|
def self.tagger
|
@@ -94,25 +90,6 @@ module Ebooks
|
|
94
90
|
text.keywords
|
95
91
|
end
|
96
92
|
|
97
|
-
def self.stemset(sentence)
|
98
|
-
tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
|
99
|
-
tokens.map(&:downcase)
|
100
|
-
.reject { |token| stopwords.include?(token) }
|
101
|
-
.map { |t| stemmer.stem(t) }
|
102
|
-
.to_set
|
103
|
-
end
|
104
|
-
|
105
|
-
# Builds a token stem frequency map
|
106
|
-
def self.stemfreq(sentences)
|
107
|
-
freqmap = {}
|
108
|
-
sentences.flatten.each do |token|
|
109
|
-
stem = NLP.stem(token)
|
110
|
-
freqmap[stem] ||= 0
|
111
|
-
freqmap[stem] += 1
|
112
|
-
end
|
113
|
-
freqmap
|
114
|
-
end
|
115
|
-
|
116
93
|
# Takes a list of tokens and builds a nice-looking sentence
|
117
94
|
def self.reconstruct(tokens)
|
118
95
|
text = ""
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter_ebooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -171,11 +171,9 @@ files:
|
|
171
171
|
- README.md
|
172
172
|
- Rakefile
|
173
173
|
- bin/ebooks
|
174
|
-
- data/ANC-all-count.txt
|
175
174
|
- data/adjectives.txt
|
176
175
|
- data/nouns.txt
|
177
176
|
- data/stopwords.txt
|
178
|
-
- data/wordfreq.json
|
179
177
|
- lib/twitter_ebooks.rb
|
180
178
|
- lib/twitter_ebooks/archiver.rb
|
181
179
|
- lib/twitter_ebooks/bot.rb
|