twitter_ebooks 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +12 -12
- data/bin/ebooks +21 -6
- data/data/ANC-all-count.txt +297241 -0
- data/data/stopwords.txt +204 -0
- data/data/wordfreq.json +1 -0
- data/lib/twitter_ebooks/bot.rb +25 -7
- data/lib/twitter_ebooks/markov.rb +55 -63
- data/lib/twitter_ebooks/model.rb +57 -74
- data/lib/twitter_ebooks/nlp.rb +90 -55
- data/lib/twitter_ebooks/version.rb +1 -1
- data/script/process_anc_data.rb +19 -0
- data/skeleton/Procfile +1 -1
- data/skeleton/bots.rb +0 -6
- data/skeleton/corpus/README.md +1 -1
- data/skeleton/run.rb +9 -0
- data/test/keywords.rb +18 -0
- data/twitter_ebooks.gemspec +3 -5
- metadata +13 -40
- data/skeleton/model/README.md +0 -1
data/lib/twitter_ebooks/bot.rb
CHANGED
@@ -15,6 +15,10 @@ module Ebooks
|
|
15
15
|
@@all = [] # List of all defined bots
|
16
16
|
def self.all; @@all; end
|
17
17
|
|
18
|
+
def self.get(name)
|
19
|
+
all.find { |bot| bot.username == name }
|
20
|
+
end
|
21
|
+
|
18
22
|
def initialize(username, &b)
|
19
23
|
# Set defaults
|
20
24
|
@username = username
|
@@ -30,8 +34,7 @@ module Ebooks
|
|
30
34
|
STDERR.flush
|
31
35
|
end
|
32
36
|
|
33
|
-
|
34
|
-
def start
|
37
|
+
def configure
|
35
38
|
TweetStream.configure do |config|
|
36
39
|
config.consumer_key = @consumer_key
|
37
40
|
config.consumer_secret = @consumer_secret
|
@@ -48,6 +51,13 @@ module Ebooks
|
|
48
51
|
|
49
52
|
@twitter = Twitter::Client.new
|
50
53
|
@stream = TweetStream::Client.new
|
54
|
+
end
|
55
|
+
|
56
|
+
# Connects to tweetstream and opens event handlers for this bot
|
57
|
+
def start
|
58
|
+
configure
|
59
|
+
|
60
|
+
@on_startup.call if @on_startup
|
51
61
|
|
52
62
|
@stream.on_error do |msg|
|
53
63
|
log "ERROR: #{msg}"
|
@@ -77,13 +87,20 @@ module Ebooks
|
|
77
87
|
mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
|
78
88
|
|
79
89
|
reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
|
80
|
-
reply_mentions
|
90
|
+
reply_mentions = [ev[:user][:screen_name]] + reply_mentions
|
81
91
|
|
82
92
|
meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
|
93
|
+
meta[:limit] = 140 - meta[:reply_prefix].length
|
83
94
|
|
84
95
|
mless = ev[:text]
|
85
|
-
|
86
|
-
|
96
|
+
begin
|
97
|
+
ev.attrs[:entities][:user_mentions].reverse.each do |entity|
|
98
|
+
mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]+1..-1]
|
99
|
+
end
|
100
|
+
rescue Exception
|
101
|
+
p ev.attrs[:entities][:user_mentions]
|
102
|
+
p ev[:text]
|
103
|
+
raise
|
87
104
|
end
|
88
105
|
meta[:mentionless] = mless
|
89
106
|
|
@@ -92,7 +109,7 @@ module Ebooks
|
|
92
109
|
# - The tweet is not being retweeted by somebody else
|
93
110
|
# - Or soft-retweeted by somebody else
|
94
111
|
if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
|
95
|
-
log "Mention from
|
112
|
+
log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
|
96
113
|
@on_mention.call(ev, meta)
|
97
114
|
else
|
98
115
|
@on_timeline.call(ev, meta)
|
@@ -117,7 +134,7 @@ module Ebooks
|
|
117
134
|
log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
|
118
135
|
@twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
|
119
136
|
elsif ev.is_a? Twitter::Tweet
|
120
|
-
log "Replying to @#{ev[:user][:screen_name]}: #{text}"
|
137
|
+
log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
|
121
138
|
@twitter.update(text, in_reply_to_status_id: ev[:id])
|
122
139
|
else
|
123
140
|
raise Exception("Don't know how to reply to a #{ev.class}")
|
@@ -138,6 +155,7 @@ module Ebooks
|
|
138
155
|
@twitter.update(*args)
|
139
156
|
end
|
140
157
|
|
158
|
+
def on_startup(&b); @on_startup = b; end
|
141
159
|
def on_follow(&b); @on_follow = b; end
|
142
160
|
def on_mention(&b); @on_mention = b; end
|
143
161
|
def on_timeline(&b); @on_timeline = b; end
|
@@ -1,69 +1,73 @@
|
|
1
1
|
module Ebooks
|
2
|
+
# Special INTERIM token represents sentence boundaries
|
3
|
+
# This is so we can include start and end of statements in model
|
4
|
+
# Due to the way the sentence tokenizer works, can correspond
|
5
|
+
# to multiple actual parts of text (such as ^, $, \n and .?!)
|
6
|
+
INTERIM = :interim
|
7
|
+
|
8
|
+
# This is an ngram-based Markov model optimized to build from a
|
9
|
+
# tokenized sentence list without requiring too much transformation
|
2
10
|
class MarkovModel
|
3
|
-
|
4
|
-
|
5
|
-
attr_accessor :tokens
|
6
|
-
attr_reader :depth
|
7
|
-
|
8
|
-
def represent(token)
|
9
|
-
if token.nil? || token == "\n" || token.empty?
|
10
|
-
INTERIM
|
11
|
-
else
|
12
|
-
token
|
13
|
-
end
|
11
|
+
def self.build(sentences)
|
12
|
+
MarkovModel.new.consume(sentences)
|
14
13
|
end
|
15
14
|
|
16
|
-
def consume(
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
15
|
+
def consume(sentences)
|
16
|
+
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
|
17
|
+
# We map by both bigrams and unigrams so we can fall back to the latter in
|
18
|
+
# cases where an input bigram is unavailable, such as starting a sentence
|
19
|
+
@sentences = sentences
|
20
|
+
@unigrams = {}
|
21
|
+
@bigrams = {}
|
22
|
+
|
23
|
+
sentences.each_with_index do |tokens, i|
|
24
|
+
last_token = INTERIM
|
25
|
+
tokens.each_with_index do |token, j|
|
26
|
+
@unigrams[last_token] ||= []
|
27
|
+
@unigrams[last_token] << [i, j]
|
28
|
+
|
29
|
+
@bigrams[last_token] ||= {}
|
30
|
+
@bigrams[last_token][token] ||= []
|
31
|
+
|
32
|
+
if j == tokens.length-1 # Mark sentence endings
|
33
|
+
@unigrams[token] ||= []
|
34
|
+
@unigrams[token] << INTERIM
|
35
|
+
@bigrams[last_token][token] << INTERIM
|
36
|
+
else
|
37
|
+
@bigrams[last_token][token] << [i, j+1]
|
33
38
|
end
|
34
|
-
prev_tokens << prev
|
35
|
-
end
|
36
39
|
|
37
|
-
|
38
|
-
break if j > prev_tokens.length
|
39
|
-
ngram = prev_tokens.last(j)
|
40
|
-
|
41
|
-
unless ngram == INTERIM && prev_tokens[-1] == INTERIM
|
42
|
-
@model[ngram] ||= []
|
43
|
-
@model[ngram] << represent(token)
|
44
|
-
end
|
40
|
+
last_token = token
|
45
41
|
end
|
46
42
|
end
|
47
43
|
|
48
44
|
self
|
49
45
|
end
|
50
46
|
|
47
|
+
def find_token(index)
|
48
|
+
if index == INTERIM
|
49
|
+
INTERIM
|
50
|
+
else
|
51
|
+
@sentences[index[0]][index[1]]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
51
55
|
def chain(tokens)
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
matches = @
|
56
|
-
if matches
|
57
|
-
#p tokens.last(i)
|
58
|
-
#puts "=> #{matches.inspect}"
|
59
|
-
next_token = matches.sample
|
60
|
-
break
|
61
|
-
end
|
56
|
+
if tokens.length == 1
|
57
|
+
matches = @unigrams[tokens[0]]
|
58
|
+
else
|
59
|
+
matches = @bigrams[tokens[-2]][tokens[-1]]
|
62
60
|
end
|
63
61
|
|
64
|
-
|
62
|
+
if matches.empty?
|
63
|
+
# This should never happen unless a strange token is
|
64
|
+
# supplied from outside the dataset
|
65
|
+
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
|
66
|
+
end
|
65
67
|
|
66
|
-
|
68
|
+
next_token = find_token(matches.sample)
|
69
|
+
|
70
|
+
if next_token == INTERIM # We chose to end the sentence
|
67
71
|
return tokens
|
68
72
|
else
|
69
73
|
return chain(tokens + [next_token])
|
@@ -71,19 +75,7 @@ module Ebooks
|
|
71
75
|
end
|
72
76
|
|
73
77
|
def generate
|
74
|
-
|
75
|
-
NLP.reconstruct(tokens)
|
76
|
-
end
|
77
|
-
|
78
|
-
def serialize
|
79
|
-
{ 'model' => @model,
|
80
|
-
'depth' => @depth }
|
81
|
-
end
|
82
|
-
|
83
|
-
def deserialize(data)
|
84
|
-
@model = data['model']
|
85
|
-
@depth = data['depth']
|
86
|
-
self
|
78
|
+
NLP.reconstruct(chain([INTERIM]))
|
87
79
|
end
|
88
80
|
end
|
89
81
|
end
|
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -7,15 +7,14 @@ require 'digest/md5'
|
|
7
7
|
|
8
8
|
module Ebooks
|
9
9
|
class Model
|
10
|
-
attr_accessor :hash, :sentences, :
|
10
|
+
attr_accessor :hash, :sentences, :markov, :keywords
|
11
11
|
|
12
12
|
def self.consume(txtpath)
|
13
13
|
Model.new.consume(txtpath)
|
14
14
|
end
|
15
15
|
|
16
16
|
def self.load(path)
|
17
|
-
|
18
|
-
Model.new.deserialize(data)
|
17
|
+
Marshal.load(File.read(path))
|
19
18
|
end
|
20
19
|
|
21
20
|
def consume(txtpath)
|
@@ -23,7 +22,7 @@ module Ebooks
|
|
23
22
|
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
24
23
|
|
25
24
|
text = File.read(txtpath)
|
26
|
-
log "Removing commented lines and
|
25
|
+
log "Removing commented lines and mention tokens"
|
27
26
|
|
28
27
|
lines = text.split("\n")
|
29
28
|
keeping = []
|
@@ -34,70 +33,43 @@ module Ebooks
|
|
34
33
|
end
|
35
34
|
text = NLP.normalize(keeping.join("\n"))
|
36
35
|
|
37
|
-
log "Segmenting text into sentences
|
38
|
-
@sentences = NLP.sentences(text).reject do |s|
|
39
|
-
s.length > 140 || s.count('"')%2 != 0
|
40
|
-
end
|
36
|
+
log "Segmenting text into sentences"
|
41
37
|
|
42
|
-
|
43
|
-
@tokenized = @sentences.map { |sent| NLP.tokenize(sent) }
|
44
|
-
@tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) }
|
38
|
+
sentences = NLP.sentences(text)
|
45
39
|
|
46
|
-
log "
|
47
|
-
@
|
40
|
+
log "Tokenizing #{sentences.length} sentences"
|
41
|
+
@sentences = sentences.map { |sent| NLP.tokenize(sent) }
|
48
42
|
|
49
|
-
|
50
|
-
|
43
|
+
log "Building markov model"
|
44
|
+
@markov = MarkovModel.build(@sentences)
|
51
45
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
46
|
+
log "Ranking keywords"
|
47
|
+
require 'benchmark'
|
48
|
+
puts Benchmark.measure {
|
49
|
+
@keywords = NLP.keywords(@sentences)
|
50
|
+
p @keywords.top(100)
|
51
|
+
}
|
52
|
+
|
53
|
+
self
|
59
54
|
end
|
60
55
|
|
61
56
|
def save(path)
|
62
|
-
data = self.serialize
|
63
57
|
File.open(path, 'w') do |f|
|
64
|
-
f.write(Marshal.dump(
|
58
|
+
f.write(Marshal.dump(self))
|
65
59
|
end
|
66
60
|
self
|
67
61
|
end
|
68
62
|
|
69
|
-
def deserialize(data)
|
70
|
-
@hash = data['hash']
|
71
|
-
@tokenized = data['tokenized']
|
72
|
-
@tokensets = data['tokensets']
|
73
|
-
@markov = MarkovModel.new.deserialize(data['markov'])
|
74
|
-
self
|
75
|
-
end
|
76
|
-
|
77
|
-
def replace_noun(sent)
|
78
|
-
tagged = NLP.tagger.add_tags(sent)
|
79
|
-
|
80
|
-
nouns = tagged.scan(/<nn>([^<]+)<\/nn>/).flatten
|
81
|
-
to_replace = nouns.reject { |n| ['much'].include?(n) }.sample
|
82
|
-
return sent if to_replace.nil?
|
83
|
-
replacement = NLP.nouns.sample
|
84
|
-
if to_replace.en.plural.length <= to_replace.length
|
85
|
-
replacement = replacement.en.plural(1)
|
86
|
-
end
|
87
|
-
sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement)
|
88
|
-
sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a)
|
89
|
-
end
|
90
|
-
|
91
63
|
def fix(tweet)
|
92
64
|
# This seems to require an external api call
|
93
|
-
begin
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
rescue Exception => e
|
98
|
-
|
99
|
-
|
100
|
-
end
|
65
|
+
#begin
|
66
|
+
# fixer = NLP.gingerice.parse(tweet)
|
67
|
+
# log fixer if fixer['corrections']
|
68
|
+
# tweet = fixer['result']
|
69
|
+
#rescue Exception => e
|
70
|
+
# log e.message
|
71
|
+
# log e.backtrace
|
72
|
+
#end
|
101
73
|
|
102
74
|
NLP.htmlentities.decode tweet
|
103
75
|
end
|
@@ -115,33 +87,44 @@ module Ebooks
|
|
115
87
|
fix tweet
|
116
88
|
end
|
117
89
|
|
118
|
-
#
|
119
|
-
#
|
120
|
-
def
|
121
|
-
|
122
|
-
|
90
|
+
# Finds all relevant tokenized sentences to given input by
|
91
|
+
# comparing non-stopword token overlaps
|
92
|
+
def relevant_sentences(input)
|
93
|
+
relevant = []
|
94
|
+
slightly_relevant = []
|
123
95
|
|
124
|
-
|
125
|
-
# Very uninteresting input; no relevant response possible
|
126
|
-
return markov_statement(limit)
|
127
|
-
end
|
96
|
+
tokenized = NLP.tokenize(input)
|
128
97
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
98
|
+
@sentences.each do |sent|
|
99
|
+
tokenized.each do |token|
|
100
|
+
if sent.include?(token)
|
101
|
+
relevant << sent unless NLP.stopword?(token)
|
102
|
+
slightly_relevant << sent
|
103
|
+
end
|
134
104
|
end
|
135
105
|
end
|
136
106
|
|
137
|
-
|
107
|
+
[relevant, slightly_relevant]
|
108
|
+
end
|
138
109
|
|
139
|
-
|
140
|
-
|
110
|
+
# Generates a response by looking for related sentences
|
111
|
+
# in the corpus and building a smaller markov model from these
|
112
|
+
def markov_response(input, limit=140)
|
113
|
+
# First try
|
114
|
+
relevant, slightly_relevant = relevant_sentences(input)
|
115
|
+
|
116
|
+
p relevant
|
117
|
+
p slightly_relevant.length
|
118
|
+
|
119
|
+
if relevant.length >= 3
|
120
|
+
markov = MarkovModel.new.consume(relevant)
|
121
|
+
markov_statement(limit, markov)
|
122
|
+
elsif slightly_relevant.length > 5
|
123
|
+
markov = MarkovModel.new.consume(slightly_relevant)
|
124
|
+
markov_statement(limit, markov)
|
125
|
+
else
|
126
|
+
markov_statement(limit)
|
141
127
|
end
|
142
|
-
|
143
|
-
markov = MarkovModel.new.consume(relevant.sample(100))
|
144
|
-
markov_statement(limit, markov)
|
145
128
|
end
|
146
129
|
end
|
147
130
|
end
|
data/lib/twitter_ebooks/nlp.rb
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
|
-
require '
|
4
|
-
Linguistics.use(:en, classes: [String])
|
2
|
+
require 'fast-stemmer'
|
3
|
+
require 'highscore'
|
5
4
|
|
6
5
|
module Ebooks
|
7
6
|
module NLP
|
8
|
-
# We
|
9
|
-
#
|
7
|
+
# We deliberately limit our punctuation handling to stuff we can do consistently
|
8
|
+
# It'll just be a part of another token if we don't split it out, and that's fine
|
9
|
+
PUNCTUATION = ".?!,"
|
10
|
+
|
11
|
+
# Lazy-load NLP libraries and resources
|
12
|
+
# Some of this stuff is pretty heavy and we don't necessarily need
|
13
|
+
# to be using it all of the time
|
10
14
|
|
11
15
|
def self.stopwords
|
12
16
|
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
@@ -19,89 +23,102 @@ module Ebooks
|
|
19
23
|
def self.adjectives
|
20
24
|
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
21
25
|
end
|
22
|
-
|
23
|
-
def self.
|
24
|
-
|
25
|
-
# It's too slow for finding sentences in paragraphs, hence tactful
|
26
|
-
require 'tokenizer'
|
27
|
-
@tokenizer ||= Tokenizer::Tokenizer.new(:en)
|
28
|
-
end
|
29
|
-
|
30
|
-
def self.tactful
|
31
|
-
require 'tactful_tokenizer'
|
32
|
-
@tactful ||= TactfulTokenizer::Model.new
|
26
|
+
|
27
|
+
def self.wordfreq
|
28
|
+
@wordfreq ||= JSON.load(File.read(File.join(DATA_PATH, 'wordfreq.json')))
|
33
29
|
end
|
34
30
|
|
31
|
+
# POS tagger
|
35
32
|
def self.tagger
|
36
33
|
require 'engtagger'
|
37
34
|
@tagger ||= EngTagger.new
|
38
35
|
end
|
39
36
|
|
40
|
-
|
41
|
-
require 'lingua/stemmer'
|
42
|
-
@stemmer ||= Lingua::Stemmer.new
|
43
|
-
end
|
44
|
-
|
37
|
+
# Gingerice text correction service
|
45
38
|
def self.gingerice
|
46
39
|
require 'gingerice'
|
47
40
|
Gingerice::Parser.new # No caching for this one
|
48
41
|
end
|
49
42
|
|
43
|
+
# For decoding html entities
|
50
44
|
def self.htmlentities
|
51
45
|
require 'htmlentities'
|
52
46
|
@htmlentities ||= HTMLEntities.new
|
53
47
|
end
|
54
48
|
|
55
|
-
### Utility functions
|
49
|
+
### Utility functions
|
56
50
|
|
57
|
-
|
58
|
-
tactful.tokenize_text(text)
|
59
|
-
end
|
60
|
-
|
51
|
+
# We don't really want to deal with all this weird unicode punctuation
|
61
52
|
def self.normalize(text)
|
62
53
|
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
63
54
|
end
|
64
55
|
|
56
|
+
# Split text into sentences
|
57
|
+
# We use ad hoc approach because fancy libraries do not deal
|
58
|
+
# especially well with tweet formatting, and we can fake solving
|
59
|
+
# the quote problem during generation
|
60
|
+
def self.sentences(text)
|
61
|
+
text.split(/\n+|(?<=[.?!])\s+/)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Split a sentence into word-level tokens
|
65
|
+
# As above, this is ad hoc because tokenization libraries
|
66
|
+
# do not behave well wrt. things like emoticons and timestamps
|
65
67
|
def self.tokenize(sentence)
|
66
|
-
|
67
|
-
|
68
|
-
# that are hard to correct.
|
69
|
-
sentence.split(/\s/).map do |token|
|
70
|
-
exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/]
|
71
|
-
if exceptions.find { |r| r.match(token) }
|
72
|
-
token
|
73
|
-
else
|
74
|
-
token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/)
|
75
|
-
end
|
76
|
-
end.flatten
|
68
|
+
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
|
69
|
+
sentence.split(regex)
|
77
70
|
end
|
78
71
|
|
79
|
-
def self.
|
72
|
+
def self.stem(word)
|
73
|
+
Stemmer::stem_word(word.downcase)
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.keywords(sentences)
|
77
|
+
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
78
|
+
text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
|
79
|
+
|
80
|
+
text = Highscore::Content.new(text)
|
81
|
+
|
82
|
+
text.configure do
|
83
|
+
#set :multiplier, 2
|
84
|
+
#set :upper_case, 3
|
85
|
+
#set :long_words, 2
|
86
|
+
#set :long_words_threshold, 15
|
87
|
+
#set :vowels, 1 # => default: 0 = not considered
|
88
|
+
#set :consonants, 5 # => default: 0 = not considered
|
89
|
+
#set :ignore_case, true # => default: false
|
90
|
+
set :word_pattern, /(?<!@)(?<=\s)[\w']+/ # => default: /\w+/
|
91
|
+
#set :stemming, true # => default: false
|
92
|
+
end
|
93
|
+
|
94
|
+
text.keywords
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.stemset(sentence)
|
80
98
|
tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
|
81
99
|
tokens.map(&:downcase)
|
82
100
|
.reject { |token| stopwords.include?(token) }
|
101
|
+
.map { |t| stemmer.stem(t) }
|
83
102
|
.to_set
|
84
103
|
end
|
85
104
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
elsif p1 && !p2 # "foo. rah"
|
94
|
-
true
|
95
|
-
else # "foo rah"
|
96
|
-
true
|
105
|
+
# Builds a token stem frequency map
|
106
|
+
def self.stemfreq(sentences)
|
107
|
+
freqmap = {}
|
108
|
+
sentences.flatten.each do |token|
|
109
|
+
stem = NLP.stem(token)
|
110
|
+
freqmap[stem] ||= 0
|
111
|
+
freqmap[stem] += 1
|
97
112
|
end
|
113
|
+
freqmap
|
98
114
|
end
|
99
115
|
|
116
|
+
# Takes a list of tokens and builds a nice-looking sentence
|
100
117
|
def self.reconstruct(tokens)
|
101
|
-
# Put tokens back together into a nice looking sentence
|
102
118
|
text = ""
|
103
119
|
last_token = nil
|
104
120
|
tokens.each do |token|
|
121
|
+
next if token == INTERIM
|
105
122
|
text += ' ' if last_token && space_between?(last_token, token)
|
106
123
|
text += token
|
107
124
|
last_token = token
|
@@ -109,17 +126,35 @@ module Ebooks
|
|
109
126
|
text
|
110
127
|
end
|
111
128
|
|
112
|
-
#
|
113
|
-
|
114
|
-
|
129
|
+
# Determine if we need to insert a space between two tokens
|
130
|
+
def self.space_between?(token1, token2)
|
131
|
+
p1 = self.punctuation?(token1)
|
132
|
+
p2 = self.punctuation?(token2)
|
133
|
+
if p1 && p2 # "foo?!"
|
134
|
+
false
|
135
|
+
elsif !p1 && p2 # "foo."
|
136
|
+
false
|
137
|
+
elsif p1 && !p2 # "foo. rah"
|
138
|
+
true
|
139
|
+
else # "foo rah"
|
140
|
+
true
|
141
|
+
end
|
142
|
+
end
|
115
143
|
|
116
144
|
def self.punctuation?(token)
|
117
145
|
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
118
146
|
end
|
119
147
|
|
148
|
+
def self.stopword?(token)
|
149
|
+
@stopword_set ||= stopwords.map(&:downcase).to_set
|
150
|
+
@stopword_set.include?(token.downcase)
|
151
|
+
end
|
152
|
+
|
153
|
+
# Determine if a sample of text contains unmatched brackets or quotes
|
154
|
+
# This is one of the more frequent and noticeable failure modes for
|
155
|
+
# the markov generator; we can just tell it to retry
|
120
156
|
def self.unmatched_enclosers?(text)
|
121
|
-
|
122
|
-
enclosers = ['**', '""', '()', '[]', '``']
|
157
|
+
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
123
158
|
enclosers.each do |pair|
|
124
159
|
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
|
125
160
|
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
freqmap = {}
|
7
|
+
|
8
|
+
data = File.read("data/ANC-all-count.txt")
|
9
|
+
data = data.unpack("C*").pack("U*")
|
10
|
+
|
11
|
+
data.lines.each do |l|
|
12
|
+
vals = l.split("\t")
|
13
|
+
|
14
|
+
freqmap[vals[0]] = vals[-1].to_i
|
15
|
+
end
|
16
|
+
|
17
|
+
File.open("data/wordfreq.json", 'w') do |f|
|
18
|
+
f.write(JSON.dump(freqmap))
|
19
|
+
end
|
data/skeleton/Procfile
CHANGED
@@ -1 +1 @@
|
|
1
|
-
worker: ruby
|
1
|
+
worker: ruby run.rb start
|
data/skeleton/bots.rb
CHANGED
data/skeleton/corpus/README.md
CHANGED
@@ -1 +1 @@
|
|
1
|
-
Put raw text files in here
|
1
|
+
Put any raw text files in here to be processed.
|