twitter_ebooks 2.0.3 → 2.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +12 -12
- data/bin/ebooks +21 -6
- data/data/ANC-all-count.txt +297241 -0
- data/data/stopwords.txt +204 -0
- data/data/wordfreq.json +1 -0
- data/lib/twitter_ebooks/bot.rb +25 -7
- data/lib/twitter_ebooks/markov.rb +55 -63
- data/lib/twitter_ebooks/model.rb +57 -74
- data/lib/twitter_ebooks/nlp.rb +90 -55
- data/lib/twitter_ebooks/version.rb +1 -1
- data/script/process_anc_data.rb +19 -0
- data/skeleton/Procfile +1 -1
- data/skeleton/bots.rb +0 -6
- data/skeleton/corpus/README.md +1 -1
- data/skeleton/run.rb +9 -0
- data/test/keywords.rb +18 -0
- data/twitter_ebooks.gemspec +3 -5
- metadata +13 -40
- data/skeleton/model/README.md +0 -1
data/lib/twitter_ebooks/bot.rb
CHANGED
@@ -15,6 +15,10 @@ module Ebooks
|
|
15
15
|
@@all = [] # List of all defined bots
|
16
16
|
def self.all; @@all; end
|
17
17
|
|
18
|
+
def self.get(name)
|
19
|
+
all.find { |bot| bot.username == name }
|
20
|
+
end
|
21
|
+
|
18
22
|
def initialize(username, &b)
|
19
23
|
# Set defaults
|
20
24
|
@username = username
|
@@ -30,8 +34,7 @@ module Ebooks
|
|
30
34
|
STDERR.flush
|
31
35
|
end
|
32
36
|
|
33
|
-
|
34
|
-
def start
|
37
|
+
def configure
|
35
38
|
TweetStream.configure do |config|
|
36
39
|
config.consumer_key = @consumer_key
|
37
40
|
config.consumer_secret = @consumer_secret
|
@@ -48,6 +51,13 @@ module Ebooks
|
|
48
51
|
|
49
52
|
@twitter = Twitter::Client.new
|
50
53
|
@stream = TweetStream::Client.new
|
54
|
+
end
|
55
|
+
|
56
|
+
# Connects to tweetstream and opens event handlers for this bot
|
57
|
+
def start
|
58
|
+
configure
|
59
|
+
|
60
|
+
@on_startup.call if @on_startup
|
51
61
|
|
52
62
|
@stream.on_error do |msg|
|
53
63
|
log "ERROR: #{msg}"
|
@@ -77,13 +87,20 @@ module Ebooks
|
|
77
87
|
mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
|
78
88
|
|
79
89
|
reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
|
80
|
-
reply_mentions
|
90
|
+
reply_mentions = [ev[:user][:screen_name]] + reply_mentions
|
81
91
|
|
82
92
|
meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
|
93
|
+
meta[:limit] = 140 - meta[:reply_prefix].length
|
83
94
|
|
84
95
|
mless = ev[:text]
|
85
|
-
|
86
|
-
|
96
|
+
begin
|
97
|
+
ev.attrs[:entities][:user_mentions].reverse.each do |entity|
|
98
|
+
mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]+1..-1]
|
99
|
+
end
|
100
|
+
rescue Exception
|
101
|
+
p ev.attrs[:entities][:user_mentions]
|
102
|
+
p ev[:text]
|
103
|
+
raise
|
87
104
|
end
|
88
105
|
meta[:mentionless] = mless
|
89
106
|
|
@@ -92,7 +109,7 @@ module Ebooks
|
|
92
109
|
# - The tweet is not being retweeted by somebody else
|
93
110
|
# - Or soft-retweeted by somebody else
|
94
111
|
if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
|
95
|
-
log "Mention from
|
112
|
+
log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
|
96
113
|
@on_mention.call(ev, meta)
|
97
114
|
else
|
98
115
|
@on_timeline.call(ev, meta)
|
@@ -117,7 +134,7 @@ module Ebooks
|
|
117
134
|
log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
|
118
135
|
@twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
|
119
136
|
elsif ev.is_a? Twitter::Tweet
|
120
|
-
log "Replying to @#{ev[:user][:screen_name]}: #{text}"
|
137
|
+
log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
|
121
138
|
@twitter.update(text, in_reply_to_status_id: ev[:id])
|
122
139
|
else
|
123
140
|
raise Exception("Don't know how to reply to a #{ev.class}")
|
@@ -138,6 +155,7 @@ module Ebooks
|
|
138
155
|
@twitter.update(*args)
|
139
156
|
end
|
140
157
|
|
158
|
+
def on_startup(&b); @on_startup = b; end
|
141
159
|
def on_follow(&b); @on_follow = b; end
|
142
160
|
def on_mention(&b); @on_mention = b; end
|
143
161
|
def on_timeline(&b); @on_timeline = b; end
|
@@ -1,69 +1,73 @@
|
|
1
1
|
module Ebooks
|
2
|
+
# Special INTERIM token represents sentence boundaries
|
3
|
+
# This is so we can include start and end of statements in model
|
4
|
+
# Due to the way the sentence tokenizer works, can correspond
|
5
|
+
# to multiple actual parts of text (such as ^, $, \n and .?!)
|
6
|
+
INTERIM = :interim
|
7
|
+
|
8
|
+
# This is an ngram-based Markov model optimized to build from a
|
9
|
+
# tokenized sentence list without requiring too much transformation
|
2
10
|
class MarkovModel
|
3
|
-
|
4
|
-
|
5
|
-
attr_accessor :tokens
|
6
|
-
attr_reader :depth
|
7
|
-
|
8
|
-
def represent(token)
|
9
|
-
if token.nil? || token == "\n" || token.empty?
|
10
|
-
INTERIM
|
11
|
-
else
|
12
|
-
token
|
13
|
-
end
|
11
|
+
def self.build(sentences)
|
12
|
+
MarkovModel.new.consume(sentences)
|
14
13
|
end
|
15
14
|
|
16
|
-
def consume(
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
15
|
+
def consume(sentences)
|
16
|
+
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
|
17
|
+
# We map by both bigrams and unigrams so we can fall back to the latter in
|
18
|
+
# cases where an input bigram is unavailable, such as starting a sentence
|
19
|
+
@sentences = sentences
|
20
|
+
@unigrams = {}
|
21
|
+
@bigrams = {}
|
22
|
+
|
23
|
+
sentences.each_with_index do |tokens, i|
|
24
|
+
last_token = INTERIM
|
25
|
+
tokens.each_with_index do |token, j|
|
26
|
+
@unigrams[last_token] ||= []
|
27
|
+
@unigrams[last_token] << [i, j]
|
28
|
+
|
29
|
+
@bigrams[last_token] ||= {}
|
30
|
+
@bigrams[last_token][token] ||= []
|
31
|
+
|
32
|
+
if j == tokens.length-1 # Mark sentence endings
|
33
|
+
@unigrams[token] ||= []
|
34
|
+
@unigrams[token] << INTERIM
|
35
|
+
@bigrams[last_token][token] << INTERIM
|
36
|
+
else
|
37
|
+
@bigrams[last_token][token] << [i, j+1]
|
33
38
|
end
|
34
|
-
prev_tokens << prev
|
35
|
-
end
|
36
39
|
|
37
|
-
|
38
|
-
break if j > prev_tokens.length
|
39
|
-
ngram = prev_tokens.last(j)
|
40
|
-
|
41
|
-
unless ngram == INTERIM && prev_tokens[-1] == INTERIM
|
42
|
-
@model[ngram] ||= []
|
43
|
-
@model[ngram] << represent(token)
|
44
|
-
end
|
40
|
+
last_token = token
|
45
41
|
end
|
46
42
|
end
|
47
43
|
|
48
44
|
self
|
49
45
|
end
|
50
46
|
|
47
|
+
def find_token(index)
|
48
|
+
if index == INTERIM
|
49
|
+
INTERIM
|
50
|
+
else
|
51
|
+
@sentences[index[0]][index[1]]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
51
55
|
def chain(tokens)
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
matches = @
|
56
|
-
if matches
|
57
|
-
#p tokens.last(i)
|
58
|
-
#puts "=> #{matches.inspect}"
|
59
|
-
next_token = matches.sample
|
60
|
-
break
|
61
|
-
end
|
56
|
+
if tokens.length == 1
|
57
|
+
matches = @unigrams[tokens[0]]
|
58
|
+
else
|
59
|
+
matches = @bigrams[tokens[-2]][tokens[-1]]
|
62
60
|
end
|
63
61
|
|
64
|
-
|
62
|
+
if matches.empty?
|
63
|
+
# This should never happen unless a strange token is
|
64
|
+
# supplied from outside the dataset
|
65
|
+
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
|
66
|
+
end
|
65
67
|
|
66
|
-
|
68
|
+
next_token = find_token(matches.sample)
|
69
|
+
|
70
|
+
if next_token == INTERIM # We chose to end the sentence
|
67
71
|
return tokens
|
68
72
|
else
|
69
73
|
return chain(tokens + [next_token])
|
@@ -71,19 +75,7 @@ module Ebooks
|
|
71
75
|
end
|
72
76
|
|
73
77
|
def generate
|
74
|
-
|
75
|
-
NLP.reconstruct(tokens)
|
76
|
-
end
|
77
|
-
|
78
|
-
def serialize
|
79
|
-
{ 'model' => @model,
|
80
|
-
'depth' => @depth }
|
81
|
-
end
|
82
|
-
|
83
|
-
def deserialize(data)
|
84
|
-
@model = data['model']
|
85
|
-
@depth = data['depth']
|
86
|
-
self
|
78
|
+
NLP.reconstruct(chain([INTERIM]))
|
87
79
|
end
|
88
80
|
end
|
89
81
|
end
|
data/lib/twitter_ebooks/model.rb
CHANGED
@@ -7,15 +7,14 @@ require 'digest/md5'
|
|
7
7
|
|
8
8
|
module Ebooks
|
9
9
|
class Model
|
10
|
-
attr_accessor :hash, :sentences, :
|
10
|
+
attr_accessor :hash, :sentences, :markov, :keywords
|
11
11
|
|
12
12
|
def self.consume(txtpath)
|
13
13
|
Model.new.consume(txtpath)
|
14
14
|
end
|
15
15
|
|
16
16
|
def self.load(path)
|
17
|
-
|
18
|
-
Model.new.deserialize(data)
|
17
|
+
Marshal.load(File.read(path))
|
19
18
|
end
|
20
19
|
|
21
20
|
def consume(txtpath)
|
@@ -23,7 +22,7 @@ module Ebooks
|
|
23
22
|
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
24
23
|
|
25
24
|
text = File.read(txtpath)
|
26
|
-
log "Removing commented lines and
|
25
|
+
log "Removing commented lines and mention tokens"
|
27
26
|
|
28
27
|
lines = text.split("\n")
|
29
28
|
keeping = []
|
@@ -34,70 +33,43 @@ module Ebooks
|
|
34
33
|
end
|
35
34
|
text = NLP.normalize(keeping.join("\n"))
|
36
35
|
|
37
|
-
log "Segmenting text into sentences
|
38
|
-
@sentences = NLP.sentences(text).reject do |s|
|
39
|
-
s.length > 140 || s.count('"')%2 != 0
|
40
|
-
end
|
36
|
+
log "Segmenting text into sentences"
|
41
37
|
|
42
|
-
|
43
|
-
@tokenized = @sentences.map { |sent| NLP.tokenize(sent) }
|
44
|
-
@tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) }
|
38
|
+
sentences = NLP.sentences(text)
|
45
39
|
|
46
|
-
log "
|
47
|
-
@
|
40
|
+
log "Tokenizing #{sentences.length} sentences"
|
41
|
+
@sentences = sentences.map { |sent| NLP.tokenize(sent) }
|
48
42
|
|
49
|
-
|
50
|
-
|
43
|
+
log "Building markov model"
|
44
|
+
@markov = MarkovModel.build(@sentences)
|
51
45
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
46
|
+
log "Ranking keywords"
|
47
|
+
require 'benchmark'
|
48
|
+
puts Benchmark.measure {
|
49
|
+
@keywords = NLP.keywords(@sentences)
|
50
|
+
p @keywords.top(100)
|
51
|
+
}
|
52
|
+
|
53
|
+
self
|
59
54
|
end
|
60
55
|
|
61
56
|
def save(path)
|
62
|
-
data = self.serialize
|
63
57
|
File.open(path, 'w') do |f|
|
64
|
-
f.write(Marshal.dump(
|
58
|
+
f.write(Marshal.dump(self))
|
65
59
|
end
|
66
60
|
self
|
67
61
|
end
|
68
62
|
|
69
|
-
def deserialize(data)
|
70
|
-
@hash = data['hash']
|
71
|
-
@tokenized = data['tokenized']
|
72
|
-
@tokensets = data['tokensets']
|
73
|
-
@markov = MarkovModel.new.deserialize(data['markov'])
|
74
|
-
self
|
75
|
-
end
|
76
|
-
|
77
|
-
def replace_noun(sent)
|
78
|
-
tagged = NLP.tagger.add_tags(sent)
|
79
|
-
|
80
|
-
nouns = tagged.scan(/<nn>([^<]+)<\/nn>/).flatten
|
81
|
-
to_replace = nouns.reject { |n| ['much'].include?(n) }.sample
|
82
|
-
return sent if to_replace.nil?
|
83
|
-
replacement = NLP.nouns.sample
|
84
|
-
if to_replace.en.plural.length <= to_replace.length
|
85
|
-
replacement = replacement.en.plural(1)
|
86
|
-
end
|
87
|
-
sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement)
|
88
|
-
sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a)
|
89
|
-
end
|
90
|
-
|
91
63
|
def fix(tweet)
|
92
64
|
# This seems to require an external api call
|
93
|
-
begin
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
rescue Exception => e
|
98
|
-
|
99
|
-
|
100
|
-
end
|
65
|
+
#begin
|
66
|
+
# fixer = NLP.gingerice.parse(tweet)
|
67
|
+
# log fixer if fixer['corrections']
|
68
|
+
# tweet = fixer['result']
|
69
|
+
#rescue Exception => e
|
70
|
+
# log e.message
|
71
|
+
# log e.backtrace
|
72
|
+
#end
|
101
73
|
|
102
74
|
NLP.htmlentities.decode tweet
|
103
75
|
end
|
@@ -115,33 +87,44 @@ module Ebooks
|
|
115
87
|
fix tweet
|
116
88
|
end
|
117
89
|
|
118
|
-
#
|
119
|
-
#
|
120
|
-
def
|
121
|
-
|
122
|
-
|
90
|
+
# Finds all relevant tokenized sentences to given input by
|
91
|
+
# comparing non-stopword token overlaps
|
92
|
+
def relevant_sentences(input)
|
93
|
+
relevant = []
|
94
|
+
slightly_relevant = []
|
123
95
|
|
124
|
-
|
125
|
-
# Very uninteresting input; no relevant response possible
|
126
|
-
return markov_statement(limit)
|
127
|
-
end
|
96
|
+
tokenized = NLP.tokenize(input)
|
128
97
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
98
|
+
@sentences.each do |sent|
|
99
|
+
tokenized.each do |token|
|
100
|
+
if sent.include?(token)
|
101
|
+
relevant << sent unless NLP.stopword?(token)
|
102
|
+
slightly_relevant << sent
|
103
|
+
end
|
134
104
|
end
|
135
105
|
end
|
136
106
|
|
137
|
-
|
107
|
+
[relevant, slightly_relevant]
|
108
|
+
end
|
138
109
|
|
139
|
-
|
140
|
-
|
110
|
+
# Generates a response by looking for related sentences
|
111
|
+
# in the corpus and building a smaller markov model from these
|
112
|
+
def markov_response(input, limit=140)
|
113
|
+
# First try
|
114
|
+
relevant, slightly_relevant = relevant_sentences(input)
|
115
|
+
|
116
|
+
p relevant
|
117
|
+
p slightly_relevant.length
|
118
|
+
|
119
|
+
if relevant.length >= 3
|
120
|
+
markov = MarkovModel.new.consume(relevant)
|
121
|
+
markov_statement(limit, markov)
|
122
|
+
elsif slightly_relevant.length > 5
|
123
|
+
markov = MarkovModel.new.consume(slightly_relevant)
|
124
|
+
markov_statement(limit, markov)
|
125
|
+
else
|
126
|
+
markov_statement(limit)
|
141
127
|
end
|
142
|
-
|
143
|
-
markov = MarkovModel.new.consume(relevant.sample(100))
|
144
|
-
markov_statement(limit, markov)
|
145
128
|
end
|
146
129
|
end
|
147
130
|
end
|
data/lib/twitter_ebooks/nlp.rb
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
|
-
require '
|
4
|
-
Linguistics.use(:en, classes: [String])
|
2
|
+
require 'fast-stemmer'
|
3
|
+
require 'highscore'
|
5
4
|
|
6
5
|
module Ebooks
|
7
6
|
module NLP
|
8
|
-
# We
|
9
|
-
#
|
7
|
+
# We deliberately limit our punctuation handling to stuff we can do consistently
|
8
|
+
# It'll just be a part of another token if we don't split it out, and that's fine
|
9
|
+
PUNCTUATION = ".?!,"
|
10
|
+
|
11
|
+
# Lazy-load NLP libraries and resources
|
12
|
+
# Some of this stuff is pretty heavy and we don't necessarily need
|
13
|
+
# to be using it all of the time
|
10
14
|
|
11
15
|
def self.stopwords
|
12
16
|
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
@@ -19,89 +23,102 @@ module Ebooks
|
|
19
23
|
def self.adjectives
|
20
24
|
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
21
25
|
end
|
22
|
-
|
23
|
-
def self.
|
24
|
-
|
25
|
-
# It's too slow for finding sentences in paragraphs, hence tactful
|
26
|
-
require 'tokenizer'
|
27
|
-
@tokenizer ||= Tokenizer::Tokenizer.new(:en)
|
28
|
-
end
|
29
|
-
|
30
|
-
def self.tactful
|
31
|
-
require 'tactful_tokenizer'
|
32
|
-
@tactful ||= TactfulTokenizer::Model.new
|
26
|
+
|
27
|
+
def self.wordfreq
|
28
|
+
@wordfreq ||= JSON.load(File.read(File.join(DATA_PATH, 'wordfreq.json')))
|
33
29
|
end
|
34
30
|
|
31
|
+
# POS tagger
|
35
32
|
def self.tagger
|
36
33
|
require 'engtagger'
|
37
34
|
@tagger ||= EngTagger.new
|
38
35
|
end
|
39
36
|
|
40
|
-
|
41
|
-
require 'lingua/stemmer'
|
42
|
-
@stemmer ||= Lingua::Stemmer.new
|
43
|
-
end
|
44
|
-
|
37
|
+
# Gingerice text correction service
|
45
38
|
def self.gingerice
|
46
39
|
require 'gingerice'
|
47
40
|
Gingerice::Parser.new # No caching for this one
|
48
41
|
end
|
49
42
|
|
43
|
+
# For decoding html entities
|
50
44
|
def self.htmlentities
|
51
45
|
require 'htmlentities'
|
52
46
|
@htmlentities ||= HTMLEntities.new
|
53
47
|
end
|
54
48
|
|
55
|
-
### Utility functions
|
49
|
+
### Utility functions
|
56
50
|
|
57
|
-
|
58
|
-
tactful.tokenize_text(text)
|
59
|
-
end
|
60
|
-
|
51
|
+
# We don't really want to deal with all this weird unicode punctuation
|
61
52
|
def self.normalize(text)
|
62
53
|
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
63
54
|
end
|
64
55
|
|
56
|
+
# Split text into sentences
|
57
|
+
# We use ad hoc approach because fancy libraries do not deal
|
58
|
+
# especially well with tweet formatting, and we can fake solving
|
59
|
+
# the quote problem during generation
|
60
|
+
def self.sentences(text)
|
61
|
+
text.split(/\n+|(?<=[.?!])\s+/)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Split a sentence into word-level tokens
|
65
|
+
# As above, this is ad hoc because tokenization libraries
|
66
|
+
# do not behave well wrt. things like emoticons and timestamps
|
65
67
|
def self.tokenize(sentence)
|
66
|
-
|
67
|
-
|
68
|
-
# that are hard to correct.
|
69
|
-
sentence.split(/\s/).map do |token|
|
70
|
-
exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/]
|
71
|
-
if exceptions.find { |r| r.match(token) }
|
72
|
-
token
|
73
|
-
else
|
74
|
-
token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/)
|
75
|
-
end
|
76
|
-
end.flatten
|
68
|
+
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
|
69
|
+
sentence.split(regex)
|
77
70
|
end
|
78
71
|
|
79
|
-
def self.
|
72
|
+
def self.stem(word)
|
73
|
+
Stemmer::stem_word(word.downcase)
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.keywords(sentences)
|
77
|
+
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
78
|
+
text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
|
79
|
+
|
80
|
+
text = Highscore::Content.new(text)
|
81
|
+
|
82
|
+
text.configure do
|
83
|
+
#set :multiplier, 2
|
84
|
+
#set :upper_case, 3
|
85
|
+
#set :long_words, 2
|
86
|
+
#set :long_words_threshold, 15
|
87
|
+
#set :vowels, 1 # => default: 0 = not considered
|
88
|
+
#set :consonants, 5 # => default: 0 = not considered
|
89
|
+
#set :ignore_case, true # => default: false
|
90
|
+
set :word_pattern, /(?<!@)(?<=\s)[\w']+/ # => default: /\w+/
|
91
|
+
#set :stemming, true # => default: false
|
92
|
+
end
|
93
|
+
|
94
|
+
text.keywords
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.stemset(sentence)
|
80
98
|
tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
|
81
99
|
tokens.map(&:downcase)
|
82
100
|
.reject { |token| stopwords.include?(token) }
|
101
|
+
.map { |t| stemmer.stem(t) }
|
83
102
|
.to_set
|
84
103
|
end
|
85
104
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
elsif p1 && !p2 # "foo. rah"
|
94
|
-
true
|
95
|
-
else # "foo rah"
|
96
|
-
true
|
105
|
+
# Builds a token stem frequency map
|
106
|
+
def self.stemfreq(sentences)
|
107
|
+
freqmap = {}
|
108
|
+
sentences.flatten.each do |token|
|
109
|
+
stem = NLP.stem(token)
|
110
|
+
freqmap[stem] ||= 0
|
111
|
+
freqmap[stem] += 1
|
97
112
|
end
|
113
|
+
freqmap
|
98
114
|
end
|
99
115
|
|
116
|
+
# Takes a list of tokens and builds a nice-looking sentence
|
100
117
|
def self.reconstruct(tokens)
|
101
|
-
# Put tokens back together into a nice looking sentence
|
102
118
|
text = ""
|
103
119
|
last_token = nil
|
104
120
|
tokens.each do |token|
|
121
|
+
next if token == INTERIM
|
105
122
|
text += ' ' if last_token && space_between?(last_token, token)
|
106
123
|
text += token
|
107
124
|
last_token = token
|
@@ -109,17 +126,35 @@ module Ebooks
|
|
109
126
|
text
|
110
127
|
end
|
111
128
|
|
112
|
-
#
|
113
|
-
|
114
|
-
|
129
|
+
# Determine if we need to insert a space between two tokens
|
130
|
+
def self.space_between?(token1, token2)
|
131
|
+
p1 = self.punctuation?(token1)
|
132
|
+
p2 = self.punctuation?(token2)
|
133
|
+
if p1 && p2 # "foo?!"
|
134
|
+
false
|
135
|
+
elsif !p1 && p2 # "foo."
|
136
|
+
false
|
137
|
+
elsif p1 && !p2 # "foo. rah"
|
138
|
+
true
|
139
|
+
else # "foo rah"
|
140
|
+
true
|
141
|
+
end
|
142
|
+
end
|
115
143
|
|
116
144
|
def self.punctuation?(token)
|
117
145
|
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
118
146
|
end
|
119
147
|
|
148
|
+
def self.stopword?(token)
|
149
|
+
@stopword_set ||= stopwords.map(&:downcase).to_set
|
150
|
+
@stopword_set.include?(token.downcase)
|
151
|
+
end
|
152
|
+
|
153
|
+
# Determine if a sample of text contains unmatched brackets or quotes
|
154
|
+
# This is one of the more frequent and noticeable failure modes for
|
155
|
+
# the markov generator; we can just tell it to retry
|
120
156
|
def self.unmatched_enclosers?(text)
|
121
|
-
|
122
|
-
enclosers = ['**', '""', '()', '[]', '``']
|
157
|
+
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
123
158
|
enclosers.each do |pair|
|
124
159
|
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
|
125
160
|
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
freqmap = {}
|
7
|
+
|
8
|
+
data = File.read("data/ANC-all-count.txt")
|
9
|
+
data = data.unpack("C*").pack("U*")
|
10
|
+
|
11
|
+
data.lines.each do |l|
|
12
|
+
vals = l.split("\t")
|
13
|
+
|
14
|
+
freqmap[vals[0]] = vals[-1].to_i
|
15
|
+
end
|
16
|
+
|
17
|
+
File.open("data/wordfreq.json", 'w') do |f|
|
18
|
+
f.write(JSON.dump(freqmap))
|
19
|
+
end
|
data/skeleton/Procfile
CHANGED
@@ -1 +1 @@
|
|
1
|
-
worker: ruby
|
1
|
+
worker: ruby run.rb start
|
data/skeleton/bots.rb
CHANGED
data/skeleton/corpus/README.md
CHANGED
@@ -1 +1 @@
|
|
1
|
-
Put raw text files in here
|
1
|
+
Put any raw text files in here to be processed.
|