moo_ebooks 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ebooks
4
+ GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
5
+ DATA_PATH = File.join(GEM_PATH, 'data')
6
+ SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
7
+ TEST_PATH = File.join(GEM_PATH, 'test')
8
+ TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
9
+ INTERIM = :interim
10
+ end
11
+
12
+ require 'moo_ebooks/nlp'
13
+ require 'moo_ebooks/suffix'
14
+ require 'moo_ebooks/model'
@@ -0,0 +1,270 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'set'
5
+ require 'digest/md5'
6
+
7
+ module Ebooks
8
+ # Main class for Model management. Models are required for text generation.
9
+ #
10
+ # @notice Only JSON format is supported.
11
+ # @notice For corpus files. These are assumed to have a `statuses` key and a
12
+ # `mentions` key, which hold the different statuses in them.
13
+ #
14
+ # @notice Make sure NOT to include reblogs (retweets) into corpus data. Those
15
+ # will negatively impact text creation
16
+ class Model
17
+ # @return [Array<String>]
18
+ # An array of unique tokens. This is the main source of actual strings
19
+ # in the model. Manipulation of a token is done using its index
20
+ # in this array, which we call a "tiki"
21
+ attr_accessor :tokens
22
+
23
+ # @return [Array<Array<Integer>>]
24
+ # Sentences represented by arrays of tikis
25
+ attr_accessor :sentences
26
+
27
+ # @return [Array<Array<Integer>>]
28
+ # Sentences derived from Twitter mentions
29
+ attr_accessor :mentions
30
+
31
+ # @return [Array<String>]
32
+ # The top 200 most important keywords, in descending order
33
+ attr_accessor :keywords
34
+ def initialize
35
+ @tokens = []
36
+ @sentences = []
37
+ @mentions = []
38
+ @keywords = []
39
+
40
+ # Reverse lookup tiki by token, for faster generation
41
+ @tikis = {}
42
+ end
43
+
44
+ # Load a saved model
45
+ # @param data [Hash]
46
+ # @return [Ebooks::Model]
47
+ def self.from_hash(data)
48
+ model = Model.new
49
+ model.tokens = data[:tokens]
50
+ model.sentences = data[:sentences]
51
+ model.mentions = data[:mentions]
52
+ model.keywords = data[:keywords]
53
+ model
54
+ end
55
+
56
+ # Load a saved model
57
+ # @param data [String]
58
+ # @reutrn [Ebooks::Model]
59
+ def self.from_json(data)
60
+ from_hash(JSON.parse(data, symbolize_names: true))
61
+ end
62
+
63
+ # Turn this model into its JSON representation.
64
+ def to_json
65
+ to_hash.to_json
66
+ end
67
+
68
+ # Turn this model into its Hash representation
69
+ def to_hash
70
+ { tokens: @tokens, sentences: @sentences, mentions: @mentions,
71
+ keywords: @keywords }
72
+ end
73
+
74
+ # Consume a corpus into this model
75
+ # @param content [Hash]
76
+ def consume(content)
77
+ model = Ebooks::Model.new
78
+ model.consume!(content)
79
+ model
80
+ end
81
+
82
+ # Consume a corpus into this model
83
+ # @param content [Hash]
84
+ def consume!(content)
85
+ unless content.key?(:statuses) || content.key?(:mentions)
86
+ raise ArgumentError, 'Malformed hash object. At least :statuses and/or'\
87
+ ' :mentions must be present as a key'
88
+ end
89
+ consume_statuses(content[:statuses]) unless content[:statuses].nil?
90
+ consume_mentions(content[:mentions]) unless content[:mentions].nil?
91
+ nil
92
+ end
93
+
94
+ # Generate some text
95
+ # @param limit [Integer] available characters
96
+ # @param generator [SuffixGenerator, nil]
97
+ # @param retry_limit [Integer] how many times to retry on invalid status
98
+ # @return [String]
99
+ def update(limit = 140, generator = nil, retry_limit = 10)
100
+ tikis = gather_tikis(limit, generator, retry_limit)
101
+
102
+ status = NLP.reconstruct(tikis, @tokens)
103
+
104
+ fix status
105
+ end
106
+
107
+ # Generates a response by looking for related sentences
108
+ # in the corpus and building a smaller generator from these
109
+ # @param input [String]
110
+ # @param limit [Integer] characters available for response
111
+ # @param sentences [Array<Array<Integer>>]
112
+ # @return [String]
113
+ def reply(input, limit = 140, sentences = @mentions)
114
+ # Prefer mentions
115
+ relevant, slightly_relevant = find_relevant(sentences, input)
116
+
117
+ if relevant.length >= 3
118
+ generator = SuffixGenerator.build(relevant)
119
+ update(limit, generator)
120
+ elsif slightly_relevant.length >= 5
121
+ generator = SuffixGenerator.build(slightly_relevant)
122
+ update(limit, generator)
123
+ else
124
+ update(limit)
125
+ end
126
+ end
127
+
128
+ private
129
+
130
+ def gather_tikis(limit, generator, retry_limit)
131
+ responding = !generator.nil?
132
+ generator ||= SuffixGenerator.build(@sentences)
133
+
134
+ @retries = 0
135
+
136
+ tikis = make_bigram_tikis(limit, generator, retry_limit, responding)
137
+
138
+ if verbatim?(tikis) && tikis.length > 3
139
+ # We made a verbatim status by accident
140
+ tikis = make_unigram_tikis(limit, generator, retry_limit)
141
+ end
142
+ @retries = nil
143
+ tikis
144
+ end
145
+
146
+ def make_unigram_tikis(limit, generator, retry_limit)
147
+ while (tikis = generator.generate(3, :unigrams))
148
+ break if valid_status?(tikis, limit) && !verbatim?(tikis)
149
+
150
+ @retries += 1
151
+ break if retry_limit_reached?(retry_limit)
152
+ end
153
+ tikis
154
+ end
155
+
156
+ def make_bigram_tikis(limit, generator, retry_limit, responding)
157
+ while (tikis = generator.generate(3, :bigrams))
158
+ break if (tikis.length > 3 || responding) && valid_status?(tikis, limit)
159
+
160
+ @retries += 1
161
+ break if retry_limit_reached?(retry_limit)
162
+ end
163
+ tikis
164
+ end
165
+
166
+ def retry_limit_reached?(retry_limit)
167
+ @retries >= retry_limit
168
+ end
169
+
170
+ # Reverse lookup a token index from a token
171
+ # @param token [String]
172
+ # @return [Integer]
173
+ def tikify(token)
174
+ if @tikis.key?(token)
175
+ @tikis[token]
176
+ else
177
+ @tokens << token
178
+ @tikis[token] = @tokens.length - 1
179
+ end
180
+ end
181
+
182
+ # Convert a body of text into arrays of tikis
183
+ # @param text [String]
184
+ # @return [Array<Array<Integer>>]
185
+ def mass_tikify(text)
186
+ sentences = NLP.sentences(text)
187
+
188
+ sentences.map do |s|
189
+ tokens = NLP.tokenize(s).reject do |t|
190
+ # Don't include usernames/urls as tokens
191
+ t.include?('@') || t.include?('http')
192
+ end
193
+
194
+ tokens.map { |t| tikify(t) }
195
+ end
196
+ end
197
+
198
+ # Test if a sentence has been copied verbatim from original
199
+ # @param tikis [Array<Integer>]
200
+ # @return [Boolean]
201
+ def verbatim?(tikis)
202
+ @sentences.include?(tikis) || @mentions.include?(tikis)
203
+ end
204
+
205
+ # Check if an array of tikis comprises a valid status
206
+ # @param tikis [Array<Integer>]
207
+ # @param limit Integer how many chars we have left
208
+ def valid_status?(tikis, limit)
209
+ status = NLP.reconstruct(tikis, @tokens)
210
+ status.length <= limit && !NLP.unmatched_enclosers?(status)
211
+ end
212
+
213
+ # Consume a sequence of statuses (excluding mentions)
214
+ # @param statuses [Array<String>]
215
+ def consume_statuses(statuses)
216
+ statuses.map! do |status|
217
+ NLP.normalize(status)
218
+ end
219
+
220
+ text = statuses.join("\n").encode('UTF-8', invalid: :replace)
221
+ @sentences = mass_tikify(text)
222
+ @keywords = NLP.keywords(text).top(200).map(&:to_s)
223
+
224
+ nil
225
+ end
226
+
227
+ # Consume a sequence of mentions
228
+ # @param mentions [Array<String>]
229
+ def consume_mentions(mentions)
230
+ mentions.map! do |mention|
231
+ NLP.normalize(mention)
232
+ end
233
+
234
+ mention_text = mentions.join("\n").encode('UTF-8', invalid: :replace)
235
+ @mentions = mass_tikify(mention_text)
236
+
237
+ nil
238
+ end
239
+
240
+ # Correct encoding issues in generated text
241
+ # @param text [String]
242
+ # @return [String]
243
+ def fix(text)
244
+ NLP.htmlentities.decode text
245
+ end
246
+
247
+ # Finds relevant and slightly relevant tokenized sentences to input
248
+ # comparing non-stopword token overlaps
249
+ # @param sentences [Array<Array<Integer>>]
250
+ # @param input [String]
251
+ # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
252
+ def find_relevant(sentences, input)
253
+ relevant = []
254
+ slightly_relevant = []
255
+
256
+ tokenized = NLP.tokenize(input).map(&:downcase)
257
+
258
+ sentences.each do |sent|
259
+ tokenized.each do |token|
260
+ if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
261
+ relevant << sent unless NLP.stopword?(token)
262
+ slightly_relevant << sent
263
+ end
264
+ end
265
+ end
266
+
267
+ [relevant, slightly_relevant]
268
+ end
269
+ end
270
+ end
@@ -0,0 +1,170 @@
1
+
2
+ # frozen_string_literal: true
3
+
4
+ require 'highscore'
5
+ require 'htmlentities'
6
+
7
+ module Ebooks
8
+ # @private
9
+ module NLP
10
+ # We deliberately limit our punctuation handling to stuff we can do
11
+ # consistently
12
+ # It'll just be a part of another token if we don't split it out, and
13
+ # that's fine
14
+ PUNCTUATION = '.?!,'
15
+
16
+ # Lazy-load NLP libraries and resources
17
+ # Some of this stuff is pretty heavy and we don't necessarily need
18
+ # to be using it all of the time
19
+
20
+ # Lazily loads an array of stopwords
21
+ # Stopwords are common words that should often be ignored
22
+ # @return [Array<String>]
23
+ def self.stopwords
24
+ @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
25
+ end
26
+
27
+ # Lazily load HTML entity decoder
28
+ # @return [HTMLEntities]
29
+ def self.htmlentities
30
+ @htmlentities ||= HTMLEntities.new
31
+ end
32
+
33
+ ### Utility functions
34
+
35
+ # Normalize some strange unicode punctuation variants
36
+ # @param text [String]
37
+ # @return [String]
38
+ def self.normalize(text)
39
+ htmlentities.decode(text.tr('“', '"').tr('”', '"').tr('’', "'")
40
+ .gsub('…', '...'))
41
+ end
42
+
43
+ # Split text into sentences
44
+ # We use ad hoc approach because fancy libraries do not deal
45
+ # especially well with tweet formatting, and we can fake solving
46
+ # the quote problem during generation
47
+ # @param text [String]
48
+ # @return [Array<String>]
49
+ def self.sentences(text)
50
+ text.split(/\n+|(?<=[.?!])\s+/)
51
+ end
52
+
53
+ # Split a sentence into word-level tokens
54
+ # As above, this is ad hoc because tokenization libraries
55
+ # do not behave well wrt. things like emoticons and timestamps
56
+ # @param sentence [String]
57
+ # @return [Array<String>]
58
+ def self.tokenize(sentence)
59
+ regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|
60
+ (?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/x
61
+ sentence.split(regex)
62
+ end
63
+
64
+ # Use highscore gem to find interesting keywords in a corpus
65
+ # @param text [String]
66
+ # @return [Highscore::Keywords]
67
+ def self.keywords(text)
68
+ # Preprocess to remove stopwords (highscore's blacklist is v. slow)
69
+ text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
70
+
71
+ text = Highscore::Content.new(text)
72
+
73
+ text.configure do
74
+ # set :multiplier, 2
75
+ # set :upper_case, 3
76
+ # set :long_words, 2
77
+ # set :long_words_threshold, 15
78
+ # set :vowels, 1 # => default: 0 = not considered
79
+ # set :consonants, 5 # => default: 0 = not considered
80
+ # set :ignore_case, true # => default: false
81
+ set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
82
+ # set :stemming, true # => default: false
83
+ end
84
+
85
+ text.keywords
86
+ end
87
+
88
+ # Builds a proper sentence from a list of tikis
89
+ # @param tikis [Array<Integer>]
90
+ # @param tokens [Array<String>]
91
+ # @return [String]
92
+ def self.reconstruct(tikis, tokens)
93
+ text = ''
94
+ last_token = nil
95
+ tikis.each do |tiki|
96
+ next if tiki == INTERIM
97
+ token = tokens[tiki]
98
+ text += ' ' if last_token && space_between?(last_token, token)
99
+ text += token
100
+ last_token = token
101
+ end
102
+ text
103
+ end
104
+
105
+ # Determine if we need to insert a space between two tokens
106
+ # @param token1 [String]
107
+ # @param token2 [String]
108
+ # @return [Boolean]
109
+ def self.space_between?(token1, token2)
110
+ p1 = punctuation?(token1)
111
+ p2 = punctuation?(token2)
112
+ if (p1 && p2) || (!p1 && p2) # "foo?!" || "foo."
113
+ false
114
+ else # "foo rah" || "foo. rah"
115
+ true
116
+ end
117
+ end
118
+
119
+ # Is this token comprised of punctuation?
120
+ # @param token [String]
121
+ # @return [Boolean]
122
+ def self.punctuation?(token)
123
+ (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
124
+ end
125
+
126
+ # Is this token a stopword?
127
+ # @param token [String]
128
+ # @return [Boolean]
129
+ def self.stopword?(token)
130
+ @stopword_set ||= stopwords.map(&:downcase).to_set
131
+ @stopword_set.include?(token.downcase)
132
+ end
133
+
134
+ # Determine if a sample of text contains unmatched brackets or quotes
135
+ # This is one of the more frequent and noticeable failure modes for
136
+ # the generator; we can just tell it to retry
137
+ # @param text [String]
138
+ # @return [Boolean]
139
+ def self.unmatched_enclosers?(text)
140
+ enclosers = ['**', '""', '()', '[]', '``', "''"]
141
+ enclosers.each do |pair|
142
+ starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
143
+ ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
144
+
145
+ opened = 0
146
+
147
+ tokenize(text).each do |token|
148
+ opened += 1 if token.match(starter)
149
+ opened -= 1 if token.match(ender)
150
+
151
+ return true if opened.negative? # Too many ends!
152
+ end
153
+
154
+ return true if opened != 0 # Mismatch somewhere.
155
+ end
156
+
157
+ false
158
+ end
159
+
160
+ # Determine if ary2 is a subsequence of ary1
161
+ # @param ary1 [Array]
162
+ # @param ary2 [Array]
163
+ # @return [Boolean]
164
+ def self.subseq?(ary1, ary2)
165
+ !ary1.each_index.find do |i|
166
+ ary1[i...i + ary2.length] == ary2
167
+ end.nil?
168
+ end
169
+ end
170
+ end