nddrylliog_pismo 0.7.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/.document +5 -0
  2. data/.gitignore +29 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +23 -0
  5. data/NOTICE +4 -0
  6. data/README.markdown +131 -0
  7. data/Rakefile +72 -0
  8. data/bin/pismo +45 -0
  9. data/lib/pismo.rb +82 -0
  10. data/lib/pismo/document.rb +67 -0
  11. data/lib/pismo/external_attributes.rb +14 -0
  12. data/lib/pismo/internal_attributes.rb +316 -0
  13. data/lib/pismo/reader.rb +19 -0
  14. data/lib/pismo/reader/base.rb +259 -0
  15. data/lib/pismo/reader/cluster.rb +171 -0
  16. data/lib/pismo/reader/tree.rb +154 -0
  17. data/lib/pismo/stopwords.txt +1002 -0
  18. data/lib/pismo/version.rb +3 -0
  19. data/pismo.gemspec +30 -0
  20. data/test/corpus/bbcnews.html +2131 -0
  21. data/test/corpus/bbcnews2.html +1575 -0
  22. data/test/corpus/briancray.html +269 -0
  23. data/test/corpus/cant_read.html +426 -0
  24. data/test/corpus/factor.html +1362 -0
  25. data/test/corpus/gmane.html +138 -0
  26. data/test/corpus/huffington.html +2932 -0
  27. data/test/corpus/metadata_expected.yaml +72 -0
  28. data/test/corpus/metadata_expected.yaml.old +122 -0
  29. data/test/corpus/queness.html +919 -0
  30. data/test/corpus/reader_expected.yaml +39 -0
  31. data/test/corpus/readers/cluster_expected.yaml +45 -0
  32. data/test/corpus/readers/tree_expected.yaml +55 -0
  33. data/test/corpus/rubyinside.html +318 -0
  34. data/test/corpus/rww.html +1351 -0
  35. data/test/corpus/spolsky.html +298 -0
  36. data/test/corpus/techcrunch.html +1285 -0
  37. data/test/corpus/tweet.html +360 -0
  38. data/test/corpus/youtube.html +2348 -0
  39. data/test/corpus/zefrank.html +535 -0
  40. data/test/helper.rb +15 -0
  41. data/test/test_corpus.rb +54 -0
  42. data/test/test_pismo_document.rb +34 -0
  43. metadata +156 -0
@@ -0,0 +1,171 @@
1
+ # encoding: utf-8
2
+
3
+ module Pismo
4
+ module Reader
5
+ class Cluster < Base
6
+
7
+ # Adapted from : http://rubyforge.org/projects/extractcontent/
8
+ #
9
+ # Portions of this code are :
10
+ # Copyright (c) 2007/2008 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
11
+ #
12
+ # Permission is hereby granted, free of charge, to any person obtaining
13
+ # a copy of this software and associated documentation files (the
14
+ # "Software"), to deal in the Software without restriction, including
15
+ # without limitation the rights to use, copy, modify, merge, publish,
16
+ # distribute, sublicense, and/or sell copies of the Software, and to
17
+ # permit persons to whom the Software is furnished to do so, subject to
18
+ # the following conditions:
19
+ #
20
+ # The above copyright notice and this permission notice shall be
21
+ # included in all copies or substantial portions of the Software.
22
+ #
23
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
27
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
28
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
29
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30
+
31
+ # Default option parameters
32
+ DEFAULTS = {
33
+ :threshold => 100, # threshold for score of the text
34
+ :min_length => 80, # minimum length of evaluated blocks
35
+ :decay_factor => 0.73, # decay factor for block score
36
+ :continuous_factor => 1.62, # continuous factor for block score ( the larger, the harder to continue )
37
+ :no_body_factor => 0.72, # no body factor that reduces block score if waste expressions are present
38
+ :punctuation_weight => 10, # score weight for punctuation
39
+ :punctuations => /(\.[^A-Za-z0-9]|,[^0-9]|!|\?)/, # punctuation characters
40
+ :waste_expressions => /Copyright|All Rights Reserved/i, # characteristic keywords including footer
41
+ :debug => false, # if true, output block information to stdout
42
+ }
43
+
44
+ # Analyze the structure of the HTML document and score content blocks for likelihood of containing useful content
45
+ def analyze
46
+
47
+ opt = DEFAULTS.clone
48
+ opt.merge!(@options)
49
+
50
+ @sections = []
51
+ factor = continuous = 1.0
52
+ body = ''
53
+ score = 0
54
+
55
+ # The content is split into blocks of divs
56
+ list = @raw_content.split(/<\/?(?:div)[^>]*>/)
57
+ list.each do |block|
58
+ next unless block
59
+ block.gsub!(/\n/, '')
60
+
61
+ # Ignore blocks that have no tex
62
+ next if has_only_tags?(block)
63
+
64
+ # Each new block iterated over makes it less likely for it to belong
65
+ # to the existing block
66
+ continuous /= opt[:continuous_factor] if body.length > 0
67
+
68
+ # Clean up and strip block of html tags for scoring
69
+ clean = clean_block(block)
70
+ #clean = strip_tags(block)
71
+ next if clean.length < opt[:min_length]
72
+
73
+ # Calculate scores for clustering of blocks
74
+
75
+ # c represents how probable it is for this block to be a content block
76
+ c = (clean.length + clean.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor
77
+
78
+ # The further down the document we go (i.e. the more blocks we see),
79
+ # the less likely they are to be valid content blocks
80
+ factor *= opt[:decay_factor]
81
+
82
+ # The not body rate represents how likely this is to be a junk block
83
+ not_body_rate = block.scan(opt[:waste_expressions]).length
84
+
85
+ # The block score is reduced if there is a not_body_rate
86
+ c *= (opt[:no_body_factor] ** not_body_rate) if not_body_rate>0
87
+
88
+ # c1 represents how probable it is for this block to belong to the
89
+ # existing block or if it is a new one
90
+ c1 = c * continuous
91
+
92
+ puts "----- #{c}*#{continuous}=#{c1} #{clean.length} \n\n" if opt[:debug]
93
+
94
+ if c1 > opt[:threshold]
95
+ # Treat continuous blocks as cluster
96
+ body += block + "\n"
97
+ score += c1
98
+ continuous = opt[:continuous_factor]
99
+ elsif c > opt[:threshold]
100
+ # Continuous block end
101
+ @sections << { :body => body, :score => score }
102
+ body = block + "\n"
103
+ score = c
104
+ continuous = opt[:continuous_factor]
105
+ else
106
+ # We drop blocks that don't have a high enough c score
107
+ end
108
+ end
109
+ # Add the last block as we've finished iterating
110
+ @sections << { :body => body, :score => score } if body
111
+ # Sort the sections by score
112
+ sorted_sections = @sections.sort_by { |section| section[:score] }
113
+ # Convert to nokogiri representation for compatibility with the content method
114
+ @content_candidates = sorted_sections.reverse.map { |section| Nokogiri::HTML(section[:body], nil, 'utf-8') }
115
+ end
116
+
117
+ def content_at(index)
118
+ @content_candidates[index]
119
+ end
120
+
121
+ protected
122
+
123
+ # Checks if the given block has only tags without text.
124
+ def has_only_tags?(block)
125
+ block.gsub(/<[^>]*>/im, '').strip.length == 0
126
+ end
127
+
128
+ # Eliminates link heavy blocks and blocks that are lists of links and
129
+ # then returns block stripped of tags
130
+ def clean_block(block)
131
+ # Return empty block if it is a list of links
132
+ return "" if is_link_list?(block)
133
+
134
+ # Return empty block if it is a very link heavy block
135
+ count = 0
136
+ no_links = block.gsub(/<a\s[^>]*>.*?<\/a\s*>/im){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/im, '')
137
+ return "" if no_links.length < 20 * count
138
+
139
+ strip_tags(no_links)
140
+ end
141
+
142
+ # Determines whether a block is link list or not
143
+ def is_link_list?(st)
144
+ if st =~ /<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/im
145
+ listpart = $1
146
+ outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/imn, '').gsub(/<.+?>/mn, '').gsub(/\s+/, ' ')
147
+ list = listpart.split(/<li[^>]*>/)
148
+ list.shift
149
+ rate = evaluate_list(list)
150
+ outside.length <= st.length / (45 / rate)
151
+ end
152
+ end
153
+
154
+ # Estimates how much degree of link list
155
+ def evaluate_list(list)
156
+ return 1 if list.length == 0
157
+ hit = 0
158
+ list.each do |line|
159
+ hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/imn
160
+ end
161
+ return 9 * (1.0 * hit / list.length) ** 2 + 1
162
+ end
163
+
164
+ # Removes all html tags and attributes from html
165
+ def strip_tags(html)
166
+ strip(Sanitize.clean(html, :elements => [], :attributes => []))
167
+ end
168
+
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,154 @@
1
+ module Pismo
2
+ module Reader
3
+ class Tree < Base
4
+
5
+ # Analyze the structure of the HTML document and score branches for likelihood of containing useful content
6
+ def analyze
7
+ @tree = {}
8
+ subels = {}
9
+
10
+ t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
11
+
12
+ @doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
13
+ # Assume that no content we'll want comes in a total package of fewer than 80 characters!
14
+ next unless el.text.to_s.strip.length >= 80
15
+
16
+ path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
17
+ depth = path_segments.length
18
+
19
+ local_ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)
20
+ ids = local_ids
21
+
22
+ cp = el.parent
23
+ (depth - 1).times do
24
+ ids += (cp['id'].to_s + ' ' + cp['class'].to_s).downcase.strip.scan(/[a-z]+/)
25
+ cp = cp.parent
26
+ end if depth > 1
27
+
28
+ #puts "IDS"
29
+ #ap ids
30
+ #puts "LOCAL IDS"
31
+ #ap local_ids
32
+
33
+ branch = {}
34
+ branch[:ids] = ids
35
+ branch[:local_ids] = local_ids
36
+ branch[:score] = -(BAD_WORDS & ids).size
37
+ branch[:score] += ((GOOD_WORDS & ids).size * 2)
38
+ next if branch[:score] < -5
39
+
40
+ #puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
41
+
42
+ # Elements that have an ID or class are more likely to be our winners
43
+ branch[:score] += 2 unless local_ids.empty?
44
+
45
+ branch[:name] = el.name
46
+ branch[:depth] = depth
47
+ branch[:path] = el.path
48
+
49
+ branch[:raw_word_count] = 0
50
+ branch[:word_count] = 0
51
+ branch[:child_count] = 0
52
+ branch[:bad_child_count] = 0
53
+ branch[:score_steps] = []
54
+
55
+
56
+ el.traverse do |subel|
57
+ div_at_end_of_branch = false if subel.name == "div"
58
+ path = subel.path
59
+ subels[path] ||= {}
60
+ subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
61
+ subels[path][:is_text] ||= subel.text?
62
+
63
+ if subels[path][:is_text]
64
+ subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
65
+ next if subels[path][:text].empty?
66
+
67
+ subels[path][:raw_word_count] ||= subels[path][:text].size
68
+ subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
69
+ subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
70
+
71
+ branch[:raw_word_count] += subels[path][:raw_word_count]
72
+ branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
73
+ end
74
+
75
+ subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
76
+ subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
77
+ subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
78
+
79
+ branch[:bad_child_count] += subels[path][:bad_child_count_inc]
80
+ branch[:child_count] += subels[path][:child_count_inc]
81
+ end
82
+
83
+ branch[:score] += 2 if branch[:name] == "div"
84
+ branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
85
+ branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
86
+ branch[:score] *= 3
87
+
88
+
89
+ branch[:score] *= 0.7 if el.children && el.children.size < 3
90
+ branch[:score] *= 1.25 if branch[:raw_word_count] > 10
91
+ next if branch[:raw_word_count] < 10
92
+ branch[:score] += [branch[:word_count], 1].max ** 0.5
93
+
94
+
95
+ word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max
96
+ branch[:word_child_count_ratio] = word_child_count_ratio
97
+
98
+ if branch[:raw_word_count] > 100
99
+ good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
100
+ branch[:score] += good_word_ratio * 12
101
+
102
+ if word_child_count_ratio > 50
103
+ branch[:score] *= 1.5
104
+ elsif word_child_count_ratio > 30
105
+ branch[:score] *= 1.2
106
+ elsif word_child_count_ratio > 15
107
+ branch[:score] *= 1.1
108
+ elsif word_child_count_ratio < 4
109
+ branch[:score] *= 0.9
110
+ end
111
+ end
112
+
113
+ branch[:score_steps] << "s1: #{branch[:score]}"
114
+
115
+ bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
116
+ branch[:bad_child_ratio] = bad_child_ratio
117
+ branch[:score] += 3 if bad_child_ratio < 0.0
118
+ branch[:score] -= 3 if bad_child_ratio > 0.15
119
+ branch[:score] -= 2 if bad_child_ratio > 0.25
120
+ branch[:score] -= 2 if bad_child_ratio > 0.4
121
+ branch[:score] -= 4 if bad_child_ratio > 0.5
122
+ branch[:score] -= 5 if bad_child_ratio > 0.7
123
+ branch[:score] -= 5 if branch[:bad_child_count] > 20
124
+
125
+ branch[:score] += depth
126
+ branch[:score] *= 0.8 if ids.length > 10
127
+
128
+
129
+
130
+ @tree[el.path] = branch
131
+ end
132
+
133
+
134
+ sorted_tree = @tree.sort_by { |k, v| v[:score] }
135
+
136
+ #ap @doc.at(sorted_tree.first[0]).text
137
+
138
+ # Sort the branches by their score in reverse order
139
+ @content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
140
+
141
+ #ap @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
142
+ #t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
143
+ #puts t2 - t1
144
+ #exit
145
+
146
+ end
147
+
148
+ def content_at(index)
149
+ @doc.at(@content_candidates[index].first)
150
+ end
151
+
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,1002 @@
1
+ a
2
+ a's
3
+ Aaliyah
4
+ Aaron
5
+ Abigail
6
+ ability
7
+ able
8
+ about
9
+ above
10
+ according
11
+ accordingly
12
+ across
13
+ actually
14
+ Adam
15
+ Addison
16
+ Adrian
17
+ after
18
+ afterwards
19
+ again
20
+ against
21
+ ago
22
+ Aidan
23
+ Aiden
24
+ ain't
25
+ al
26
+ Alejandro
27
+ Alex
28
+ Alexa
29
+ Alexander
30
+ Alexandra
31
+ Alexis
32
+ all
33
+ Allison
34
+ allow
35
+ allowed
36
+ allowing
37
+ allows
38
+ almost
39
+ alone
40
+ along
41
+ alongside
42
+ already
43
+ also
44
+ although
45
+ always
46
+ Alyssa
47
+ am
48
+ Amanda
49
+ Amber
50
+ among
51
+ amongst
52
+ an
53
+ and
54
+ Andrea
55
+ Andrew
56
+ Angel
57
+ Angelina
58
+ Anna
59
+ annual
60
+ another
61
+ Anthony
62
+ Antonio
63
+ anybody
64
+ anyhow
65
+ anyone
66
+ anything
67
+ anyway
68
+ anyways
69
+ anywhere
70
+ apart
71
+ appear
72
+ appreciate
73
+ appropriate
74
+ approximate
75
+ approximately
76
+ apr
77
+ april
78
+ are
79
+ aren't
80
+ Ariana
81
+ Arianna
82
+ around
83
+ articles
84
+ as
85
+ Ashley
86
+ Ashton
87
+ aside
88
+ ask
89
+ asking
90
+ asshole
91
+ associated
92
+ at
93
+ Audrey
94
+ aug
95
+ august
96
+ Austin
97
+ Autumn
98
+ Ava
99
+ available
100
+ Avery
101
+ away
102
+ awesome
103
+ awfully
104
+ Bailey
105
+ based
106
+ basically
107
+ be
108
+ became
109
+ because
110
+ become
111
+ becomes
112
+ becoming
113
+ been
114
+ beforehand
115
+ behind
116
+ being
117
+ believe
118
+ below
119
+ benefit
120
+ Benjamin
121
+ beside
122
+ besides
123
+ best
124
+ better
125
+ beyond
126
+ big
127
+ biggest
128
+ Blake
129
+ both
130
+ bother
131
+ Brady
132
+ Brandon
133
+ Brayden
134
+ Brian
135
+ Brianna
136
+ brief
137
+ bring
138
+ brings
139
+ Brooke
140
+ Brooklyn
141
+ Bryan
142
+ Bryce
143
+ but
144
+ by
145
+ c'mon
146
+ c's
147
+ Caden
148
+ Caleb
149
+ called
150
+ came
151
+ Cameron
152
+ can
153
+ can't
154
+ cancel
155
+ cannot
156
+ cant
157
+ carefully
158
+ Carlos
159
+ Caroline
160
+ Carson
161
+ Carter
162
+ casually
163
+ cause
164
+ causes
165
+ certain
166
+ certainly
167
+ changes
168
+ Charles
169
+ Chase
170
+ check
171
+ Chloe
172
+ Christian
173
+ Christopher
174
+ Claire
175
+ clearly
176
+ co
177
+ Cody
178
+ Cole
179
+ Colin
180
+ Colton
181
+ come
182
+ comes
183
+ coming
184
+ comment
185
+ company
186
+ compelling
187
+ concerning
188
+ congratulations
189
+ Connor
190
+ consequently
191
+ consider
192
+ considering
193
+ contain
194
+ containing
195
+ contains
196
+ continued
197
+ Cooper
198
+ corresponding
199
+ could
200
+ couldn't
201
+ country
202
+ course
203
+ covered
204
+ covering
205
+ cunt
206
+ currently
207
+ customizable
208
+ damn
209
+ Daniel
210
+ Danielle
211
+ dave
212
+ David
213
+ david
214
+ de
215
+ dead
216
+ dec
217
+ decade
218
+ december
219
+ definitely
220
+ definitive
221
+ described
222
+ despite
223
+ Destiny
224
+ Devin
225
+ did
226
+ didn't
227
+ Diego
228
+ different
229
+ direct
230
+ discuss
231
+ do
232
+ does
233
+ doesn
234
+ doesn't
235
+ doing
236
+ Dominic
237
+ don't
238
+ done
239
+ down
240
+ downwards
241
+ driven
242
+ drove
243
+ during
244
+ Dylan
245
+ e
246
+ each
247
+ easier
248
+ edu
249
+ Eduardo
250
+ Edward
251
+ eg
252
+ eight
253
+ either
254
+ Elijah
255
+ Elizabeth
256
+ Ella
257
+ else
258
+ elsewhere
259
+ Emily
260
+ Emma
261
+ end
262
+ english
263
+ enough
264
+ entirely
265
+ Eric
266
+ Erin
267
+ es
268
+ especially
269
+ et
270
+ etc
271
+ Ethan
272
+ Evan
273
+ Evelyn
274
+ even
275
+ eventually
276
+ ever
277
+ every
278
+ everybody
279
+ everyone
280
+ everything
281
+ everywhere
282
+ ex
283
+ exactly
284
+ example
285
+ except
286
+ existing
287
+ extensive
288
+ extra
289
+ extremely
290
+ f
291
+ Faith
292
+ false
293
+ fame
294
+ far
295
+ feb
296
+ february
297
+ feel
298
+ feeling
299
+ few
300
+ fifth
301
+ finally
302
+ fine
303
+ first
304
+ five
305
+ followed
306
+ following
307
+ follows
308
+ for
309
+ former
310
+ formerly
311
+ forth
312
+ found
313
+ four
314
+ from
315
+ fuck
316
+ full
317
+ further
318
+ furthermore
319
+ g
320
+ Gabriel
321
+ Gabriella
322
+ Gabrielle
323
+ Garrett
324
+ gave
325
+ Gavin
326
+ generally
327
+ get
328
+ gets
329
+ getting
330
+ give
331
+ given
332
+ gives
333
+ glory
334
+ goal
335
+ goes
336
+ going
337
+ gone
338
+ good
339
+ got
340
+ gotten
341
+ Grace
342
+ great
343
+ greetings
344
+ h
345
+ had
346
+ hadn't
347
+ Hailey
348
+ Haley
349
+ Hannah
350
+ happens
351
+ hardly
352
+ has
353
+ hasn't
354
+ have
355
+ haven't
356
+ having
357
+ Hayden
358
+ he
359
+ he's
360
+ hello
361
+ help
362
+ hence
363
+ Henry
364
+ her
365
+ here
366
+ here's
367
+ hereafter
368
+ hereby
369
+ herein
370
+ hereupon
371
+ hers
372
+ herself
373
+ hi
374
+ high
375
+ highly
376
+ him
377
+ himself
378
+ hire
379
+ his
380
+ hither
381
+ hopefully
382
+ how
383
+ howbeit
384
+ however
385
+ huge
386
+ Hunter
387
+ i
388
+ i'd
389
+ i'll
390
+ i'm
391
+ i've
392
+ Ian
393
+ ie
394
+ if
395
+ ignored
396
+ imagine
397
+ immediate
398
+ implement
399
+ important
400
+ impromptu
401
+ in
402
+ inasmuch
403
+ inc
404
+ indeed
405
+ indicate
406
+ indicated
407
+ indicates
408
+ informative
409
+ inhibits
410
+ inner
411
+ insofar
412
+ instead
413
+ interest
414
+ interesting
415
+ into
416
+ inward
417
+ is
418
+ Isaac
419
+ Isabel
420
+ Isabella
421
+ Isaiah
422
+ isn
423
+ isn't
424
+ it
425
+ it'd
426
+ it'll
427
+ it's
428
+ its
429
+ itself
430
+ Ivan
431
+ j
432
+ Jack
433
+ Jackson
434
+ Jacob
435
+ Jada
436
+ Jaden
437
+ Jake
438
+ James
439
+ jan
440
+ january
441
+ Jared
442
+ Jasmine
443
+ Jason
444
+ Jayden
445
+ Jenna
446
+ Jennifer
447
+ Jeremiah
448
+ Jeremy
449
+ Jesse
450
+ Jessica
451
+ Jesus
452
+ jim
453
+ jimmy
454
+ jnr
455
+ Jocelyn
456
+ Joel
457
+ John
458
+ Jonathan
459
+ Jordan
460
+ Jorge
461
+ Jose
462
+ Joseph
463
+ Joshua
464
+ Josiah
465
+ jr
466
+ Juan
467
+ jul
468
+ Julia
469
+ Julian
470
+ july
471
+ jun
472
+ june
473
+ just
474
+ Justin
475
+ k
476
+ Kaden
477
+ Kaitlyn
478
+ Kaleb
479
+ Katelyn
480
+ Katherine
481
+ Kayla
482
+ Kaylee
483
+ keep
484
+ keeps
485
+ Kenneth
486
+ kept
487
+ Kevin
488
+ key
489
+ kid
490
+ Kimberly
491
+ know
492
+ known
493
+ knows
494
+ Kyle
495
+ Kylie
496
+ l
497
+ la
498
+ Landon
499
+ last
500
+ lately
501
+ later
502
+ latter
503
+ latterly
504
+ Lauren
505
+ le
506
+ Leah
507
+ least
508
+ les
509
+ less
510
+ lest
511
+ let
512
+ let's
513
+ levels
514
+ Liam
515
+ like
516
+ liked
517
+ likely
518
+ Lillian
519
+ Lily
520
+ line
521
+ listing
522
+ listings
523
+ little
524
+ Logan
525
+ look
526
+ looking
527
+ looks
528
+ lot
529
+ lots
530
+ love
531
+ low
532
+ ltd
533
+ Lucas
534
+ Luis
535
+ Luke
536
+ m
537
+ Mackenzie
538
+ Madeline
539
+ Madison
540
+ mainly
541
+ Makayla
542
+ many
543
+ mar
544
+ march
545
+ Marcus
546
+ Maria
547
+ Mariah
548
+ Marissa
549
+ Mark
550
+ Mary
551
+ Mason
552
+ Matthew
553
+ maturity
554
+ may
555
+ Maya
556
+ maybe
557
+ me
558
+ mean
559
+ means
560
+ meant
561
+ meanwhile
562
+ Megan
563
+ Melanie
564
+ member
565
+ mentioned
566
+ merely
567
+ Mia
568
+ Michael
569
+ Michelle
570
+ might
571
+ Miguel
572
+ mile
573
+ more
574
+ moreover
575
+ Morgan
576
+ most
577
+ mostly
578
+ moving
579
+ much
580
+ must
581
+ my
582
+ myself
583
+ n
584
+ name
585
+ namely
586
+ Natalie
587
+ Nathan
588
+ Nathaniel
589
+ naturally
590
+ nd
591
+ near
592
+ nearly
593
+ necessary
594
+ need
595
+ needed
596
+ needs
597
+ neither
598
+ Nevaeh
599
+ never
600
+ nevertheless
601
+ new
602
+ next
603
+ Nicholas
604
+ Nicole
605
+ nine
606
+ no
607
+ Noah
608
+ nobody
609
+ non
610
+ none
611
+ noone
612
+ nor
613
+ normally
614
+ not
615
+ notably
616
+ nothing
617
+ nov
618
+ novel
619
+ november
620
+ now
621
+ nowhere
622
+ o
623
+ Obie
624
+ obviously
625
+ oct
626
+ october
627
+ of
628
+ off
629
+ official
630
+ often
631
+ oh
632
+ ok
633
+ okay
634
+ old
635
+ Olivia
636
+ on
637
+ once
638
+ one
639
+ ones
640
+ online
641
+ only
642
+ onto
643
+ open
644
+ or
645
+ org
646
+ oriented
647
+ Oscar
648
+ others
649
+ otherwise
650
+ ought
651
+ our
652
+ ours
653
+ ourselves
654
+ out
655
+ overall
656
+ Owen
657
+ own
658
+ p
659
+ Paige
660
+ par
661
+ Parker
662
+ part
663
+ particular
664
+ particularly
665
+ Patrick
666
+ Paul
667
+ peasy
668
+ per
669
+ perhaps
670
+ piece
671
+ placed
672
+ play
673
+ please
674
+ plus
675
+ possible
676
+ posts
677
+ pre
678
+ preferences
679
+ presumably
680
+ pretty
681
+ probably
682
+ product
683
+ products
684
+ proud
685
+ provide
686
+ provides
687
+ put
688
+ q
689
+ que
690
+ quite
691
+ qv
692
+ r
693
+ Rachel
694
+ rather
695
+ rd
696
+ re
697
+ reached
698
+ read
699
+ real
700
+ really
701
+ reasonably
702
+ Rebecca
703
+ recently
704
+ regarding
705
+ regardless
706
+ regards
707
+ related
708
+ relatively
709
+ replaced
710
+ requirements
711
+ respectively
712
+ Richard
713
+ right
714
+ Riley
715
+ Robert
716
+ run
717
+ Ryan
718
+ s
719
+ safest
720
+ said
721
+ Samantha
722
+ same
723
+ Samuel
724
+ Sara
725
+ Sarah
726
+ Savannah
727
+ saw
728
+ say
729
+ saying
730
+ says
731
+ Sean
732
+ Sebastian
733
+ second
734
+ secondly
735
+ seconds
736
+ see
737
+ seeing
738
+ seem
739
+ seemed
740
+ seeming
741
+ seems
742
+ seen
743
+ self
744
+ selves
745
+ sensible
746
+ sent
747
+ sep
748
+ september
749
+ serious
750
+ seriously
751
+ set
752
+ settings
753
+ seven
754
+ several
755
+ shall
756
+ she
757
+ shit
758
+ shot
759
+ should
760
+ shouldn't
761
+ Sierra
762
+ simpler
763
+ simply
764
+ since
765
+ site
766
+ six
767
+ size
768
+ so
769
+ Sofia
770
+ solid
771
+ some
772
+ somebody
773
+ somehow
774
+ someone
775
+ something
776
+ sometime
777
+ sometimes
778
+ somewhat
779
+ somewhere
780
+ soon
781
+ Sophia
782
+ sorry
783
+ sounding
784
+ specified
785
+ specify
786
+ specifying
787
+ spoke
788
+ spread
789
+ sr
790
+ stand
791
+ started
792
+ step
793
+ Stephanie
794
+ Steven
795
+ still
796
+ stuff
797
+ sub
798
+ subscribe
799
+ such
800
+ suck
801
+ suite
802
+ sup
803
+ sur
804
+ sure
805
+ Sydney
806
+ t
807
+ t's
808
+ take
809
+ taken
810
+ Tanner
811
+ tat
812
+ Taylor
813
+ team
814
+ tedious
815
+ tell
816
+ tends
817
+ th
818
+ than
819
+ thank
820
+ thanks
821
+ thanx
822
+ that
823
+ that's
824
+ thats
825
+ the
826
+ their
827
+ theirs
828
+ them
829
+ themselves
830
+ then
831
+ thence
832
+ there
833
+ there's
834
+ thereafter
835
+ thereby
836
+ therefore
837
+ therein
838
+ theres
839
+ thereupon
840
+ these
841
+ they
842
+ they'd
843
+ they'll
844
+ they're
845
+ they've
846
+ thing
847
+ things
848
+ think
849
+ third
850
+ this
851
+ Thomas
852
+ thomas
853
+ thorough
854
+ thoroughly
855
+ those
856
+ though
857
+ three
858
+ through
859
+ throughout
860
+ thru
861
+ thus
862
+ Timothy
863
+ tit
864
+ to
865
+ today
866
+ together
867
+ told
868
+ too
869
+ took
870
+ toward
871
+ towards
872
+ Trevor
873
+ tried
874
+ tries
875
+ Trinity
876
+ Tristan
877
+ truly
878
+ try
879
+ trying
880
+ turn
881
+ turns
882
+ twice
883
+ two
884
+ Tyler
885
+ typically
886
+ u
887
+ ultra
888
+ un
889
+ unfortunately
890
+ unlikely
891
+ unsurprisingly
892
+ until
893
+ unto
894
+ up
895
+ upon
896
+ us
897
+ use
898
+ used
899
+ useful
900
+ uses
901
+ using
902
+ usually
903
+ uucp
904
+ v
905
+ value
906
+ Vanessa
907
+ various
908
+ very
909
+ via
910
+ Victor
911
+ Victoria
912
+ Vincent
913
+ viz
914
+ vs
915
+ w
916
+ walks
917
+ want
918
+ wants
919
+ was
920
+ wasn't
921
+ way
922
+ we
923
+ we'd
924
+ we'll
925
+ we're
926
+ we've
927
+ week
928
+ weekly
929
+ welcome
930
+ well
931
+ went
932
+ were
933
+ weren't
934
+ what
935
+ what's
936
+ whatever
937
+ when
938
+ whence
939
+ whenever
940
+ where
941
+ where's
942
+ whereafter
943
+ whereas
944
+ whereby
945
+ wherein
946
+ whereupon
947
+ wherever
948
+ whether
949
+ which
950
+ while
951
+ whither
952
+ who
953
+ who's
954
+ whoever
955
+ whole
956
+ whom
957
+ whose
958
+ why
959
+ will
960
+ William
961
+ willing
962
+ win
963
+ wish
964
+ with
965
+ within
966
+ without
967
+ won't
968
+ wonder
969
+ works
970
+ world
971
+ would
972
+ wouldn't
973
+ wrapped
974
+ Wyatt
975
+ Xavier
976
+ y
977
+ yeah
978
+ yes
979
+ yet
980
+ you
981
+ you'd
982
+ you'll
983
+ you're
984
+ you've
985
+ your
986
+ yours
987
+ yourself
988
+ yourselves
989
+ z
990
+ Zachary
991
+ zero
992
+ Zoe
993
+ 0
994
+ 1
995
+ 2
996
+ 3
997
+ 4
998
+ 5
999
+ 6
1000
+ 7
1001
+ 8
1002
+ 9