nddrylliog_pismo 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/.document +5 -0
  2. data/.gitignore +29 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +23 -0
  5. data/NOTICE +4 -0
  6. data/README.markdown +131 -0
  7. data/Rakefile +72 -0
  8. data/bin/pismo +45 -0
  9. data/lib/pismo.rb +82 -0
  10. data/lib/pismo/document.rb +67 -0
  11. data/lib/pismo/external_attributes.rb +14 -0
  12. data/lib/pismo/internal_attributes.rb +316 -0
  13. data/lib/pismo/reader.rb +19 -0
  14. data/lib/pismo/reader/base.rb +259 -0
  15. data/lib/pismo/reader/cluster.rb +171 -0
  16. data/lib/pismo/reader/tree.rb +154 -0
  17. data/lib/pismo/stopwords.txt +1002 -0
  18. data/lib/pismo/version.rb +3 -0
  19. data/pismo.gemspec +30 -0
  20. data/test/corpus/bbcnews.html +2131 -0
  21. data/test/corpus/bbcnews2.html +1575 -0
  22. data/test/corpus/briancray.html +269 -0
  23. data/test/corpus/cant_read.html +426 -0
  24. data/test/corpus/factor.html +1362 -0
  25. data/test/corpus/gmane.html +138 -0
  26. data/test/corpus/huffington.html +2932 -0
  27. data/test/corpus/metadata_expected.yaml +72 -0
  28. data/test/corpus/metadata_expected.yaml.old +122 -0
  29. data/test/corpus/queness.html +919 -0
  30. data/test/corpus/reader_expected.yaml +39 -0
  31. data/test/corpus/readers/cluster_expected.yaml +45 -0
  32. data/test/corpus/readers/tree_expected.yaml +55 -0
  33. data/test/corpus/rubyinside.html +318 -0
  34. data/test/corpus/rww.html +1351 -0
  35. data/test/corpus/spolsky.html +298 -0
  36. data/test/corpus/techcrunch.html +1285 -0
  37. data/test/corpus/tweet.html +360 -0
  38. data/test/corpus/youtube.html +2348 -0
  39. data/test/corpus/zefrank.html +535 -0
  40. data/test/helper.rb +15 -0
  41. data/test/test_corpus.rb +54 -0
  42. data/test/test_pismo_document.rb +34 -0
  43. metadata +156 -0
@@ -0,0 +1,171 @@
1
+ # encoding: utf-8
2
+
3
+ module Pismo
4
+ module Reader
5
+ class Cluster < Base
6
+
7
+ # Adapted from : http://rubyforge.org/projects/extractcontent/
8
+ #
9
+ # Portions of this code are :
10
+ # Copyright (c) 2007/2008 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
11
+ #
12
+ # Permission is hereby granted, free of charge, to any person obtaining
13
+ # a copy of this software and associated documentation files (the
14
+ # "Software"), to deal in the Software without restriction, including
15
+ # without limitation the rights to use, copy, modify, merge, publish,
16
+ # distribute, sublicense, and/or sell copies of the Software, and to
17
+ # permit persons to whom the Software is furnished to do so, subject to
18
+ # the following conditions:
19
+ #
20
+ # The above copyright notice and this permission notice shall be
21
+ # included in all copies or substantial portions of the Software.
22
+ #
23
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
27
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
28
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
29
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30
+
31
+ # Default option parameters
32
+ DEFAULTS = {
33
+ :threshold => 100, # threshold for score of the text
34
+ :min_length => 80, # minimum length of evaluated blocks
35
+ :decay_factor => 0.73, # decay factor for block score
36
+ :continuous_factor => 1.62, # continuous factor for block score ( the larger, the harder to continue )
37
+ :no_body_factor => 0.72, # no body factor that reduces block score if waste expressions are present
38
+ :punctuation_weight => 10, # score weight for punctuation
39
+ :punctuations => /(\.[^A-Za-z0-9]|,[^0-9]|!|\?)/, # punctuation characters
40
+ :waste_expressions => /Copyright|All Rights Reserved/i, # characteristic keywords including footer
41
+ :debug => false, # if true, output block information to stdout
42
+ }
43
+
44
+ # Analyze the structure of the HTML document and score content blocks for likelihood of containing useful content
45
+ def analyze
46
+
47
+ opt = DEFAULTS.clone
48
+ opt.merge!(@options)
49
+
50
+ @sections = []
51
+ factor = continuous = 1.0
52
+ body = ''
53
+ score = 0
54
+
55
+ # The content is split into blocks of divs
56
+ list = @raw_content.split(/<\/?(?:div)[^>]*>/)
57
+ list.each do |block|
58
+ next unless block
59
+ block.gsub!(/\n/, '')
60
+
61
+ # Ignore blocks that have no tex
62
+ next if has_only_tags?(block)
63
+
64
+ # Each new block iterated over makes it less likely for it to belong
65
+ # to the existing block
66
+ continuous /= opt[:continuous_factor] if body.length > 0
67
+
68
+ # Clean up and strip block of html tags for scoring
69
+ clean = clean_block(block)
70
+ #clean = strip_tags(block)
71
+ next if clean.length < opt[:min_length]
72
+
73
+ # Calculate scores for clustering of blocks
74
+
75
+ # c represents how probable it is for this block to be a content block
76
+ c = (clean.length + clean.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor
77
+
78
+ # The further down the document we go (i.e. the more blocks we see),
79
+ # the less likely they are to be valid content blocks
80
+ factor *= opt[:decay_factor]
81
+
82
+ # The not body rate represents how likely this is to be a junk block
83
+ not_body_rate = block.scan(opt[:waste_expressions]).length
84
+
85
+ # The block score is reduced if there is a not_body_rate
86
+ c *= (opt[:no_body_factor] ** not_body_rate) if not_body_rate>0
87
+
88
+ # c1 represents how probable it is for this block to belong to the
89
+ # existing block or if it is a new one
90
+ c1 = c * continuous
91
+
92
+ puts "----- #{c}*#{continuous}=#{c1} #{clean.length} \n\n" if opt[:debug]
93
+
94
+ if c1 > opt[:threshold]
95
+ # Treat continuous blocks as cluster
96
+ body += block + "\n"
97
+ score += c1
98
+ continuous = opt[:continuous_factor]
99
+ elsif c > opt[:threshold]
100
+ # Continuous block end
101
+ @sections << { :body => body, :score => score }
102
+ body = block + "\n"
103
+ score = c
104
+ continuous = opt[:continuous_factor]
105
+ else
106
+ # We drop blocks that don't have a high enough c score
107
+ end
108
+ end
109
+ # Add the last block as we've finished iterating
110
+ @sections << { :body => body, :score => score } if body
111
+ # Sort the sections by score
112
+ sorted_sections = @sections.sort_by { |section| section[:score] }
113
+ # Convert to nokogiri representation for compatibility with the content method
114
+ @content_candidates = sorted_sections.reverse.map { |section| Nokogiri::HTML(section[:body], nil, 'utf-8') }
115
+ end
116
+
117
+ def content_at(index)
118
+ @content_candidates[index]
119
+ end
120
+
121
+ protected
122
+
123
+ # Checks if the given block has only tags without text.
124
+ def has_only_tags?(block)
125
+ block.gsub(/<[^>]*>/im, '').strip.length == 0
126
+ end
127
+
128
+ # Eliminates link heavy blocks and blocks that are lists of links and
129
+ # then returns block stripped of tags
130
+ def clean_block(block)
131
+ # Return empty block if it is a list of links
132
+ return "" if is_link_list?(block)
133
+
134
+ # Return empty block if it is a very link heavy block
135
+ count = 0
136
+ no_links = block.gsub(/<a\s[^>]*>.*?<\/a\s*>/im){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/im, '')
137
+ return "" if no_links.length < 20 * count
138
+
139
+ strip_tags(no_links)
140
+ end
141
+
142
+ # Determines whether a block is link list or not
143
+ def is_link_list?(st)
144
+ if st =~ /<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/im
145
+ listpart = $1
146
+ outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/imn, '').gsub(/<.+?>/mn, '').gsub(/\s+/, ' ')
147
+ list = listpart.split(/<li[^>]*>/)
148
+ list.shift
149
+ rate = evaluate_list(list)
150
+ outside.length <= st.length / (45 / rate)
151
+ end
152
+ end
153
+
154
+ # Estimates how much degree of link list
155
+ def evaluate_list(list)
156
+ return 1 if list.length == 0
157
+ hit = 0
158
+ list.each do |line|
159
+ hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/imn
160
+ end
161
+ return 9 * (1.0 * hit / list.length) ** 2 + 1
162
+ end
163
+
164
+ # Removes all html tags and attributes from html
165
+ def strip_tags(html)
166
+ strip(Sanitize.clean(html, :elements => [], :attributes => []))
167
+ end
168
+
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,154 @@
1
+ module Pismo
2
+ module Reader
3
+ class Tree < Base
4
+
5
+ # Analyze the structure of the HTML document and score branches for likelihood of containing useful content
6
+ def analyze
7
+ @tree = {}
8
+ subels = {}
9
+
10
+ t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
11
+
12
+ @doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
13
+ # Assume that no content we'll want comes in a total package of fewer than 80 characters!
14
+ next unless el.text.to_s.strip.length >= 80
15
+
16
+ path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
17
+ depth = path_segments.length
18
+
19
+ local_ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)
20
+ ids = local_ids
21
+
22
+ cp = el.parent
23
+ (depth - 1).times do
24
+ ids += (cp['id'].to_s + ' ' + cp['class'].to_s).downcase.strip.scan(/[a-z]+/)
25
+ cp = cp.parent
26
+ end if depth > 1
27
+
28
+ #puts "IDS"
29
+ #ap ids
30
+ #puts "LOCAL IDS"
31
+ #ap local_ids
32
+
33
+ branch = {}
34
+ branch[:ids] = ids
35
+ branch[:local_ids] = local_ids
36
+ branch[:score] = -(BAD_WORDS & ids).size
37
+ branch[:score] += ((GOOD_WORDS & ids).size * 2)
38
+ next if branch[:score] < -5
39
+
40
+ #puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
41
+
42
+ # Elements that have an ID or class are more likely to be our winners
43
+ branch[:score] += 2 unless local_ids.empty?
44
+
45
+ branch[:name] = el.name
46
+ branch[:depth] = depth
47
+ branch[:path] = el.path
48
+
49
+ branch[:raw_word_count] = 0
50
+ branch[:word_count] = 0
51
+ branch[:child_count] = 0
52
+ branch[:bad_child_count] = 0
53
+ branch[:score_steps] = []
54
+
55
+
56
+ el.traverse do |subel|
57
+ div_at_end_of_branch = false if subel.name == "div"
58
+ path = subel.path
59
+ subels[path] ||= {}
60
+ subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
61
+ subels[path][:is_text] ||= subel.text?
62
+
63
+ if subels[path][:is_text]
64
+ subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
65
+ next if subels[path][:text].empty?
66
+
67
+ subels[path][:raw_word_count] ||= subels[path][:text].size
68
+ subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
69
+ subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
70
+
71
+ branch[:raw_word_count] += subels[path][:raw_word_count]
72
+ branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
73
+ end
74
+
75
+ subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
76
+ subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
77
+ subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
78
+
79
+ branch[:bad_child_count] += subels[path][:bad_child_count_inc]
80
+ branch[:child_count] += subels[path][:child_count_inc]
81
+ end
82
+
83
+ branch[:score] += 2 if branch[:name] == "div"
84
+ branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
85
+ branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
86
+ branch[:score] *= 3
87
+
88
+
89
+ branch[:score] *= 0.7 if el.children && el.children.size < 3
90
+ branch[:score] *= 1.25 if branch[:raw_word_count] > 10
91
+ next if branch[:raw_word_count] < 10
92
+ branch[:score] += [branch[:word_count], 1].max ** 0.5
93
+
94
+
95
+ word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max
96
+ branch[:word_child_count_ratio] = word_child_count_ratio
97
+
98
+ if branch[:raw_word_count] > 100
99
+ good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
100
+ branch[:score] += good_word_ratio * 12
101
+
102
+ if word_child_count_ratio > 50
103
+ branch[:score] *= 1.5
104
+ elsif word_child_count_ratio > 30
105
+ branch[:score] *= 1.2
106
+ elsif word_child_count_ratio > 15
107
+ branch[:score] *= 1.1
108
+ elsif word_child_count_ratio < 4
109
+ branch[:score] *= 0.9
110
+ end
111
+ end
112
+
113
+ branch[:score_steps] << "s1: #{branch[:score]}"
114
+
115
+ bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
116
+ branch[:bad_child_ratio] = bad_child_ratio
117
+ branch[:score] += 3 if bad_child_ratio < 0.0
118
+ branch[:score] -= 3 if bad_child_ratio > 0.15
119
+ branch[:score] -= 2 if bad_child_ratio > 0.25
120
+ branch[:score] -= 2 if bad_child_ratio > 0.4
121
+ branch[:score] -= 4 if bad_child_ratio > 0.5
122
+ branch[:score] -= 5 if bad_child_ratio > 0.7
123
+ branch[:score] -= 5 if branch[:bad_child_count] > 20
124
+
125
+ branch[:score] += depth
126
+ branch[:score] *= 0.8 if ids.length > 10
127
+
128
+
129
+
130
+ @tree[el.path] = branch
131
+ end
132
+
133
+
134
+ sorted_tree = @tree.sort_by { |k, v| v[:score] }
135
+
136
+ #ap @doc.at(sorted_tree.first[0]).text
137
+
138
+ # Sort the branches by their score in reverse order
139
+ @content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
140
+
141
+ #ap @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
142
+ #t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
143
+ #puts t2 - t1
144
+ #exit
145
+
146
+ end
147
+
148
+ def content_at(index)
149
+ @doc.at(@content_candidates[index].first)
150
+ end
151
+
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,1002 @@
1
+ a
2
+ a's
3
+ Aaliyah
4
+ Aaron
5
+ Abigail
6
+ ability
7
+ able
8
+ about
9
+ above
10
+ according
11
+ accordingly
12
+ across
13
+ actually
14
+ Adam
15
+ Addison
16
+ Adrian
17
+ after
18
+ afterwards
19
+ again
20
+ against
21
+ ago
22
+ Aidan
23
+ Aiden
24
+ ain't
25
+ al
26
+ Alejandro
27
+ Alex
28
+ Alexa
29
+ Alexander
30
+ Alexandra
31
+ Alexis
32
+ all
33
+ Allison
34
+ allow
35
+ allowed
36
+ allowing
37
+ allows
38
+ almost
39
+ alone
40
+ along
41
+ alongside
42
+ already
43
+ also
44
+ although
45
+ always
46
+ Alyssa
47
+ am
48
+ Amanda
49
+ Amber
50
+ among
51
+ amongst
52
+ an
53
+ and
54
+ Andrea
55
+ Andrew
56
+ Angel
57
+ Angelina
58
+ Anna
59
+ annual
60
+ another
61
+ Anthony
62
+ Antonio
63
+ anybody
64
+ anyhow
65
+ anyone
66
+ anything
67
+ anyway
68
+ anyways
69
+ anywhere
70
+ apart
71
+ appear
72
+ appreciate
73
+ appropriate
74
+ approximate
75
+ approximately
76
+ apr
77
+ april
78
+ are
79
+ aren't
80
+ Ariana
81
+ Arianna
82
+ around
83
+ articles
84
+ as
85
+ Ashley
86
+ Ashton
87
+ aside
88
+ ask
89
+ asking
90
+ asshole
91
+ associated
92
+ at
93
+ Audrey
94
+ aug
95
+ august
96
+ Austin
97
+ Autumn
98
+ Ava
99
+ available
100
+ Avery
101
+ away
102
+ awesome
103
+ awfully
104
+ Bailey
105
+ based
106
+ basically
107
+ be
108
+ became
109
+ because
110
+ become
111
+ becomes
112
+ becoming
113
+ been
114
+ beforehand
115
+ behind
116
+ being
117
+ believe
118
+ below
119
+ benefit
120
+ Benjamin
121
+ beside
122
+ besides
123
+ best
124
+ better
125
+ beyond
126
+ big
127
+ biggest
128
+ Blake
129
+ both
130
+ bother
131
+ Brady
132
+ Brandon
133
+ Brayden
134
+ Brian
135
+ Brianna
136
+ brief
137
+ bring
138
+ brings
139
+ Brooke
140
+ Brooklyn
141
+ Bryan
142
+ Bryce
143
+ but
144
+ by
145
+ c'mon
146
+ c's
147
+ Caden
148
+ Caleb
149
+ called
150
+ came
151
+ Cameron
152
+ can
153
+ can't
154
+ cancel
155
+ cannot
156
+ cant
157
+ carefully
158
+ Carlos
159
+ Caroline
160
+ Carson
161
+ Carter
162
+ casually
163
+ cause
164
+ causes
165
+ certain
166
+ certainly
167
+ changes
168
+ Charles
169
+ Chase
170
+ check
171
+ Chloe
172
+ Christian
173
+ Christopher
174
+ Claire
175
+ clearly
176
+ co
177
+ Cody
178
+ Cole
179
+ Colin
180
+ Colton
181
+ come
182
+ comes
183
+ coming
184
+ comment
185
+ company
186
+ compelling
187
+ concerning
188
+ congratulations
189
+ Connor
190
+ consequently
191
+ consider
192
+ considering
193
+ contain
194
+ containing
195
+ contains
196
+ continued
197
+ Cooper
198
+ corresponding
199
+ could
200
+ couldn't
201
+ country
202
+ course
203
+ covered
204
+ covering
205
+ cunt
206
+ currently
207
+ customizable
208
+ damn
209
+ Daniel
210
+ Danielle
211
+ dave
212
+ David
213
+ david
214
+ de
215
+ dead
216
+ dec
217
+ decade
218
+ december
219
+ definitely
220
+ definitive
221
+ described
222
+ despite
223
+ Destiny
224
+ Devin
225
+ did
226
+ didn't
227
+ Diego
228
+ different
229
+ direct
230
+ discuss
231
+ do
232
+ does
233
+ doesn
234
+ doesn't
235
+ doing
236
+ Dominic
237
+ don't
238
+ done
239
+ down
240
+ downwards
241
+ driven
242
+ drove
243
+ during
244
+ Dylan
245
+ e
246
+ each
247
+ easier
248
+ edu
249
+ Eduardo
250
+ Edward
251
+ eg
252
+ eight
253
+ either
254
+ Elijah
255
+ Elizabeth
256
+ Ella
257
+ else
258
+ elsewhere
259
+ Emily
260
+ Emma
261
+ end
262
+ english
263
+ enough
264
+ entirely
265
+ Eric
266
+ Erin
267
+ es
268
+ especially
269
+ et
270
+ etc
271
+ Ethan
272
+ Evan
273
+ Evelyn
274
+ even
275
+ eventually
276
+ ever
277
+ every
278
+ everybody
279
+ everyone
280
+ everything
281
+ everywhere
282
+ ex
283
+ exactly
284
+ example
285
+ except
286
+ existing
287
+ extensive
288
+ extra
289
+ extremely
290
+ f
291
+ Faith
292
+ false
293
+ fame
294
+ far
295
+ feb
296
+ february
297
+ feel
298
+ feeling
299
+ few
300
+ fifth
301
+ finally
302
+ fine
303
+ first
304
+ five
305
+ followed
306
+ following
307
+ follows
308
+ for
309
+ former
310
+ formerly
311
+ forth
312
+ found
313
+ four
314
+ from
315
+ fuck
316
+ full
317
+ further
318
+ furthermore
319
+ g
320
+ Gabriel
321
+ Gabriella
322
+ Gabrielle
323
+ Garrett
324
+ gave
325
+ Gavin
326
+ generally
327
+ get
328
+ gets
329
+ getting
330
+ give
331
+ given
332
+ gives
333
+ glory
334
+ goal
335
+ goes
336
+ going
337
+ gone
338
+ good
339
+ got
340
+ gotten
341
+ Grace
342
+ great
343
+ greetings
344
+ h
345
+ had
346
+ hadn't
347
+ Hailey
348
+ Haley
349
+ Hannah
350
+ happens
351
+ hardly
352
+ has
353
+ hasn't
354
+ have
355
+ haven't
356
+ having
357
+ Hayden
358
+ he
359
+ he's
360
+ hello
361
+ help
362
+ hence
363
+ Henry
364
+ her
365
+ here
366
+ here's
367
+ hereafter
368
+ hereby
369
+ herein
370
+ hereupon
371
+ hers
372
+ herself
373
+ hi
374
+ high
375
+ highly
376
+ him
377
+ himself
378
+ hire
379
+ his
380
+ hither
381
+ hopefully
382
+ how
383
+ howbeit
384
+ however
385
+ huge
386
+ Hunter
387
+ i
388
+ i'd
389
+ i'll
390
+ i'm
391
+ i've
392
+ Ian
393
+ ie
394
+ if
395
+ ignored
396
+ imagine
397
+ immediate
398
+ implement
399
+ important
400
+ impromptu
401
+ in
402
+ inasmuch
403
+ inc
404
+ indeed
405
+ indicate
406
+ indicated
407
+ indicates
408
+ informative
409
+ inhibits
410
+ inner
411
+ insofar
412
+ instead
413
+ interest
414
+ interesting
415
+ into
416
+ inward
417
+ is
418
+ Isaac
419
+ Isabel
420
+ Isabella
421
+ Isaiah
422
+ isn
423
+ isn't
424
+ it
425
+ it'd
426
+ it'll
427
+ it's
428
+ its
429
+ itself
430
+ Ivan
431
+ j
432
+ Jack
433
+ Jackson
434
+ Jacob
435
+ Jada
436
+ Jaden
437
+ Jake
438
+ James
439
+ jan
440
+ january
441
+ Jared
442
+ Jasmine
443
+ Jason
444
+ Jayden
445
+ Jenna
446
+ Jennifer
447
+ Jeremiah
448
+ Jeremy
449
+ Jesse
450
+ Jessica
451
+ Jesus
452
+ jim
453
+ jimmy
454
+ jnr
455
+ Jocelyn
456
+ Joel
457
+ John
458
+ Jonathan
459
+ Jordan
460
+ Jorge
461
+ Jose
462
+ Joseph
463
+ Joshua
464
+ Josiah
465
+ jr
466
+ Juan
467
+ jul
468
+ Julia
469
+ Julian
470
+ july
471
+ jun
472
+ june
473
+ just
474
+ Justin
475
+ k
476
+ Kaden
477
+ Kaitlyn
478
+ Kaleb
479
+ Katelyn
480
+ Katherine
481
+ Kayla
482
+ Kaylee
483
+ keep
484
+ keeps
485
+ Kenneth
486
+ kept
487
+ Kevin
488
+ key
489
+ kid
490
+ Kimberly
491
+ know
492
+ known
493
+ knows
494
+ Kyle
495
+ Kylie
496
+ l
497
+ la
498
+ Landon
499
+ last
500
+ lately
501
+ later
502
+ latter
503
+ latterly
504
+ Lauren
505
+ le
506
+ Leah
507
+ least
508
+ les
509
+ less
510
+ lest
511
+ let
512
+ let's
513
+ levels
514
+ Liam
515
+ like
516
+ liked
517
+ likely
518
+ Lillian
519
+ Lily
520
+ line
521
+ listing
522
+ listings
523
+ little
524
+ Logan
525
+ look
526
+ looking
527
+ looks
528
+ lot
529
+ lots
530
+ love
531
+ low
532
+ ltd
533
+ Lucas
534
+ Luis
535
+ Luke
536
+ m
537
+ Mackenzie
538
+ Madeline
539
+ Madison
540
+ mainly
541
+ Makayla
542
+ many
543
+ mar
544
+ march
545
+ Marcus
546
+ Maria
547
+ Mariah
548
+ Marissa
549
+ Mark
550
+ Mary
551
+ Mason
552
+ Matthew
553
+ maturity
554
+ may
555
+ Maya
556
+ maybe
557
+ me
558
+ mean
559
+ means
560
+ meant
561
+ meanwhile
562
+ Megan
563
+ Melanie
564
+ member
565
+ mentioned
566
+ merely
567
+ Mia
568
+ Michael
569
+ Michelle
570
+ might
571
+ Miguel
572
+ mile
573
+ more
574
+ moreover
575
+ Morgan
576
+ most
577
+ mostly
578
+ moving
579
+ much
580
+ must
581
+ my
582
+ myself
583
+ n
584
+ name
585
+ namely
586
+ Natalie
587
+ Nathan
588
+ Nathaniel
589
+ naturally
590
+ nd
591
+ near
592
+ nearly
593
+ necessary
594
+ need
595
+ needed
596
+ needs
597
+ neither
598
+ Nevaeh
599
+ never
600
+ nevertheless
601
+ new
602
+ next
603
+ Nicholas
604
+ Nicole
605
+ nine
606
+ no
607
+ Noah
608
+ nobody
609
+ non
610
+ none
611
+ noone
612
+ nor
613
+ normally
614
+ not
615
+ notably
616
+ nothing
617
+ nov
618
+ novel
619
+ november
620
+ now
621
+ nowhere
622
+ o
623
+ Obie
624
+ obviously
625
+ oct
626
+ october
627
+ of
628
+ off
629
+ official
630
+ often
631
+ oh
632
+ ok
633
+ okay
634
+ old
635
+ Olivia
636
+ on
637
+ once
638
+ one
639
+ ones
640
+ online
641
+ only
642
+ onto
643
+ open
644
+ or
645
+ org
646
+ oriented
647
+ Oscar
648
+ others
649
+ otherwise
650
+ ought
651
+ our
652
+ ours
653
+ ourselves
654
+ out
655
+ overall
656
+ Owen
657
+ own
658
+ p
659
+ Paige
660
+ par
661
+ Parker
662
+ part
663
+ particular
664
+ particularly
665
+ Patrick
666
+ Paul
667
+ peasy
668
+ per
669
+ perhaps
670
+ piece
671
+ placed
672
+ play
673
+ please
674
+ plus
675
+ possible
676
+ posts
677
+ pre
678
+ preferences
679
+ presumably
680
+ pretty
681
+ probably
682
+ product
683
+ products
684
+ proud
685
+ provide
686
+ provides
687
+ put
688
+ q
689
+ que
690
+ quite
691
+ qv
692
+ r
693
+ Rachel
694
+ rather
695
+ rd
696
+ re
697
+ reached
698
+ read
699
+ real
700
+ really
701
+ reasonably
702
+ Rebecca
703
+ recently
704
+ regarding
705
+ regardless
706
+ regards
707
+ related
708
+ relatively
709
+ replaced
710
+ requirements
711
+ respectively
712
+ Richard
713
+ right
714
+ Riley
715
+ Robert
716
+ run
717
+ Ryan
718
+ s
719
+ safest
720
+ said
721
+ Samantha
722
+ same
723
+ Samuel
724
+ Sara
725
+ Sarah
726
+ Savannah
727
+ saw
728
+ say
729
+ saying
730
+ says
731
+ Sean
732
+ Sebastian
733
+ second
734
+ secondly
735
+ seconds
736
+ see
737
+ seeing
738
+ seem
739
+ seemed
740
+ seeming
741
+ seems
742
+ seen
743
+ self
744
+ selves
745
+ sensible
746
+ sent
747
+ sep
748
+ september
749
+ serious
750
+ seriously
751
+ set
752
+ settings
753
+ seven
754
+ several
755
+ shall
756
+ she
757
+ shit
758
+ shot
759
+ should
760
+ shouldn't
761
+ Sierra
762
+ simpler
763
+ simply
764
+ since
765
+ site
766
+ six
767
+ size
768
+ so
769
+ Sofia
770
+ solid
771
+ some
772
+ somebody
773
+ somehow
774
+ someone
775
+ something
776
+ sometime
777
+ sometimes
778
+ somewhat
779
+ somewhere
780
+ soon
781
+ Sophia
782
+ sorry
783
+ sounding
784
+ specified
785
+ specify
786
+ specifying
787
+ spoke
788
+ spread
789
+ sr
790
+ stand
791
+ started
792
+ step
793
+ Stephanie
794
+ Steven
795
+ still
796
+ stuff
797
+ sub
798
+ subscribe
799
+ such
800
+ suck
801
+ suite
802
+ sup
803
+ sur
804
+ sure
805
+ Sydney
806
+ t
807
+ t's
808
+ take
809
+ taken
810
+ Tanner
811
+ tat
812
+ Taylor
813
+ team
814
+ tedious
815
+ tell
816
+ tends
817
+ th
818
+ than
819
+ thank
820
+ thanks
821
+ thanx
822
+ that
823
+ that's
824
+ thats
825
+ the
826
+ their
827
+ theirs
828
+ them
829
+ themselves
830
+ then
831
+ thence
832
+ there
833
+ there's
834
+ thereafter
835
+ thereby
836
+ therefore
837
+ therein
838
+ theres
839
+ thereupon
840
+ these
841
+ they
842
+ they'd
843
+ they'll
844
+ they're
845
+ they've
846
+ thing
847
+ things
848
+ think
849
+ third
850
+ this
851
+ Thomas
852
+ thomas
853
+ thorough
854
+ thoroughly
855
+ those
856
+ though
857
+ three
858
+ through
859
+ throughout
860
+ thru
861
+ thus
862
+ Timothy
863
+ tit
864
+ to
865
+ today
866
+ together
867
+ told
868
+ too
869
+ took
870
+ toward
871
+ towards
872
+ Trevor
873
+ tried
874
+ tries
875
+ Trinity
876
+ Tristan
877
+ truly
878
+ try
879
+ trying
880
+ turn
881
+ turns
882
+ twice
883
+ two
884
+ Tyler
885
+ typically
886
+ u
887
+ ultra
888
+ un
889
+ unfortunately
890
+ unlikely
891
+ unsurprisingly
892
+ until
893
+ unto
894
+ up
895
+ upon
896
+ us
897
+ use
898
+ used
899
+ useful
900
+ uses
901
+ using
902
+ usually
903
+ uucp
904
+ v
905
+ value
906
+ Vanessa
907
+ various
908
+ very
909
+ via
910
+ Victor
911
+ Victoria
912
+ Vincent
913
+ viz
914
+ vs
915
+ w
916
+ walks
917
+ want
918
+ wants
919
+ was
920
+ wasn't
921
+ way
922
+ we
923
+ we'd
924
+ we'll
925
+ we're
926
+ we've
927
+ week
928
+ weekly
929
+ welcome
930
+ well
931
+ went
932
+ were
933
+ weren't
934
+ what
935
+ what's
936
+ whatever
937
+ when
938
+ whence
939
+ whenever
940
+ where
941
+ where's
942
+ whereafter
943
+ whereas
944
+ whereby
945
+ wherein
946
+ whereupon
947
+ wherever
948
+ whether
949
+ which
950
+ while
951
+ whither
952
+ who
953
+ who's
954
+ whoever
955
+ whole
956
+ whom
957
+ whose
958
+ why
959
+ will
960
+ William
961
+ willing
962
+ win
963
+ wish
964
+ with
965
+ within
966
+ without
967
+ won't
968
+ wonder
969
+ works
970
+ world
971
+ would
972
+ wouldn't
973
+ wrapped
974
+ Wyatt
975
+ Xavier
976
+ y
977
+ yeah
978
+ yes
979
+ yet
980
+ you
981
+ you'd
982
+ you'll
983
+ you're
984
+ you've
985
+ your
986
+ yours
987
+ yourself
988
+ yourselves
989
+ z
990
+ Zachary
991
+ zero
992
+ Zoe
993
+ 0
994
+ 1
995
+ 2
996
+ 3
997
+ 4
998
+ 5
999
+ 6
1000
+ 7
1001
+ 8
1002
+ 9