nddrylliog_pismo 0.7.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +29 -0
- data/Gemfile +4 -0
- data/LICENSE +23 -0
- data/NOTICE +4 -0
- data/README.markdown +131 -0
- data/Rakefile +72 -0
- data/bin/pismo +45 -0
- data/lib/pismo.rb +82 -0
- data/lib/pismo/document.rb +67 -0
- data/lib/pismo/external_attributes.rb +14 -0
- data/lib/pismo/internal_attributes.rb +316 -0
- data/lib/pismo/reader.rb +19 -0
- data/lib/pismo/reader/base.rb +259 -0
- data/lib/pismo/reader/cluster.rb +171 -0
- data/lib/pismo/reader/tree.rb +154 -0
- data/lib/pismo/stopwords.txt +1002 -0
- data/lib/pismo/version.rb +3 -0
- data/pismo.gemspec +30 -0
- data/test/corpus/bbcnews.html +2131 -0
- data/test/corpus/bbcnews2.html +1575 -0
- data/test/corpus/briancray.html +269 -0
- data/test/corpus/cant_read.html +426 -0
- data/test/corpus/factor.html +1362 -0
- data/test/corpus/gmane.html +138 -0
- data/test/corpus/huffington.html +2932 -0
- data/test/corpus/metadata_expected.yaml +72 -0
- data/test/corpus/metadata_expected.yaml.old +122 -0
- data/test/corpus/queness.html +919 -0
- data/test/corpus/reader_expected.yaml +39 -0
- data/test/corpus/readers/cluster_expected.yaml +45 -0
- data/test/corpus/readers/tree_expected.yaml +55 -0
- data/test/corpus/rubyinside.html +318 -0
- data/test/corpus/rww.html +1351 -0
- data/test/corpus/spolsky.html +298 -0
- data/test/corpus/techcrunch.html +1285 -0
- data/test/corpus/tweet.html +360 -0
- data/test/corpus/youtube.html +2348 -0
- data/test/corpus/zefrank.html +535 -0
- data/test/helper.rb +15 -0
- data/test/test_corpus.rb +54 -0
- data/test/test_pismo_document.rb +34 -0
- metadata +156 -0
@@ -0,0 +1,171 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Pismo
|
4
|
+
module Reader
|
5
|
+
class Cluster < Base
|
6
|
+
|
7
|
+
# Adapted from : http://rubyforge.org/projects/extractcontent/
|
8
|
+
#
|
9
|
+
# Portions of this code are :
|
10
|
+
# Copyright (c) 2007/2008 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
|
11
|
+
#
|
12
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
13
|
+
# a copy of this software and associated documentation files (the
|
14
|
+
# "Software"), to deal in the Software without restriction, including
|
15
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
16
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
17
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
18
|
+
# the following conditions:
|
19
|
+
#
|
20
|
+
# The above copyright notice and this permission notice shall be
|
21
|
+
# included in all copies or substantial portions of the Software.
|
22
|
+
#
|
23
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
24
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
25
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
26
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
27
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
28
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
29
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
30
|
+
|
31
|
+
# Default option parameters
|
32
|
+
DEFAULTS = {
|
33
|
+
:threshold => 100, # threshold for score of the text
|
34
|
+
:min_length => 80, # minimum length of evaluated blocks
|
35
|
+
:decay_factor => 0.73, # decay factor for block score
|
36
|
+
:continuous_factor => 1.62, # continuous factor for block score ( the larger, the harder to continue )
|
37
|
+
:no_body_factor => 0.72, # no body factor that reduces block score if waste expressions are present
|
38
|
+
:punctuation_weight => 10, # score weight for punctuation
|
39
|
+
:punctuations => /(\.[^A-Za-z0-9]|,[^0-9]|!|\?)/, # punctuation characters
|
40
|
+
:waste_expressions => /Copyright|All Rights Reserved/i, # characteristic keywords including footer
|
41
|
+
:debug => false, # if true, output block information to stdout
|
42
|
+
}
|
43
|
+
|
44
|
+
# Analyze the structure of the HTML document and score content blocks for likelihood of containing useful content
|
45
|
+
def analyze
|
46
|
+
|
47
|
+
opt = DEFAULTS.clone
|
48
|
+
opt.merge!(@options)
|
49
|
+
|
50
|
+
@sections = []
|
51
|
+
factor = continuous = 1.0
|
52
|
+
body = ''
|
53
|
+
score = 0
|
54
|
+
|
55
|
+
# The content is split into blocks of divs
|
56
|
+
list = @raw_content.split(/<\/?(?:div)[^>]*>/)
|
57
|
+
list.each do |block|
|
58
|
+
next unless block
|
59
|
+
block.gsub!(/\n/, '')
|
60
|
+
|
61
|
+
# Ignore blocks that have no tex
|
62
|
+
next if has_only_tags?(block)
|
63
|
+
|
64
|
+
# Each new block iterated over makes it less likely for it to belong
|
65
|
+
# to the existing block
|
66
|
+
continuous /= opt[:continuous_factor] if body.length > 0
|
67
|
+
|
68
|
+
# Clean up and strip block of html tags for scoring
|
69
|
+
clean = clean_block(block)
|
70
|
+
#clean = strip_tags(block)
|
71
|
+
next if clean.length < opt[:min_length]
|
72
|
+
|
73
|
+
# Calculate scores for clustering of blocks
|
74
|
+
|
75
|
+
# c represents how probable it is for this block to be a content block
|
76
|
+
c = (clean.length + clean.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor
|
77
|
+
|
78
|
+
# The further down the document we go (i.e. the more blocks we see),
|
79
|
+
# the less likely they are to be valid content blocks
|
80
|
+
factor *= opt[:decay_factor]
|
81
|
+
|
82
|
+
# The not body rate represents how likely this is to be a junk block
|
83
|
+
not_body_rate = block.scan(opt[:waste_expressions]).length
|
84
|
+
|
85
|
+
# The block score is reduced if there is a not_body_rate
|
86
|
+
c *= (opt[:no_body_factor] ** not_body_rate) if not_body_rate>0
|
87
|
+
|
88
|
+
# c1 represents how probable it is for this block to belong to the
|
89
|
+
# existing block or if it is a new one
|
90
|
+
c1 = c * continuous
|
91
|
+
|
92
|
+
puts "----- #{c}*#{continuous}=#{c1} #{clean.length} \n\n" if opt[:debug]
|
93
|
+
|
94
|
+
if c1 > opt[:threshold]
|
95
|
+
# Treat continuous blocks as cluster
|
96
|
+
body += block + "\n"
|
97
|
+
score += c1
|
98
|
+
continuous = opt[:continuous_factor]
|
99
|
+
elsif c > opt[:threshold]
|
100
|
+
# Continuous block end
|
101
|
+
@sections << { :body => body, :score => score }
|
102
|
+
body = block + "\n"
|
103
|
+
score = c
|
104
|
+
continuous = opt[:continuous_factor]
|
105
|
+
else
|
106
|
+
# We drop blocks that don't have a high enough c score
|
107
|
+
end
|
108
|
+
end
|
109
|
+
# Add the last block as we've finished iterating
|
110
|
+
@sections << { :body => body, :score => score } if body
|
111
|
+
# Sort the sections by score
|
112
|
+
sorted_sections = @sections.sort_by { |section| section[:score] }
|
113
|
+
# Convert to nokogiri representation for compatibility with the content method
|
114
|
+
@content_candidates = sorted_sections.reverse.map { |section| Nokogiri::HTML(section[:body], nil, 'utf-8') }
|
115
|
+
end
|
116
|
+
|
117
|
+
def content_at(index)
|
118
|
+
@content_candidates[index]
|
119
|
+
end
|
120
|
+
|
121
|
+
protected
|
122
|
+
|
123
|
+
# Checks if the given block has only tags without text.
|
124
|
+
def has_only_tags?(block)
|
125
|
+
block.gsub(/<[^>]*>/im, '').strip.length == 0
|
126
|
+
end
|
127
|
+
|
128
|
+
# Eliminates link heavy blocks and blocks that are lists of links and
|
129
|
+
# then returns block stripped of tags
|
130
|
+
def clean_block(block)
|
131
|
+
# Return empty block if it is a list of links
|
132
|
+
return "" if is_link_list?(block)
|
133
|
+
|
134
|
+
# Return empty block if it is a very link heavy block
|
135
|
+
count = 0
|
136
|
+
no_links = block.gsub(/<a\s[^>]*>.*?<\/a\s*>/im){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/im, '')
|
137
|
+
return "" if no_links.length < 20 * count
|
138
|
+
|
139
|
+
strip_tags(no_links)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Determines whether a block is link list or not
|
143
|
+
def is_link_list?(st)
|
144
|
+
if st =~ /<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/im
|
145
|
+
listpart = $1
|
146
|
+
outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/imn, '').gsub(/<.+?>/mn, '').gsub(/\s+/, ' ')
|
147
|
+
list = listpart.split(/<li[^>]*>/)
|
148
|
+
list.shift
|
149
|
+
rate = evaluate_list(list)
|
150
|
+
outside.length <= st.length / (45 / rate)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# Estimates how much degree of link list
|
155
|
+
def evaluate_list(list)
|
156
|
+
return 1 if list.length == 0
|
157
|
+
hit = 0
|
158
|
+
list.each do |line|
|
159
|
+
hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/imn
|
160
|
+
end
|
161
|
+
return 9 * (1.0 * hit / list.length) ** 2 + 1
|
162
|
+
end
|
163
|
+
|
164
|
+
# Removes all html tags and attributes from html
|
165
|
+
def strip_tags(html)
|
166
|
+
strip(Sanitize.clean(html, :elements => [], :attributes => []))
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
module Pismo
|
2
|
+
module Reader
|
3
|
+
class Tree < Base
|
4
|
+
|
5
|
+
# Analyze the structure of the HTML document and score branches for likelihood of containing useful content
|
6
|
+
def analyze
|
7
|
+
@tree = {}
|
8
|
+
subels = {}
|
9
|
+
|
10
|
+
t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
|
11
|
+
|
12
|
+
@doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
|
13
|
+
# Assume that no content we'll want comes in a total package of fewer than 80 characters!
|
14
|
+
next unless el.text.to_s.strip.length >= 80
|
15
|
+
|
16
|
+
path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
|
17
|
+
depth = path_segments.length
|
18
|
+
|
19
|
+
local_ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)
|
20
|
+
ids = local_ids
|
21
|
+
|
22
|
+
cp = el.parent
|
23
|
+
(depth - 1).times do
|
24
|
+
ids += (cp['id'].to_s + ' ' + cp['class'].to_s).downcase.strip.scan(/[a-z]+/)
|
25
|
+
cp = cp.parent
|
26
|
+
end if depth > 1
|
27
|
+
|
28
|
+
#puts "IDS"
|
29
|
+
#ap ids
|
30
|
+
#puts "LOCAL IDS"
|
31
|
+
#ap local_ids
|
32
|
+
|
33
|
+
branch = {}
|
34
|
+
branch[:ids] = ids
|
35
|
+
branch[:local_ids] = local_ids
|
36
|
+
branch[:score] = -(BAD_WORDS & ids).size
|
37
|
+
branch[:score] += ((GOOD_WORDS & ids).size * 2)
|
38
|
+
next if branch[:score] < -5
|
39
|
+
|
40
|
+
#puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
|
41
|
+
|
42
|
+
# Elements that have an ID or class are more likely to be our winners
|
43
|
+
branch[:score] += 2 unless local_ids.empty?
|
44
|
+
|
45
|
+
branch[:name] = el.name
|
46
|
+
branch[:depth] = depth
|
47
|
+
branch[:path] = el.path
|
48
|
+
|
49
|
+
branch[:raw_word_count] = 0
|
50
|
+
branch[:word_count] = 0
|
51
|
+
branch[:child_count] = 0
|
52
|
+
branch[:bad_child_count] = 0
|
53
|
+
branch[:score_steps] = []
|
54
|
+
|
55
|
+
|
56
|
+
el.traverse do |subel|
|
57
|
+
div_at_end_of_branch = false if subel.name == "div"
|
58
|
+
path = subel.path
|
59
|
+
subels[path] ||= {}
|
60
|
+
subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
|
61
|
+
subels[path][:is_text] ||= subel.text?
|
62
|
+
|
63
|
+
if subels[path][:is_text]
|
64
|
+
subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
|
65
|
+
next if subels[path][:text].empty?
|
66
|
+
|
67
|
+
subels[path][:raw_word_count] ||= subels[path][:text].size
|
68
|
+
subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
|
69
|
+
subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
|
70
|
+
|
71
|
+
branch[:raw_word_count] += subels[path][:raw_word_count]
|
72
|
+
branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
|
73
|
+
end
|
74
|
+
|
75
|
+
subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
|
76
|
+
subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
|
77
|
+
subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
|
78
|
+
|
79
|
+
branch[:bad_child_count] += subels[path][:bad_child_count_inc]
|
80
|
+
branch[:child_count] += subels[path][:child_count_inc]
|
81
|
+
end
|
82
|
+
|
83
|
+
branch[:score] += 2 if branch[:name] == "div"
|
84
|
+
branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
|
85
|
+
branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
|
86
|
+
branch[:score] *= 3
|
87
|
+
|
88
|
+
|
89
|
+
branch[:score] *= 0.7 if el.children && el.children.size < 3
|
90
|
+
branch[:score] *= 1.25 if branch[:raw_word_count] > 10
|
91
|
+
next if branch[:raw_word_count] < 10
|
92
|
+
branch[:score] += [branch[:word_count], 1].max ** 0.5
|
93
|
+
|
94
|
+
|
95
|
+
word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max
|
96
|
+
branch[:word_child_count_ratio] = word_child_count_ratio
|
97
|
+
|
98
|
+
if branch[:raw_word_count] > 100
|
99
|
+
good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
|
100
|
+
branch[:score] += good_word_ratio * 12
|
101
|
+
|
102
|
+
if word_child_count_ratio > 50
|
103
|
+
branch[:score] *= 1.5
|
104
|
+
elsif word_child_count_ratio > 30
|
105
|
+
branch[:score] *= 1.2
|
106
|
+
elsif word_child_count_ratio > 15
|
107
|
+
branch[:score] *= 1.1
|
108
|
+
elsif word_child_count_ratio < 4
|
109
|
+
branch[:score] *= 0.9
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
branch[:score_steps] << "s1: #{branch[:score]}"
|
114
|
+
|
115
|
+
bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
|
116
|
+
branch[:bad_child_ratio] = bad_child_ratio
|
117
|
+
branch[:score] += 3 if bad_child_ratio < 0.0
|
118
|
+
branch[:score] -= 3 if bad_child_ratio > 0.15
|
119
|
+
branch[:score] -= 2 if bad_child_ratio > 0.25
|
120
|
+
branch[:score] -= 2 if bad_child_ratio > 0.4
|
121
|
+
branch[:score] -= 4 if bad_child_ratio > 0.5
|
122
|
+
branch[:score] -= 5 if bad_child_ratio > 0.7
|
123
|
+
branch[:score] -= 5 if branch[:bad_child_count] > 20
|
124
|
+
|
125
|
+
branch[:score] += depth
|
126
|
+
branch[:score] *= 0.8 if ids.length > 10
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
@tree[el.path] = branch
|
131
|
+
end
|
132
|
+
|
133
|
+
|
134
|
+
sorted_tree = @tree.sort_by { |k, v| v[:score] }
|
135
|
+
|
136
|
+
#ap @doc.at(sorted_tree.first[0]).text
|
137
|
+
|
138
|
+
# Sort the branches by their score in reverse order
|
139
|
+
@content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
|
140
|
+
|
141
|
+
#ap @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
|
142
|
+
#t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
|
143
|
+
#puts t2 - t1
|
144
|
+
#exit
|
145
|
+
|
146
|
+
end
|
147
|
+
|
148
|
+
def content_at(index)
|
149
|
+
@doc.at(@content_candidates[index].first)
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,1002 @@
|
|
1
|
+
a
|
2
|
+
a's
|
3
|
+
Aaliyah
|
4
|
+
Aaron
|
5
|
+
Abigail
|
6
|
+
ability
|
7
|
+
able
|
8
|
+
about
|
9
|
+
above
|
10
|
+
according
|
11
|
+
accordingly
|
12
|
+
across
|
13
|
+
actually
|
14
|
+
Adam
|
15
|
+
Addison
|
16
|
+
Adrian
|
17
|
+
after
|
18
|
+
afterwards
|
19
|
+
again
|
20
|
+
against
|
21
|
+
ago
|
22
|
+
Aidan
|
23
|
+
Aiden
|
24
|
+
ain't
|
25
|
+
al
|
26
|
+
Alejandro
|
27
|
+
Alex
|
28
|
+
Alexa
|
29
|
+
Alexander
|
30
|
+
Alexandra
|
31
|
+
Alexis
|
32
|
+
all
|
33
|
+
Allison
|
34
|
+
allow
|
35
|
+
allowed
|
36
|
+
allowing
|
37
|
+
allows
|
38
|
+
almost
|
39
|
+
alone
|
40
|
+
along
|
41
|
+
alongside
|
42
|
+
already
|
43
|
+
also
|
44
|
+
although
|
45
|
+
always
|
46
|
+
Alyssa
|
47
|
+
am
|
48
|
+
Amanda
|
49
|
+
Amber
|
50
|
+
among
|
51
|
+
amongst
|
52
|
+
an
|
53
|
+
and
|
54
|
+
Andrea
|
55
|
+
Andrew
|
56
|
+
Angel
|
57
|
+
Angelina
|
58
|
+
Anna
|
59
|
+
annual
|
60
|
+
another
|
61
|
+
Anthony
|
62
|
+
Antonio
|
63
|
+
anybody
|
64
|
+
anyhow
|
65
|
+
anyone
|
66
|
+
anything
|
67
|
+
anyway
|
68
|
+
anyways
|
69
|
+
anywhere
|
70
|
+
apart
|
71
|
+
appear
|
72
|
+
appreciate
|
73
|
+
appropriate
|
74
|
+
approximate
|
75
|
+
approximately
|
76
|
+
apr
|
77
|
+
april
|
78
|
+
are
|
79
|
+
aren't
|
80
|
+
Ariana
|
81
|
+
Arianna
|
82
|
+
around
|
83
|
+
articles
|
84
|
+
as
|
85
|
+
Ashley
|
86
|
+
Ashton
|
87
|
+
aside
|
88
|
+
ask
|
89
|
+
asking
|
90
|
+
asshole
|
91
|
+
associated
|
92
|
+
at
|
93
|
+
Audrey
|
94
|
+
aug
|
95
|
+
august
|
96
|
+
Austin
|
97
|
+
Autumn
|
98
|
+
Ava
|
99
|
+
available
|
100
|
+
Avery
|
101
|
+
away
|
102
|
+
awesome
|
103
|
+
awfully
|
104
|
+
Bailey
|
105
|
+
based
|
106
|
+
basically
|
107
|
+
be
|
108
|
+
became
|
109
|
+
because
|
110
|
+
become
|
111
|
+
becomes
|
112
|
+
becoming
|
113
|
+
been
|
114
|
+
beforehand
|
115
|
+
behind
|
116
|
+
being
|
117
|
+
believe
|
118
|
+
below
|
119
|
+
benefit
|
120
|
+
Benjamin
|
121
|
+
beside
|
122
|
+
besides
|
123
|
+
best
|
124
|
+
better
|
125
|
+
beyond
|
126
|
+
big
|
127
|
+
biggest
|
128
|
+
Blake
|
129
|
+
both
|
130
|
+
bother
|
131
|
+
Brady
|
132
|
+
Brandon
|
133
|
+
Brayden
|
134
|
+
Brian
|
135
|
+
Brianna
|
136
|
+
brief
|
137
|
+
bring
|
138
|
+
brings
|
139
|
+
Brooke
|
140
|
+
Brooklyn
|
141
|
+
Bryan
|
142
|
+
Bryce
|
143
|
+
but
|
144
|
+
by
|
145
|
+
c'mon
|
146
|
+
c's
|
147
|
+
Caden
|
148
|
+
Caleb
|
149
|
+
called
|
150
|
+
came
|
151
|
+
Cameron
|
152
|
+
can
|
153
|
+
can't
|
154
|
+
cancel
|
155
|
+
cannot
|
156
|
+
cant
|
157
|
+
carefully
|
158
|
+
Carlos
|
159
|
+
Caroline
|
160
|
+
Carson
|
161
|
+
Carter
|
162
|
+
casually
|
163
|
+
cause
|
164
|
+
causes
|
165
|
+
certain
|
166
|
+
certainly
|
167
|
+
changes
|
168
|
+
Charles
|
169
|
+
Chase
|
170
|
+
check
|
171
|
+
Chloe
|
172
|
+
Christian
|
173
|
+
Christopher
|
174
|
+
Claire
|
175
|
+
clearly
|
176
|
+
co
|
177
|
+
Cody
|
178
|
+
Cole
|
179
|
+
Colin
|
180
|
+
Colton
|
181
|
+
come
|
182
|
+
comes
|
183
|
+
coming
|
184
|
+
comment
|
185
|
+
company
|
186
|
+
compelling
|
187
|
+
concerning
|
188
|
+
congratulations
|
189
|
+
Connor
|
190
|
+
consequently
|
191
|
+
consider
|
192
|
+
considering
|
193
|
+
contain
|
194
|
+
containing
|
195
|
+
contains
|
196
|
+
continued
|
197
|
+
Cooper
|
198
|
+
corresponding
|
199
|
+
could
|
200
|
+
couldn't
|
201
|
+
country
|
202
|
+
course
|
203
|
+
covered
|
204
|
+
covering
|
205
|
+
cunt
|
206
|
+
currently
|
207
|
+
customizable
|
208
|
+
damn
|
209
|
+
Daniel
|
210
|
+
Danielle
|
211
|
+
dave
|
212
|
+
David
|
213
|
+
david
|
214
|
+
de
|
215
|
+
dead
|
216
|
+
dec
|
217
|
+
decade
|
218
|
+
december
|
219
|
+
definitely
|
220
|
+
definitive
|
221
|
+
described
|
222
|
+
despite
|
223
|
+
Destiny
|
224
|
+
Devin
|
225
|
+
did
|
226
|
+
didn't
|
227
|
+
Diego
|
228
|
+
different
|
229
|
+
direct
|
230
|
+
discuss
|
231
|
+
do
|
232
|
+
does
|
233
|
+
doesn
|
234
|
+
doesn't
|
235
|
+
doing
|
236
|
+
Dominic
|
237
|
+
don't
|
238
|
+
done
|
239
|
+
down
|
240
|
+
downwards
|
241
|
+
driven
|
242
|
+
drove
|
243
|
+
during
|
244
|
+
Dylan
|
245
|
+
e
|
246
|
+
each
|
247
|
+
easier
|
248
|
+
edu
|
249
|
+
Eduardo
|
250
|
+
Edward
|
251
|
+
eg
|
252
|
+
eight
|
253
|
+
either
|
254
|
+
Elijah
|
255
|
+
Elizabeth
|
256
|
+
Ella
|
257
|
+
else
|
258
|
+
elsewhere
|
259
|
+
Emily
|
260
|
+
Emma
|
261
|
+
end
|
262
|
+
english
|
263
|
+
enough
|
264
|
+
entirely
|
265
|
+
Eric
|
266
|
+
Erin
|
267
|
+
es
|
268
|
+
especially
|
269
|
+
et
|
270
|
+
etc
|
271
|
+
Ethan
|
272
|
+
Evan
|
273
|
+
Evelyn
|
274
|
+
even
|
275
|
+
eventually
|
276
|
+
ever
|
277
|
+
every
|
278
|
+
everybody
|
279
|
+
everyone
|
280
|
+
everything
|
281
|
+
everywhere
|
282
|
+
ex
|
283
|
+
exactly
|
284
|
+
example
|
285
|
+
except
|
286
|
+
existing
|
287
|
+
extensive
|
288
|
+
extra
|
289
|
+
extremely
|
290
|
+
f
|
291
|
+
Faith
|
292
|
+
false
|
293
|
+
fame
|
294
|
+
far
|
295
|
+
feb
|
296
|
+
february
|
297
|
+
feel
|
298
|
+
feeling
|
299
|
+
few
|
300
|
+
fifth
|
301
|
+
finally
|
302
|
+
fine
|
303
|
+
first
|
304
|
+
five
|
305
|
+
followed
|
306
|
+
following
|
307
|
+
follows
|
308
|
+
for
|
309
|
+
former
|
310
|
+
formerly
|
311
|
+
forth
|
312
|
+
found
|
313
|
+
four
|
314
|
+
from
|
315
|
+
fuck
|
316
|
+
full
|
317
|
+
further
|
318
|
+
furthermore
|
319
|
+
g
|
320
|
+
Gabriel
|
321
|
+
Gabriella
|
322
|
+
Gabrielle
|
323
|
+
Garrett
|
324
|
+
gave
|
325
|
+
Gavin
|
326
|
+
generally
|
327
|
+
get
|
328
|
+
gets
|
329
|
+
getting
|
330
|
+
give
|
331
|
+
given
|
332
|
+
gives
|
333
|
+
glory
|
334
|
+
goal
|
335
|
+
goes
|
336
|
+
going
|
337
|
+
gone
|
338
|
+
good
|
339
|
+
got
|
340
|
+
gotten
|
341
|
+
Grace
|
342
|
+
great
|
343
|
+
greetings
|
344
|
+
h
|
345
|
+
had
|
346
|
+
hadn't
|
347
|
+
Hailey
|
348
|
+
Haley
|
349
|
+
Hannah
|
350
|
+
happens
|
351
|
+
hardly
|
352
|
+
has
|
353
|
+
hasn't
|
354
|
+
have
|
355
|
+
haven't
|
356
|
+
having
|
357
|
+
Hayden
|
358
|
+
he
|
359
|
+
he's
|
360
|
+
hello
|
361
|
+
help
|
362
|
+
hence
|
363
|
+
Henry
|
364
|
+
her
|
365
|
+
here
|
366
|
+
here's
|
367
|
+
hereafter
|
368
|
+
hereby
|
369
|
+
herein
|
370
|
+
hereupon
|
371
|
+
hers
|
372
|
+
herself
|
373
|
+
hi
|
374
|
+
high
|
375
|
+
highly
|
376
|
+
him
|
377
|
+
himself
|
378
|
+
hire
|
379
|
+
his
|
380
|
+
hither
|
381
|
+
hopefully
|
382
|
+
how
|
383
|
+
howbeit
|
384
|
+
however
|
385
|
+
huge
|
386
|
+
Hunter
|
387
|
+
i
|
388
|
+
i'd
|
389
|
+
i'll
|
390
|
+
i'm
|
391
|
+
i've
|
392
|
+
Ian
|
393
|
+
ie
|
394
|
+
if
|
395
|
+
ignored
|
396
|
+
imagine
|
397
|
+
immediate
|
398
|
+
implement
|
399
|
+
important
|
400
|
+
impromptu
|
401
|
+
in
|
402
|
+
inasmuch
|
403
|
+
inc
|
404
|
+
indeed
|
405
|
+
indicate
|
406
|
+
indicated
|
407
|
+
indicates
|
408
|
+
informative
|
409
|
+
inhibits
|
410
|
+
inner
|
411
|
+
insofar
|
412
|
+
instead
|
413
|
+
interest
|
414
|
+
interesting
|
415
|
+
into
|
416
|
+
inward
|
417
|
+
is
|
418
|
+
Isaac
|
419
|
+
Isabel
|
420
|
+
Isabella
|
421
|
+
Isaiah
|
422
|
+
isn
|
423
|
+
isn't
|
424
|
+
it
|
425
|
+
it'd
|
426
|
+
it'll
|
427
|
+
it's
|
428
|
+
its
|
429
|
+
itself
|
430
|
+
Ivan
|
431
|
+
j
|
432
|
+
Jack
|
433
|
+
Jackson
|
434
|
+
Jacob
|
435
|
+
Jada
|
436
|
+
Jaden
|
437
|
+
Jake
|
438
|
+
James
|
439
|
+
jan
|
440
|
+
january
|
441
|
+
Jared
|
442
|
+
Jasmine
|
443
|
+
Jason
|
444
|
+
Jayden
|
445
|
+
Jenna
|
446
|
+
Jennifer
|
447
|
+
Jeremiah
|
448
|
+
Jeremy
|
449
|
+
Jesse
|
450
|
+
Jessica
|
451
|
+
Jesus
|
452
|
+
jim
|
453
|
+
jimmy
|
454
|
+
jnr
|
455
|
+
Jocelyn
|
456
|
+
Joel
|
457
|
+
John
|
458
|
+
Jonathan
|
459
|
+
Jordan
|
460
|
+
Jorge
|
461
|
+
Jose
|
462
|
+
Joseph
|
463
|
+
Joshua
|
464
|
+
Josiah
|
465
|
+
jr
|
466
|
+
Juan
|
467
|
+
jul
|
468
|
+
Julia
|
469
|
+
Julian
|
470
|
+
july
|
471
|
+
jun
|
472
|
+
june
|
473
|
+
just
|
474
|
+
Justin
|
475
|
+
k
|
476
|
+
Kaden
|
477
|
+
Kaitlyn
|
478
|
+
Kaleb
|
479
|
+
Katelyn
|
480
|
+
Katherine
|
481
|
+
Kayla
|
482
|
+
Kaylee
|
483
|
+
keep
|
484
|
+
keeps
|
485
|
+
Kenneth
|
486
|
+
kept
|
487
|
+
Kevin
|
488
|
+
key
|
489
|
+
kid
|
490
|
+
Kimberly
|
491
|
+
know
|
492
|
+
known
|
493
|
+
knows
|
494
|
+
Kyle
|
495
|
+
Kylie
|
496
|
+
l
|
497
|
+
la
|
498
|
+
Landon
|
499
|
+
last
|
500
|
+
lately
|
501
|
+
later
|
502
|
+
latter
|
503
|
+
latterly
|
504
|
+
Lauren
|
505
|
+
le
|
506
|
+
Leah
|
507
|
+
least
|
508
|
+
les
|
509
|
+
less
|
510
|
+
lest
|
511
|
+
let
|
512
|
+
let's
|
513
|
+
levels
|
514
|
+
Liam
|
515
|
+
like
|
516
|
+
liked
|
517
|
+
likely
|
518
|
+
Lillian
|
519
|
+
Lily
|
520
|
+
line
|
521
|
+
listing
|
522
|
+
listings
|
523
|
+
little
|
524
|
+
Logan
|
525
|
+
look
|
526
|
+
looking
|
527
|
+
looks
|
528
|
+
lot
|
529
|
+
lots
|
530
|
+
love
|
531
|
+
low
|
532
|
+
ltd
|
533
|
+
Lucas
|
534
|
+
Luis
|
535
|
+
Luke
|
536
|
+
m
|
537
|
+
Mackenzie
|
538
|
+
Madeline
|
539
|
+
Madison
|
540
|
+
mainly
|
541
|
+
Makayla
|
542
|
+
many
|
543
|
+
mar
|
544
|
+
march
|
545
|
+
Marcus
|
546
|
+
Maria
|
547
|
+
Mariah
|
548
|
+
Marissa
|
549
|
+
Mark
|
550
|
+
Mary
|
551
|
+
Mason
|
552
|
+
Matthew
|
553
|
+
maturity
|
554
|
+
may
|
555
|
+
Maya
|
556
|
+
maybe
|
557
|
+
me
|
558
|
+
mean
|
559
|
+
means
|
560
|
+
meant
|
561
|
+
meanwhile
|
562
|
+
Megan
|
563
|
+
Melanie
|
564
|
+
member
|
565
|
+
mentioned
|
566
|
+
merely
|
567
|
+
Mia
|
568
|
+
Michael
|
569
|
+
Michelle
|
570
|
+
might
|
571
|
+
Miguel
|
572
|
+
mile
|
573
|
+
more
|
574
|
+
moreover
|
575
|
+
Morgan
|
576
|
+
most
|
577
|
+
mostly
|
578
|
+
moving
|
579
|
+
much
|
580
|
+
must
|
581
|
+
my
|
582
|
+
myself
|
583
|
+
n
|
584
|
+
name
|
585
|
+
namely
|
586
|
+
Natalie
|
587
|
+
Nathan
|
588
|
+
Nathaniel
|
589
|
+
naturally
|
590
|
+
nd
|
591
|
+
near
|
592
|
+
nearly
|
593
|
+
necessary
|
594
|
+
need
|
595
|
+
needed
|
596
|
+
needs
|
597
|
+
neither
|
598
|
+
Nevaeh
|
599
|
+
never
|
600
|
+
nevertheless
|
601
|
+
new
|
602
|
+
next
|
603
|
+
Nicholas
|
604
|
+
Nicole
|
605
|
+
nine
|
606
|
+
no
|
607
|
+
Noah
|
608
|
+
nobody
|
609
|
+
non
|
610
|
+
none
|
611
|
+
noone
|
612
|
+
nor
|
613
|
+
normally
|
614
|
+
not
|
615
|
+
notably
|
616
|
+
nothing
|
617
|
+
nov
|
618
|
+
novel
|
619
|
+
november
|
620
|
+
now
|
621
|
+
nowhere
|
622
|
+
o
|
623
|
+
Obie
|
624
|
+
obviously
|
625
|
+
oct
|
626
|
+
october
|
627
|
+
of
|
628
|
+
off
|
629
|
+
official
|
630
|
+
often
|
631
|
+
oh
|
632
|
+
ok
|
633
|
+
okay
|
634
|
+
old
|
635
|
+
Olivia
|
636
|
+
on
|
637
|
+
once
|
638
|
+
one
|
639
|
+
ones
|
640
|
+
online
|
641
|
+
only
|
642
|
+
onto
|
643
|
+
open
|
644
|
+
or
|
645
|
+
org
|
646
|
+
oriented
|
647
|
+
Oscar
|
648
|
+
others
|
649
|
+
otherwise
|
650
|
+
ought
|
651
|
+
our
|
652
|
+
ours
|
653
|
+
ourselves
|
654
|
+
out
|
655
|
+
overall
|
656
|
+
Owen
|
657
|
+
own
|
658
|
+
p
|
659
|
+
Paige
|
660
|
+
par
|
661
|
+
Parker
|
662
|
+
part
|
663
|
+
particular
|
664
|
+
particularly
|
665
|
+
Patrick
|
666
|
+
Paul
|
667
|
+
peasy
|
668
|
+
per
|
669
|
+
perhaps
|
670
|
+
piece
|
671
|
+
placed
|
672
|
+
play
|
673
|
+
please
|
674
|
+
plus
|
675
|
+
possible
|
676
|
+
posts
|
677
|
+
pre
|
678
|
+
preferences
|
679
|
+
presumably
|
680
|
+
pretty
|
681
|
+
probably
|
682
|
+
product
|
683
|
+
products
|
684
|
+
proud
|
685
|
+
provide
|
686
|
+
provides
|
687
|
+
put
|
688
|
+
q
|
689
|
+
que
|
690
|
+
quite
|
691
|
+
qv
|
692
|
+
r
|
693
|
+
Rachel
|
694
|
+
rather
|
695
|
+
rd
|
696
|
+
re
|
697
|
+
reached
|
698
|
+
read
|
699
|
+
real
|
700
|
+
really
|
701
|
+
reasonably
|
702
|
+
Rebecca
|
703
|
+
recently
|
704
|
+
regarding
|
705
|
+
regardless
|
706
|
+
regards
|
707
|
+
related
|
708
|
+
relatively
|
709
|
+
replaced
|
710
|
+
requirements
|
711
|
+
respectively
|
712
|
+
Richard
|
713
|
+
right
|
714
|
+
Riley
|
715
|
+
Robert
|
716
|
+
run
|
717
|
+
Ryan
|
718
|
+
s
|
719
|
+
safest
|
720
|
+
said
|
721
|
+
Samantha
|
722
|
+
same
|
723
|
+
Samuel
|
724
|
+
Sara
|
725
|
+
Sarah
|
726
|
+
Savannah
|
727
|
+
saw
|
728
|
+
say
|
729
|
+
saying
|
730
|
+
says
|
731
|
+
Sean
|
732
|
+
Sebastian
|
733
|
+
second
|
734
|
+
secondly
|
735
|
+
seconds
|
736
|
+
see
|
737
|
+
seeing
|
738
|
+
seem
|
739
|
+
seemed
|
740
|
+
seeming
|
741
|
+
seems
|
742
|
+
seen
|
743
|
+
self
|
744
|
+
selves
|
745
|
+
sensible
|
746
|
+
sent
|
747
|
+
sep
|
748
|
+
september
|
749
|
+
serious
|
750
|
+
seriously
|
751
|
+
set
|
752
|
+
settings
|
753
|
+
seven
|
754
|
+
several
|
755
|
+
shall
|
756
|
+
she
|
757
|
+
shit
|
758
|
+
shot
|
759
|
+
should
|
760
|
+
shouldn't
|
761
|
+
Sierra
|
762
|
+
simpler
|
763
|
+
simply
|
764
|
+
since
|
765
|
+
site
|
766
|
+
six
|
767
|
+
size
|
768
|
+
so
|
769
|
+
Sofia
|
770
|
+
solid
|
771
|
+
some
|
772
|
+
somebody
|
773
|
+
somehow
|
774
|
+
someone
|
775
|
+
something
|
776
|
+
sometime
|
777
|
+
sometimes
|
778
|
+
somewhat
|
779
|
+
somewhere
|
780
|
+
soon
|
781
|
+
Sophia
|
782
|
+
sorry
|
783
|
+
sounding
|
784
|
+
specified
|
785
|
+
specify
|
786
|
+
specifying
|
787
|
+
spoke
|
788
|
+
spread
|
789
|
+
sr
|
790
|
+
stand
|
791
|
+
started
|
792
|
+
step
|
793
|
+
Stephanie
|
794
|
+
Steven
|
795
|
+
still
|
796
|
+
stuff
|
797
|
+
sub
|
798
|
+
subscribe
|
799
|
+
such
|
800
|
+
suck
|
801
|
+
suite
|
802
|
+
sup
|
803
|
+
sur
|
804
|
+
sure
|
805
|
+
Sydney
|
806
|
+
t
|
807
|
+
t's
|
808
|
+
take
|
809
|
+
taken
|
810
|
+
Tanner
|
811
|
+
tat
|
812
|
+
Taylor
|
813
|
+
team
|
814
|
+
tedious
|
815
|
+
tell
|
816
|
+
tends
|
817
|
+
th
|
818
|
+
than
|
819
|
+
thank
|
820
|
+
thanks
|
821
|
+
thanx
|
822
|
+
that
|
823
|
+
that's
|
824
|
+
thats
|
825
|
+
the
|
826
|
+
their
|
827
|
+
theirs
|
828
|
+
them
|
829
|
+
themselves
|
830
|
+
then
|
831
|
+
thence
|
832
|
+
there
|
833
|
+
there's
|
834
|
+
thereafter
|
835
|
+
thereby
|
836
|
+
therefore
|
837
|
+
therein
|
838
|
+
theres
|
839
|
+
thereupon
|
840
|
+
these
|
841
|
+
they
|
842
|
+
they'd
|
843
|
+
they'll
|
844
|
+
they're
|
845
|
+
they've
|
846
|
+
thing
|
847
|
+
things
|
848
|
+
think
|
849
|
+
third
|
850
|
+
this
|
851
|
+
Thomas
|
852
|
+
thomas
|
853
|
+
thorough
|
854
|
+
thoroughly
|
855
|
+
those
|
856
|
+
though
|
857
|
+
three
|
858
|
+
through
|
859
|
+
throughout
|
860
|
+
thru
|
861
|
+
thus
|
862
|
+
Timothy
|
863
|
+
tit
|
864
|
+
to
|
865
|
+
today
|
866
|
+
together
|
867
|
+
told
|
868
|
+
too
|
869
|
+
took
|
870
|
+
toward
|
871
|
+
towards
|
872
|
+
Trevor
|
873
|
+
tried
|
874
|
+
tries
|
875
|
+
Trinity
|
876
|
+
Tristan
|
877
|
+
truly
|
878
|
+
try
|
879
|
+
trying
|
880
|
+
turn
|
881
|
+
turns
|
882
|
+
twice
|
883
|
+
two
|
884
|
+
Tyler
|
885
|
+
typically
|
886
|
+
u
|
887
|
+
ultra
|
888
|
+
un
|
889
|
+
unfortunately
|
890
|
+
unlikely
|
891
|
+
unsurprisingly
|
892
|
+
until
|
893
|
+
unto
|
894
|
+
up
|
895
|
+
upon
|
896
|
+
us
|
897
|
+
use
|
898
|
+
used
|
899
|
+
useful
|
900
|
+
uses
|
901
|
+
using
|
902
|
+
usually
|
903
|
+
uucp
|
904
|
+
v
|
905
|
+
value
|
906
|
+
Vanessa
|
907
|
+
various
|
908
|
+
very
|
909
|
+
via
|
910
|
+
Victor
|
911
|
+
Victoria
|
912
|
+
Vincent
|
913
|
+
viz
|
914
|
+
vs
|
915
|
+
w
|
916
|
+
walks
|
917
|
+
want
|
918
|
+
wants
|
919
|
+
was
|
920
|
+
wasn't
|
921
|
+
way
|
922
|
+
we
|
923
|
+
we'd
|
924
|
+
we'll
|
925
|
+
we're
|
926
|
+
we've
|
927
|
+
week
|
928
|
+
weekly
|
929
|
+
welcome
|
930
|
+
well
|
931
|
+
went
|
932
|
+
were
|
933
|
+
weren't
|
934
|
+
what
|
935
|
+
what's
|
936
|
+
whatever
|
937
|
+
when
|
938
|
+
whence
|
939
|
+
whenever
|
940
|
+
where
|
941
|
+
where's
|
942
|
+
whereafter
|
943
|
+
whereas
|
944
|
+
whereby
|
945
|
+
wherein
|
946
|
+
whereupon
|
947
|
+
wherever
|
948
|
+
whether
|
949
|
+
which
|
950
|
+
while
|
951
|
+
whither
|
952
|
+
who
|
953
|
+
who's
|
954
|
+
whoever
|
955
|
+
whole
|
956
|
+
whom
|
957
|
+
whose
|
958
|
+
why
|
959
|
+
will
|
960
|
+
William
|
961
|
+
willing
|
962
|
+
win
|
963
|
+
wish
|
964
|
+
with
|
965
|
+
within
|
966
|
+
without
|
967
|
+
won't
|
968
|
+
wonder
|
969
|
+
works
|
970
|
+
world
|
971
|
+
would
|
972
|
+
wouldn't
|
973
|
+
wrapped
|
974
|
+
Wyatt
|
975
|
+
Xavier
|
976
|
+
y
|
977
|
+
yeah
|
978
|
+
yes
|
979
|
+
yet
|
980
|
+
you
|
981
|
+
you'd
|
982
|
+
you'll
|
983
|
+
you're
|
984
|
+
you've
|
985
|
+
your
|
986
|
+
yours
|
987
|
+
yourself
|
988
|
+
yourselves
|
989
|
+
z
|
990
|
+
Zachary
|
991
|
+
zero
|
992
|
+
Zoe
|
993
|
+
0
|
994
|
+
1
|
995
|
+
2
|
996
|
+
3
|
997
|
+
4
|
998
|
+
5
|
999
|
+
6
|
1000
|
+
7
|
1001
|
+
8
|
1002
|
+
9
|