RubyGems - nddrylliog_pismo - Versions diffs - 0.7.3 - Mend

nddrylliog_pismo 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

data/.document +5 -0
data/.gitignore +29 -0
data/Gemfile +4 -0
data/LICENSE +23 -0
data/NOTICE +4 -0
data/README.markdown +131 -0
data/Rakefile +72 -0
data/bin/pismo +45 -0
data/lib/pismo.rb +82 -0
data/lib/pismo/document.rb +67 -0
data/lib/pismo/external_attributes.rb +14 -0
data/lib/pismo/internal_attributes.rb +316 -0
data/lib/pismo/reader.rb +19 -0
data/lib/pismo/reader/base.rb +259 -0
data/lib/pismo/reader/cluster.rb +171 -0
data/lib/pismo/reader/tree.rb +154 -0
data/lib/pismo/stopwords.txt +1002 -0
data/lib/pismo/version.rb +3 -0
data/pismo.gemspec +30 -0
data/test/corpus/bbcnews.html +2131 -0
data/test/corpus/bbcnews2.html +1575 -0
data/test/corpus/briancray.html +269 -0
data/test/corpus/cant_read.html +426 -0
data/test/corpus/factor.html +1362 -0
data/test/corpus/gmane.html +138 -0
data/test/corpus/huffington.html +2932 -0
data/test/corpus/metadata_expected.yaml +72 -0
data/test/corpus/metadata_expected.yaml.old +122 -0
data/test/corpus/queness.html +919 -0
data/test/corpus/reader_expected.yaml +39 -0
data/test/corpus/readers/cluster_expected.yaml +45 -0
data/test/corpus/readers/tree_expected.yaml +55 -0
data/test/corpus/rubyinside.html +318 -0
data/test/corpus/rww.html +1351 -0
data/test/corpus/spolsky.html +298 -0
data/test/corpus/techcrunch.html +1285 -0
data/test/corpus/tweet.html +360 -0
data/test/corpus/youtube.html +2348 -0
data/test/corpus/zefrank.html +535 -0
data/test/helper.rb +15 -0
data/test/test_corpus.rb +54 -0
data/test/test_pismo_document.rb +34 -0
metadata +156 -0

data/lib/pismo/reader/cluster.rb ADDED

@@ -0,0 +1,171 @@
+# encoding: utf-8
+module Pismo
+  module Reader
+    class Cluster < Base
+      # Adapted from : http://rubyforge.org/projects/extractcontent/
+      #
+      # Portions of this code are :
+      # Copyright (c) 2007/2008 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
+      #
+      # Permission is hereby granted, free of charge, to any person obtaining
+      # a copy of this software and associated documentation files (the
+      # "Software"), to deal in the Software without restriction, including
+      # without limitation the rights to use, copy, modify, merge, publish,
+      # distribute, sublicense, and/or sell copies of the Software, and to
+      # permit persons to whom the Software is furnished to do so, subject to
+      # the following conditions:
+      #
+      # The above copyright notice and this permission notice shall be
+      # included in all copies or substantial portions of the Software.
+      #
+      # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+      # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+      # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+      # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+      # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+      # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+      # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+      # Default option parameters
+      DEFAULTS = {
+        :threshold => 100,                                       # threshold for score of the text
+        :min_length => 80,                                       # minimum length of evaluated blocks
+        :decay_factor => 0.73,                                   # decay factor for block score
+        :continuous_factor => 1.62,                              # continuous factor for block score ( the larger, the harder to continue )
+        :no_body_factor => 0.72,                                 # no body factor that reduces block score if waste expressions are present
+        :punctuation_weight => 10,                               # score weight for punctuation
+        :punctuations => /(\.[^A-Za-z0-9]|,[^0-9]|!|\?)/,        # punctuation characters
+        :waste_expressions => /Copyright|All Rights Reserved/i,  # characteristic keywords including footer
+        :debug => false,                                         # if true, output block information to stdout
+      }
+      # Analyze the structure of the HTML document and score content blocks for likelihood of containing useful content
+      def analyze
+        opt = DEFAULTS.clone
+        opt.merge!(@options)
+        @sections = []
+        factor = continuous = 1.0
+        body = ''
+        score = 0
+        # The content is split into blocks of divs
+        list = @raw_content.split(/<\/?(?:div)[^>]*>/)
+        list.each do |block|
+          next unless block
+          block.gsub!(/\n/, '')
+          # Ignore blocks that have no tex
+          next if has_only_tags?(block)
+          # Each new block iterated over makes it less likely for it to belong
+          # to the existing block
+          continuous /= opt[:continuous_factor] if body.length > 0
+          # Clean up and strip block of html tags for scoring
+          clean = clean_block(block)
+          #clean = strip_tags(block)
+          next if clean.length < opt[:min_length]
+          # Calculate scores for clustering of blocks
+          # c represents how probable it is for this block to be a content block
+          c = (clean.length + clean.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor
+          # The further down the document we go (i.e. the more blocks we see),
+          # the less likely they are to be valid content blocks
+          factor *= opt[:decay_factor]
+          # The not body rate represents how likely this is to be a junk block
+          not_body_rate = block.scan(opt[:waste_expressions]).length
+          # The block score is reduced if there is a not_body_rate
+          c *= (opt[:no_body_factor] ** not_body_rate) if not_body_rate>0
+          # c1 represents how probable it is for this block to belong to the
+          # existing block or if it is a new one
+          c1 = c * continuous
+          puts "----- #{c}*#{continuous}=#{c1} #{clean.length} \n\n" if opt[:debug]
+          if c1 > opt[:threshold]
+            # Treat continuous blocks as cluster
+            body += block + "\n"
+            score += c1
+            continuous = opt[:continuous_factor]
+          elsif c > opt[:threshold]
+            # Continuous block end
+            @sections << { :body => body, :score => score }
+            body = block + "\n"
+            score = c
+            continuous = opt[:continuous_factor]
+          else
+            # We drop blocks that don't have a high enough c score
+          end
+        end
+        # Add the last block as we've finished iterating
+        @sections << { :body => body, :score => score } if body
+        # Sort the sections by score
+        sorted_sections = @sections.sort_by { |section| section[:score] }
+        # Convert to nokogiri representation for compatibility with the content method
+        @content_candidates = sorted_sections.reverse.map { |section| Nokogiri::HTML(section[:body], nil, 'utf-8') }
+      end
+      def content_at(index)
+        @content_candidates[index]
+      end
+      protected
+      # Checks if the given block has only tags without text.
+      def has_only_tags?(block)
+        block.gsub(/<[^>]*>/im, '').strip.length == 0
+      end
+      # Eliminates link heavy blocks and blocks that are lists of links and
+      # then returns block stripped of tags
+      def clean_block(block)
+        # Return empty block if it is a list of links
+        return "" if is_link_list?(block)
+        # Return empty block if it is a very link heavy block
+        count = 0
+        no_links = block.gsub(/<a\s[^>]*>.*?<\/a\s*>/im){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/im, '')
+        return "" if no_links.length < 20 * count
+        strip_tags(no_links)
+      end
+      # Determines whether a block is link list or not
+      def is_link_list?(st)
+        if st =~ /<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/im
+          listpart = $1
+          outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/imn, '').gsub(/<.+?>/mn, '').gsub(/\s+/, ' ')
+          list = listpart.split(/<li[^>]*>/)
+          list.shift
+          rate = evaluate_list(list)
+          outside.length <= st.length / (45 / rate)
+        end
+      end
+      # Estimates how much degree of link list
+      def evaluate_list(list)
+        return 1 if list.length == 0
+        hit = 0
+        list.each do |line|
+          hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/imn
+        end
+        return 9 * (1.0 * hit / list.length) ** 2 + 1
+      end
+      # Removes all html tags and attributes from html
+      def strip_tags(html)
+        strip(Sanitize.clean(html, :elements => [], :attributes => []))
+      end
+    end
+  end
+end

data/lib/pismo/reader/tree.rb ADDED

@@ -0,0 +1,154 @@
+module Pismo
+  module Reader
+    class Tree < Base
+      # Analyze the structure of the HTML document and score branches for likelihood of containing useful content
+      def analyze
+        @tree = {}
+        subels = {}
+        t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
+        @doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
+          # Assume that no content we'll want comes in a total package of fewer than 80 characters!
+          next unless el.text.to_s.strip.length >= 80
+          path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
+          depth = path_segments.length
+          local_ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)
+          ids = local_ids
+          cp = el.parent
+          (depth - 1).times do
+            ids += (cp['id'].to_s + ' ' + cp['class'].to_s).downcase.strip.scan(/[a-z]+/)
+            cp = cp.parent
+          end if depth > 1
+          #puts "IDS"
+          #ap ids
+          #puts "LOCAL IDS"
+          #ap local_ids
+          branch = {}
+          branch[:ids] = ids
+          branch[:local_ids] = local_ids
+          branch[:score] = -(BAD_WORDS & ids).size
+          branch[:score] += ((GOOD_WORDS & ids).size * 2)
+          next if branch[:score] < -5
+          #puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
+          # Elements that have an ID or class are more likely to be our winners
+          branch[:score] += 2 unless local_ids.empty?
+          branch[:name] = el.name
+          branch[:depth] = depth
+          branch[:path] = el.path
+          branch[:raw_word_count] = 0
+          branch[:word_count] = 0
+          branch[:child_count] = 0
+          branch[:bad_child_count] = 0
+          branch[:score_steps] = []
+          el.traverse do |subel|
+            div_at_end_of_branch = false if subel.name == "div"
+            path = subel.path
+            subels[path] ||= {}
+            subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
+            subels[path][:is_text] ||= subel.text?
+            if subels[path][:is_text]
+              subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
+              next if subels[path][:text].empty?
+              subels[path][:raw_word_count] ||= subels[path][:text].size
+              subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
+              subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
+              branch[:raw_word_count] += subels[path][:raw_word_count]
+              branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
+            end
+            subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
+            subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
+            subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
+            branch[:bad_child_count] += subels[path][:bad_child_count_inc]
+            branch[:child_count] += subels[path][:child_count_inc]
+          end
+          branch[:score] += 2 if branch[:name] == "div"
+          branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
+          branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
+          branch[:score] *= 3
+          branch[:score] *= 0.7 if el.children && el.children.size < 3
+          branch[:score] *= 1.25 if branch[:raw_word_count] > 10
+          next if branch[:raw_word_count] < 10
+          branch[:score] += [branch[:word_count], 1].max ** 0.5
+          word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max
+          branch[:word_child_count_ratio] = word_child_count_ratio
+          if branch[:raw_word_count] > 100
+            good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
+            branch[:score] += good_word_ratio * 12
+            if word_child_count_ratio > 50
+              branch[:score] *= 1.5
+            elsif word_child_count_ratio > 30
+              branch[:score] *= 1.2
+            elsif word_child_count_ratio > 15
+              branch[:score] *= 1.1
+            elsif word_child_count_ratio < 4
+              branch[:score] *= 0.9
+            end
+          end
+          branch[:score_steps] << "s1: #{branch[:score]}"
+          bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
+          branch[:bad_child_ratio] = bad_child_ratio
+          branch[:score] += 3 if bad_child_ratio < 0.0
+          branch[:score] -= 3 if bad_child_ratio > 0.15
+          branch[:score] -= 2 if bad_child_ratio > 0.25
+          branch[:score] -= 2 if bad_child_ratio > 0.4
+          branch[:score] -= 4 if bad_child_ratio > 0.5
+          branch[:score] -= 5 if bad_child_ratio > 0.7
+          branch[:score] -= 5 if branch[:bad_child_count] > 20
+          branch[:score] += depth
+          branch[:score] *= 0.8 if ids.length > 10
+          @tree[el.path] = branch
+        end
+        sorted_tree = @tree.sort_by { |k, v| v[:score] }
+        #ap @doc.at(sorted_tree.first[0]).text
+        # Sort the branches by their score in reverse order
+        @content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
+        #ap @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
+        #t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
+        #puts t2 - t1
+        #exit
+      end
+      def content_at(index)
+        @doc.at(@content_candidates[index].first)
+      end
+    end
+  end
+end

data/lib/pismo/stopwords.txt ADDED

@@ -0,0 +1,1002 @@
+a
+a's
+Aaliyah
+Aaron
+Abigail
+ability
+able
+about
+above
+according
+accordingly
+across
+actually
+Adam
+Addison
+Adrian
+after
+afterwards
+again
+against
+ago
+Aidan
+Aiden
+ain't
+al
+Alejandro
+Alex
+Alexa
+Alexander
+Alexandra
+Alexis
+all
+Allison
+allow
+allowed
+allowing
+allows
+almost
+alone
+along
+alongside
+already
+also
+although
+always
+Alyssa
+am
+Amanda
+Amber
+among
+amongst
+an
+and
+Andrea
+Andrew
+Angel
+Angelina
+Anna
+annual
+another
+Anthony
+Antonio
+anybody
+anyhow
+anyone
+anything
+anyway
+anyways
+anywhere
+apart
+appear
+appreciate
+appropriate
+approximate
+approximately
+apr
+april
+are
+aren't
+Ariana
+Arianna
+around
+articles
+as
+Ashley
+Ashton
+aside
+ask
+asking
+asshole
+associated
+at
+Audrey
+aug
+august
+Austin
+Autumn
+Ava
+available
+Avery
+away
+awesome
+awfully
+Bailey
+based
+basically
+be
+became
+because
+become
+becomes
+becoming
+been
+beforehand
+behind
+being
+believe
+below
+benefit
+Benjamin
+beside
+besides
+best
+better
+beyond
+big
+biggest
+Blake
+both
+bother
+Brady
+Brandon
+Brayden
+Brian
+Brianna
+brief
+bring
+brings
+Brooke
+Brooklyn
+Bryan
+Bryce
+but
+by
+c'mon
+c's
+Caden
+Caleb
+called
+came
+Cameron
+can
+can't
+cancel
+cannot
+cant
+carefully
+Carlos
+Caroline
+Carson
+Carter
+casually
+cause
+causes
+certain
+certainly
+changes
+Charles
+Chase
+check
+Chloe
+Christian
+Christopher
+Claire
+clearly
+co
+Cody
+Cole
+Colin
+Colton
+come
+comes
+coming
+comment
+company
+compelling
+concerning
+congratulations
+Connor
+consequently
+consider
+considering
+contain
+containing
+contains
+continued
+Cooper
+corresponding
+could
+couldn't
+country
+course
+covered
+covering
+cunt
+currently
+customizable
+damn
+Daniel
+Danielle
+dave
+David
+david
+de
+dead
+dec
+decade
+december
+definitely
+definitive
+described
+despite
+Destiny
+Devin
+did
+didn't
+Diego
+different
+direct
+discuss
+do
+does
+doesn
+doesn't
+doing
+Dominic
+don't
+done
+down
+downwards
+driven
+drove
+during
+Dylan
+e
+each
+easier
+edu
+Eduardo
+Edward
+eg
+eight
+either
+Elijah
+Elizabeth
+Ella
+else
+elsewhere
+Emily
+Emma
+end
+english
+enough
+entirely
+Eric
+Erin
+es
+especially
+et
+etc
+Ethan
+Evan
+Evelyn
+even
+eventually
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+exactly
+example
+except
+existing
+extensive
+extra
+extremely
+f
+Faith
+false
+fame
+far
+feb
+february
+feel
+feeling
+few
+fifth
+finally
+fine
+first
+five
+followed
+following
+follows
+for
+former
+formerly
+forth
+found
+four
+from
+fuck
+full
+further
+furthermore
+g
+Gabriel
+Gabriella
+Gabrielle
+Garrett
+gave
+Gavin
+generally
+get
+gets
+getting
+give
+given
+gives
+glory
+goal
+goes
+going
+gone
+good
+got
+gotten
+Grace
+great
+greetings
+h
+had
+hadn't
+Hailey
+Haley
+Hannah
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+Hayden
+he
+he's
+hello
+help
+hence
+Henry
+her
+here
+here's
+hereafter
+hereby
+herein
+hereupon
+hers
+herself
+hi
+high
+highly
+him
+himself
+hire
+his
+hither
+hopefully
+how
+howbeit
+however
+huge
+Hunter
+i
+i'd
+i'll
+i'm
+i've
+Ian
+ie
+if
+ignored
+imagine
+immediate
+implement
+important
+impromptu
+in
+inasmuch
+inc
+indeed
+indicate
+indicated
+indicates
+informative
+inhibits
+inner
+insofar
+instead
+interest
+interesting
+into
+inward
+is
+Isaac
+Isabel
+Isabella
+Isaiah
+isn
+isn't
+it
+it'd
+it'll
+it's
+its
+itself
+Ivan
+j
+Jack
+Jackson
+Jacob
+Jada
+Jaden
+Jake
+James
+jan
+january
+Jared
+Jasmine
+Jason
+Jayden
+Jenna
+Jennifer
+Jeremiah
+Jeremy
+Jesse
+Jessica
+Jesus
+jim
+jimmy
+jnr
+Jocelyn
+Joel
+John
+Jonathan
+Jordan
+Jorge
+Jose
+Joseph
+Joshua
+Josiah
+jr
+Juan
+jul
+Julia
+Julian
+july
+jun
+june
+just
+Justin
+k
+Kaden
+Kaitlyn
+Kaleb
+Katelyn
+Katherine
+Kayla
+Kaylee
+keep
+keeps
+Kenneth
+kept
+Kevin
+key
+kid
+Kimberly
+know
+known
+knows
+Kyle
+Kylie
+l
+la
+Landon
+last
+lately
+later
+latter
+latterly
+Lauren
+le
+Leah
+least
+les
+less
+lest
+let
+let's
+levels
+Liam
+like
+liked
+likely
+Lillian
+Lily
+line
+listing
+listings
+little
+Logan
+look
+looking
+looks
+lot
+lots
+love
+low
+ltd
+Lucas
+Luis
+Luke
+m
+Mackenzie
+Madeline
+Madison
+mainly
+Makayla
+many
+mar
+march
+Marcus
+Maria
+Mariah
+Marissa
+Mark
+Mary
+Mason
+Matthew
+maturity
+may
+Maya
+maybe
+me
+mean
+means
+meant
+meanwhile
+Megan
+Melanie
+member
+mentioned
+merely
+Mia
+Michael
+Michelle
+might
+Miguel
+mile
+more
+moreover
+Morgan
+most
+mostly
+moving
+much
+must
+my
+myself
+n
+name
+namely
+Natalie
+Nathan
+Nathaniel
+naturally
+nd
+near
+nearly
+necessary
+need
+needed
+needs
+neither
+Nevaeh
+never
+nevertheless
+new
+next
+Nicholas
+Nicole
+nine
+no
+Noah
+nobody
+non
+none
+noone
+nor
+normally
+not
+notably
+nothing
+nov
+novel
+november
+now
+nowhere
+o
+Obie
+obviously
+oct
+october
+of
+off
+official
+often
+oh
+ok
+okay
+old
+Olivia
+on
+once
+one
+ones
+online
+only
+onto
+open
+or
+org
+oriented
+Oscar
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+overall
+Owen
+own
+p
+Paige
+par
+Parker
+part
+particular
+particularly
+Patrick
+Paul
+peasy
+per
+perhaps
+piece
+placed
+play
+please
+plus
+possible
+posts
+pre
+preferences
+presumably
+pretty
+probably
+product
+products
+proud
+provide
+provides
+put
+q
+que
+quite
+qv
+r
+Rachel
+rather
+rd
+re
+reached
+read
+real
+really
+reasonably
+Rebecca
+recently
+regarding
+regardless
+regards
+related
+relatively
+replaced
+requirements
+respectively
+Richard
+right
+Riley
+Robert
+run
+Ryan
+s
+safest
+said
+Samantha
+same
+Samuel
+Sara
+Sarah
+Savannah
+saw
+say
+saying
+says
+Sean
+Sebastian
+second
+secondly
+seconds
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sensible
+sent
+sep
+september
+serious
+seriously
+set
+settings
+seven
+several
+shall
+she
+shit
+shot
+should
+shouldn't
+Sierra
+simpler
+simply
+since
+site
+six
+size
+so
+Sofia
+solid
+some
+somebody
+somehow
+someone
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+Sophia
+sorry
+sounding
+specified
+specify
+specifying
+spoke
+spread
+sr
+stand
+started
+step
+Stephanie
+Steven
+still
+stuff
+sub
+subscribe
+such
+suck
+suite
+sup
+sur
+sure
+Sydney
+t
+t's
+take
+taken
+Tanner
+tat
+Taylor
+team
+tedious
+tell
+tends
+th
+than
+thank
+thanks
+thanx
+that
+that's
+thats
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+there's
+thereafter
+thereby
+therefore
+therein
+theres
+thereupon
+these
+they
+they'd
+they'll
+they're
+they've
+thing
+things
+think
+third
+this
+Thomas
+thomas
+thorough
+thoroughly
+those
+though
+three
+through
+throughout
+thru
+thus
+Timothy
+tit
+to
+today
+together
+told
+too
+took
+toward
+towards
+Trevor
+tried
+tries
+Trinity
+Tristan
+truly
+try
+trying
+turn
+turns
+twice
+two
+Tyler
+typically
+u
+ultra
+un
+unfortunately
+unlikely
+unsurprisingly
+until
+unto
+up
+upon
+us
+use
+used
+useful
+uses
+using
+usually
+uucp
+v
+value
+Vanessa
+various
+very
+via
+Victor
+Victoria
+Vincent
+viz
+vs
+w
+walks
+want
+wants
+was
+wasn't
+way
+we
+we'd
+we'll
+we're
+we've
+week
+weekly
+welcome
+well
+went
+were
+weren't
+what
+what's
+whatever
+when
+whence
+whenever
+where
+where's
+whereafter
+whereas
+whereby
+wherein
+whereupon
+wherever
+whether
+which
+while
+whither
+who
+who's
+whoever
+whole
+whom
+whose
+why
+will
+William
+willing
+win
+wish
+with
+within
+without
+won't
+wonder
+works
+world
+would
+wouldn't
+wrapped
+Wyatt
+Xavier
+y
+yeah
+yes
+yet
+you
+you'd
+you'll
+you're
+you've
+your
+yours
+yourself
+yourselves
+z
+Zachary
+zero
+Zoe
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9