nddrylliog_pismo 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
 - data/.gitignore +29 -0
 - data/Gemfile +4 -0
 - data/LICENSE +23 -0
 - data/NOTICE +4 -0
 - data/README.markdown +131 -0
 - data/Rakefile +72 -0
 - data/bin/pismo +45 -0
 - data/lib/pismo.rb +82 -0
 - data/lib/pismo/document.rb +67 -0
 - data/lib/pismo/external_attributes.rb +14 -0
 - data/lib/pismo/internal_attributes.rb +316 -0
 - data/lib/pismo/reader.rb +19 -0
 - data/lib/pismo/reader/base.rb +259 -0
 - data/lib/pismo/reader/cluster.rb +171 -0
 - data/lib/pismo/reader/tree.rb +154 -0
 - data/lib/pismo/stopwords.txt +1002 -0
 - data/lib/pismo/version.rb +3 -0
 - data/pismo.gemspec +30 -0
 - data/test/corpus/bbcnews.html +2131 -0
 - data/test/corpus/bbcnews2.html +1575 -0
 - data/test/corpus/briancray.html +269 -0
 - data/test/corpus/cant_read.html +426 -0
 - data/test/corpus/factor.html +1362 -0
 - data/test/corpus/gmane.html +138 -0
 - data/test/corpus/huffington.html +2932 -0
 - data/test/corpus/metadata_expected.yaml +72 -0
 - data/test/corpus/metadata_expected.yaml.old +122 -0
 - data/test/corpus/queness.html +919 -0
 - data/test/corpus/reader_expected.yaml +39 -0
 - data/test/corpus/readers/cluster_expected.yaml +45 -0
 - data/test/corpus/readers/tree_expected.yaml +55 -0
 - data/test/corpus/rubyinside.html +318 -0
 - data/test/corpus/rww.html +1351 -0
 - data/test/corpus/spolsky.html +298 -0
 - data/test/corpus/techcrunch.html +1285 -0
 - data/test/corpus/tweet.html +360 -0
 - data/test/corpus/youtube.html +2348 -0
 - data/test/corpus/zefrank.html +535 -0
 - data/test/helper.rb +15 -0
 - data/test/test_corpus.rb +54 -0
 - data/test/test_pismo_document.rb +34 -0
 - metadata +156 -0
 
| 
         @@ -0,0 +1,14 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Pismo
         
     | 
| 
      
 2 
     | 
    
         
            +
              # External attributes return data that comes from external services or programs (e.g. Delicious tags)
         
     | 
| 
      
 3 
     | 
    
         
            +
              module ExternalAttributes
         
     | 
| 
      
 4 
     | 
    
         
            +
                #include HTTParty
         
     | 
| 
      
 5 
     | 
    
         
            +
                #
         
     | 
| 
      
 6 
     | 
    
         
            +
                #def delicious_tags
         
     | 
| 
      
 7 
     | 
    
         
            +
                #  delicious_info["top_tags"].sort_by { |k, v| v }.reverse.first(5) rescue []
         
     | 
| 
      
 8 
     | 
    
         
            +
                #end
         
     | 
| 
      
 9 
     | 
    
         
            +
                #
         
     | 
| 
      
 10 
     | 
    
         
            +
                #def delicious_info
         
     | 
| 
      
 11 
     | 
    
         
            +
                #  @delicious_info ||= self.class.get('http://feeds.delicious.com/v2/json/urlinfo/' + Digest::MD5.hexdigest(@url)).first rescue nil
         
     | 
| 
      
 12 
     | 
    
         
            +
                #end
         
     | 
| 
      
 13 
     | 
    
         
            +
              end
         
     | 
| 
      
 14 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,316 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Pismo
         
     | 
| 
      
 2 
     | 
    
         
            +
              # Internal attributes are different pieces of data we can extract from a document's content
         
     | 
| 
      
 3 
     | 
    
         
            +
              module InternalAttributes
         
     | 
| 
      
 4 
     | 
    
         
            +
                # Returns the title of the page/content - attempts to strip site name, etc, if possible
         
     | 
| 
      
 5 
     | 
    
         
            +
                def title(all = false)
         
     | 
| 
      
 6 
     | 
    
         
            +
                  # TODO: Memoizations
         
     | 
| 
      
 7 
     | 
    
         
            +
                  title = @doc.match( 
         
     | 
| 
      
 8 
     | 
    
         
            +
                                      [
         
     | 
| 
      
 9 
     | 
    
         
            +
                                        '#pname a',                                                       # Google Code style
         
     | 
| 
      
 10 
     | 
    
         
            +
                                        '.entryheader h1',                                                # Ruby Inside/Kubrick
         
     | 
| 
      
 11 
     | 
    
         
            +
                                        '.entry-title a',                                               # Common Blogger/Blogspot rules
         
     | 
| 
      
 12 
     | 
    
         
            +
                                        '.post-title a',
         
     | 
| 
      
 13 
     | 
    
         
            +
                                        '.post_title a',
         
     | 
| 
      
 14 
     | 
    
         
            +
                                        '.posttitle a',
         
     | 
| 
      
 15 
     | 
    
         
            +
                                        '.post-header h1',
         
     | 
| 
      
 16 
     | 
    
         
            +
                                        '.entry-title',
         
     | 
| 
      
 17 
     | 
    
         
            +
                                        '.post-title',
         
     | 
| 
      
 18 
     | 
    
         
            +
                                        '.post h1',
         
     | 
| 
      
 19 
     | 
    
         
            +
                                        '.post h3 a',
         
     | 
| 
      
 20 
     | 
    
         
            +
                                        'a.datitle',          # Slashdot style
         
     | 
| 
      
 21 
     | 
    
         
            +
                                        '.posttitle',
         
     | 
| 
      
 22 
     | 
    
         
            +
                                        '.post_title',
         
     | 
| 
      
 23 
     | 
    
         
            +
                                        '.pageTitle',
         
     | 
| 
      
 24 
     | 
    
         
            +
                                        '#main h1.title',
         
     | 
| 
      
 25 
     | 
    
         
            +
                                        '.title h1',                          
         
     | 
| 
      
 26 
     | 
    
         
            +
                                        '.post h2',
         
     | 
| 
      
 27 
     | 
    
         
            +
                                        'h2.title',
         
     | 
| 
      
 28 
     | 
    
         
            +
                                        '.entry h2 a',
         
     | 
| 
      
 29 
     | 
    
         
            +
                                        '.entry h2',                                                      # Common style
         
     | 
| 
      
 30 
     | 
    
         
            +
                                        '.boite_titre a',
         
     | 
| 
      
 31 
     | 
    
         
            +
                                        ['meta[@name="title"]', lambda { |el| el.attr('content') }],
         
     | 
| 
      
 32 
     | 
    
         
            +
                                        'h1.headermain',
         
     | 
| 
      
 33 
     | 
    
         
            +
                                        'h1.title',
         
     | 
| 
      
 34 
     | 
    
         
            +
                                        '.mxb h1',                                                        # BBC News
         
     | 
| 
      
 35 
     | 
    
         
            +
                                        '#content h1',
         
     | 
| 
      
 36 
     | 
    
         
            +
                                        '#content h2',
         
     | 
| 
      
 37 
     | 
    
         
            +
                                        '#content h3',
         
     | 
| 
      
 38 
     | 
    
         
            +
                                        'a[@rel="bookmark"]',
         
     | 
| 
      
 39 
     | 
    
         
            +
                                        '.products h2',
         
     | 
| 
      
 40 
     | 
    
         
            +
                                        '.caption h3',
         
     | 
| 
      
 41 
     | 
    
         
            +
                                        '#main h2',
         
     | 
| 
      
 42 
     | 
    
         
            +
                                        '#body h1',
         
     | 
| 
      
 43 
     | 
    
         
            +
                                        '#wrapper h1',
         
     | 
| 
      
 44 
     | 
    
         
            +
                                        '#page h1',
         
     | 
| 
      
 45 
     | 
    
         
            +
                                        '.asset-header h1',
         
     | 
| 
      
 46 
     | 
    
         
            +
                                        '#body_content h2'
         
     | 
| 
      
 47 
     | 
    
         
            +
                                      ],
         
     | 
| 
      
 48 
     | 
    
         
            +
                                      all
         
     | 
| 
      
 49 
     | 
    
         
            +
                                    )
         
     | 
| 
      
 50 
     | 
    
         
            +
                  
         
     | 
| 
      
 51 
     | 
    
         
            +
                  # If all else fails, go to the HTML title
         
     | 
| 
      
 52 
     | 
    
         
            +
                  if all
         
     | 
| 
      
 53 
     | 
    
         
            +
                    return [html_title] if !title
         
     | 
| 
      
 54 
     | 
    
         
            +
                    return ([*title] + [html_title]).uniq
         
     | 
| 
      
 55 
     | 
    
         
            +
                  else
         
     | 
| 
      
 56 
     | 
    
         
            +
                    return html_title if !title
         
     | 
| 
      
 57 
     | 
    
         
            +
                    return title
         
     | 
| 
      
 58 
     | 
    
         
            +
                  end
         
     | 
| 
      
 59 
     | 
    
         
            +
                end
         
     | 
| 
      
 60 
     | 
    
         
            +
                
         
     | 
| 
      
 61 
     | 
    
         
            +
                def titles
         
     | 
| 
      
 62 
     | 
    
         
            +
                  title(true)
         
     | 
| 
      
 63 
     | 
    
         
            +
                end
         
     | 
| 
      
 64 
     | 
    
         
            +
                
         
     | 
| 
      
 65 
     | 
    
         
            +
                
         
     | 
| 
      
 66 
     | 
    
         
            +
                # HTML title
         
     | 
| 
      
 67 
     | 
    
         
            +
                def html_title
         
     | 
| 
      
 68 
     | 
    
         
            +
                  title = @doc.match('title')
         
     | 
| 
      
 69 
     | 
    
         
            +
                  return unless title
         
     | 
| 
      
 70 
     | 
    
         
            +
                  title
         
     | 
| 
      
 71 
     | 
    
         
            +
                end
         
     | 
| 
      
 72 
     | 
    
         
            +
                
         
     | 
| 
      
 73 
     | 
    
         
            +
                # Return an estimate of when the page/content was created
         
     | 
| 
      
 74 
     | 
    
         
            +
                # As clients of this library should be doing HTTP retrieval themselves, they can fall to the
         
     | 
| 
      
 75 
     | 
    
         
            +
                # Last-Updated HTTP header if they so wish. This method is just rough and based on content only.
         
     | 
| 
      
 76 
     | 
    
         
            +
                def datetime
         
     | 
| 
      
 77 
     | 
    
         
            +
                  # TODO: Clean all this mess up
         
     | 
| 
      
 78 
     | 
    
         
            +
                  
         
     | 
| 
      
 79 
     | 
    
         
            +
                  mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)}i
         
     | 
| 
      
 80 
     | 
    
         
            +
                  
         
     | 
| 
      
 81 
     | 
    
         
            +
                  regexen = [
         
     | 
| 
      
 82 
     | 
    
         
            +
                    /#{mo}\b\s+\d+\D{1,10}\d{4}/i,
         
     | 
| 
      
 83 
     | 
    
         
            +
                    /(on\s+)?\d+\s+#{mo}\s+\D{1,10}\d+/i,
         
     | 
| 
      
 84 
     | 
    
         
            +
                    /(on[^\d+]{1,10})\d+(th|st|rd)?.{1,10}#{mo}\b[^\d]{1,10}\d+/i,
         
     | 
| 
      
 85 
     | 
    
         
            +
                    /\b\d{4}\-\d{2}\-\d{2}\b/i,
         
     | 
| 
      
 86 
     | 
    
         
            +
                    /\d+(th|st|rd).{1,10}#{mo}\b[^\d]{1,10}\d+/i,
         
     | 
| 
      
 87 
     | 
    
         
            +
                    /\d+\s+#{mo}\b[^\d]{1,10}\d+/i,
         
     | 
| 
      
 88 
     | 
    
         
            +
                    /on\s+#{mo}\s+\d+/i,
         
     | 
| 
      
 89 
     | 
    
         
            +
                    /#{mo}\s+\d+/i,
         
     | 
| 
      
 90 
     | 
    
         
            +
                    /\d{4}[\.\/\-]\d{2}[\.\/\-]\d{2}/,
         
     | 
| 
      
 91 
     | 
    
         
            +
                    /\d{2}[\.\/\-]\d{2}[\.\/\-]\d{4}/
         
     | 
| 
      
 92 
     | 
    
         
            +
                  ]
         
     | 
| 
      
 93 
     | 
    
         
            +
                  
         
     | 
| 
      
 94 
     | 
    
         
            +
                  datetime = 10
         
     | 
| 
      
 95 
     | 
    
         
            +
                  
         
     | 
| 
      
 96 
     | 
    
         
            +
                  regexen.each do |r|
         
     | 
| 
      
 97 
     | 
    
         
            +
                    break if datetime = @doc.to_html[r]
         
     | 
| 
      
 98 
     | 
    
         
            +
                  end
         
     | 
| 
      
 99 
     | 
    
         
            +
                  
         
     | 
| 
      
 100 
     | 
    
         
            +
                  return unless datetime && datetime.length > 4
         
     | 
| 
      
 101 
     | 
    
         
            +
                  
         
     | 
| 
      
 102 
     | 
    
         
            +
                  # Clean up the string for use by Chronic
         
     | 
| 
      
 103 
     | 
    
         
            +
                  datetime.strip!
         
     | 
| 
      
 104 
     | 
    
         
            +
                  datetime.gsub!(/(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)[^\w]*/i, '')
         
     | 
| 
      
 105 
     | 
    
         
            +
                  datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)[^\w]*/i, '')
         
     | 
| 
      
 106 
     | 
    
         
            +
                  datetime.sub!(/on\s+/, '')
         
     | 
| 
      
 107 
     | 
    
         
            +
                  datetime.gsub!(/\,/, '')
         
     | 
| 
      
 108 
     | 
    
         
            +
                  datetime.sub!(/(\d+)(th|st|rd)/, '\1')
         
     | 
| 
      
 109 
     | 
    
         
            +
                  
         
     | 
| 
      
 110 
     | 
    
         
            +
                  Chronic.parse(datetime) || datetime
         
     | 
| 
      
 111 
     | 
    
         
            +
                end
         
     | 
| 
      
 112 
     | 
    
         
            +
                
         
     | 
| 
      
 113 
     | 
    
         
            +
                # Returns the author of the page/content
         
     | 
| 
      
 114 
     | 
    
         
            +
                def author(all = false)
         
     | 
| 
      
 115 
     | 
    
         
            +
                  author = @doc.match([
         
     | 
| 
      
 116 
     | 
    
         
            +
                                      '.post-author .fn',
         
     | 
| 
      
 117 
     | 
    
         
            +
                                      '.wire_author',
         
     | 
| 
      
 118 
     | 
    
         
            +
                                      '.cnnByline b',
         
     | 
| 
      
 119 
     | 
    
         
            +
                                      '.editorlink',
         
     | 
| 
      
 120 
     | 
    
         
            +
                                      '.authors p',
         
     | 
| 
      
 121 
     | 
    
         
            +
                                      ['meta[@name="author"]', lambda { |el| el.attr('content') }],     # Traditional meta tag style
         
     | 
| 
      
 122 
     | 
    
         
            +
                                      ['meta[@name="Author"]', lambda { |el| el.attr('content') }],     # CNN style
         
     | 
| 
      
 123 
     | 
    
         
            +
                                      ['meta[@name="AUTHOR"]', lambda { |el| el.attr('content') }],     # CNN style
         
     | 
| 
      
 124 
     | 
    
         
            +
                                      '.byline a',                                                      # Ruby Inside style
         
     | 
| 
      
 125 
     | 
    
         
            +
                                      '.byline',
         
     | 
| 
      
 126 
     | 
    
         
            +
                                      '.post_subheader_left a',                                         # TechCrunch style
         
     | 
| 
      
 127 
     | 
    
         
            +
                                      '.byl',                                                           # BBC News style
         
     | 
| 
      
 128 
     | 
    
         
            +
                                      '.articledata .author a',
         
     | 
| 
      
 129 
     | 
    
         
            +
                                      '#owners a',                                                      # Google Code style
         
     | 
| 
      
 130 
     | 
    
         
            +
                                      '.author a',
         
     | 
| 
      
 131 
     | 
    
         
            +
                                      '.author',
         
     | 
| 
      
 132 
     | 
    
         
            +
                                      '.auth a',
         
     | 
| 
      
 133 
     | 
    
         
            +
                                      '.auth',
         
     | 
| 
      
 134 
     | 
    
         
            +
                                      '.cT-storyDetails h5',                                            # smh.com.au - worth dropping maybe..
         
     | 
| 
      
 135 
     | 
    
         
            +
                                      ['meta[@name="byl"]', lambda { |el| el.attr('content') }],
         
     | 
| 
      
 136 
     | 
    
         
            +
                                      '.timestamp a',
         
     | 
| 
      
 137 
     | 
    
         
            +
                                      '.fn a',
         
     | 
| 
      
 138 
     | 
    
         
            +
                                      '.fn',
         
     | 
| 
      
 139 
     | 
    
         
            +
                                      '.byline-author',
         
     | 
| 
      
 140 
     | 
    
         
            +
                                      '.ArticleAuthor a',
         
     | 
| 
      
 141 
     | 
    
         
            +
                                      '.blog_meta a',
         
     | 
| 
      
 142 
     | 
    
         
            +
                                      'cite a',
         
     | 
| 
      
 143 
     | 
    
         
            +
                                      'cite',
         
     | 
| 
      
 144 
     | 
    
         
            +
                                      '.contributor_details h4 a',
         
     | 
| 
      
 145 
     | 
    
         
            +
                                      '.meta a'
         
     | 
| 
      
 146 
     | 
    
         
            +
                                      ], all)
         
     | 
| 
      
 147 
     | 
    
         
            +
                                      
         
     | 
| 
      
 148 
     | 
    
         
            +
                  return unless author
         
     | 
| 
      
 149 
     | 
    
         
            +
                
         
     | 
| 
      
 150 
     | 
    
         
            +
                  # Strip off any "By [whoever]" section
         
     | 
| 
      
 151 
     | 
    
         
            +
                  if String === author
         
     | 
| 
      
 152 
     | 
    
         
            +
                    author.sub!(/^(post(ed)?\s)?by\W+/i, '')
         
     | 
| 
      
 153 
     | 
    
         
            +
                    author.tr!('^a-zA-Z 0-9\'', '|')
         
     | 
| 
      
 154 
     | 
    
         
            +
                    author = author.split(/\|{2,}/).first.to_s
         
     | 
| 
      
 155 
     | 
    
         
            +
                    author.gsub!(/\s+/, ' ')
         
     | 
| 
      
 156 
     | 
    
         
            +
                    author.gsub!(/\|/, '')
         
     | 
| 
      
 157 
     | 
    
         
            +
                    author.strip!
         
     | 
| 
      
 158 
     | 
    
         
            +
                  elsif Array === author
         
     | 
| 
      
 159 
     | 
    
         
            +
                    author.map! { |a| a.sub(/^(post(ed)?\s)?by\W+/i, '') }.uniq!
         
     | 
| 
      
 160 
     | 
    
         
            +
                  end
         
     | 
| 
      
 161 
     | 
    
         
            +
                  
         
     | 
| 
      
 162 
     | 
    
         
            +
                  author
         
     | 
| 
      
 163 
     | 
    
         
            +
                end
         
     | 
| 
      
 164 
     | 
    
         
            +
                
         
     | 
| 
      
 165 
     | 
    
         
            +
                def authors
         
     | 
| 
      
 166 
     | 
    
         
            +
                  author(true)
         
     | 
| 
      
 167 
     | 
    
         
            +
                end
         
     | 
| 
      
 168 
     | 
    
         
            +
                
         
     | 
| 
      
 169 
     | 
    
         
            +
                
         
     | 
| 
      
 170 
     | 
    
         
            +
                # Returns the "description" of the page, usually comes from a meta tag
         
     | 
| 
      
 171 
     | 
    
         
            +
                def description
         
     | 
| 
      
 172 
     | 
    
         
            +
                  @doc.match([
         
     | 
| 
      
 173 
     | 
    
         
            +
                              ['meta[@name="description"]', lambda { |el| el.attr('content') }],
         
     | 
| 
      
 174 
     | 
    
         
            +
                              ['meta[@name="Description"]', lambda { |el| el.attr('content') }],
         
     | 
| 
      
 175 
     | 
    
         
            +
                              ['meta[@name="DESCRIPTION"]', lambda { |el| el.attr('content') }],
         
     | 
| 
      
 176 
     | 
    
         
            +
                              'rdf:Description[@name="dc:description"]',
         
     | 
| 
      
 177 
     | 
    
         
            +
                              '.description'
         
     | 
| 
      
 178 
     | 
    
         
            +
                   ])
         
     | 
| 
      
 179 
     | 
    
         
            +
                end
         
     | 
| 
      
 180 
     | 
    
         
            +
                
         
     | 
| 
      
 181 
     | 
    
         
            +
                # Returns the "lede(s)" or first paragraph(s) of the story/page
         
     | 
| 
      
 182 
     | 
    
         
            +
                def lede(all = false)
         
     | 
| 
      
 183 
     | 
    
         
            +
                  lede = @doc.match([ 
         
     | 
| 
      
 184 
     | 
    
         
            +
                              '.post-text p',
         
     | 
| 
      
 185 
     | 
    
         
            +
                              '.post-body p',
         
     | 
| 
      
 186 
     | 
    
         
            +
                              '#blogpost p',
         
     | 
| 
      
 187 
     | 
    
         
            +
                              '.story-teaser',
         
     | 
| 
      
 188 
     | 
    
         
            +
                              '.article .body p',
         
     | 
| 
      
 189 
     | 
    
         
            +
                              '//div[@class="entrytext"]//p[string-length()>40]',                      # Ruby Inside / Kubrick style
         
     | 
| 
      
 190 
     | 
    
         
            +
                              'section p',
         
     | 
| 
      
 191 
     | 
    
         
            +
                              '.entry .text p',
         
     | 
| 
      
 192 
     | 
    
         
            +
                              '.hentry .content p',
         
     | 
| 
      
 193 
     | 
    
         
            +
                              '.entry-content p',
         
     | 
| 
      
 194 
     | 
    
         
            +
                              '#wikicontent p',                                                        # Google Code style
         
     | 
| 
      
 195 
     | 
    
         
            +
                              '.wikistyle p',                                                          # GitHub style
         
     | 
| 
      
 196 
     | 
    
         
            +
                              '//td[@class="storybody"]/p[string-length()>40]',                        # BBC News style
         
     | 
| 
      
 197 
     | 
    
         
            +
                              '//div[@class="entry"]//p[string-length()>100]',
         
     | 
| 
      
 198 
     | 
    
         
            +
                              # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
         
     | 
| 
      
 199 
     | 
    
         
            +
                              # don't use <p> tags..
         
     | 
| 
      
 200 
     | 
    
         
            +
                              ['.entry-content', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
         
     | 
| 
      
 201 
     | 
    
         
            +
                              ['.entry', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
         
     | 
| 
      
 202 
     | 
    
         
            +
                              '.entry',
         
     | 
| 
      
 203 
     | 
    
         
            +
                              '#content p',
         
     | 
| 
      
 204 
     | 
    
         
            +
                              '#article p',
         
     | 
| 
      
 205 
     | 
    
         
            +
                              '.post-body',
         
     | 
| 
      
 206 
     | 
    
         
            +
                              '.entry-content',
         
     | 
| 
      
 207 
     | 
    
         
            +
                              '.document_description_short p',    # Scribd
         
     | 
| 
      
 208 
     | 
    
         
            +
                              '.single-post p'
         
     | 
| 
      
 209 
     | 
    
         
            +
                              ], all)
         
     | 
| 
      
 210 
     | 
    
         
            +
             
     | 
| 
      
 211 
     | 
    
         
            +
                  # TODO: Improve sentence extraction - this is dire even if it "works for now"
         
     | 
| 
      
 212 
     | 
    
         
            +
                  if lede && String === lede
         
     | 
| 
      
 213 
     | 
    
         
            +
                    return (lede[/^(.*?[\.\!\?]\s){1,3}/m] || lede).to_s.strip
         
     | 
| 
      
 214 
     | 
    
         
            +
                  elsif lede && Array === lede
         
     | 
| 
      
 215 
     | 
    
         
            +
                    return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){1,3}/m].strip || l }.uniq
         
     | 
| 
      
 216 
     | 
    
         
            +
                  else
         
     | 
| 
      
 217 
     | 
    
         
            +
                    return reader_doc && !reader_doc.sentences(4).empty? ? reader_doc.sentences(4).join(' ') : nil
         
     | 
| 
      
 218 
     | 
    
         
            +
                  end
         
     | 
| 
      
 219 
     | 
    
         
            +
                end
         
     | 
| 
      
 220 
     | 
    
         
            +
                
         
     | 
| 
      
 221 
     | 
    
         
            +
                def ledes
         
     | 
| 
      
 222 
     | 
    
         
            +
                  lede(true) rescue []
         
     | 
| 
      
 223 
     | 
    
         
            +
                end
         
     | 
| 
      
 224 
     | 
    
         
            +
                
         
     | 
| 
      
 225 
     | 
    
         
            +
                # Returns a string containing the first [limit] sentences as determined by the Reader algorithm
         
     | 
| 
      
 226 
     | 
    
         
            +
                def sentences(limit = 3)
         
     | 
| 
      
 227 
     | 
    
         
            +
                  reader_doc && !reader_doc.sentences.empty? ? reader_doc.sentences(limit).join(' ') : nil
         
     | 
| 
      
 228 
     | 
    
         
            +
                end
         
     | 
| 
      
 229 
     | 
    
         
            +
             
     | 
| 
      
 230 
     | 
    
         
            +
                # Returns any images with absolute URLs in the document
         
     | 
| 
      
 231 
     | 
    
         
            +
                def images(limit = 3)
         
     | 
| 
      
 232 
     | 
    
         
            +
                  reader_doc && !reader_doc.images.empty? ? reader_doc.images(limit) : nil
         
     | 
| 
      
 233 
     | 
    
         
            +
                end
         
     | 
| 
      
 234 
     | 
    
         
            +
                
         
     | 
| 
      
 235 
     | 
    
         
            +
                # Returns the "keywords" in the document (not the meta keywords - they're next to useless now)
         
     | 
| 
      
 236 
     | 
    
         
            +
                def keywords(options = {})
         
     | 
| 
      
 237 
     | 
    
         
            +
                  options = { :stem_at => 20, :word_length_limit => 15, :limit => 20, :remove_stopwords => true, :minimum_score => 2 }.merge(options)
         
     | 
| 
      
 238 
     | 
    
         
            +
                  
         
     | 
| 
      
 239 
     | 
    
         
            +
                  words = {}
         
     | 
| 
      
 240 
     | 
    
         
            +
                  
         
     | 
| 
      
 241 
     | 
    
         
            +
                  # Convert doc to lowercase, scrub out most HTML tags, then keep track of words
         
     | 
| 
      
 242 
     | 
    
         
            +
                  cached_title = title.to_s
         
     | 
| 
      
 243 
     | 
    
         
            +
                  content_to_use = body.to_s.downcase + " " + description.to_s.downcase
         
     | 
| 
      
 244 
     | 
    
         
            +
             
     | 
| 
      
 245 
     | 
    
         
            +
                  # old regex for safe keeping -- \b[a-z][a-z\+\.\'\+\#\-]*\b
         
     | 
| 
      
 246 
     | 
    
         
            +
                  content_to_use.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\.+\s+/, ' ').gsub(/\&\w+\;/, '').scan(/(\b|\s|\A)([a-z0-9][a-z0-9\+\.\'\+\#\-\\]*)(\b|\s|\Z)/i).map{ |ta1| ta1[1] }.compact.each do |word|
         
     | 
| 
      
 247 
     | 
    
         
            +
                    next if word.length > options[:word_length_limit]
         
     | 
| 
      
 248 
     | 
    
         
            +
                    word.gsub!(/^[\']/, '')
         
     | 
| 
      
 249 
     | 
    
         
            +
                    word.gsub!(/[\.\-\']$/, '')
         
     | 
| 
      
 250 
     | 
    
         
            +
                    next if options[:hints] && !options[:hints].include?(word)
         
     | 
| 
      
 251 
     | 
    
         
            +
                    words[word] ||= 0
         
     | 
| 
      
 252 
     | 
    
         
            +
                    words[word] += (cached_title.downcase =~ /\b#{word}\b/ ? 5 : 1)
         
     | 
| 
      
 253 
     | 
    
         
            +
                  end
         
     | 
| 
      
 254 
     | 
    
         
            +
             
     | 
| 
      
 255 
     | 
    
         
            +
                  # Stem the words and stop words if necessary
         
     | 
| 
      
 256 
     | 
    
         
            +
                  d = words.keys.uniq.map { |a| a.length > options[:stem_at] ? a.stem : a }
         
     | 
| 
      
 257 
     | 
    
         
            +
                  s = Pismo.stopwords.map { |a| a.length > options[:stem_at] ? a.stem : a }
         
     | 
| 
      
 258 
     | 
    
         
            +
             
     | 
| 
      
 259 
     | 
    
         
            +
                  words.delete_if { |k1, v1| v1 < options[:minimum_score] }
         
     | 
| 
      
 260 
     | 
    
         
            +
                  words.delete_if { |k1, v1| s.include?(k1) } if options[:remove_stopwords]
         
     | 
| 
      
 261 
     | 
    
         
            +
                  words.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
         
     | 
| 
      
 262 
     | 
    
         
            +
                end
         
     | 
| 
      
 263 
     | 
    
         
            +
                
         
     | 
| 
      
 264 
     | 
    
         
            +
                def reader_doc
         
     | 
| 
      
 265 
     | 
    
         
            +
                  @reader_doc ||= Reader::Document.create(@doc.to_s, @options)
         
     | 
| 
      
 266 
     | 
    
         
            +
                end
         
     | 
| 
      
 267 
     | 
    
         
            +
                
         
     | 
| 
      
 268 
     | 
    
         
            +
                # Returns body text as determined by Reader algorithm
         
     | 
| 
      
 269 
     | 
    
         
            +
                def body
         
     | 
| 
      
 270 
     | 
    
         
            +
                  @body ||= reader_doc.content(true).strip      
         
     | 
| 
      
 271 
     | 
    
         
            +
                end
         
     | 
| 
      
 272 
     | 
    
         
            +
                
         
     | 
| 
      
 273 
     | 
    
         
            +
                # Returns body text as determined by Reader algorithm WITH basic HTML formatting intact
         
     | 
| 
      
 274 
     | 
    
         
            +
                def html_body
         
     | 
| 
      
 275 
     | 
    
         
            +
                  @html_body ||= reader_doc.content.strip      
         
     | 
| 
      
 276 
     | 
    
         
            +
                end
         
     | 
| 
      
 277 
     | 
    
         
            +
                
         
     | 
| 
      
 278 
     | 
    
         
            +
                # Returns URL to the site's favicon
         
     | 
| 
      
 279 
     | 
    
         
            +
                def favicon
         
     | 
| 
      
 280 
     | 
    
         
            +
                  url = @doc.match([['link[@rel="fluid-icon"]', lambda { |el| el.attr('href') }],      # Get a Fluid icon if possible..
         
     | 
| 
      
 281 
     | 
    
         
            +
                                    ['link[@rel="shortcut icon"]', lambda { |el| el.attr('href') }],
         
     | 
| 
      
 282 
     | 
    
         
            +
                                    ['link[@rel="icon"]', lambda { |el| el.attr('href') }]])
         
     | 
| 
      
 283 
     | 
    
         
            +
                  if url && url !~ /^http/ && @url
         
     | 
| 
      
 284 
     | 
    
         
            +
                    url = URI.join(@url , url).to_s
         
     | 
| 
      
 285 
     | 
    
         
            +
                  end
         
     | 
| 
      
 286 
     | 
    
         
            +
                  
         
     | 
| 
      
 287 
     | 
    
         
            +
                  url
         
     | 
| 
      
 288 
     | 
    
         
            +
                end
         
     | 
| 
      
 289 
     | 
    
         
            +
                
         
     | 
| 
      
 290 
     | 
    
         
            +
                # Returns URL(s) of Web feed(s)
         
     | 
| 
      
 291 
     | 
    
         
            +
                def feed(all = false)
         
     | 
| 
      
 292 
     | 
    
         
            +
                  url = @doc.match([['link[@type="application/rss+xml"]', lambda { |el| el.attr('href') }],
         
     | 
| 
      
 293 
     | 
    
         
            +
                                    ['link[@type="application/atom+xml"]', lambda { |el| el.attr('href') }]], all
         
     | 
| 
      
 294 
     | 
    
         
            +
                  )
         
     | 
| 
      
 295 
     | 
    
         
            +
                  
         
     | 
| 
      
 296 
     | 
    
         
            +
                  if url && String === url && url !~ /^http/ && @url
         
     | 
| 
      
 297 
     | 
    
         
            +
                    url = URI.join(@url , url).to_s
         
     | 
| 
      
 298 
     | 
    
         
            +
                  elsif url && Array === url
         
     | 
| 
      
 299 
     | 
    
         
            +
                    url.map! do |u|
         
     | 
| 
      
 300 
     | 
    
         
            +
                      if u !~ /^http/ && @url
         
     | 
| 
      
 301 
     | 
    
         
            +
                        URI.join(@url, u).to_s
         
     | 
| 
      
 302 
     | 
    
         
            +
                      else
         
     | 
| 
      
 303 
     | 
    
         
            +
                        u
         
     | 
| 
      
 304 
     | 
    
         
            +
                      end
         
     | 
| 
      
 305 
     | 
    
         
            +
                    end
         
     | 
| 
      
 306 
     | 
    
         
            +
                    url.uniq!
         
     | 
| 
      
 307 
     | 
    
         
            +
                  end
         
     | 
| 
      
 308 
     | 
    
         
            +
                  
         
     | 
| 
      
 309 
     | 
    
         
            +
                  url
         
     | 
| 
      
 310 
     | 
    
         
            +
                end
         
     | 
| 
      
 311 
     | 
    
         
            +
                
         
     | 
| 
      
 312 
     | 
    
         
            +
                def feeds
         
     | 
| 
      
 313 
     | 
    
         
            +
                  feed(true)
         
     | 
| 
      
 314 
     | 
    
         
            +
                end
         
     | 
| 
      
 315 
     | 
    
         
            +
              end
         
     | 
| 
      
 316 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/pismo/reader.rb
    ADDED
    
    | 
         @@ -0,0 +1,19 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Pismo
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Reader
         
     | 
| 
      
 3 
     | 
    
         
            +
                class Document
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
                  def self.create(raw_content, options = {})
         
     | 
| 
      
 6 
     | 
    
         
            +
                    type = options.delete(:reader)
         
     | 
| 
      
 7 
     | 
    
         
            +
                    case type
         
     | 
| 
      
 8 
     | 
    
         
            +
                    when :score
         
     | 
| 
      
 9 
     | 
    
         
            +
                      Pismo::Reader::Tree.new(raw_content, options)
         
     | 
| 
      
 10 
     | 
    
         
            +
                    when :cluster
         
     | 
| 
      
 11 
     | 
    
         
            +
                      Pismo::Reader::Cluster.new(raw_content, options)
         
     | 
| 
      
 12 
     | 
    
         
            +
                    else
         
     | 
| 
      
 13 
     | 
    
         
            +
                      Pismo::Reader::Tree.new(raw_content, options)
         
     | 
| 
      
 14 
     | 
    
         
            +
                    end
         
     | 
| 
      
 15 
     | 
    
         
            +
                  end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                end  
         
     | 
| 
      
 18 
     | 
    
         
            +
              end
         
     | 
| 
      
 19 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,259 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'nokogiri'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'sanitize'
         
     | 
| 
      
 3 
     | 
    
         
            +
            begin; require 'ap'; rescue LoadError; end
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module Pismo
         
     | 
| 
      
 6 
     | 
    
         
            +
              module Reader
         
     | 
| 
      
 7 
     | 
    
         
            +
                class Base
         
     | 
| 
      
 8 
     | 
    
         
            +
                  attr_reader :raw_content, :doc, :content_candidates, :options
         
     | 
| 
      
 9 
     | 
    
         
            +
                  
         
     | 
| 
      
 10 
     | 
    
         
            +
                  # Elements to keep for /input/ sanitization
         
     | 
| 
      
 11 
     | 
    
         
            +
                  OK_ELEMENTS = %w{a td br th tbody table tr div span img strong em b i body html head title p h1 h2 h3 h4 h5 h6 pre code tt ul li ol blockquote font big small section article abbr audio video cite dd dt figure caption sup form dl dt dd center}
         
     | 
| 
      
 12 
     | 
    
         
            +
              
         
     | 
| 
      
 13 
     | 
    
         
            +
                  # Build a tree of attributes that are allowed for each element.. doing it this messy way due to how Sanitize works, alas
         
     | 
| 
      
 14 
     | 
    
         
            +
                  OK_ATTRIBUTES = {}
         
     | 
| 
      
 15 
     | 
    
         
            +
                  OK_CLEAN_ATTRIBUTES = {}
         
     | 
| 
      
 16 
     | 
    
         
            +
                  OK_ELEMENTS.each { |el| OK_ATTRIBUTES[el] = %w{id class href name content type alt title src} }
         
     | 
| 
      
 17 
     | 
    
         
            +
                  OK_ELEMENTS.each { |el| OK_CLEAN_ATTRIBUTES[el] = %w{href title src alt} }
         
     | 
| 
      
 18 
     | 
    
         
            +
                  
         
     | 
| 
      
 19 
     | 
    
         
            +
                  
         
     | 
| 
      
 20 
     | 
    
         
            +
                  # Words that we'd like to see in class and ID names for "content"
         
     | 
| 
      
 21 
     | 
    
         
            +
                  GOOD_WORDS = %w{content post blogpost main story body entry text desc asset hentry single entrytext postcontent bodycontent}.uniq
         
     | 
| 
      
 22 
     | 
    
         
            +
              
         
     | 
| 
      
 23 
     | 
    
         
            +
                  # Words that indicate crap in general
         
     | 
| 
      
 24 
     | 
    
         
            +
                  BAD_WORDS = %w{reply metadata options commenting comments comment about footer header outer credit sidebar widget subscribe clearfix date social bookmarks links share video watch excerpt related supplement accessibility offscreen meta title signup blq secondary feedback featured clearfix small job jobs listing listings navigation nav byline addcomment postcomment trackback neighbor ads commentform fbfans login similar thumb link blogroll grid twitter wrapper container nav sitesub printfooter editsection visualclear catlinks hidden toc contentsub caption disqus rss shoutbox sponsor blogcomments}.uniq
         
     | 
| 
      
 25 
     | 
    
         
            +
                  
         
     | 
| 
      
 26 
     | 
    
         
            +
                  # Words that kill a branch dead
         
     | 
| 
      
 27 
     | 
    
         
            +
                  FATAL_WORDS = %w{comments comment bookmarks social links ads related similar footer digg totop metadata sitesub nav sidebar commenting options addcomment leaderboard offscreen job prevlink prevnext navigation reply-link hide hidden sidebox archives vcard}
         
     | 
| 
      
 28 
     | 
    
         
            +
                  
         
     | 
| 
      
 29 
     | 
    
         
            +
                  META_WORDS = %w{january february march april may june july august september october november december jan feb mar apr may jun jul aug sep oct nov dec st th rd nd comments written posted on at published 2000 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 updated last gmt est pst pdt edt cet cdt cst article feature featured filed under comment comments follow twitter facebook email e-mail register story continue continues reading read inside more page next related response responses respond contact street phone tel email e-mail fax info tags tagged tag thanks credit creative commons copy nbsp lt gt this friend printable version subscribe rss mail follow twitter article via leave}.uniq
         
     | 
| 
      
 30 
     | 
    
         
            +
              
         
     | 
| 
      
 31 
     | 
    
         
            +
                  WONT_CONTAIN_FULL_CONTENT = %w{h1 h2 h3 h4 h5 h6 h6 li ol ul br a img meta cite strong em i b input head small big code title sup sub dd dt}
         
     | 
| 
      
 32 
     | 
    
         
            +
                  COULD_CONTAIN_FULL_CONTENT = %w{body div p table tr td article pre blockquote tbody section}
         
     | 
| 
      
 33 
     | 
    
         
            +
              
         
     | 
| 
      
 34 
     | 
    
         
            +
                  ## Output sanitization element sets
         
     | 
| 
      
 35 
     | 
    
         
            +
                  BLOCK_OUTPUT_ELEMENTS = %w{div p h2 h3 h4 h5 h6 li dl pre ul ol blockquote section article audio video cite dd dt figure caption br table tr td thead tbody tfoot}
         
     | 
| 
      
 36 
     | 
    
         
            +
                  INLINE_OUTPUT_ELEMENTS = %w{a img b strong em i br code sup font small big dd dt}
         
     | 
| 
      
 37 
     | 
    
         
            +
                  OUTPUT_ELEMENTS = BLOCK_OUTPUT_ELEMENTS + INLINE_OUTPUT_ELEMENTS
         
     | 
| 
      
 38 
     | 
    
         
            +
                  NON_HEADER_ELEMENTS = %w{p br}
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                  # Create a document object based on the raw HTML content provided
         
     | 
| 
      
 41 
     | 
    
         
            +
                  def initialize(raw_content, options = {})
         
     | 
| 
      
 42 
     | 
    
         
            +
                    @options = options
         
     | 
| 
      
 43 
     | 
    
         
            +
                    @raw_content = Pismo::Document.clean_html(raw_content)
         
     | 
| 
      
 44 
     | 
    
         
            +
                    build_doc
         
     | 
| 
      
 45 
     | 
    
         
            +
                  end
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                  def build_doc
         
     | 
| 
      
 48 
     | 
    
         
            +
                    @content = {}
         
     | 
| 
      
 49 
     | 
    
         
            +
                    
         
     | 
| 
      
 50 
     | 
    
         
            +
                    if RUBY_VERSION > "1.9"
         
     | 
| 
      
 51 
     | 
    
         
            +
                      @raw_content.encode!("UTF-8", :invalid => :replace, :replace => '?') if @raw_content.encoding != "UTF-8"
         
     | 
| 
      
 52 
     | 
    
         
            +
                      @raw_content.encode!("ASCII-8BIT", :invalid => :replace, :replace => '?') if !@raw_content.valid_encoding?
         
     | 
| 
      
 53 
     | 
    
         
            +
                    end
         
     | 
| 
      
 54 
     | 
    
         
            +
              
         
     | 
| 
      
 55 
     | 
    
         
            +
                    # Normalize whitespace (as much to make debugging sessions look nice as anything else)
         
     | 
| 
      
 56 
     | 
    
         
            +
                    @raw_content.gsub!(/\s{2,}/, ' ')
         
     | 
| 
      
 57 
     | 
    
         
            +
                    @raw_content.gsub!(/\r/, "\n")
         
     | 
| 
      
 58 
     | 
    
         
            +
                    @raw_content.gsub!(/\n{3,}/, "\n\n")
         
     | 
| 
      
 59 
     | 
    
         
            +
                    @raw_content.gsub!(/(\<br(\s\/)?\>){2,}/, "</p><p>")
         
     | 
| 
      
 60 
     | 
    
         
            +
                    
         
     | 
| 
      
 61 
     | 
    
         
            +
                    # Remove scripts manually, Sanitize and/or Nokogiri seem to go a bit funny with them
         
     | 
| 
      
 62 
     | 
    
         
            +
                    @raw_content.gsub!(/\<script .*?\<\/script\>/im, '')
         
     | 
| 
      
 63 
     | 
    
         
            +
                    
         
     | 
| 
      
 64 
     | 
    
         
            +
                    # Get rid of bullshit "smart" quotes and other Unicode nonsense
         
     | 
| 
      
 65 
     | 
    
         
            +
                    @raw_content.force_encoding("ASCII-8BIT") if RUBY_VERSION > "1.9"
         
     | 
| 
      
 66 
     | 
    
         
            +
                    @raw_content.gsub!("\xe2\x80\x89", " ")
         
     | 
| 
      
 67 
     | 
    
         
            +
                    @raw_content.gsub!("\xe2\x80\x99", "'")
         
     | 
| 
      
 68 
     | 
    
         
            +
                    @raw_content.gsub!("\xe2\x80\x98", "'")
         
     | 
| 
      
 69 
     | 
    
         
            +
                    @raw_content.gsub!("\xe2\x80\x9c", '"')
         
     | 
| 
      
 70 
     | 
    
         
            +
                    @raw_content.gsub!("\xe2\x80\x9d", '"')
         
     | 
| 
      
 71 
     | 
    
         
            +
                    @raw_content.gsub!("\xe2\x80\xf6", '.')
         
     | 
| 
      
 72 
     | 
    
         
            +
                    @raw_content.force_encoding("UTF-8") if RUBY_VERSION > "1.9"
         
     | 
| 
      
 73 
     | 
    
         
            +
                    
         
     | 
| 
      
 74 
     | 
    
         
            +
                          
         
     | 
| 
      
 75 
     | 
    
         
            +
                    # Sanitize the HTML
         
     | 
| 
      
 76 
     | 
    
         
            +
                    @raw_content = Sanitize.clean(@raw_content,
         
     | 
| 
      
 77 
     | 
    
         
            +
                      :elements => OK_ELEMENTS,
         
     | 
| 
      
 78 
     | 
    
         
            +
                      :attributes => OK_ATTRIBUTES,
         
     | 
| 
      
 79 
     | 
    
         
            +
                      :remove_contents => true,
         
     | 
| 
      
 80 
     | 
    
         
            +
                      :output_encoding => 'utf-8'
         
     | 
| 
      
 81 
     | 
    
         
            +
                    )
         
     | 
| 
      
 82 
     | 
    
         
            +
                          
         
     | 
| 
      
 83 
     | 
    
         
            +
                    @doc = Nokogiri::HTML(@raw_content, nil, 'utf-8')
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
      
 85 
     | 
    
         
            +
                    # Do a pre clean up of elements. 
         
     | 
| 
      
 86 
     | 
    
         
            +
                    @doc.css("div, span, table, tr, td, pre").each do |el|
         
     | 
| 
      
 87 
     | 
    
         
            +
                      # Any block elements with no child block elements can become paragraphs
         
     | 
| 
      
 88 
     | 
    
         
            +
                      if (BLOCK_OUTPUT_ELEMENTS & el.inner_html.scan(/\<(\w+)/).flatten).empty?
         
     | 
| 
      
 89 
     | 
    
         
            +
                        el.name = "p"
         
     | 
| 
      
 90 
     | 
    
         
            +
                      elsif el.name != "span"
         
     | 
| 
      
 91 
     | 
    
         
            +
                        el.name = "div"
         
     | 
| 
      
 92 
     | 
    
         
            +
                      end
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
                      # Any SPANs that aren't within paragraphs can become paragraphs too
         
     | 
| 
      
 95 
     | 
    
         
            +
                      el.name = "p" if el.name == "span" && !el.path.scan(/[a-z]+/).include?('p')
         
     | 
| 
      
 96 
     | 
    
         
            +
             
     | 
| 
      
 97 
     | 
    
         
            +
                      el.remove if (FATAL_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)).size > 0
         
     | 
| 
      
 98 
     | 
    
         
            +
                    end
         
     | 
| 
      
 99 
     | 
    
         
            +
                    
         
     | 
| 
      
 100 
     | 
    
         
            +
                    analyze
         
     | 
| 
      
 101 
     | 
    
         
            +
                  end
         
     | 
| 
      
 102 
     | 
    
         
            +
                
         
     | 
| 
      
 103 
     | 
    
         
            +
                  # Return the content from best match number of index (default 0) and, optionally, clean it to plain-text
         
     | 
| 
      
 104 
     | 
    
         
            +
                  def content(clean = false, index = 0)
         
     | 
| 
      
 105 
     | 
    
         
            +
                    return @content[[clean, index]] if @content[[clean, index]]
         
     | 
| 
      
 106 
     | 
    
         
            +
                    return '' if !@content_candidates || @content_candidates.empty?
         
     | 
| 
      
 107 
     | 
    
         
            +
                    
         
     | 
| 
      
 108 
     | 
    
         
            +
                    content_branch = content_at(index)
         
     | 
| 
      
 109 
     | 
    
         
            +
                    orphans_to_remove = []
         
     | 
| 
      
 110 
     | 
    
         
            +
                    
         
     | 
| 
      
 111 
     | 
    
         
            +
                    #ap content_branch.to_html
         
     | 
| 
      
 112 
     | 
    
         
            +
                    #exit
         
     | 
| 
      
 113 
     | 
    
         
            +
                    
         
     | 
| 
      
 114 
     | 
    
         
            +
                    # Go through every piece of the content and rip out sections that contain too many tags compared to words
         
     | 
| 
      
 115 
     | 
    
         
            +
                    # This is usually indicative of "widgets" or link bar sections
         
     | 
| 
      
 116 
     | 
    
         
            +
                    content_branch.css('*').each_with_index do |el, i|
         
     | 
| 
      
 117 
     | 
    
         
            +
                      next unless el
         
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
      
 119 
     | 
    
         
            +
                      if el.name == "h1"
         
     | 
| 
      
 120 
     | 
    
         
            +
                        el.remove
         
     | 
| 
      
 121 
     | 
    
         
            +
                        next
         
     | 
| 
      
 122 
     | 
    
         
            +
                      end
         
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
      
 124 
     | 
    
         
            +
                      if el.name == "h2" && content_branch.inner_html.scan('<h2').size == 1
         
     | 
| 
      
 125 
     | 
    
         
            +
                        el.remove
         
     | 
| 
      
 126 
     | 
    
         
            +
                      end
         
     | 
| 
      
 127 
     | 
    
         
            +
             
     | 
| 
      
 128 
     | 
    
         
            +
                      # Remove elements that contain words but there are more tags than words overall
         
     | 
| 
      
 129 
     | 
    
         
            +
                      # First, count the words
         
     | 
| 
      
 130 
     | 
    
         
            +
                      #word_count = 0
         
     | 
| 
      
 131 
     | 
    
         
            +
                      #el.traverse do |subel|
         
     | 
| 
      
 132 
     | 
    
         
            +
                      #  if subel.text? && subel.path !~ /\/a\// && subel.path !~ /\/(h1|h2|h3|h4|h5|h6)\//
         
     | 
| 
      
 133 
     | 
    
         
            +
                      #    word_count += (subel.text.downcase.scan(/[a-z]{4,}/) - META_WORDS).size
         
     | 
| 
      
 134 
     | 
    
         
            +
                      #  end
         
     | 
| 
      
 135 
     | 
    
         
            +
                      #end
         
     | 
| 
      
 136 
     | 
    
         
            +
                      #
         
     | 
| 
      
 137 
     | 
    
         
            +
                      ## .. then count the tags
         
     | 
| 
      
 138 
     | 
    
         
            +
                      #
         
     | 
| 
      
 139 
     | 
    
         
            +
                      #inner_tags = el.inner_html.scan(/\<\w.*?\>/).size
         
     | 
| 
      
 140 
     | 
    
         
            +
                      #if word_count < inner_tags && inner_tags > 3 && word_count < 250
         
     | 
| 
      
 141 
     | 
    
         
            +
                      #  puts "At #{el.name} #{el['id']} #{el['class']} containing '#{el.text[0..20]}' we have #{word_count} valid words to #{el.inner_html.scan(/\<\w.*?\>/).size} tags"
         
     | 
| 
      
 142 
     | 
    
         
            +
                      #  #puts "Removing #{el.name} #{el['id']} #{el['class']} TOO MANY TAGS FOR WORDS"
         
     | 
| 
      
 143 
     | 
    
         
            +
                      #  el.remove
         
     | 
| 
      
 144 
     | 
    
         
            +
                      #  next
         
     | 
| 
      
 145 
     | 
    
         
            +
                      #end
         
     | 
| 
      
 146 
     | 
    
         
            +
             
     | 
| 
      
 147 
     | 
    
         
            +
                      # If there are at least 2 words and a third of them are "meta words," remove the element
         
     | 
| 
      
 148 
     | 
    
         
            +
                      #inner_words = el.text.to_s.downcase.scan(/[a-z]{3,}/)
         
     | 
| 
      
 149 
     | 
    
         
            +
                      #if BLOCK_OUTPUT_ELEMENTS.include?(el.name) && inner_words.size >= 2
         
     | 
| 
      
 150 
     | 
    
         
            +
                      #  if ((inner_words & META_WORDS).size >= (inner_words.size / 3))
         
     | 
| 
      
 151 
     | 
    
         
            +
                      #    el.remove
         
     | 
| 
      
 152 
     | 
    
         
            +
                      #  end
         
     | 
| 
      
 153 
     | 
    
         
            +
                      #end
         
     | 
| 
      
 154 
     | 
    
         
            +
             
     | 
| 
      
 155 
     | 
    
         
            +
                      if el.text && el.text.strip.length < 3 && !%w{img}.include?(el.name) && el.inner_html !~ /\<img/
         
     | 
| 
      
 156 
     | 
    
         
            +
                        el.remove
         
     | 
| 
      
 157 
     | 
    
         
            +
                        next
         
     | 
| 
      
 158 
     | 
    
         
            +
                      end
         
     | 
| 
      
 159 
     | 
    
         
            +
             
     | 
| 
      
 160 
     | 
    
         
            +
                      if el.name == "p" && el.text !~ /(\.|\?|\!|\"|\')(\s|$)/ && el.inner_html !~ /\<img/
         
     | 
| 
      
 161 
     | 
    
         
            +
                        el.remove
         
     | 
| 
      
 162 
     | 
    
         
            +
                        next
         
     | 
| 
      
 163 
     | 
    
         
            +
                      end
         
     | 
| 
      
 164 
     | 
    
         
            +
             
     | 
| 
      
 165 
     | 
    
         
            +
                      # If the ID or class of the element contains a fatally bad word, get rid of it
         
     | 
| 
      
 166 
     | 
    
         
            +
                      if (BAD_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.scan(/[a-z]+/)).length > 0
         
     | 
| 
      
 167 
     | 
    
         
            +
                        #puts "Removing #{el.name} #{el['id']} #{el['class']} BAD"
         
     | 
| 
      
 168 
     | 
    
         
            +
                        el.remove
         
     | 
| 
      
 169 
     | 
    
         
            +
                        next
         
     | 
| 
      
 170 
     | 
    
         
            +
                      end
         
     | 
| 
      
 171 
     | 
    
         
            +
                    end
         
     | 
| 
      
 172 
     | 
    
         
            +
             
     | 
| 
      
 173 
     | 
    
         
            +
                    # If a title was found early in the result document but had text before it, remove that text - it's probably crap
         
     | 
| 
      
 174 
     | 
    
         
            +
                    orphans_to_remove.each { |el| el.remove }
         
     | 
| 
      
 175 
     | 
    
         
            +
                    
         
     | 
| 
      
 176 
     | 
    
         
            +
                    # Clean up the HTML again - Nokogiri outputs it with full doctype and crap
         
     | 
| 
      
 177 
     | 
    
         
            +
                    clean_html = strip(Sanitize.clean(content_branch.to_html, :elements => (clean ? BLOCK_OUTPUT_ELEMENTS : OUTPUT_ELEMENTS), :attributes => (clean ? OK_CLEAN_ATTRIBUTES : OK_ATTRIBUTES)))
         
     | 
| 
      
 178 
     | 
    
         
            +
                    
         
     | 
| 
      
 179 
     | 
    
         
            +
                    # If the content is desired as "clean" (i.e. plain-text), do some quick fix-ups
         
     | 
| 
      
 180 
     | 
    
         
            +
                    if clean
         
     | 
| 
      
 181 
     | 
    
         
            +
                      # Get rid of line break tags, make list items look nice, remove all other HTML tags, and clean up spaces and newlines
         
     | 
| 
      
 182 
     | 
    
         
            +
                      clean_html.gsub!(/<br.*?>/, "\n")
         
     | 
| 
      
 183 
     | 
    
         
            +
                      clean_html.gsub!(/<li>/, '* ')
         
     | 
| 
      
 184 
     | 
    
         
            +
                      clean_html.gsub!(/<\w+>/, '')
         
     | 
| 
      
 185 
     | 
    
         
            +
                      clean_html.gsub!(/<\/\w+>/, "\n")
         
     | 
| 
      
 186 
     | 
    
         
            +
                      clean_html.gsub!(/\ +/, ' ')
         
     | 
| 
      
 187 
     | 
    
         
            +
                      clean_html.gsub!(/^\s+\n/, "\n")
         
     | 
| 
      
 188 
     | 
    
         
            +
                      clean_html.gsub!(/\n{2,}/, "\n")
         
     | 
| 
      
 189 
     | 
    
         
            +
                      clean_html.strip!
         
     | 
| 
      
 190 
     | 
    
         
            +
                    end
         
     | 
| 
      
 191 
     | 
    
         
            +
                    
         
     | 
| 
      
 192 
     | 
    
         
            +
                    # If tags butt up against each other across lines, remove the line break(s)
         
     | 
| 
      
 193 
     | 
    
         
            +
                    clean_html.gsub!(/\>\n+\</, '><')
         
     | 
| 
      
 194 
     | 
    
         
            +
             
     | 
| 
      
 195 
     | 
    
         
            +
                    # Get rid of images whose sources are relative (TODO: Make this optional)
         
     | 
| 
      
 196 
     | 
    
         
            +
                    clean_html.gsub!(/\<img .*?\>/i) do |img_tag|
         
     | 
| 
      
 197 
     | 
    
         
            +
                      img_tag =~ /\Whttp/ ? img_tag : ''
         
     | 
| 
      
 198 
     | 
    
         
            +
                    end
         
     | 
| 
      
 199 
     | 
    
         
            +
             
     | 
| 
      
 200 
     | 
    
         
            +
                    # Remove empty tags
         
     | 
| 
      
 201 
     | 
    
         
            +
                    clean_html.gsub!(/<(\w+)><\/\1>/, "")
         
     | 
| 
      
 202 
     | 
    
         
            +
             
     | 
| 
      
 203 
     | 
    
         
            +
                    # Just a messy, hacky way to make output look nicer with subsequent paragraphs..
         
     | 
| 
      
 204 
     | 
    
         
            +
                    clean_html.gsub!(/<\/(div|p|h1|h2|h3|h4|h5|h6)>/, '</\1>' + "\n\n")
         
     | 
| 
      
 205 
     | 
    
         
            +
                    
         
     | 
| 
      
 206 
     | 
    
         
            +
                    @content[[clean, index]] = clean_html
         
     | 
| 
      
 207 
     | 
    
         
            +
                  end
         
     | 
| 
      
 208 
     | 
    
         
            +
                      
         
     | 
| 
      
 209 
     | 
    
         
            +
                  def sentences(qty = 3)
         
     | 
| 
      
 210 
     | 
    
         
            +
                    clean_content = Sanitize.clean(content, :elements => NON_HEADER_ELEMENTS, :attributes => OK_CLEAN_ATTRIBUTES, :remove_contents => %w{h1 h2 h3 h4 h5 h6})
         
     | 
| 
      
 211 
     | 
    
         
            +
             
     | 
| 
      
 212 
     | 
    
         
            +
                    fodder = ''
         
     | 
| 
      
 213 
     | 
    
         
            +
                    doc = Nokogiri::HTML(clean_content, nil, 'utf-8')
         
     | 
| 
      
 214 
     | 
    
         
            +
              
         
     | 
| 
      
 215 
     | 
    
         
            +
                    doc.traverse do |el|
         
     | 
| 
      
 216 
     | 
    
         
            +
                      path_segments = el.path.scan(/[a-z]+/)[2..-1]
         
     | 
| 
      
 217 
     | 
    
         
            +
                      next unless path_segments && path_segments.length > 1
         
     | 
| 
      
 218 
     | 
    
         
            +
                      if el.text? && el.text.strip.length < 3
         
     | 
| 
      
 219 
     | 
    
         
            +
                        el.remove
         
     | 
| 
      
 220 
     | 
    
         
            +
                        next
         
     | 
| 
      
 221 
     | 
    
         
            +
                      end
         
     | 
| 
      
 222 
     | 
    
         
            +
                      if el.text? && NON_HEADER_ELEMENTS.include?(path_segments[-2]) 
         
     | 
| 
      
 223 
     | 
    
         
            +
                        text = el.text.strip
         
     | 
| 
      
 224 
     | 
    
         
            +
                        text += "." if text !~ /[\.\!\?\"\']$/
         
     | 
| 
      
 225 
     | 
    
         
            +
                        fodder += text + "\n" 
         
     | 
| 
      
 226 
     | 
    
         
            +
                      end
         
     | 
| 
      
 227 
     | 
    
         
            +
                    end
         
     | 
| 
      
 228 
     | 
    
         
            +
                    
         
     | 
| 
      
 229 
     | 
    
         
            +
                    fodder = content(true) if fodder.to_s.length < 50
         
     | 
| 
      
 230 
     | 
    
         
            +
                    fodder.gsub!(/\b\w\W\s/, '')
         
     | 
| 
      
 231 
     | 
    
         
            +
                    
         
     | 
| 
      
 232 
     | 
    
         
            +
                    #sentences = fodder.scan(/([\&\w\s\-\'\,\+\.\/\\\:\#\(\)\=\"\?\!]+?[\.\?\!])(\s|\Z)/im).map { |s| s.first }
         
     | 
| 
      
 233 
     | 
    
         
            +
                    sentences = fodder.scan(/(.+?[\.\?\!])(\s|\Z)/im).map { |s| s.first.strip }
         
     | 
| 
      
 234 
     | 
    
         
            +
                    
         
     | 
| 
      
 235 
     | 
    
         
            +
                    sentences.compact!
         
     | 
| 
      
 236 
     | 
    
         
            +
                    sentences.map! { |s| s.strip }
         
     | 
| 
      
 237 
     | 
    
         
            +
                    sentences.map! { |s| s.sub(/^[^\"\'a-z0-9\(\[]+/im, '') }
         
     | 
| 
      
 238 
     | 
    
         
            +
                    sentences.map! { |s| s.sub(/[^a-z0-9\'\"\)\]\.\!\:\?]+$/im, '') }
         
     | 
| 
      
 239 
     | 
    
         
            +
                    sentences.map! { |s| s.gsub(/\s+/m, ' ') }
         
     | 
| 
      
 240 
     | 
    
         
            +
                    sentences.first(qty)
         
     | 
| 
      
 241 
     | 
    
         
            +
                  end
         
     | 
| 
      
 242 
     | 
    
         
            +
                  
         
     | 
| 
      
 243 
     | 
    
         
            +
                  def images(qty = 3)
         
     | 
| 
      
 244 
     | 
    
         
            +
                    doc = Nokogiri::HTML(content, nil, 'utf-8')
         
     | 
| 
      
 245 
     | 
    
         
            +
                    images = []
         
     | 
| 
      
 246 
     | 
    
         
            +
                    doc.css("img").each do |img|
         
     | 
| 
      
 247 
     | 
    
         
            +
                      images << img['src']
         
     | 
| 
      
 248 
     | 
    
         
            +
                      break if images.length == qty
         
     | 
| 
      
 249 
     | 
    
         
            +
                    end
         
     | 
| 
      
 250 
     | 
    
         
            +
                    images
         
     | 
| 
      
 251 
     | 
    
         
            +
                  end
         
     | 
| 
      
 252 
     | 
    
         
            +
                  
         
     | 
| 
      
 253 
     | 
    
         
            +
                  # Remove leading and trailing spaces on lines throughout a string (a bit like String#strip, but for multi-lines)
         
     | 
| 
      
 254 
     | 
    
         
            +
                  def strip(s)
         
     | 
| 
      
 255 
     | 
    
         
            +
                    s.gsub(/^\s+/, '').gsub(/\s+$/, '')
         
     | 
| 
      
 256 
     | 
    
         
            +
                  end
         
     | 
| 
      
 257 
     | 
    
         
            +
                end  
         
     | 
| 
      
 258 
     | 
    
         
            +
              end
         
     | 
| 
      
 259 
     | 
    
         
            +
            end
         
     |