pismo 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +19 -28
- data/NOTICE +4 -0
- data/README.markdown +37 -40
- data/Rakefile +3 -2
- data/VERSION +1 -1
- data/bin/pismo +15 -7
- data/lib/pismo/document.rb +2 -2
- data/lib/pismo/internal_attributes.rb +23 -16
- data/lib/pismo/reader.rb +390 -0
- data/lib/pismo.rb +3 -2
- data/pismo.gemspec +23 -15
- data/test/corpus/bbcnews2.html +1575 -0
- data/test/corpus/gmane.html +138 -0
- data/test/corpus/metadata_expected.yaml +20 -5
- data/test/corpus/queness.html +919 -0
- data/test/corpus/reader_expected.yaml +45 -0
- data/test/corpus/tweet.html +360 -0
- data/test/corpus/zefrank.html +535 -0
- data/test/test_corpus.rb +9 -1
- metadata +89 -34
- data/lib/pismo/readability.rb +0 -342
- data/test/test_readability.rb +0 -152
    
        data/lib/pismo/reader.rb
    ADDED
    
    | @@ -0,0 +1,390 @@ | |
| 1 | 
            +
            require 'nokogiri'
         | 
| 2 | 
            +
            require 'sanitize'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Pismo
         | 
| 5 | 
            +
              module Reader
         | 
| 6 | 
            +
                class Document
         | 
| 7 | 
            +
                  attr_reader :raw_content, :doc, :content_candidates
         | 
| 8 | 
            +
                  
         | 
| 9 | 
            +
                  # Elements to keep for /input/ sanitization
         | 
| 10 | 
            +
                  OK_ELEMENTS = %w{a td br th tbody table tr div span img strong em b i body html head title p h1 h2 h3 h4 h5 h6 pre code tt ul li ol blockquote font big small section article abbr audio video cite dd dt figure caption sup form dl dt dd}
         | 
| 11 | 
            +
              
         | 
| 12 | 
            +
                  # Build a tree of attributes that are allowed for each element.. doing it this messy way due to how Sanitize works, alas
         | 
| 13 | 
            +
                  OK_ATTRIBUTES = {}
         | 
| 14 | 
            +
                  OK_CLEAN_ATTRIBUTES = {}
         | 
| 15 | 
            +
                  OK_ELEMENTS.each { |el| OK_ATTRIBUTES[el] = %w{id class href name content type alt title src} }
         | 
| 16 | 
            +
                  OK_ELEMENTS.each { |el| OK_CLEAN_ATTRIBUTES[el] = %w{href title src alt} }
         | 
| 17 | 
            +
                  
         | 
| 18 | 
            +
                  
         | 
| 19 | 
            +
                  # Words that we'd like to see in class and ID names for "content"
         | 
| 20 | 
            +
                  GOOD_WORDS = %w{content post blogpost main story body entry text desc asset hentry single entrytext postcontent bodycontent}.uniq
         | 
| 21 | 
            +
              
         | 
| 22 | 
            +
                  # Words that indicate crap in general
         | 
| 23 | 
            +
                  BAD_WORDS = %w{reply metadata options commenting comments comment about footer header outer credit sidebar widget subscribe clearfix date social bookmarks links share video watch excerpt related supplement accessibility offscreen meta title signup blq secondary feedback featured clearfix small job jobs listing listings navigation nav byline addcomment postcomment trackback neighbor snap nopreview ads commentform fbfans login similar thumb link blogroll grid twitter wrapper container nav sitesub printfooter editsection visualclear catlinks hidden toc contentsub caption disqus rss shoutbox sponsor}.uniq
         | 
| 24 | 
            +
                  
         | 
| 25 | 
            +
                  # Words that kill a branch dead
         | 
| 26 | 
            +
                  FATAL_WORDS = %w{comments comment bookmarks social links ads related similar footer digg totop metadata sitesub nav sidebar commenting options addcomment leaderboard offscreen job prevlink prevnext navigation reply-link hide hidden sidebox archives vcard}
         | 
| 27 | 
            +
                  
         | 
| 28 | 
            +
                  META_WORDS = %w{january february march april may june july august september october november december jan feb mar apr may jun jul aug sep oct nov dec st th rd nd comments written posted on at published 2000 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 updated last gmt est pst pdt edt cet cdt cst article feature featured filed under comment comments follow twitter facebook email e-mail register story continue continues reading read inside more page next related response responses respond contact street phone tel email e-mail fax info tags tagged tag thanks credit creative commons copy nbsp lt gt this friend printable version subscribe rss mail follow twitter article via leave}.uniq
         | 
| 29 | 
            +
              
         | 
| 30 | 
            +
                  WONT_CONTAIN_FULL_CONTENT = %w{h1 h2 h3 h4 h5 h6 h6 li ol ul br a img meta cite strong em i b input head small big code title sup sub dd dt}
         | 
| 31 | 
            +
                  COULD_CONTAIN_FULL_CONTENT = %w{body div p table tr td article pre blockquote tbody section}
         | 
| 32 | 
            +
              
         | 
| 33 | 
            +
                  ## Output sanitization element sets
         | 
| 34 | 
            +
                  BLOCK_OUTPUT_ELEMENTS = %w{div p h2 h3 h4 h5 h6 li dl pre ul ol blockquote section article audio video cite dd dt figure caption br table tr td thead tbody tfoot}
         | 
| 35 | 
            +
                  INLINE_OUTPUT_ELEMENTS = %w{a img b strong em i br code sup font small big dd dt}
         | 
| 36 | 
            +
                  OUTPUT_ELEMENTS = BLOCK_OUTPUT_ELEMENTS + INLINE_OUTPUT_ELEMENTS
         | 
| 37 | 
            +
                  NON_HEADER_ELEMENTS = %w{p br}
         | 
| 38 | 
            +
              
         | 
| 39 | 
            +
                  # Create a document object based on the raw HTML content provided
         | 
| 40 | 
            +
                  def initialize(raw_content)
         | 
| 41 | 
            +
                    @raw_content = raw_content
         | 
| 42 | 
            +
                    build_doc
         | 
| 43 | 
            +
                  end
         | 
| 44 | 
            +
              
         | 
| 45 | 
            +
                  def build_doc
         | 
| 46 | 
            +
                    @content = {}
         | 
| 47 | 
            +
                    
         | 
| 48 | 
            +
                    if RUBY_VERSION > "1.9"
         | 
| 49 | 
            +
                      @raw_content.encode!("UTF-8", :invalid => :replace, :replace => '?') if @raw_content.encoding != "UTF-8"
         | 
| 50 | 
            +
                      @raw_content.encode!("ASCII-8BIT", :invalid => :replace, :replace => '?') if !@raw_content.valid_encoding?
         | 
| 51 | 
            +
                    end
         | 
| 52 | 
            +
              
         | 
| 53 | 
            +
                    # Normalize whitespace (as much to make debugging sessions look nice as anything else)
         | 
| 54 | 
            +
                    @raw_content.gsub!(/\s{2,}/, ' ')
         | 
| 55 | 
            +
                    @raw_content.gsub!(/\r/, "\n")
         | 
| 56 | 
            +
                    @raw_content.gsub!(/\n{3,}/, "\n\n")
         | 
| 57 | 
            +
                    @raw_content.gsub!(/(\<br(\s\/)?\>){2,}/, "</p><p>")
         | 
| 58 | 
            +
                    
         | 
| 59 | 
            +
                    # Remove scripts manually, Sanitize and/or Nokogiri seem to go a bit funny with them
         | 
| 60 | 
            +
                    @raw_content.gsub!(/\<script .*?\<\/script\>/im, '')
         | 
| 61 | 
            +
                          
         | 
| 62 | 
            +
                    # Sanitize the HTML
         | 
| 63 | 
            +
                    @raw_content = Sanitize.clean(@raw_content,
         | 
| 64 | 
            +
                      :elements => OK_ELEMENTS,
         | 
| 65 | 
            +
                      :attributes => OK_ATTRIBUTES,
         | 
| 66 | 
            +
                      :remove_contents => true,
         | 
| 67 | 
            +
                      :output_encoding => 'utf-8'
         | 
| 68 | 
            +
                    )
         | 
| 69 | 
            +
                          
         | 
| 70 | 
            +
                    @doc = Nokogiri::HTML(@raw_content, nil, 'utf-8')
         | 
| 71 | 
            +
                    
         | 
| 72 | 
            +
                    build_analysis_tree
         | 
| 73 | 
            +
                  end
         | 
| 74 | 
            +
                
         | 
| 75 | 
            +
                  
         | 
| 76 | 
            +
                  # Analyze the structure of the HTML document and score branches for likelihood of containing useful content
         | 
| 77 | 
            +
                  def build_analysis_tree
         | 
| 78 | 
            +
                    @tree = {}    
         | 
| 79 | 
            +
                    subels = {}
         | 
| 80 | 
            +
                    
         | 
| 81 | 
            +
                    t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
         | 
| 82 | 
            +
                    
         | 
| 83 | 
            +
                    # Do a pre clean up of elements. 
         | 
| 84 | 
            +
                    @doc.css("div, span, table, tr, td, pre").each do |el|
         | 
| 85 | 
            +
                      # Any block elements with no child block elements can become paragraphs
         | 
| 86 | 
            +
                      if (BLOCK_OUTPUT_ELEMENTS & el.inner_html.scan(/\<(\w+)/).flatten).empty?
         | 
| 87 | 
            +
                        el.name = "p" 
         | 
| 88 | 
            +
                      elsif el.name != "span"
         | 
| 89 | 
            +
                        el.name = "div"
         | 
| 90 | 
            +
                      end
         | 
| 91 | 
            +
                      
         | 
| 92 | 
            +
                      # Any SPANs that aren't within paragraphs can become paragraphs too
         | 
| 93 | 
            +
                      el.name = "p" if el.name == "span" && !el.path.scan(/[a-z]+/).include?('p')
         | 
| 94 | 
            +
                      
         | 
| 95 | 
            +
                      el.remove if (FATAL_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)).size > 0
         | 
| 96 | 
            +
                    end
         | 
| 97 | 
            +
                                
         | 
| 98 | 
            +
                    @doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
         | 
| 99 | 
            +
                      # Assume that no content we'll want comes in a total package of fewer than 80 characters!
         | 
| 100 | 
            +
                      next unless el.text.to_s.strip.length >= 80
         | 
| 101 | 
            +
              
         | 
| 102 | 
            +
                      ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)
         | 
| 103 | 
            +
                      path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
         | 
| 104 | 
            +
                      depth = path_segments.length
         | 
| 105 | 
            +
                      
         | 
| 106 | 
            +
                      branch = {}        
         | 
| 107 | 
            +
                      branch[:ids] = ids
         | 
| 108 | 
            +
                      branch[:score] = -(BAD_WORDS & ids).size
         | 
| 109 | 
            +
                      branch[:score] += (GOOD_WORDS & ids).size
         | 
| 110 | 
            +
                      next if branch[:score] < 0
         | 
| 111 | 
            +
              
         | 
| 112 | 
            +
                      #puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
         | 
| 113 | 
            +
                      
         | 
| 114 | 
            +
                      # Elements that have an ID or class are more likely to be our winners
         | 
| 115 | 
            +
                      branch[:score] += 2 unless ids.empty?
         | 
| 116 | 
            +
              
         | 
| 117 | 
            +
                      branch[:name] = el.name
         | 
| 118 | 
            +
                      branch[:depth] = depth
         | 
| 119 | 
            +
                      branch[:path] = el.path
         | 
| 120 | 
            +
                      
         | 
| 121 | 
            +
                      branch[:raw_word_count] = 0
         | 
| 122 | 
            +
                      branch[:word_count] = 0
         | 
| 123 | 
            +
                      branch[:child_count] = 0
         | 
| 124 | 
            +
                      branch[:bad_child_count] = 0
         | 
| 125 | 
            +
                      branch[:score_steps] = []
         | 
| 126 | 
            +
                      
         | 
| 127 | 
            +
                      
         | 
| 128 | 
            +
                      el.traverse do |subel|
         | 
| 129 | 
            +
                        div_at_end_of_branch = false if subel.name == "div"
         | 
| 130 | 
            +
                        path = subel.path
         | 
| 131 | 
            +
                        subels[path] ||= {}
         | 
| 132 | 
            +
                        subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
         | 
| 133 | 
            +
                        subels[path][:is_text] ||= subel.text?
         | 
| 134 | 
            +
                                  
         | 
| 135 | 
            +
                        if subels[path][:is_text]
         | 
| 136 | 
            +
                          subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
         | 
| 137 | 
            +
                          next if subels[path][:text].empty?
         | 
| 138 | 
            +
              
         | 
| 139 | 
            +
                          subels[path][:raw_word_count] ||= subels[path][:text].size
         | 
| 140 | 
            +
                          subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
         | 
| 141 | 
            +
                          subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
         | 
| 142 | 
            +
                          
         | 
| 143 | 
            +
                          branch[:raw_word_count] += subels[path][:raw_word_count]
         | 
| 144 | 
            +
                          branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
         | 
| 145 | 
            +
                        end
         | 
| 146 | 
            +
                        
         | 
| 147 | 
            +
                        subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
         | 
| 148 | 
            +
                        subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
         | 
| 149 | 
            +
                        subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
         | 
| 150 | 
            +
                        
         | 
| 151 | 
            +
                        branch[:bad_child_count] += subels[path][:bad_child_count_inc]
         | 
| 152 | 
            +
                        branch[:child_count] += subels[path][:child_count_inc]
         | 
| 153 | 
            +
                      end
         | 
| 154 | 
            +
                              
         | 
| 155 | 
            +
                      branch[:score] += 2 if branch[:name] == "div"
         | 
| 156 | 
            +
                      branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
         | 
| 157 | 
            +
                      branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
         | 
| 158 | 
            +
                      branch[:score] *= 3
         | 
| 159 | 
            +
                      
         | 
| 160 | 
            +
                      
         | 
| 161 | 
            +
                      branch[:score] *= 0.7 if el.children && el.children.size < 3
         | 
| 162 | 
            +
                      branch[:score] *= 1.25 if branch[:raw_word_count] > 10
         | 
| 163 | 
            +
                      next if branch[:raw_word_count] < 10        
         | 
| 164 | 
            +
                      branch[:score] += [branch[:word_count], 1].max ** 0.5
         | 
| 165 | 
            +
              
         | 
| 166 | 
            +
                      
         | 
| 167 | 
            +
                      word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max        
         | 
| 168 | 
            +
                      branch[:word_child_count_ratio] = word_child_count_ratio
         | 
| 169 | 
            +
              
         | 
| 170 | 
            +
                      if branch[:raw_word_count] > 100
         | 
| 171 | 
            +
                        good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
         | 
| 172 | 
            +
                        branch[:score] += good_word_ratio * 12
         | 
| 173 | 
            +
              
         | 
| 174 | 
            +
                        if word_child_count_ratio > 50
         | 
| 175 | 
            +
                          branch[:score] *= 1.5
         | 
| 176 | 
            +
                        elsif word_child_count_ratio > 30
         | 
| 177 | 
            +
                          branch[:score] *= 1.2
         | 
| 178 | 
            +
                        elsif word_child_count_ratio > 15
         | 
| 179 | 
            +
                          branch[:score] *= 1.1
         | 
| 180 | 
            +
                        elsif word_child_count_ratio < 4
         | 
| 181 | 
            +
                          branch[:score] *= 0.9
         | 
| 182 | 
            +
                        end   
         | 
| 183 | 
            +
                      end
         | 
| 184 | 
            +
                      
         | 
| 185 | 
            +
                      branch[:score_steps] << "s1: #{branch[:score]}"
         | 
| 186 | 
            +
                      
         | 
| 187 | 
            +
                      bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
         | 
| 188 | 
            +
                      branch[:bad_child_ratio] = bad_child_ratio
         | 
| 189 | 
            +
                      branch[:score] += 3 if bad_child_ratio < 0.0
         | 
| 190 | 
            +
                      branch[:score] -= 3 if bad_child_ratio > 0.15
         | 
| 191 | 
            +
                      branch[:score] -= 2 if bad_child_ratio > 0.25
         | 
| 192 | 
            +
                      branch[:score] -= 2 if bad_child_ratio > 0.4
         | 
| 193 | 
            +
                      branch[:score] -= 4 if bad_child_ratio > 0.5
         | 
| 194 | 
            +
                      branch[:score] -= 5 if bad_child_ratio > 0.7
         | 
| 195 | 
            +
                      branch[:score] -= 5 if branch[:bad_child_count] > 20
         | 
| 196 | 
            +
                      
         | 
| 197 | 
            +
                      branch[:score] += depth
         | 
| 198 | 
            +
                      
         | 
| 199 | 
            +
                      
         | 
| 200 | 
            +
                      
         | 
| 201 | 
            +
                      @tree[el.path] = branch
         | 
| 202 | 
            +
                    end
         | 
| 203 | 
            +
              
         | 
| 204 | 
            +
              
         | 
| 205 | 
            +
                    sorted_tree = @tree.sort_by { |k, v| v[:score] }
         | 
| 206 | 
            +
                    
         | 
| 207 | 
            +
                    #ap @doc.at(sorted_tree.first[0]).text
         | 
| 208 | 
            +
                    
         | 
| 209 | 
            +
                    # Sort the branches by their score in reverse order
         | 
| 210 | 
            +
                    @content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
         | 
| 211 | 
            +
                    
         | 
| 212 | 
            +
                    @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
         | 
| 213 | 
            +
                    #ap @content_candidates
         | 
| 214 | 
            +
                    #t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)      
         | 
| 215 | 
            +
                    #puts t2 - t1      
         | 
| 216 | 
            +
                    #exit
         | 
| 217 | 
            +
                    
         | 
| 218 | 
            +
                  end
         | 
| 219 | 
            +
                  
         | 
| 220 | 
            +
                  
         | 
| 221 | 
            +
                  # Return the content from best match number of index (default 0) and, optionally, clean it to plain-text
         | 
| 222 | 
            +
                  def content(clean = false, index = 0)
         | 
| 223 | 
            +
                    return @content[[clean, index]] if @content[[clean, index]]
         | 
| 224 | 
            +
                    return '' unless @content_candidates && !@content_candidates.empty?
         | 
| 225 | 
            +
                    
         | 
| 226 | 
            +
                    content_branch = @doc.at(@content_candidates[index].first)
         | 
| 227 | 
            +
                    orphans_to_remove = []
         | 
| 228 | 
            +
                    
         | 
| 229 | 
            +
                    #ap content_branch.to_html
         | 
| 230 | 
            +
                    #exit
         | 
| 231 | 
            +
                    
         | 
| 232 | 
            +
                    # Go through every piece of the content and rip out sections that contain too many tags compared to words
         | 
| 233 | 
            +
                    # This is usually indicative of "widgets" or link bar sections
         | 
| 234 | 
            +
                    content_branch.css('*').each_with_index do |el, i|
         | 
| 235 | 
            +
                      next unless el
         | 
| 236 | 
            +
                      
         | 
| 237 | 
            +
                      if el.name == "h1"
         | 
| 238 | 
            +
                        el.remove
         | 
| 239 | 
            +
                        next
         | 
| 240 | 
            +
                      end
         | 
| 241 | 
            +
                      
         | 
| 242 | 
            +
                      if el.name == "h2" && content_branch.inner_html.scan('<h2').size == 1
         | 
| 243 | 
            +
                        el.remove
         | 
| 244 | 
            +
                      end
         | 
| 245 | 
            +
                      
         | 
| 246 | 
            +
                      # Remove elements that contain words but there are more tags than words overall
         | 
| 247 | 
            +
                      # First, count the words
         | 
| 248 | 
            +
                      #word_count = 0
         | 
| 249 | 
            +
                      #el.traverse do |subel|
         | 
| 250 | 
            +
                      #  if subel.text? && subel.path !~ /\/a\// && subel.path !~ /\/(h1|h2|h3|h4|h5|h6)\//
         | 
| 251 | 
            +
                      #    word_count += (subel.text.downcase.scan(/[a-z]{4,}/) - META_WORDS).size
         | 
| 252 | 
            +
                      #  end
         | 
| 253 | 
            +
                      #end
         | 
| 254 | 
            +
                      #
         | 
| 255 | 
            +
                      ## .. then count the tags
         | 
| 256 | 
            +
                      #
         | 
| 257 | 
            +
                      #inner_tags = el.inner_html.scan(/\<\w.*?\>/).size
         | 
| 258 | 
            +
                      #if word_count < inner_tags && inner_tags > 3 && word_count < 250
         | 
| 259 | 
            +
                      #  puts "At #{el.name} #{el['id']} #{el['class']} containing '#{el.text[0..20]}' we have #{word_count} valid words to #{el.inner_html.scan(/\<\w.*?\>/).size} tags"
         | 
| 260 | 
            +
                      #  #puts "Removing #{el.name} #{el['id']} #{el['class']} TOO MANY TAGS FOR WORDS"
         | 
| 261 | 
            +
                      #  el.remove
         | 
| 262 | 
            +
                      #  next
         | 
| 263 | 
            +
                      #end        
         | 
| 264 | 
            +
              
         | 
| 265 | 
            +
                      # If there are at least 2 words and a third of them are "meta words," remove the element
         | 
| 266 | 
            +
                      #inner_words = el.text.to_s.downcase.scan(/[a-z]{3,}/)
         | 
| 267 | 
            +
                      #if BLOCK_OUTPUT_ELEMENTS.include?(el.name) && inner_words.size >= 2
         | 
| 268 | 
            +
                      #  if ((inner_words & META_WORDS).size >= (inner_words.size / 3))
         | 
| 269 | 
            +
                      #    el.remove
         | 
| 270 | 
            +
                      #  end
         | 
| 271 | 
            +
                      #end
         | 
| 272 | 
            +
                      
         | 
| 273 | 
            +
                      if el.text && el.text.strip.length < 3 && !%w{img}.include?(el.name) && el.inner_html !~ /\<img/
         | 
| 274 | 
            +
                        el.remove
         | 
| 275 | 
            +
                        next
         | 
| 276 | 
            +
                      end
         | 
| 277 | 
            +
                      
         | 
| 278 | 
            +
                      if el.name == "p" && el.text !~ /\.(\s|$)/ && el.inner_html !~ /\<img/
         | 
| 279 | 
            +
                        el.remove
         | 
| 280 | 
            +
                        next
         | 
| 281 | 
            +
                      end
         | 
| 282 | 
            +
                      
         | 
| 283 | 
            +
                      # If the ID or class of the element contains a fatally bad word, get rid of it
         | 
| 284 | 
            +
                      if (BAD_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.scan(/[a-z]+/)).length > 0
         | 
| 285 | 
            +
                        #puts "Removing #{el.name} #{el['id']} #{el['class']} BAD"          
         | 
| 286 | 
            +
                        el.remove
         | 
| 287 | 
            +
                        next
         | 
| 288 | 
            +
                      end
         | 
| 289 | 
            +
                    end
         | 
| 290 | 
            +
                          
         | 
| 291 | 
            +
                    # If a title was found early in the result document but had text before it, remove that text - it's probably crap
         | 
| 292 | 
            +
                    orphans_to_remove.each { |el| el.remove }
         | 
| 293 | 
            +
                    
         | 
| 294 | 
            +
                    # Clean up the HTML again - Nokogiri outputs it with full doctype and crap
         | 
| 295 | 
            +
                    clean_html = strip(Sanitize.clean(content_branch.to_html, :elements => (clean ? BLOCK_OUTPUT_ELEMENTS : OUTPUT_ELEMENTS), :attributes => (clean ? OK_CLEAN_ATTRIBUTES : OK_ATTRIBUTES)))
         | 
| 296 | 
            +
                    
         | 
| 297 | 
            +
                    # If the content is desired as "clean" (i.e. plain-text), do some quick fix-ups
         | 
| 298 | 
            +
                    if clean
         | 
| 299 | 
            +
                      # Get rid of line break tags, make list items look nice, remove all other HTML tags, and clean up spaces and newlines
         | 
| 300 | 
            +
                      clean_html.gsub!(/<br.*?>/, "\n")
         | 
| 301 | 
            +
                      clean_html.gsub!(/<li>/, '* ')
         | 
| 302 | 
            +
                      clean_html.gsub!(/<\w+>/, '')
         | 
| 303 | 
            +
                      clean_html.gsub!(/<\/\w+>/, "\n")
         | 
| 304 | 
            +
                      clean_html.gsub!(/\ +/, ' ')
         | 
| 305 | 
            +
                      clean_html.gsub!(/^\s+\n/, "\n")
         | 
| 306 | 
            +
                      clean_html.gsub!(/\n{2,}/, "\n")
         | 
| 307 | 
            +
                      clean_html.strip!
         | 
| 308 | 
            +
                    end
         | 
| 309 | 
            +
                    
         | 
| 310 | 
            +
                    # If tags butt up against each other across lines, remove the line break(s)
         | 
| 311 | 
            +
                    clean_html.gsub!(/\>\n+\</, '><')
         | 
| 312 | 
            +
                    
         | 
| 313 | 
            +
                    # Get rid of images whose sources are relative (TODO: Make this optional)
         | 
| 314 | 
            +
                    clean_html.gsub!(/\<img .*?\>/i) do |img_tag|
         | 
| 315 | 
            +
                      img_tag =~ /\Whttp/ ? img_tag : ''
         | 
| 316 | 
            +
                    end
         | 
| 317 | 
            +
                    
         | 
| 318 | 
            +
                    # Remove empty tags
         | 
| 319 | 
            +
                    clean_html.gsub!(/<(\w+)><\/\1>/, "")
         | 
| 320 | 
            +
                    
         | 
| 321 | 
            +
                    # Trim leading space from lines but without removing blank lines
         | 
| 322 | 
            +
                    #clean_html.gsub!(/^\ +(?=\S)/, '')
         | 
| 323 | 
            +
                    
         | 
| 324 | 
            +
                    # Just a messy, hacky way to make output look nicer with subsequent paragraphs..
         | 
| 325 | 
            +
                    clean_html.gsub!(/<\/(div|p|h1|h2|h3|h4|h5|h6)>/, '</\1>' + "\n\n")
         | 
| 326 | 
            +
                          
         | 
| 327 | 
            +
                    # Get rid of bullshit "smart" quotes
         | 
| 328 | 
            +
                    clean_html.force_encoding("ASCII-8BIT") if RUBY_VERSION > "1.9"
         | 
| 329 | 
            +
                    clean_html.gsub!("\xe2\x80\x89", " ")
         | 
| 330 | 
            +
                    clean_html.gsub!("\xe2\x80\x99", "'")
         | 
| 331 | 
            +
                    clean_html.gsub!("\xe2\x80\x98", "'")
         | 
| 332 | 
            +
                    clean_html.gsub!("\xe2\x80\x9c", '"')
         | 
| 333 | 
            +
                    clean_html.gsub!("\xe2\x80\x9d", '"')
         | 
| 334 | 
            +
                    clean_html.force_encoding("UTF-8") if RUBY_VERSION > "1.9"
         | 
| 335 | 
            +
                    
         | 
| 336 | 
            +
                    @content[[clean, index]] = clean_html
         | 
| 337 | 
            +
                  end
         | 
| 338 | 
            +
                      
         | 
| 339 | 
            +
                  def sentences(qty = 3)
         | 
| 340 | 
            +
                  #  ap content
         | 
| 341 | 
            +
                    clean_content = Sanitize.clean(content, :elements => NON_HEADER_ELEMENTS, :attributes => OK_CLEAN_ATTRIBUTES, :remove_contents => %w{h1 h2 h3 h4 h5 h6})
         | 
| 342 | 
            +
                    #ap clean_content
         | 
| 343 | 
            +
                  #exit
         | 
| 344 | 
            +
                    fodder = ''
         | 
| 345 | 
            +
                    doc = Nokogiri::HTML(clean_content, nil, 'utf-8')
         | 
| 346 | 
            +
              
         | 
| 347 | 
            +
                    doc.traverse do |el|
         | 
| 348 | 
            +
                      path_segments = el.path.scan(/[a-z]+/)[2..-1]
         | 
| 349 | 
            +
                      next unless path_segments && path_segments.length > 1
         | 
| 350 | 
            +
                      if el.text? && el.text.strip.length < 3
         | 
| 351 | 
            +
                        el.remove
         | 
| 352 | 
            +
                        next
         | 
| 353 | 
            +
                      end
         | 
| 354 | 
            +
                      if el.text? && NON_HEADER_ELEMENTS.include?(path_segments[-2]) 
         | 
| 355 | 
            +
                        text = el.text.strip
         | 
| 356 | 
            +
                        text += "." if text !~ /[\.\!\?\"\']$/
         | 
| 357 | 
            +
                        fodder += text + "\n" 
         | 
| 358 | 
            +
                      end
         | 
| 359 | 
            +
                    end
         | 
| 360 | 
            +
                    
         | 
| 361 | 
            +
                    fodder = content(true) if fodder.to_s.length < 50
         | 
| 362 | 
            +
                    fodder.gsub!(/\b\w\W\s/, '')
         | 
| 363 | 
            +
                    
         | 
| 364 | 
            +
                    sentences = fodder.scan(/([\&\w\s\-\'\,\+\.\/\\\:\#\(\)\=\"\?\!]+?[\.\?\!])(\s|\Z)/im).map { |s| s.first }
         | 
| 365 | 
            +
                    
         | 
| 366 | 
            +
                    sentences.compact!
         | 
| 367 | 
            +
                    sentences.map! { |s| s.strip }
         | 
| 368 | 
            +
                    sentences.map! { |s| s.sub(/^[^\"\'a-z0-9\(\[]+/im, '') }
         | 
| 369 | 
            +
                    sentences.map! { |s| s.sub(/[^a-z0-9\'\"\)\]\.\!\:\?]+$/im, '') }
         | 
| 370 | 
            +
                    sentences.map! { |s| s.gsub(/\s+/m, ' ') }
         | 
| 371 | 
            +
                    sentences.first(qty)
         | 
| 372 | 
            +
                  end
         | 
| 373 | 
            +
                  
         | 
| 374 | 
            +
                  def images(qty = 3)
         | 
| 375 | 
            +
                    doc = Nokogiri::HTML(content, nil, 'utf-8')
         | 
| 376 | 
            +
                    images = []
         | 
| 377 | 
            +
                    doc.css("img").each do |img|
         | 
| 378 | 
            +
                      images << img['src']
         | 
| 379 | 
            +
                      break if images.length == qty
         | 
| 380 | 
            +
                    end
         | 
| 381 | 
            +
                    images
         | 
| 382 | 
            +
                  end
         | 
| 383 | 
            +
                  
         | 
| 384 | 
            +
                  # Remove leading and trailing spaces on lines throughout a string (a bit like String#strip, but for multi-lines)
         | 
| 385 | 
            +
                  def strip(s)
         | 
| 386 | 
            +
                    s.gsub(/^\s+/, '').gsub(/\s+$/, '')
         | 
| 387 | 
            +
                  end
         | 
| 388 | 
            +
                end  
         | 
| 389 | 
            +
              end
         | 
| 390 | 
            +
            end
         | 
    
        data/lib/pismo.rb
    CHANGED
    
    | @@ -4,11 +4,12 @@ require 'open-uri' | |
| 4 4 | 
             
            require 'nokogiri'
         | 
| 5 5 | 
             
            require 'fast_stemmer'
         | 
| 6 6 | 
             
            require 'chronic'
         | 
| 7 | 
            +
            require 'sanitize'
         | 
| 7 8 | 
             
            require 'tempfile'
         | 
| 8 9 |  | 
| 9 10 | 
             
            $: << File.dirname(__FILE__)
         | 
| 10 11 | 
             
            require 'pismo/document'
         | 
| 11 | 
            -
            require 'pismo/ | 
| 12 | 
            +
            require 'pismo/reader'
         | 
| 12 13 |  | 
| 13 14 | 
             
            module Pismo
         | 
| 14 15 | 
             
              # Sugar methods to make creating document objects nicer
         | 
| @@ -20,7 +21,7 @@ module Pismo | |
| 20 21 | 
             
              # (mostly useful for debugging use)
         | 
| 21 22 | 
             
              def self.[](url)
         | 
| 22 23 | 
             
                @docs ||= {}
         | 
| 23 | 
            -
                @docs[url] ||= Pismo::Document.new( | 
| 24 | 
            +
                @docs[url] ||= Pismo::Document.new(url)
         | 
| 24 25 | 
             
              end
         | 
| 25 26 |  | 
| 26 27 |  | 
    
        data/pismo.gemspec
    CHANGED
    
    | @@ -5,11 +5,11 @@ | |
| 5 5 |  | 
| 6 6 | 
             
            Gem::Specification.new do |s|
         | 
| 7 7 | 
             
              s.name = %q{pismo}
         | 
| 8 | 
            -
              s.version = "0. | 
| 8 | 
            +
              s.version = "0.6.0"
         | 
| 9 9 |  | 
| 10 10 | 
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         | 
| 11 11 | 
             
              s.authors = ["Peter Cooper"]
         | 
| 12 | 
            -
              s.date = %q{2010-06- | 
| 12 | 
            +
              s.date = %q{2010-06-20}
         | 
| 13 13 | 
             
              s.default_executable = %q{pismo}
         | 
| 14 14 | 
             
              s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
         | 
| 15 15 | 
             
              s.email = %q{git@peterc.org}
         | 
| @@ -22,6 +22,7 @@ Gem::Specification.new do |s| | |
| 22 22 | 
             
                ".document",
         | 
| 23 23 | 
             
                 ".gitignore",
         | 
| 24 24 | 
             
                 "LICENSE",
         | 
| 25 | 
            +
                 "NOTICE",
         | 
| 25 26 | 
             
                 "README.markdown",
         | 
| 26 27 | 
             
                 "Rakefile",
         | 
| 27 28 | 
             
                 "VERSION",
         | 
| @@ -30,62 +31,69 @@ Gem::Specification.new do |s| | |
| 30 31 | 
             
                 "lib/pismo/document.rb",
         | 
| 31 32 | 
             
                 "lib/pismo/external_attributes.rb",
         | 
| 32 33 | 
             
                 "lib/pismo/internal_attributes.rb",
         | 
| 33 | 
            -
                 "lib/pismo/ | 
| 34 | 
            +
                 "lib/pismo/reader.rb",
         | 
| 34 35 | 
             
                 "lib/pismo/stopwords.txt",
         | 
| 35 36 | 
             
                 "pismo.gemspec",
         | 
| 36 37 | 
             
                 "test/corpus/bbcnews.html",
         | 
| 38 | 
            +
                 "test/corpus/bbcnews2.html",
         | 
| 37 39 | 
             
                 "test/corpus/briancray.html",
         | 
| 38 40 | 
             
                 "test/corpus/cant_read.html",
         | 
| 39 41 | 
             
                 "test/corpus/factor.html",
         | 
| 42 | 
            +
                 "test/corpus/gmane.html",
         | 
| 40 43 | 
             
                 "test/corpus/huffington.html",
         | 
| 41 44 | 
             
                 "test/corpus/metadata_expected.yaml",
         | 
| 42 45 | 
             
                 "test/corpus/metadata_expected.yaml.old",
         | 
| 46 | 
            +
                 "test/corpus/queness.html",
         | 
| 47 | 
            +
                 "test/corpus/reader_expected.yaml",
         | 
| 43 48 | 
             
                 "test/corpus/rubyinside.html",
         | 
| 44 49 | 
             
                 "test/corpus/rww.html",
         | 
| 45 50 | 
             
                 "test/corpus/spolsky.html",
         | 
| 46 51 | 
             
                 "test/corpus/techcrunch.html",
         | 
| 52 | 
            +
                 "test/corpus/tweet.html",
         | 
| 47 53 | 
             
                 "test/corpus/youtube.html",
         | 
| 54 | 
            +
                 "test/corpus/zefrank.html",
         | 
| 48 55 | 
             
                 "test/helper.rb",
         | 
| 49 56 | 
             
                 "test/test_corpus.rb",
         | 
| 50 | 
            -
                 "test/test_pismo_document.rb" | 
| 51 | 
            -
                 "test/test_readability.rb"
         | 
| 57 | 
            +
                 "test/test_pismo_document.rb"
         | 
| 52 58 | 
             
              ]
         | 
| 53 59 | 
             
              s.homepage = %q{http://github.com/peterc/pismo}
         | 
| 54 60 | 
             
              s.rdoc_options = ["--charset=UTF-8"]
         | 
| 55 61 | 
             
              s.require_paths = ["lib"]
         | 
| 56 | 
            -
              s.rubygems_version = %q{1.3. | 
| 62 | 
            +
              s.rubygems_version = %q{1.3.7}
         | 
| 57 63 | 
             
              s.summary = %q{Extracts or retrieves content-related metadata from HTML pages}
         | 
| 58 64 | 
             
              s.test_files = [
         | 
| 59 65 | 
             
                "test/helper.rb",
         | 
| 60 66 | 
             
                 "test/test_corpus.rb",
         | 
| 61 | 
            -
                 "test/test_pismo_document.rb" | 
| 62 | 
            -
                 "test/test_readability.rb"
         | 
| 67 | 
            +
                 "test/test_pismo_document.rb"
         | 
| 63 68 | 
             
              ]
         | 
| 64 69 |  | 
| 65 70 | 
             
              if s.respond_to? :specification_version then
         | 
| 66 71 | 
             
                current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
         | 
| 67 72 | 
             
                s.specification_version = 3
         | 
| 68 73 |  | 
| 69 | 
            -
                if Gem::Version.new(Gem:: | 
| 74 | 
            +
                if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
         | 
| 70 75 | 
             
                  s.add_development_dependency(%q<shoulda>, [">= 0"])
         | 
| 76 | 
            +
                  s.add_development_dependency(%q<awesome_print>, [">= 0"])
         | 
| 77 | 
            +
                  s.add_runtime_dependency(%q<jeweler>, [">= 0"])
         | 
| 71 78 | 
             
                  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
         | 
| 72 | 
            -
                  s.add_runtime_dependency(%q< | 
| 73 | 
            -
                  s.add_runtime_dependency(%q<httparty>, [">= 0"])
         | 
| 79 | 
            +
                  s.add_runtime_dependency(%q<sanitize>, [">= 0"])
         | 
| 74 80 | 
             
                  s.add_runtime_dependency(%q<fast-stemmer>, [">= 0"])
         | 
| 75 81 | 
             
                  s.add_runtime_dependency(%q<chronic>, [">= 0"])
         | 
| 76 82 | 
             
                else
         | 
| 77 83 | 
             
                  s.add_dependency(%q<shoulda>, [">= 0"])
         | 
| 84 | 
            +
                  s.add_dependency(%q<awesome_print>, [">= 0"])
         | 
| 85 | 
            +
                  s.add_dependency(%q<jeweler>, [">= 0"])
         | 
| 78 86 | 
             
                  s.add_dependency(%q<nokogiri>, [">= 0"])
         | 
| 79 | 
            -
                  s.add_dependency(%q< | 
| 80 | 
            -
                  s.add_dependency(%q<httparty>, [">= 0"])
         | 
| 87 | 
            +
                  s.add_dependency(%q<sanitize>, [">= 0"])
         | 
| 81 88 | 
             
                  s.add_dependency(%q<fast-stemmer>, [">= 0"])
         | 
| 82 89 | 
             
                  s.add_dependency(%q<chronic>, [">= 0"])
         | 
| 83 90 | 
             
                end
         | 
| 84 91 | 
             
              else
         | 
| 85 92 | 
             
                s.add_dependency(%q<shoulda>, [">= 0"])
         | 
| 93 | 
            +
                s.add_dependency(%q<awesome_print>, [">= 0"])
         | 
| 94 | 
            +
                s.add_dependency(%q<jeweler>, [">= 0"])
         | 
| 86 95 | 
             
                s.add_dependency(%q<nokogiri>, [">= 0"])
         | 
| 87 | 
            -
                s.add_dependency(%q< | 
| 88 | 
            -
                s.add_dependency(%q<httparty>, [">= 0"])
         | 
| 96 | 
            +
                s.add_dependency(%q<sanitize>, [">= 0"])
         | 
| 89 97 | 
             
                s.add_dependency(%q<fast-stemmer>, [">= 0"])
         | 
| 90 98 | 
             
                s.add_dependency(%q<chronic>, [">= 0"])
         | 
| 91 99 | 
             
              end
         |