mechanize_content 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rvmrc +1 -0
- data/Rakefile +5 -0
- data/lib/mechanize_content/image.rb +61 -0
- data/lib/mechanize_content/page.rb +115 -0
- data/lib/mechanize_content/util.rb +4 -31
- data/lib/mechanize_content/version.rb +1 -1
- data/lib/mechanize_content.rb +8 -168
- data/mechanize_content.gemspec +2 -1
- data/spec/cassettes/MechanizeContent.yml +33742 -0
- data/spec/cassettes/MechanizeContent_Image.yml +279 -0
- data/spec/mechanize_content/image_spec.rb +39 -0
- data/spec/mechanize_content/page_spec.rb +90 -0
- data/spec/mechanize_content_spec.rb +149 -0
- data/spec/spec_helper.rb +11 -1
- metadata +29 -45
- data/spec/fixtures/a-fistful-of-red-dead-redemption-ps3-for-a-few-dollars-less-on.html +0 -754
- data/spec/fixtures/another-world-15th-anniversary-edition-now-on-gog-com.html +0 -2416
- data/spec/fixtures/another_world_15th_anniversary_edition.html +0 -805
- data/spec/fixtures/cmp.html +0 -333
- data/spec/fixtures/episodes_from_liberty_city_now_coming_to_playstation_3_and_pc_this_april.html +0 -1593
- data/spec/fixtures/gdc_2010_rounds_off_indie_cove.html +0 -698
- data/spec/fixtures/google.html +0 -42
- data/spec/fixtures/gta-iv-episodes-from-liberty-city-sees-slight-delay-on-pc-and-ps3.html +0 -1012
- data/spec/fixtures/johnny.jpg +0 -0
- data/spec/fixtures/joystiq-xbox-usb-support-580.jpg +0 -0
- data/spec/fixtures/mutiny.html +0 -264
- data/spec/fixtures/nuff-said-good-old-games-gets-another-world-168150.html +0 -5492
- data/spec/fixtures/rock-band-3-out-this-holiday-will-revolutionize-genre.html +0 -1157
- data/spec/fixtures/rockband_facebook.html +0 -93
- data/spec/fixtures/spartan.html +0 -391
- data/spec/fixtures/techmeme.html +0 -2216
- data/spec/fixtures/time-warner-retail-egm.html +0 -49
- data/spec/fixtures/witcher.html +0 -458
- data/spec/fixtures/xbox-360-gaining-usb-storage-support-in-2010-update.html +0 -2462
- data/spec/mechanize-content_spec.rb +0 -202
    
        data/.rvmrc
    ADDED
    
    | @@ -0,0 +1 @@ | |
| 1 | 
            +
            rvm 1.9.2@mechanize_content
         | 
    
        data/Rakefile
    CHANGED
    
    
| @@ -0,0 +1,61 @@ | |
| 1 | 
            +
            module MechanizeContent
         | 
| 2 | 
            +
              class Image
         | 
| 3 | 
            +
                MIN_WIDTH  = 64
         | 
| 4 | 
            +
                MIN_HEIGHT = 64
         | 
| 5 | 
            +
                AD_WIDTH = 728
         | 
| 6 | 
            +
                AD_HEIGHT = 90
         | 
| 7 | 
            +
                
         | 
| 8 | 
            +
                def self.best_image(images, base_url)
         | 
| 9 | 
            +
                  imgs = images.map{|i| Image.new(i, base_url)}
         | 
| 10 | 
            +
                  top_image = imgs.select{|i| i.interesting_css?}.first || imgs.select{|i| i.interesting_file?}.first
         | 
| 11 | 
            +
                  top_image.absolute_url if top_image
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
                
         | 
| 14 | 
            +
                def initialize(image, base_url)
         | 
| 15 | 
            +
                  @src      = image["src"]
         | 
| 16 | 
            +
                  @width    = image["width"].to_i
         | 
| 17 | 
            +
                  @height   = image["height"].to_i
         | 
| 18 | 
            +
                  @base_url = base_url
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
                
         | 
| 21 | 
            +
                def interesting_css?
         | 
| 22 | 
            +
                  valid_image?(@width, @height)
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
                
         | 
| 25 | 
            +
                def interesting_file?
         | 
| 26 | 
            +
                  open(absolute_url, "rb") do |fh|
         | 
| 27 | 
            +
                    is = ImageSize.new(fh.read)
         | 
| 28 | 
            +
                    return valid_image?(is.width, is.height)
         | 
| 29 | 
            +
                  end
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
                
         | 
| 32 | 
            +
                def valid_image?(width, height)
         | 
| 33 | 
            +
                  big_enough?(width, height) && not_advertising?(width, height) && allows_hotlinking?
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
                
         | 
| 36 | 
            +
                def allows_hotlinking?
         | 
| 37 | 
            +
                  begin
         | 
| 38 | 
            +
                    open(absolute_url, "Referer" => "http://splitstate.com")
         | 
| 39 | 
            +
                  rescue OpenURI::HTTPError, SocketError
         | 
| 40 | 
            +
                    return false
         | 
| 41 | 
            +
                  end
         | 
| 42 | 
            +
                  true
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
                
         | 
| 45 | 
            +
                def advertising?(width, height)
         | 
| 46 | 
            +
                  @src.include?("banner") || @src.include?(".gif") || ((width == AD_WIDTH) && (height == AD_HEIGHT))
         | 
| 47 | 
            +
                end
         | 
| 48 | 
            +
                
         | 
| 49 | 
            +
                def not_advertising?(width, height)
         | 
| 50 | 
            +
                  !advertising?(width, height)
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
                
         | 
| 53 | 
            +
                def big_enough?(width, height)
         | 
| 54 | 
            +
                  width > MIN_WIDTH && height > MIN_HEIGHT
         | 
| 55 | 
            +
                end
         | 
| 56 | 
            +
                
         | 
| 57 | 
            +
                def absolute_url
         | 
| 58 | 
            +
                  URI.parse(@src).relative? ? (URI.parse(@base_url.to_s)+@src).to_s : @src
         | 
| 59 | 
            +
                end    
         | 
| 60 | 
            +
              end
         | 
| 61 | 
            +
            end
         | 
| @@ -0,0 +1,115 @@ | |
| 1 | 
            +
            module MechanizeContent
         | 
| 2 | 
            +
              class Page
         | 
| 3 | 
            +
                attr_accessor :url
         | 
| 4 | 
            +
                
         | 
| 5 | 
            +
                def initialize(url)
         | 
| 6 | 
            +
                  @url = url
         | 
| 7 | 
            +
                end
         | 
| 8 | 
            +
                
         | 
| 9 | 
            +
                def title
         | 
| 10 | 
            +
                  content.title if content
         | 
| 11 | 
            +
                end
         | 
| 12 | 
            +
                
         | 
| 13 | 
            +
                def text
         | 
| 14 | 
            +
                  Util.force_utf8(best_content.text) if best_content && best_content.text.size > 50
         | 
| 15 | 
            +
                end
         | 
| 16 | 
            +
                
         | 
| 17 | 
            +
                def image
         | 
| 18 | 
            +
                  @image ||= best_content ? Image.best_image(images, base_url) : nil
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
                
         | 
| 21 | 
            +
                def images
         | 
| 22 | 
            +
                  best_content.css('img')
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
                
         | 
| 25 | 
            +
                def base_url
         | 
| 26 | 
            +
                  base = content.parser.xpath("//base/@href").first
         | 
| 27 | 
            +
                  base ? base.value : content.uri
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
                    
         | 
| 30 | 
            +
                def best_content
         | 
| 31 | 
            +
                  @best_content ||= find_content
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
                
         | 
| 34 | 
            +
                def find_content
         | 
| 35 | 
            +
                  return nil unless content
         | 
| 36 | 
            +
                  doc = content.parser
         | 
| 37 | 
            +
                  readability = {}
         | 
| 38 | 
            +
                  doc.css('p').each do |paragraph|
         | 
| 39 | 
            +
                    if readability[paragraph.parent].nil?
         | 
| 40 | 
            +
                      readability[paragraph.parent] = 0
         | 
| 41 | 
            +
                    end
         | 
| 42 | 
            +
                    parent_class = paragraph.parent['class'] || ""
         | 
| 43 | 
            +
                    parent_id = paragraph.parent['id'] || ""
         | 
| 44 | 
            +
                    if !parent_class.match('(comment|meta|footer|footnote)').nil?
         | 
| 45 | 
            +
                      readability[paragraph.parent] -= 50
         | 
| 46 | 
            +
                    elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
         | 
| 47 | 
            +
                      readability[paragraph.parent] += 25
         | 
| 48 | 
            +
                    end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                    if !parent_id.match('(comment|meta|footer|footnote)').nil?
         | 
| 51 | 
            +
                      readability[paragraph.parent] -= 50
         | 
| 52 | 
            +
                    elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
         | 
| 53 | 
            +
                      readability[paragraph.parent] += 25
         | 
| 54 | 
            +
                    end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    if paragraph.inner_text().length > 10
         | 
| 57 | 
            +
                      readability[paragraph.parent] += 1
         | 
| 58 | 
            +
                    end
         | 
| 59 | 
            +
                    if !paragraph.parent.attributes.values.nil?
         | 
| 60 | 
            +
                      if !paragraph.parent.attributes.values.first.nil?
         | 
| 61 | 
            +
                        if paragraph.parent.attributes.values.first.value.include? "comment"
         | 
| 62 | 
            +
                          break
         | 
| 63 | 
            +
                        end
         | 
| 64 | 
            +
                      end
         | 
| 65 | 
            +
                    end
         | 
| 66 | 
            +
                    readability[paragraph.parent] += paragraph.inner_text().count(',')
         | 
| 67 | 
            +
                  end
         | 
| 68 | 
            +
                  sorted_results = readability.sort_by { |parent,score| -score }
         | 
| 69 | 
            +
                  if sorted_results.nil? || sorted_results.first.nil?
         | 
| 70 | 
            +
                    return nil
         | 
| 71 | 
            +
                  elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty? || !sorted_results.first.first.xpath("//a[@href='http://www.adobe.com/go/getflashplayer']").empty?
         | 
| 72 | 
            +
                    return nil
         | 
| 73 | 
            +
                  else
         | 
| 74 | 
            +
                    top_result = sorted_results.first.first
         | 
| 75 | 
            +
                    top_result.css('script').unlink
         | 
| 76 | 
            +
                    top_result.css('iframe').unlink
         | 
| 77 | 
            +
                    top_result.css('h1').unlink
         | 
| 78 | 
            +
                    top_result.css('h2').unlink
         | 
| 79 | 
            +
                    top_result.css("div#date-byline").unlink
         | 
| 80 | 
            +
                    top_result.css("p.date").unlink
         | 
| 81 | 
            +
                    top_result.css("div#facebook-like-button").unlink
         | 
| 82 | 
            +
                    return top_result
         | 
| 83 | 
            +
                  end
         | 
| 84 | 
            +
                end
         | 
| 85 | 
            +
                
         | 
| 86 | 
            +
                
         | 
| 87 | 
            +
                def content
         | 
| 88 | 
            +
                  @page_content ||= fetch_content
         | 
| 89 | 
            +
                end
         | 
| 90 | 
            +
                
         | 
| 91 | 
            +
                def fetch_content
         | 
| 92 | 
            +
                  begin
         | 
| 93 | 
            +
                    page_content = agent.get(@url)
         | 
| 94 | 
            +
                    page_content if page_content.is_a?(Mechanize::Page)
         | 
| 95 | 
            +
                  rescue Timeout::Error
         | 
| 96 | 
            +
                    puts "Timeout - "+@url
         | 
| 97 | 
            +
                  rescue Errno::ECONNRESET
         | 
| 98 | 
            +
                    puts "Connection reset by peer - "+@url
         | 
| 99 | 
            +
                  rescue Mechanize::ResponseCodeError
         | 
| 100 | 
            +
                    puts "Invalid url"
         | 
| 101 | 
            +
                  rescue Mechanize::UnsupportedSchemeError
         | 
| 102 | 
            +
                    puts "Unsupported Scheme"
         | 
| 103 | 
            +
                  rescue SocketError => e
         | 
| 104 | 
            +
                    puts e
         | 
| 105 | 
            +
                  # rescue
         | 
| 106 | 
            +
                  #   puts "There was a problem connecting - "+@url
         | 
| 107 | 
            +
                  end
         | 
| 108 | 
            +
                end
         | 
| 109 | 
            +
                
         | 
| 110 | 
            +
                def agent
         | 
| 111 | 
            +
                  @agent ||= Mechanize.new {|a| a.user_agent_alias = 'Mac Safari'}
         | 
| 112 | 
            +
                end
         | 
| 113 | 
            +
                
         | 
| 114 | 
            +
              end
         | 
| 115 | 
            +
            end
         | 
| @@ -1,35 +1,8 @@ | |
| 1 1 | 
             
            module MechanizeContent
         | 
| 2 | 
            -
              class Util
         | 
| 3 | 
            -
             | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
                AD_WIDTH = 728
         | 
| 7 | 
            -
                AD_HEIGHT = 90
         | 
| 8 | 
            -
              
         | 
| 9 | 
            -
                def self.get_base_url(doc, url)
         | 
| 10 | 
            -
                  base_url = doc.xpath("//base/@href").first
         | 
| 11 | 
            -
                  if base_url.nil?
         | 
| 12 | 
            -
                    return url
         | 
| 13 | 
            -
                  else
         | 
| 14 | 
            -
                    return base_url.value
         | 
| 15 | 
            -
                  end
         | 
| 2 | 
            +
              class Util          
         | 
| 3 | 
            +
                def self.force_utf8(string)
         | 
| 4 | 
            +
                  ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
         | 
| 5 | 
            +
                  ic.iconv(string.delete("\t").delete("\n").strip + ' ')[0..-2]
         | 
| 16 6 | 
             
                end
         | 
| 17 | 
            -
              
         | 
| 18 | 
            -
                def self.build_absolute_url(current_src, url)
         | 
| 19 | 
            -
                  if URI.parse(current_src).relative?
         | 
| 20 | 
            -
                    current_src = (URI.parse(url.to_s)+current_src).to_s
         | 
| 21 | 
            -
                  end
         | 
| 22 | 
            -
                  current_src
         | 
| 23 | 
            -
                end
         | 
| 24 | 
            -
              
         | 
| 25 | 
            -
                def self.valid_image?(width, height, src)
         | 
| 26 | 
            -
                  if width > MIN_WIDTH && height > MIN_HEIGHT && !src.include?("banner") && !src.include?(".gif")
         | 
| 27 | 
            -
                    if (!(width == AD_WIDTH) && !(height == AD_HEIGHT))
         | 
| 28 | 
            -
                      return true
         | 
| 29 | 
            -
                    end
         | 
| 30 | 
            -
                  end
         | 
| 31 | 
            -
                  return false
         | 
| 32 | 
            -
                end
         | 
| 33 | 
            -
              
         | 
| 34 7 | 
             
              end
         | 
| 35 8 | 
             
            end
         | 
    
        data/lib/mechanize_content.rb
    CHANGED
    
    | @@ -3,187 +3,27 @@ require 'mechanize' | |
| 3 3 | 
             
            require 'image_size'
         | 
| 4 4 | 
             
            require 'open-uri'
         | 
| 5 5 | 
             
            require 'mechanize_content/util'
         | 
| 6 | 
            +
            require 'mechanize_content/page'
         | 
| 7 | 
            +
            require 'mechanize_content/image'
         | 
| 6 8 |  | 
| 7 9 | 
             
            module MechanizeContent
         | 
| 8 10 | 
             
              class Parser
         | 
| 9 | 
            -
             | 
| 10 | 
            -
                 | 
| 11 | 
            -
             | 
| 11 | 
            +
                attr_accessor :pages
         | 
| 12 | 
            +
                
         | 
| 12 13 | 
             
                def initialize(*args)
         | 
| 13 | 
            -
                  @ | 
| 14 | 
            +
                  @pages = *args.flatten.map{|url| Page.new(url)}
         | 
| 14 15 | 
             
                end
         | 
| 15 16 |  | 
| 16 17 | 
             
                def best_title
         | 
| 17 | 
            -
                  @ | 
| 18 | 
            +
                  @pages.map{|page| page.title}.compact.first || @pages.first.url
         | 
| 18 19 | 
             
                end
         | 
| 19 20 |  | 
| 20 21 | 
             
                def best_text
         | 
| 21 | 
            -
                  @ | 
| 22 | 
            +
                  @pages.map{|page| page.text}.compact.first
         | 
| 22 23 | 
             
                end
         | 
| 23 24 |  | 
| 24 25 | 
             
                def best_image
         | 
| 25 | 
            -
                  @ | 
| 26 | 
            -
                end
         | 
| 27 | 
            -
             | 
| 28 | 
            -
                def fetch_images
         | 
| 29 | 
            -
                  (@pages || fetch_pages).each do |page|
         | 
| 30 | 
            -
                    image = fetch_image(page)
         | 
| 31 | 
            -
                    return image unless image.nil?
         | 
| 32 | 
            -
                  end
         | 
| 33 | 
            -
                  return nil
         | 
| 34 | 
            -
                end
         | 
| 35 | 
            -
             | 
| 36 | 
            -
                def fetch_texts
         | 
| 37 | 
            -
                  (@pages || fetch_pages).each do |page|
         | 
| 38 | 
            -
                    text = fetch_text(page)
         | 
| 39 | 
            -
                    return text unless text.nil? || text.empty?
         | 
| 40 | 
            -
                  end
         | 
| 41 | 
            -
                  return nil
         | 
| 42 | 
            -
                end
         | 
| 43 | 
            -
             | 
| 44 | 
            -
                def fetch_titles
         | 
| 45 | 
            -
                  (@pages || fetch_pages).each do |page|
         | 
| 46 | 
            -
                    title = page.title
         | 
| 47 | 
            -
                    unless title.nil?
         | 
| 48 | 
            -
                      ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
         | 
| 49 | 
            -
                      title = ic.iconv(title + ' ')[0..-2]
         | 
| 50 | 
            -
                      return title
         | 
| 51 | 
            -
                    end
         | 
| 52 | 
            -
             | 
| 53 | 
            -
                  end
         | 
| 54 | 
            -
                  return @urls.first
         | 
| 55 | 
            -
                end
         | 
| 56 | 
            -
             | 
| 57 | 
            -
                def fetch_pages
         | 
| 58 | 
            -
                  @pages = []
         | 
| 59 | 
            -
                  @urls.each do |url|
         | 
| 60 | 
            -
                    page = fetch_page(url)
         | 
| 61 | 
            -
                    @pages << page unless page.nil?
         | 
| 62 | 
            -
                  end
         | 
| 63 | 
            -
                  @pages
         | 
| 64 | 
            -
                end
         | 
| 65 | 
            -
             | 
| 66 | 
            -
                def fetch_page(url)
         | 
| 67 | 
            -
                  begin
         | 
| 68 | 
            -
                    page = (@agent || init_agent).get(url)
         | 
| 69 | 
            -
                    if page.class ==  Mechanize::Page
         | 
| 70 | 
            -
                      return page
         | 
| 71 | 
            -
                    else
         | 
| 72 | 
            -
                      return nil
         | 
| 73 | 
            -
                    end
         | 
| 74 | 
            -
                  rescue Timeout::Error
         | 
| 75 | 
            -
                    puts "Timeout - "+url
         | 
| 76 | 
            -
                  rescue Errno::ECONNRESET
         | 
| 77 | 
            -
                    puts "Connection reset by peer - "+url
         | 
| 78 | 
            -
                  rescue Mechanize::ResponseCodeError
         | 
| 79 | 
            -
                    puts "Invalid url"
         | 
| 80 | 
            -
                  rescue Mechanize::UnsupportedSchemeError
         | 
| 81 | 
            -
                    puts "Unsupported Scheme"
         | 
| 82 | 
            -
                  rescue
         | 
| 83 | 
            -
                    puts "There was a problem connecting - "+url
         | 
| 84 | 
            -
                  end
         | 
| 26 | 
            +
                  @pages.map{|page| page.image}.compact.first
         | 
| 85 27 | 
             
                end
         | 
| 86 | 
            -
             | 
| 87 | 
            -
                def init_agent
         | 
| 88 | 
            -
                  agent = Mechanize.new
         | 
| 89 | 
            -
                  agent.user_agent_alias = 'Mac Safari'
         | 
| 90 | 
            -
                  return @agent = agent
         | 
| 91 | 
            -
                end
         | 
| 92 | 
            -
             | 
| 93 | 
            -
                def fetch_text(page)
         | 
| 94 | 
            -
                  top_content = fetch_content(page)
         | 
| 95 | 
            -
                  if top_content
         | 
| 96 | 
            -
                    text = top_content.text.delete("\t").delete("\n").strip
         | 
| 97 | 
            -
                    ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
         | 
| 98 | 
            -
                    text = ic.iconv(text + ' ')[0..-2]
         | 
| 99 | 
            -
                  else
         | 
| 100 | 
            -
                    return nil
         | 
| 101 | 
            -
                  end
         | 
| 102 | 
            -
                end
         | 
| 103 | 
            -
             | 
| 104 | 
            -
                def fetch_content(page)
         | 
| 105 | 
            -
                  doc = page.parser
         | 
| 106 | 
            -
                  readability = {}
         | 
| 107 | 
            -
                  doc.css('p').each do |paragraph|
         | 
| 108 | 
            -
                    if readability[paragraph.parent].nil?
         | 
| 109 | 
            -
                      readability[paragraph.parent] = 0
         | 
| 110 | 
            -
                    end
         | 
| 111 | 
            -
                    parent_class = paragraph.parent['class'] || ""
         | 
| 112 | 
            -
                    parent_id = paragraph.parent['id'] || ""
         | 
| 113 | 
            -
                    if !parent_class.match('(comment|meta|footer|footnote)').nil?
         | 
| 114 | 
            -
                      readability[paragraph.parent] -= 50
         | 
| 115 | 
            -
                    elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
         | 
| 116 | 
            -
                      readability[paragraph.parent] += 25
         | 
| 117 | 
            -
                    end
         | 
| 118 | 
            -
             | 
| 119 | 
            -
                    if !parent_id.match('(comment|meta|footer|footnote)').nil?
         | 
| 120 | 
            -
                      readability[paragraph.parent] -= 50
         | 
| 121 | 
            -
                    elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
         | 
| 122 | 
            -
                      readability[paragraph.parent] += 25
         | 
| 123 | 
            -
                    end
         | 
| 124 | 
            -
             | 
| 125 | 
            -
                    if paragraph.inner_text().length > 10
         | 
| 126 | 
            -
                      readability[paragraph.parent] += 1
         | 
| 127 | 
            -
                    end
         | 
| 128 | 
            -
                    if !paragraph.parent.attributes.values.nil?
         | 
| 129 | 
            -
                      if !paragraph.parent.attributes.values.first.nil?
         | 
| 130 | 
            -
                        if paragraph.parent.attributes.values.first.value.include? "comment"
         | 
| 131 | 
            -
                          break
         | 
| 132 | 
            -
                        end
         | 
| 133 | 
            -
                      end
         | 
| 134 | 
            -
                    end
         | 
| 135 | 
            -
                    readability[paragraph.parent] += paragraph.inner_text().count(',')
         | 
| 136 | 
            -
                  end
         | 
| 137 | 
            -
                  sorted_results = readability.sort_by { |parent,score| -score }
         | 
| 138 | 
            -
                  if sorted_results.nil? || sorted_results.first.nil?
         | 
| 139 | 
            -
                    return nil
         | 
| 140 | 
            -
                  elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty?
         | 
| 141 | 
            -
                    return nil
         | 
| 142 | 
            -
                  else
         | 
| 143 | 
            -
                    top_result = sorted_results.first.first
         | 
| 144 | 
            -
                    top_result.css('script').unlink
         | 
| 145 | 
            -
                    top_result.css('iframe').unlink
         | 
| 146 | 
            -
                    top_result.css('h1').unlink
         | 
| 147 | 
            -
                    top_result.css('h2').unlink
         | 
| 148 | 
            -
                    return top_result
         | 
| 149 | 
            -
                  end
         | 
| 150 | 
            -
                end
         | 
| 151 | 
            -
             | 
| 152 | 
            -
                def fetch_image(page)
         | 
| 153 | 
            -
                  top_content = fetch_content(page)
         | 
| 154 | 
            -
                  if top_content
         | 
| 155 | 
            -
                    return find_best_image(top_content.css('img'), Util.get_base_url(page.parser, page.uri))
         | 
| 156 | 
            -
                  else
         | 
| 157 | 
            -
                    return nil
         | 
| 158 | 
            -
                  end
         | 
| 159 | 
            -
                end  
         | 
| 160 | 
            -
             | 
| 161 | 
            -
                def find_best_image(all_images, url)
         | 
| 162 | 
            -
                  begin
         | 
| 163 | 
            -
                    current_src = nil
         | 
| 164 | 
            -
                    all_images.each do |img|
         | 
| 165 | 
            -
                      current_src = img["src"]
         | 
| 166 | 
            -
                      if Util.valid_image?(img['width'].to_i, img['height'].to_i, current_src)
         | 
| 167 | 
            -
                        return Util.build_absolute_url(current_src, url)
         | 
| 168 | 
            -
                      end
         | 
| 169 | 
            -
                    end
         | 
| 170 | 
            -
                    all_images.each do |img|
         | 
| 171 | 
            -
                      current_src = img["src"]
         | 
| 172 | 
            -
                      current_src = Util.build_absolute_url(current_src, url)
         | 
| 173 | 
            -
                      open(current_src, "rb") do |fh|
         | 
| 174 | 
            -
                        is = ImageSize.new(fh.read)
         | 
| 175 | 
            -
                        if Util.valid_image?(is.width, is.height, current_src)
         | 
| 176 | 
            -
                          return current_src
         | 
| 177 | 
            -
                        end
         | 
| 178 | 
            -
                      end
         | 
| 179 | 
            -
                    end
         | 
| 180 | 
            -
                    return nil
         | 
| 181 | 
            -
                  rescue Errno::ENOENT
         | 
| 182 | 
            -
                    puts "No such file - " + current_src
         | 
| 183 | 
            -
                  rescue 
         | 
| 184 | 
            -
                    puts "There was a problem connecting - " + current_src
         | 
| 185 | 
            -
                  end
         | 
| 186 | 
            -
                end
         | 
| 187 | 
            -
             | 
| 188 28 | 
             
              end
         | 
| 189 29 | 
             
            end
         | 
    
        data/mechanize_content.gemspec
    CHANGED
    
    | @@ -20,6 +20,7 @@ Gem::Specification.new do |s| | |
| 20 20 | 
             
              s.require_paths = ["lib"]
         | 
| 21 21 | 
             
              s.add_dependency("mechanize", "~> 1.0.0")
         | 
| 22 22 | 
             
              s.add_dependency("imagesize", "~> 0.1.1")
         | 
| 23 | 
            -
              s.add_development_dependency('rspec', "~> 2. | 
| 23 | 
            +
              s.add_development_dependency('rspec', "~> 2.6.0")
         | 
| 24 | 
            +
              s.add_development_dependency('vcr', "~> 1.9.0")
         | 
| 24 25 | 
             
              s.add_development_dependency('fakeweb', "~> 1.3.0")
         | 
| 25 26 | 
             
            end
         |