RubyGems - scrapers - Versions diffs - 0.2.0 - Mend

scrapers 0.2.0

Files changed (40) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.rspec-example +2 -0
data/.watchr +27 -0
data/Gemfile +4 -0
data/Guardfile +6 -0
data/LICENSE.txt +22 -0
data/README.md +32 -0
data/Rakefile +1 -0
data/lib/scrapers/allrecipes.rb +49 -0
data/lib/scrapers/discoverynews.rb +28 -0
data/lib/scrapers/download.rb +37 -0
data/lib/scrapers/gocomics.rb +43 -0
data/lib/scrapers/imgur.rb +56 -0
data/lib/scrapers/nasa_apod.rb +60 -0
data/lib/scrapers/version.rb +7 -0
data/lib/scrapers.rb +9 -0
data/scrapers.gemspec +29 -0
data/spec/scrapers/allrecipes_spec.rb +29 -0
data/spec/scrapers/discoverynews_spec.rb +39 -0
data/spec/scrapers/download_spec.rb +36 -0
data/spec/scrapers/gocomics_spec.rb +46 -0
data/spec/scrapers/imgur_spec.rb +31 -0
data/spec/scrapers/nasa_apod_spec.rb +30 -0
data/spec/scrapers_spec.rb +9 -0
data/spec/spec_helper.rb +31 -0
data/vcr_cassettes/allrecipes_morning-glory-muffins-i.yml +1047 -0
data/vcr_cassettes/disconews_history-of-space.yml +385 -0
data/vcr_cassettes/download_cassette.yml +431 -0
data/vcr_cassettes/exampleatspecscrapersnasaapodspecrb16_cassette.yml +326 -0
data/vcr_cassettes/exampleatspecscrapersnasaapodspecrb21_cassette.yml +326 -0
data/vcr_cassettes/exampleatspecscrapersnasaapodspecrb22_cassette.yml +312 -0
data/vcr_cassettes/gocomics_nonsequitur.yml +336 -0
data/vcr_cassettes/shouldincludecontentencoded_cassette.yml +326 -0
data/vcr_cassettes/shouldincludedescription_cassette.yml +326 -0
data/vcr_cassettes/shouldincludeguid_cassette.yml +326 -0
data/vcr_cassettes/shouldincludelink_cassette.yml +326 -0
data/vcr_cassettes/shouldincludepubDate_cassette.yml +326 -0
data/vcr_cassettes/shouldincludetitle_cassette.yml +326 -0
metadata +203 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: eef6a353955404ad8beaf88e4db8ab4b0a4f3cec
+  data.tar.gz: 87ec094c87d315640c1ebb274b6f334b6dd73646
+SHA512:
+  metadata.gz: ece9775696f757216e7c9fd9c2ef2a6ff4a1bd3a9dcbf1fd754caed7c751bdeb1f055a2ea6694b216dfaf61f7a1678cedc777fd0281d660d73e68f34399f2c90
+  data.tar.gz: 370b23a91c38f349a8f02a6fe4ed8770bb58017b43876ad736c449087da7b303aae605f2a314e76b6489bdfc89aaceaa2588f94a6e6e1d7b3e0423c50f2c5f63

data/.gitignore ADDED Viewed

@@ -0,0 +1,18 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+.rspec

data/.rspec-example ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format progress

data/.watchr ADDED Viewed

@@ -0,0 +1,27 @@
+# -*- ruby -*-
+def run_spec(file)
+  unless File.exist?(file)
+    puts "#{file} does not exist"
+    return
+  end
+  puts "Running #{file}"
+  system "bundle exec rspec #{file}"
+  puts
+end
+watch("^spec/*_spec.rb$") do |match|
+  run_spec match[0]
+end
+watch("^spec/.*/*_spec.rb$") do |match|
+  run_spec match[0]
+end
+watch("^lib/(.*).rb$") do |match|
+  run_spec "spec/#{match[1]}_spec.rb"
+end
+watch("^lib/(.*/.*).rb$") do |match|
+  run_spec "spec/#{match[1]}_spec.rb"
+end

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in scrapers.gemspec
+gemspec

data/Guardfile ADDED Viewed

@@ -0,0 +1,6 @@
+guard :rspec do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch(%r{^lib/(.+)\.rb$})     { |m| "spec/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb')  { "spec" }
+end

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Tamara Temple
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,32 @@
+# Scrapers
+A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc.
+LICENSE:: MIT
+WEBSITE:: http://github.com/tamouse/scrapers
+## Installation
+Add this line to your application's Gemfile:
+    gem 'scrapers'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install scrapers
+## Usage
+See the various RDoc for explanation of each item.
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/lib/scrapers/allrecipes.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require 'mechanize'
+module Scrapers
+  module AllRecipes
+    def self.scrape(url)
+      results = Hash.new
+      Scrapers.agent.get(url).tap do |page|
+        results[:url] = page.uri.to_s
+        results[:title] = page.title.strip
+        results[:ingredients] = scrape_ingredients(page)
+        results[:directions] = scrape_directions(page)
+        results[:photo] = scrape_photo(page)
+      end
+      results
+    end
+    def self.scrape_ingredients(page)
+      page.
+        search("ul.ingredient-wrap").
+        search(".//li").
+        map do |i|
+        i.text.gsub(/[[:space:]]+/,' ').sub(/^/,'*')
+      end
+    end
+    def self.scrape_directions(page)
+      page.
+        search("div.directLeft").first.
+        search("li").
+        map do |i|
+        i.text.gsub(/[[:space:]]+/,' ').sub(/^/,'# ')
+      end
+    end
+    def self.scrape_photo(page)
+      photo = page.search("img#imgPhoto").first
+      Hash[photo.attributes.map{|k,v| [k,v.value]}]
+    end
+  end
+end

data/lib/scrapers/discoverynews.rb ADDED Viewed

@@ -0,0 +1,28 @@
+=begin rdoc
+= DISCOVERYNEWS.RB
+*Author*::      Tamara Temple <tamouse@gmail.com>
+*Since*::       2013-06-15
+*Copyright*::   (c) 2013 Tamara Temple Web Development
+*License*::     MIT
+Scraper for disco news pictures of the week
+=end
+require 'mechanize'
+module Scrapers
+  module DiscoNews
+    def self.disco_downloads(url)
+      @url = url.clone
+      @page = Scrapers.agent.get(url)
+      images = @page.images_with(:class => "media-hero").map(&:src)
+    end
+  end
+end

data/lib/scrapers/download.rb ADDED Viewed

@@ -0,0 +1,37 @@
+=begin rdoc
+= DOWNLOAD.RB
+*Author*::      Tamara Temple <tamara@tamaratemple.com>
+*Since*::       2013-05-27
+*Copyright*::   (c) 2013 Tamara Temple Web Development
+*License*::     MIT
+=end
+require 'mechanize'
+module Scrapers
+  module Download
+    def self.download(url,dir=".")
+      Scrapers.agent.pluggable_parser.default = Mechanize::Download
+      @dir = validate_directory(dir)
+      dl = Scrapers.agent.get(url)
+      Dir.chdir(@dir) do |dir|
+        dl.save()
+      end
+      File.join(@dir,dl.filename)
+    end
+    def self.validate_directory(d)
+      STDERR.puts caller(0,1).first +
+        "d: #{d.inspect}. directory? #{File.directory?(d)}. writable? #{File.writable?(d)}"
+      raise "#{d} is not a writable directory!" unless File.directory?(d) and File.writable?(d)
+      d
+    end
+  end
+end

data/lib/scrapers/gocomics.rb ADDED Viewed

@@ -0,0 +1,43 @@
+require 'open-uri'
+require 'nokogiri'
+module Scrapers
+  module GoComics
+    GOCOMIC_URL = "http://www.gocomics.com/"
+    def self.scrape(comic)
+      results = Hash.new
+      url = URI.parse GOCOMIC_URL
+      url.path = "/#{comic}"
+      page = Nokogiri::HTML(open(url.to_s))
+      results[:url] = url.to_s
+      results[:title] = scrape_title(page)
+      results[:pubdate] = scrape_pubdate(page)
+      results[:img_src] = scrape_image_source(page)
+      results
+    end
+    def self.scrape_title(page)
+      page.at_css("title").content.strip.gsub(/[[:space:]]/,' ').squeeze(" ")
+    end
+    def self.scrape_pubdate(page)
+      Date.parse(page.at_css("ul.feature-nav > li").content).to_s
+    end
+    def self.scrape_image_source(page)
+      page.
+        at_css("p.feature_item").
+        at_css("img").
+        attr("src")
+    end
+  end
+end

data/lib/scrapers/imgur.rb ADDED Viewed

@@ -0,0 +1,56 @@
+=begin rdoc
+= IMGUR.RB
+*Author*::      Tamara Temple <tamara@tamaratemple.com>
+*Since*::       2013-05-27
+*Copyright*::   (c) 2013 Tamara Temple Web Development
+*License*::     MIT
+=end
+module Scrapers
+  IMGUR_TEMPLATE="http://imgur.com/path"
+  class Imgur
+    attr_accessor :agent, :url, :download, :page
+    def initialize
+      @agent = Mechanize.new
+      @url = URI.parse(IMGUR_TEMPLATE)
+      @download = URI.parse(IMGUR_TEMPLATE)
+    end
+    def download_link(code)
+      make_url(code)
+      retrieve_page()
+      find_download()
+      @download.to_s
+    end
+    def make_url(imgur_code)
+      @url.path = "/#{imgur_code}"
+    end
+    def retrieve_page()
+      @page = @agent.get(@url.to_s)
+    end
+    def find_download(link_text=/Download/)
+      link = @page.link_with(:text => link_text)
+      raise "#{link_text} not found on #{@page.uri.to_s}" if link.nil?
+      @download.path = link.href
+    end
+  end
+  module_function
+  def imgur(url)
+    code = File.basename(url).sub(/\.[^.]+$/,'')
+    "http://imgur.com/download/#{code}/"
+  end
+end

data/lib/scrapers/nasa_apod.rb ADDED Viewed

@@ -0,0 +1,60 @@
+=begin rdoc
+nasa_apod.rb -- oneline desc
+Time-stamp: <2013-08-23 22:47:58 tamara>
+Copyright (C) 2013 Tamara Temple Web Development
+Author:     Tamara Temple <tamouse@gmail.com>
+License:    MIT
+== Discussion
+NASA's Astronomy Picture of the Day is a great source for nice astro
+photos and various other information. But it isn't something I
+remember to go see every day, so I'd like it to drop in my in-box or
+an evernote notebook. But the feed does not include the image, for
+some ungodly reason, so I'm adding a scraper to grab the nice info off
+the page including the photo.
+=end
+module Scrapers
+  module NasaApod
+    module_function
+    def scrape(url)
+      apod = Hash.new
+      unless url.nil?
+        Mechanize.start do |m|
+          m.get url
+          # APOD has a funky entry page, but we want the actual page
+          prev = m.current_page.link_with(:text => '<').href
+          m.get prev
+          canonical = m.current_page.link_with(:text => '>' ).href
+          m.get canonical
+          m.current_page.tap do |page|
+            apod[:title] = page.title.strip
+            apod[:link] = page.uri.to_s
+            apod[:description] = (page/("body")).text
+            apod[:pubDate] = page.response['date'].to_s
+            apod[:guid] = page.uri.to_s
+            apod[:content_encoded] = (page/("body")).to_html
+          end
+        end
+      end
+      apod
+    end
+  end
+end

data/lib/scrapers/version.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module Scrapers
+  VERSION = "0.2.0"
+  DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
+  SUMMARY = "Web site scrapers"
+  LICENSE = "MIT"
+  WEBSITE = "http://github.com/tamouse/scrapers"
+end

data/lib/scrapers.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'mechanize'
+Dir[File.join(File.expand_path('../', __FILE__),'**','*.rb')].each {|file| require file}
+module Scrapers
+  def self.agent()
+    @agent ||= Mechanize.new
+  end
+end

data/scrapers.gemspec ADDED Viewed

@@ -0,0 +1,29 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'scrapers/version'
+Gem::Specification.new do |spec|
+  spec.name          = "scrapers"
+  spec.version       = Scrapers::VERSION
+  spec.authors       = ["Tamara Temple"]
+  spec.email         = ["tamouse@gmail.com"]
+  spec.description   = Scrapers::DESCRIPTION
+  spec.summary       = Scrapers::SUMMARY
+  spec.homepage      = Scrapers::WEBSITE
+  spec.license       = Scrapers::LICENSE
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency "rspec"
+  spec.add_dependency "mechanize"
+  spec.add_development_dependency "guard"
+  spec.add_development_dependency "guard-rspec"
+  spec.add_development_dependency "webmock"
+  spec.add_development_dependency "vcr"
+end

data/spec/scrapers/allrecipes_spec.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require 'spec_helper'
+module Scrapers
+  describe AllRecipes do
+    it{should respond_to :scrape}
+    context "scraping" do
+      before(:all) do
+        @url = "http://allrecipes.com/recipe/morning-glory-muffins-i/detail.aspx"
+        @recipe = VCR.use_cassette('allrecipes.morning-glory-muffins-i') do
+          Scrapers::AllRecipes.scrape(@url)
+        end
+      end
+      it "retrieves a recipe" do
+        @recipe.should_not be_nil
+      end
+      it "should be a Hash" do
+        @recipe.should be_a(Hash)
+      end
+      %w{title url ingredients directions photo}.map(&:to_sym).each do |key|
+        it "should have key #{key}" do
+          @recipe.should have_key(key)
+        end
+      end
+    end
+  end
+end

data/spec/scrapers/discoverynews_spec.rb ADDED Viewed

@@ -0,0 +1,39 @@
+=begin rdoc
+= DISCOVERYNEWS_SPEC.RB
+*Author*::      Tamara Temple <tamouse@gmail.com>
+*Since*::       2013-06-15
+*Copyright*::   (c) 2013 Tamara Temple Web Development
+*License*::     MIT
+=end
+require 'spec_helper'
+module Scrapers
+  describe DiscoNews do
+    it {should respond_to(:disco_downloads)}
+    context "scraping" do
+      let(:url) {"http://news.discovery.com/space/history-of-space/stunning-space-photos-week-june-9-14-pictures-130614.htm"}
+      let(:images) do
+        VCR.use_cassette('disconews.history-of-space') do
+          Scrapers::DiscoNews.disco_downloads(url)
+        end
+      end
+      it "retrieves an array of images" do
+        images.should be_a(Array)
+        images.each do |i|
+          i.should =~ /^http:\/\/.*(jpe?g|png|gif)/
+        end
+      end
+    end
+  end
+end

data/spec/scrapers/download_spec.rb ADDED Viewed

@@ -0,0 +1,36 @@
+=begin rdoc
+= DOWNLOAD_SPEC.RB
+*Author*::      Tamara Temple <tamara@tamaratemple.com>
+*Since*::       2013-05-27
+*Copyright*::   (c) 2013 Tamara Temple Web Development
+*License*::     MIT
+=end
+require 'spec_helper'
+require 'tmpdir'
+module Scrapers
+  describe Download do
+    it {Scrapers::Download.should respond_to :download}
+    context "download" do
+      before(:all) do
+        @url="http://imgur.com/download/v70StgA/%2Asnrrrrrrrrrrrf%21%2A"
+        VCR.use_cassette("download.cassette") do
+          @file = Scrapers::Download.download(@url,'tmp')
+        end
+      end
+      it "saves the file" do
+        @file.should =~ /.*snrrrrrrrrrrrf.*Imgur\.jpg/
+        File.exist?(@file).should be_true
+      end
+    end
+  end
+end

data/spec/scrapers/gocomics_spec.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'spec_helper'
+module Scrapers
+  describe GoComics do
+    it{should respond_to :scrape}
+    context "scraping" do
+      before(:all) do
+        @comic_strip = 'nonsequitur'
+        @comic = VCR.use_cassette('gocomics.nonsequitur') do
+          Scrapers::GoComics.scrape(@comic_strip)
+        end
+      end
+      it "retrieves a comic" do
+        @comic.should_not be_nil
+      end
+      it "should be a Hash" do
+        @comic.should be_a(Hash)
+      end
+      %w{title url pubdate img_src}.map(&:to_sym).each do |key|
+        it "should have key #{key}" do
+          @comic.should have_key(key)
+        end
+      end
+      context "title" do
+        it{@comic[:title].should_not be_empty}
+        it{@comic[:title].should match /Non Sequitur Comic Strip on GoComics.com/}
+      end
+      context "url" do
+        it{@comic[:url].should_not be_empty}
+        it{@comic[:url].should match /www\.gocomics\.com\/nonsequitur/}
+      end
+      context "pubdate" do
+        it{@comic[:pubdate].should_not be_empty}
+        it{Date.parse(@comic[:pubdate]).should be_a(Date)}
+      end
+      context "img_src" do
+        it{@comic[:img_src].should_not be_empty}
+        it{URI.parse(@comic[:img_src]).should be_a(URI::HTTP)}
+      end
+    end
+  end
+end

data/spec/scrapers/imgur_spec.rb ADDED Viewed

@@ -0,0 +1,31 @@
+=begin rdoc
+= IMGUR_SPEC.RB
+*Author*::      Tamara Temple <tamara@tamaratemple.com>
+*Since*::       2013-05-27
+*Copyright*::   (c) 2013 Tamara Temple Web Development
+*License*::     MIT
+=end
+require 'spec_helper'
+module Scrapers
+  describe "Scrapers" do
+    it {Scrapers.should respond_to(:imgur)}
+  end
+  describe "Fetch the download link" do
+    let(:url) {"http://imgur.com/v70StgA"}
+    it "should return the download link from a given url" do
+      Scrapers.imgur(url).should =~ %r{http://imgur.com/download/v70StgA/}
+    end
+  end
+end

data/spec/scrapers/nasa_apod_spec.rb ADDED Viewed

@@ -0,0 +1,30 @@
+require 'spec_helper'
+module Scrapers
+  describe NasaApod do
+    it {Scrapers::NasaApod.should respond_to :scrape}
+    context "scrape" do
+      let(:url){"http://apod.nasa.gov/apod/astropix.html"}
+      let(:apod_hash){
+        VCR.use_cassette("#{example.description.gsub(/[^-[:alnum:]]/,'')}.cassette", :record => :new_episodes) do
+          Scrapers::NasaApod.scrape(url)
+        end}
+      it {apod_hash.should be_a(Hash)}
+      %w{title link description pubDate guid content_encoded}.map(&:to_sym).each do |attr|
+        it "should include #{attr}" do
+          apod_hash.keys.should include attr
+        end
+        it {apod_hash[attr].should_not be_nil}
+        it {apod_hash[attr].should be_a(String)}
+      end
+    end
+  end
+end

data/spec/scrapers_spec.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'spec_helper'
+module Scrapers
+  describe Scrapers do
+    it{should respond_to(:agent)}
+  end
+end