RubyGems - scrapers - Versions diffs - 0.4.3 → 1.0.0 - Mend

scrapers 0.4.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/lib/scrapers/download.rb +5 -3
data/lib/scrapers/nasa_apod.rb +20 -22
data/lib/scrapers/sinfest.rb +24 -0
data/lib/scrapers/version.rb +10 -1
data/spec/scrapers/download_spec.rb +45 -8
data/spec/scrapers/sinfest_spec.rb +46 -0
data/spec/spec_helper.rb +1 -1
data/vcr_cassettes/allrecipes_morning-glory-muffins-i.yml +879 -851
data/vcr_cassettes/disconews_history-of-space.yml +347 -340
data/vcr_cassettes/download-newfile.yml +631 -0
data/vcr_cassettes/download-overwrite.yml +631 -0
data/vcr_cassettes/download_cassette.yml +7 -7
data/vcr_cassettes/gocomics_nonsequitur.yml +907 -765
data/vcr_cassettes/sinfest.yml +393 -0
data/vcr_cassettes/xkcd.yml +8 -8
metadata +8 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 155c10cf4c011c7b4eed63949eebe2839dabcc56
-  data.tar.gz: 54794bf2c19e302ecfcf44bce9317eb5dc17b97e
+  metadata.gz: 960eadf205104f76a6be3e1bf820dd3bb401fae8
+  data.tar.gz: 56ec817fe56cbe1adc9f82060b5fe8d5b96222be
 SHA512:
-  metadata.gz: 1cf0bae23ef4a72e1057e801663cb038e876ebfdcedbcb113c97973773e542905e10c806cbffb30286fb1528d725346b7662e8033ce02efa097ba2650cded5b2
-  data.tar.gz: 58eef7cdad244caceae175958d4bffdffa520b622b828acbad6362db255ee6ecee61a97fac190e3e14cafad5d30e5416e48ab3cb15225af10495e0c5189b607e
+  metadata.gz: fa018e7d9a1ca5faeebca9eb478832784fa3cb377b8b382f6a1d7c28299e1df31297eeeb70a0031748de0f1ce1b7e19d6348ddbeb853e73bda88f760b2db6ab3
+  data.tar.gz: 223214a05801135f0e5df20d60393104a865c9446cdcb40d810a08d764bb73202ba4902e642531a3ffb81470611fd20a4c581e7a6074c37dde97e3d49921b900

data/lib/scrapers/download.rb CHANGED Viewed

@@ -16,12 +16,14 @@ module Scrapers
   module Download
     def self.download(url,dir=".",overwrite=false)
-      Scrapers.agent.pluggable_parser.default = Mechanize::Download
+      # need a new agent each time!
+      agent = Mechanize.new
+      agent.pluggable_parser.default = Mechanize::Download
       @dir = validate_directory(dir)
-      dl = Scrapers.agent.get(url)
+      dl = agent.get(url)
       Dir.chdir(@dir) do |dir|
         if overwrite
-          dl.save!()
+          dl.save!(dl.filename)
         else
           dl.save()
         end

data/lib/scrapers/nasa_apod.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 nasa_apod.rb -- oneline desc
-Time-stamp: <2013-08-23 22:47:58 tamara>
+Time-stamp: <2013-10-15 00:17:15 tamara>
 Copyright (C) 2013 Tamara Temple Web Development
 Author:     Tamara Temple <tamouse@gmail.com>
 License:    MIT
@@ -23,35 +23,33 @@ module Scrapers
   module NasaApod
+    NASA_APOD_URL="http://asterisk.apod.com/library/APOD/APOD%20mirror/astropix.html"
     module_function
-    def scrape(url)
+    def scrape(url=nil)
+      url ||= NASA_APOD_URL
       apod = Hash.new
-      unless url.nil?
-        Mechanize.start do |m|
+      Mechanize.start do |m|
-          m.get url
+        m.get url
-          # APOD has a funky entry page, but we want the actual page
-          prev = m.current_page.link_with(:text => '<').href
-          m.get prev
-          canonical = m.current_page.link_with(:text => '>' ).href
-          m.get canonical
-          m.current_page.tap do |page|
-            apod[:title] = page.title.strip
-            apod[:link] = page.uri.to_s
-            apod[:description] = (page/("body")).text
-            apod[:pubDate] = page.response['date'].to_s
-            apod[:guid] = page.uri.to_s
-            apod[:content_encoded] = (page/("body")).to_html
-          end
+        # APOD has a funky entry page, but we want the actual page
+        prev = m.current_page.link_with(:text => '<').href
+        m.get prev
+        canonical = m.current_page.link_with(:text => '>' ).href
+        m.get canonical
+        m.current_page.tap do |page|
+          apod[:title] = page.title.strip
+          apod[:link] = page.uri.to_s
+          apod[:description] = (page/("body")).text
+          apod[:pubDate] = page.response['date'].to_s
+          apod[:guid] = page.uri.to_s
+          apod[:content_encoded] = (page/("body")).to_html
         end
       end
       apod
     end

data/lib/scrapers/sinfest.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require 'mechanize'
+module Scrapers
+  module Sinfest
+    SINFEST_URL = "http://sinfest.net"
+    def self.scrape
+      results = Hash.new
+      results[:comic] = 'Sinfest'
+      results[:url] = SINFEST_URL
+      Mechanize.start do |agent|
+        agent.get SINFEST_URL
+        agent.current_page.image(src: %r{comikaze/comics}).tap do |comic|
+          results[:title] = comic.alt.to_s
+          results[:img_src] = comic.src.to_s
+          comicdate = Date.parse(File.basename(comic.src.to_s,'.gif'))
+          pubdate = Time.utc(comicdate.year,comicdate.month,comicdate.day)
+          results[:pubdate] = pubdate.to_s
+        end
+      end
+      results.tap{|t| $stderr.puts "DEBUG: #{caller(0,1).first} results #{t.inspect}"}
+    end
+  end
+end

data/lib/scrapers/version.rb CHANGED Viewed

@@ -1,5 +1,14 @@
 module Scrapers
-  VERSION = "0.4.3"
+  module Version
+    MAJOR = 1
+    MINOR = 0
+    BUILD = 0
+  end
+  VERSION = [Version::MAJOR,Version::MINOR,Version::BUILD].map(&:to_s).join(".")
   DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
   SUMMARY = "Web site scrapers"
   LICENSE = "MIT"

data/spec/scrapers/download_spec.rb CHANGED Viewed

@@ -12,25 +12,62 @@
 require 'spec_helper'
 require 'tmpdir'
+def in_tmpdir
+  Dir.mktmpdir do |dir|
+    Dir.chdir(dir) do |dir|
+      yield dir
+    end
+  end
+end
 module Scrapers
   describe Download do
     it {Scrapers::Download.should respond_to :download}
-    context "download" do
-      before(:all) do
+    it "should download and save the file" do
+      in_tmpdir do |dir|
         @url="http://imgur.com/download/v70StgA/%2Asnrrrrrrrrrrrf%21%2A"
         VCR.use_cassette("download.cassette") do
-          @file = Scrapers::Download.download(@url,'tmp')
+          @file = Scrapers::Download.download(@url,dir)
         end
-      end
-      it "saves the file" do
         @file.should =~ /.*snrrrrrrrrrrrf.*Imgur\.jpg/
         File.exist?(@file).should be_true
       end
     end
-  end
+    it "should overwrite file with second download" do
+      in_tmpdir do |dir|
+        @url="http://imgs.xkcd.com/comics/sandwich.png"
+        VCR.use_cassette("download-overwrite") do
+          @file1 = Scrapers::Download.download(@url,dir)
+          @file2 = Scrapers::Download.download(@url,dir,true)
+        end
+        @file1.should eq @file2
+        @file1.should eq File.join(dir,'sandwich.png')
+        File.exist?(@file1).should be_true
+      end
+    end
+    it "should make a new file on second download" do
+      in_tmpdir do |dir|
+        @url="http://imgs.xkcd.com/comics/sandwich.png"
+        VCR.use_cassette("download-newfile") do
+          @file1 = Scrapers::Download.download(@url,dir)
+          @file2 = Scrapers::Download.download(@url,dir)
+        end
+        # Filed issue against mechanise to make save return
+        # the actual file name saved under. Until that's fixed
+        # have to work around it.
+        @file2 += '.1'
+        @file1.should_not eq @file2
+        @file1.should eq File.join(dir,'sandwich.png')
+        File.exist?(@file1).should be_true
+        @file2.should eq File.join(dir,'sandwich.png.1')
+        File.exist?(@file2).should be_true
+      end
+    end
+  end
 end

data/spec/scrapers/sinfest_spec.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'spec_helper'
+module Scrapers
+  describe Sinfest do
+    it{should respond_to :scrape}
+    context "scraping" do
+      before(:all) do
+        @comic = VCR.use_cassette('sinfest') do
+          Scrapers::Sinfest.scrape
+        end
+      end
+      it "retrieves a comic" do
+        @comic.should_not be_nil
+      end
+      it "should be a Hash" do
+        @comic.should be_a(Hash)
+      end
+      %w{title url pubdate img_src}.map(&:to_sym).each do |key|
+        it "should have key #{key}" do
+          @comic.should have_key(key)
+        end
+      end
+      context "title" do
+        it{@comic[:title].should_not be_empty}
+        it{@comic[:title].should match /Nails/}
+      end
+      context "url" do
+        it{@comic[:url].should_not be_empty}
+        it{@comic[:url].should match /http:\/\/sinfest.net/}
+      end
+      context "pubdate" do
+        it{@comic[:pubdate].should_not be_empty}
+        it{Date.parse(@comic[:pubdate]).should be_a(Date)}
+      end
+      context "img_src" do
+        it{@comic[:img_src].should_not be_empty}
+        it{URI.parse(@comic[:img_src]).should be_a(URI::HTTP)}
+        it{@comic[:img_src].should eq 'http://sinfest.net/comikaze/comics/2013-10-19.gif'}
+      end
+    end
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 #require 'webmock/rspec'
 require 'vcr'
 # This file was generated by the `rspec --init` command. Conventionally, all
 # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
 # Require this file using `require "spec_helper"` to ensure that it is only
@@ -29,3 +28,4 @@ end
 require 'scrapers.rb'