RubyGems - scrapers - Versions diffs - 0.3.0 → 0.4.0 - Mend

scrapers 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 0ed963b518db6e326e8f86caf549821942e95248
-  data.tar.gz: fec6828d9b6b9923d0f595852accb4ba282454a0
+  metadata.gz: e5742674b339e7f902b7fdb75c0a95ac8ac5063d
+  data.tar.gz: 8bfeea7a843d8657ef54e6bc1c42cd935c5e2865
 SHA512:
-  metadata.gz: 644b43272415ef5c765bd75d9e788564f861430b84914aa79e37f664f691eb1436f7b98934283521331d6411da51605af2c8937f9d4a1b856686badf454cf1e0
-  data.tar.gz: e47dc6eadcd7c7ca05197cdff5e19c60010e760380264c1bf515f832fa814bc7bde29d7c02df980b239dae75507198dc9b6b27a40840ada8398d7f09065a2f5d
+  metadata.gz: adf3e62d1e53474a3e11bac3c239f1d0a21b69a1b4aa2fc277e32850913495248ca98424906dd7367bfd09b90aae6b81e9a7534ecf7a413a9b1c90484653f694
+  data.tar.gz: d68d1694e09f800d18de780432e6998a78627eb58aae10185dac0261d3ab1f866af8d8b4a8cb3845bdb6a2ea04aa4e23c20567c6645143d73fd6775902c5d50c

data/README.md CHANGED Viewed

@@ -2,8 +2,8 @@
 A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc.
-LICENSE:: MIT
-WEBSITE:: http://github.com/tamouse/scrapers
+* LICENSE: MIT
+* WEBSITE: http://github.com/tamouse/scrapers
 ## Installation

data/lib/scrapers/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Scrapers
-  VERSION = "0.3.0"
+  VERSION = "0.4.0"
   DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
   SUMMARY = "Web site scrapers"
   LICENSE = "MIT"

data/lib/scrapers/xkcd.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require 'open-uri'
+require 'nokogiri'
+module Scrapers
+  module Xkcd
+    XKCD_URL = "http://xkcd.com"
+    def self.scrape(comic=nil)
+      results = Hash.new
+      url = URI.parse XKCD_URL
+      url.path = "/#{comic}/" unless comic.nil?
+      results[:url] = url.to_s
+      doc = Nokogiri::HTML(open(url.to_s))
+      comic = doc.at_css("#comic img")
+      results[:img_src] = comic.attr("src")
+      results[:hover_text] = comic.attr("title")
+      results[:title] = comic.attr("alt")
+      results
+    end
+  end
+end

data/spec/scrapers/xkcd_spec.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require 'spec_helper'
+describe Scrapers::Xkcd do
+  it {should respond_to :scrape}
+  context "scraping" do
+    before(:all) do
+      @comic = VCR.use_cassette("xkcd") do
+        Scrapers::Xkcd.scrape 149
+      end
+      @expected =
+        {:title => "Sandwich",
+        :url => "http://xkcd.com/149/",
+        :img_src => "http://imgs.xkcd.com/comics/sandwich.png",
+        :hover_text => "Proper User Policy apparently means Simon Says."
+      }
+    end
+    it "should retrieve the comic" do
+      @comic.should_not be_nil
+    end
+    it "should be a Hash" do
+      @comic.should be_a(Hash)
+    end
+    it "should return expected" do
+      @comic.should eq @expected
+    end
+  end
+end

data/vcr_cassettes/xkcd.yml ADDED Viewed

@@ -0,0 +1,205 @@
+---
+http_interactions:
+- request:
+    method: get
+    uri: http://xkcd.com/149
+    body:
+      encoding: US-ASCII
+      string: ''
+    headers:
+      Accept-Encoding:
+      - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
+      Accept:
+      - '*/*'
+      User-Agent:
+      - Ruby
+  response:
+    status:
+      code: 301
+      message: Moved Permanently
+    headers:
+      Location:
+      - http://xkcd.com/149/
+      Content-Length:
+      - '0'
+      Date:
+      - Mon, 14 Oct 2013 05:13:16 GMT
+      Server:
+      - lighttpd/1.4.28
+    body:
+      encoding: UTF-8
+      string: ''
+    http_version:
+  recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
+- request:
+    method: get
+    uri: http://xkcd.com/149/
+    body:
+      encoding: US-ASCII
+      string: ''
+    headers:
+      Accept-Encoding:
+      - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
+      Accept:
+      - '*/*'
+      User-Agent:
+      - Ruby
+  response:
+    status:
+      code: 200
+      message: OK
+    headers:
+      Vary:
+      - Accept-Encoding
+      Last-Modified:
+      - Mon, 14 Oct 2013 04:00:05 GMT
+      Etag:
+      - '"871292384"'
+      Content-Type:
+      - text/html; charset=utf-8
+      Accept-Ranges:
+      - bytes
+      Content-Length:
+      - '2685'
+      Date:
+      - Mon, 14 Oct 2013 05:13:17 GMT
+      Server:
+      - lighttpd/1.4.28
+    body:
+      encoding: UTF-8
+      string: |+
+        <?xml version="1.0" encoding="UTF-8" ?>
+        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+            "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+        <html version="-//W3C//DTD XHTML 1.1//EN" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+        <head>
+        <link rel="stylesheet" type="text/css" href="/s/d16ebb.css" title="Default"/>
+        <title>xkcd: Sandwich</title>
+        <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+        <link rel="shortcut icon" href="/s/919f27.ico" type="image/x-icon"/>
+        <link rel="icon" href="/s/919f27.ico" type="image/x-icon"/>
+        <link rel="alternate" type="application/atom+xml" title="Atom 1.0" href="/atom.xml"/>
+        <link rel="alternate" type="application/rss+xml" title="RSS 2.0" href="/rss.xml"/>
+        <link rel="apple-touch-icon-precomposed" href="/s/d9522a.png" />
+        <script type="text/javascript">
+          var _gaq = _gaq || [];
+          _gaq.push(['_setAccount', 'UA-25700708-7']);
+          _gaq.push(['_setDomainName', 'xkcd.com']);
+          _gaq.push(['_setAllowLinker', true]);
+          _gaq.push(['_trackPageview']);
+          (function() {
+            var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+            ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+            var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+          })();
+        </script>
+        </head>
+        <body>
+        <div id="topContainer">
+        <div id="topLeft">
+        <ul>
+        <li><a href="/archive">Archive</a></li>
+        <li><a href="http://what-if.xkcd.com">What If?</a></li>
+        <li><a href="http://blag.xkcd.com">Blag</a></li>
+        <li><a href="http://store.xkcd.com/">Store</a></li>
+        <li><a rel="author" href="/about">About</a></li>
+        </ul>
+        </div>
+        <div id="topRight">
+        <div id="masthead">
+        <span><a href="/"><img src="http://imgs.xkcd.com/static/terrible_small_logo.png" alt="xkcd.com logo" height="83" width="185"/></a></span>
+        <span id="slogan">A webcomic of romance,<br/> sarcasm, math, and language.</span>
+        </div>
+        <div id="news">
+        You can get the Subways comic as a <a href="http://store-xkcd-com.myshopify.com/products/subways">poster</a>!
+        </div>
+        </div>
+        <div id="bgLeft" class="bg box"></div>
+        <div id="bgRight" class="bg box"></div>
+        </div>
+        <div id="middleContainer" class="box">
+        <div id="ctitle">Sandwich</div>
+        <ul class="comicNav">
+        <li><a href="/1/">|&lt;</a></li>
+        <li><a rel="prev" href="/148/" accesskey="p">&lt; Prev</a></li>
+        <li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
+        <li><a rel="next" href="/150/" accesskey="n">Next &gt;</a></li>
+        <li><a href="/">&gt;|</a></li>
+        </ul>
+        <div id="comic">
+        <img src="http://imgs.xkcd.com/comics/sandwich.png" title="Proper User Policy apparently means Simon Says." alt="Sandwich" />
+        </div>
+        <ul class="comicNav">
+        <li><a href="/1/">|&lt;</a></li>
+        <li><a rel="prev" href="/148/" accesskey="p">&lt; Prev</a></li>
+        <li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
+        <li><a rel="next" href="/150/" accesskey="n">Next &gt;</a></li>
+        <li><a href="/">&gt;|</a></li>
+        </ul>
+        <br />
+        Permanent link to this comic: http://xkcd.com/149/<br />
+        Image URL (for hotlinking/embedding): http://imgs.xkcd.com/comics/sandwich.png
+        <div id="transcript" style="display: none">[[ A man is sitting on a couch, talking to another man.  They are both stick figures. ]]
+        First man:  Make me a sandwich.
+        Second man:  What?  Make it yourself.
+        First man:  Sudo make me a sandwich.
+        Second man:  Okay</div>
+        </div>
+        <div id="bottom" class="box">
+        <img src="http://imgs.xkcd.com/s/a899e84.jpg" width="520" height="100" alt="Selected Comics" usemap="#comicmap"/>
+        <map id="comicmap" name="comicmap">
+        <!-- http://code.google.com/p/chromium/issues/detail?id=108489 Might be MIME dependent. -->
+        <area shape="rect" coords="0,0,100,100" href="/150/" alt="Grownups"/>
+        <area shape="rect" coords="104,0,204,100" href="/730/" alt="Circuit Diagram"/>
+        <area shape="rect" coords="208,0,308,100" href="/162/" alt="Angular Momentum"/>
+        <area shape="rect" coords="312,0,412,100" href="/688/" alt="Self-Description"/>
+        <area shape="rect" coords="416,0,520,100" href="/556/" alt="Alternative Energy Revolution"/>
+        </map>
+        <div>
+        Search comic titles and transcripts:
+        <script type="text/javascript" src="//www.google.com/jsapi"></script>
+        <script type="text/javascript">google.load('search', '1');google.setOnLoadCallback(function() {google.search.CustomSearchControl.attachAutoCompletion('012652707207066138651:zudjtuwe28q',document.getElementById('q'),'cse-search-box');});</script>
+        <form action="//www.google.com/cse" id="cse-search-box">
+        <div>
+        <input type="hidden" name="cx" value="012652707207066138651:zudjtuwe28q"/>
+        <input type="hidden" name="ie" value="UTF-8"/>
+        <input type="text" name="q" id="q" size="31"/>
+        <input type="submit" name="sa" value="Search"/>
+        </div>
+        </form>
+        <script type="text/javascript" src="//www.google.com/cse/brand?form=cse-search-box&amp;lang=en"></script>
+        <a href="/rss.xml">RSS Feed</a> - <a href="/atom.xml">Atom Feed</a>
+        </div>
+        <br />
+        <div id="comicLinks">
+        Comics I enjoy:<br/>
+                <a href="http://threewordphrase.com/">Three Word Phrase</a>,
+                <a href="http://oglaf.com/">Oglaf</a> (nsfw),
+                <a href="http://www.smbc-comics.com/">SMBC</a>,
+                <a href="http://www.qwantz.com">Dinosaur Comics</a>,
+                <a href="http://www.asofterworld.com">A Softer World</a>,
+                <a href="http://buttersafe.com/">Buttersafe</a>,
+                <a href="http://pbfcomics.com/">Perry Bible Fellowship</a>,
+                <a href="http://questionablecontent.net/">Questionable Content</a>,
+                <a href="http://www.buttercupfestival.com/">Buttercup Festival</a>
+        </div>
+        <p>Warning: this comic occasionally contains strong language (which may be unsuitable for children), unusual humor (which may be unsuitable for adults), and advanced mathematics (which may be unsuitable for liberal-arts majors).</p>
+        <div id="footnote">BTC 1NfBXWqseXc9rCBc3Cbbu6HjxYssFUgkH6<br />We did not invent the algorithm. The algorithm consistently finds Jesus. The algorithm killed Jeeves. <br/>The algorithm is banned in China. The algorithm is from Jersey. The algorithm constantly finds Jesus.<br/>This is not the algorithm. This is close.</div>
+        <div id="licenseText">
+        <p>
+        This work is licensed under a
+        <a href="http://creativecommons.org/licenses/by-nc/2.5/">Creative Commons Attribution-NonCommercial 2.5 License</a>.
+        </p><p>
+        This means you're free to copy and share these comics (but not to sell them). <a rel="license" href="/license.html">More details</a>.</p>
+        </div>
+        </div>
+        </body>
+        <!-- Layout by Ian Clasbey, davean, and chromakode -->
+        </html>
+    http_version:
+  recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
+recorded_with: VCR 2.5.0

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scrapers
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - Tamara Temple
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-09-29 00:00:00.000000000 Z
+date: 2013-10-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -145,6 +145,7 @@ files:
 - lib/scrapers/imgur.rb
 - lib/scrapers/nasa_apod.rb
 - lib/scrapers/version.rb
+- lib/scrapers/xkcd.rb
 - scrapers.gemspec
 - spec/scrapers/allrecipes_spec.rb
 - spec/scrapers/discoverynews_spec.rb
@@ -152,6 +153,7 @@ files:
 - spec/scrapers/gocomics_spec.rb
 - spec/scrapers/imgur_spec.rb
 - spec/scrapers/nasa_apod_spec.rb
+- spec/scrapers/xkcd_spec.rb
 - spec/scrapers_spec.rb
 - spec/spec_helper.rb
 - vcr_cassettes/allrecipes_morning-glory-muffins-i.yml
@@ -167,6 +169,7 @@ files:
 - vcr_cassettes/shouldincludelink_cassette.yml
 - vcr_cassettes/shouldincludepubDate_cassette.yml
 - vcr_cassettes/shouldincludetitle_cassette.yml
+- vcr_cassettes/xkcd.yml
 homepage: http://github.com/tamouse/scrapers
 licenses:
 - MIT
@@ -198,5 +201,6 @@ test_files:
 - spec/scrapers/gocomics_spec.rb
 - spec/scrapers/imgur_spec.rb
 - spec/scrapers/nasa_apod_spec.rb
+- spec/scrapers/xkcd_spec.rb
 - spec/scrapers_spec.rb
 - spec/spec_helper.rb