scrapers 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0ed963b518db6e326e8f86caf549821942e95248
4
- data.tar.gz: fec6828d9b6b9923d0f595852accb4ba282454a0
3
+ metadata.gz: e5742674b339e7f902b7fdb75c0a95ac8ac5063d
4
+ data.tar.gz: 8bfeea7a843d8657ef54e6bc1c42cd935c5e2865
5
5
  SHA512:
6
- metadata.gz: 644b43272415ef5c765bd75d9e788564f861430b84914aa79e37f664f691eb1436f7b98934283521331d6411da51605af2c8937f9d4a1b856686badf454cf1e0
7
- data.tar.gz: e47dc6eadcd7c7ca05197cdff5e19c60010e760380264c1bf515f832fa814bc7bde29d7c02df980b239dae75507198dc9b6b27a40840ada8398d7f09065a2f5d
6
+ metadata.gz: adf3e62d1e53474a3e11bac3c239f1d0a21b69a1b4aa2fc277e32850913495248ca98424906dd7367bfd09b90aae6b81e9a7534ecf7a413a9b1c90484653f694
7
+ data.tar.gz: d68d1694e09f800d18de780432e6998a78627eb58aae10185dac0261d3ab1f866af8d8b4a8cb3845bdb6a2ea04aa4e23c20567c6645143d73fd6775902c5d50c
data/README.md CHANGED
@@ -2,8 +2,8 @@
2
2
 
3
3
  A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc.
4
4
 
5
- LICENSE:: MIT
6
- WEBSITE:: http://github.com/tamouse/scrapers
5
+ * LICENSE: MIT
6
+ * WEBSITE: http://github.com/tamouse/scrapers
7
7
 
8
8
  ## Installation
9
9
 
@@ -1,5 +1,5 @@
1
1
  module Scrapers
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
4
4
  SUMMARY = "Web site scrapers"
5
5
  LICENSE = "MIT"
@@ -0,0 +1,23 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+
4
+ module Scrapers
5
+ module Xkcd
6
+ XKCD_URL = "http://xkcd.com"
7
+ def self.scrape(comic=nil)
8
+ results = Hash.new
9
+
10
+ url = URI.parse XKCD_URL
11
+ url.path = "/#{comic}/" unless comic.nil?
12
+ results[:url] = url.to_s
13
+ doc = Nokogiri::HTML(open(url.to_s))
14
+ comic = doc.at_css("#comic img")
15
+ results[:img_src] = comic.attr("src")
16
+ results[:hover_text] = comic.attr("title")
17
+ results[:title] = comic.attr("alt")
18
+
19
+ results
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Scrapers::Xkcd do
4
+ it {should respond_to :scrape}
5
+ context "scraping" do
6
+ before(:all) do
7
+ @comic = VCR.use_cassette("xkcd") do
8
+ Scrapers::Xkcd.scrape 149
9
+ end
10
+ @expected =
11
+ {:title => "Sandwich",
12
+ :url => "http://xkcd.com/149/",
13
+ :img_src => "http://imgs.xkcd.com/comics/sandwich.png",
14
+ :hover_text => "Proper User Policy apparently means Simon Says."
15
+ }
16
+ end
17
+ it "should retrieve the comic" do
18
+ @comic.should_not be_nil
19
+ end
20
+ it "should be a Hash" do
21
+ @comic.should be_a(Hash)
22
+ end
23
+ it "should return expected" do
24
+ @comic.should eq @expected
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,205 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://xkcd.com/149
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ Accept-Encoding:
11
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
12
+ Accept:
13
+ - '*/*'
14
+ User-Agent:
15
+ - Ruby
16
+ response:
17
+ status:
18
+ code: 301
19
+ message: Moved Permanently
20
+ headers:
21
+ Location:
22
+ - http://xkcd.com/149/
23
+ Content-Length:
24
+ - '0'
25
+ Date:
26
+ - Mon, 14 Oct 2013 05:13:16 GMT
27
+ Server:
28
+ - lighttpd/1.4.28
29
+ body:
30
+ encoding: UTF-8
31
+ string: ''
32
+ http_version:
33
+ recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
34
+ - request:
35
+ method: get
36
+ uri: http://xkcd.com/149/
37
+ body:
38
+ encoding: US-ASCII
39
+ string: ''
40
+ headers:
41
+ Accept-Encoding:
42
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
43
+ Accept:
44
+ - '*/*'
45
+ User-Agent:
46
+ - Ruby
47
+ response:
48
+ status:
49
+ code: 200
50
+ message: OK
51
+ headers:
52
+ Vary:
53
+ - Accept-Encoding
54
+ Last-Modified:
55
+ - Mon, 14 Oct 2013 04:00:05 GMT
56
+ Etag:
57
+ - '"871292384"'
58
+ Content-Type:
59
+ - text/html; charset=utf-8
60
+ Accept-Ranges:
61
+ - bytes
62
+ Content-Length:
63
+ - '2685'
64
+ Date:
65
+ - Mon, 14 Oct 2013 05:13:17 GMT
66
+ Server:
67
+ - lighttpd/1.4.28
68
+ body:
69
+ encoding: UTF-8
70
+ string: |+
71
+ <?xml version="1.0" encoding="UTF-8" ?>
72
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
73
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
74
+ <html version="-//W3C//DTD XHTML 1.1//EN" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
75
+ <head>
76
+ <link rel="stylesheet" type="text/css" href="/s/d16ebb.css" title="Default"/>
77
+ <title>xkcd: Sandwich</title>
78
+ <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
79
+ <link rel="shortcut icon" href="/s/919f27.ico" type="image/x-icon"/>
80
+ <link rel="icon" href="/s/919f27.ico" type="image/x-icon"/>
81
+ <link rel="alternate" type="application/atom+xml" title="Atom 1.0" href="/atom.xml"/>
82
+ <link rel="alternate" type="application/rss+xml" title="RSS 2.0" href="/rss.xml"/>
83
+ <link rel="apple-touch-icon-precomposed" href="/s/d9522a.png" />
84
+ <script type="text/javascript">
85
+ var _gaq = _gaq || [];
86
+ _gaq.push(['_setAccount', 'UA-25700708-7']);
87
+ _gaq.push(['_setDomainName', 'xkcd.com']);
88
+ _gaq.push(['_setAllowLinker', true]);
89
+ _gaq.push(['_trackPageview']);
90
+
91
+ (function() {
92
+ var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
93
+ ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
94
+ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
95
+ })();
96
+ </script>
97
+
98
+ </head>
99
+ <body>
100
+ <div id="topContainer">
101
+ <div id="topLeft">
102
+ <ul>
103
+ <li><a href="/archive">Archive</a></li>
104
+ <li><a href="http://what-if.xkcd.com">What If?</a></li>
105
+ <li><a href="http://blag.xkcd.com">Blag</a></li>
106
+ <li><a href="http://store.xkcd.com/">Store</a></li>
107
+ <li><a rel="author" href="/about">About</a></li>
108
+ </ul>
109
+ </div>
110
+ <div id="topRight">
111
+ <div id="masthead">
112
+ <span><a href="/"><img src="http://imgs.xkcd.com/static/terrible_small_logo.png" alt="xkcd.com logo" height="83" width="185"/></a></span>
113
+ <span id="slogan">A webcomic of romance,<br/> sarcasm, math, and language.</span>
114
+ </div>
115
+ <div id="news">
116
+ You can get the Subways comic as a <a href="http://store-xkcd-com.myshopify.com/products/subways">poster</a>!
117
+ </div>
118
+ </div>
119
+ <div id="bgLeft" class="bg box"></div>
120
+ <div id="bgRight" class="bg box"></div>
121
+ </div>
122
+ <div id="middleContainer" class="box">
123
+
124
+ <div id="ctitle">Sandwich</div>
125
+ <ul class="comicNav">
126
+ <li><a href="/1/">|&lt;</a></li>
127
+ <li><a rel="prev" href="/148/" accesskey="p">&lt; Prev</a></li>
128
+ <li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
129
+ <li><a rel="next" href="/150/" accesskey="n">Next &gt;</a></li>
130
+ <li><a href="/">&gt;|</a></li>
131
+ </ul>
132
+ <div id="comic">
133
+ <img src="http://imgs.xkcd.com/comics/sandwich.png" title="Proper User Policy apparently means Simon Says." alt="Sandwich" />
134
+ </div>
135
+ <ul class="comicNav">
136
+ <li><a href="/1/">|&lt;</a></li>
137
+ <li><a rel="prev" href="/148/" accesskey="p">&lt; Prev</a></li>
138
+ <li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
139
+ <li><a rel="next" href="/150/" accesskey="n">Next &gt;</a></li>
140
+ <li><a href="/">&gt;|</a></li>
141
+ </ul>
142
+ <br />
143
+ Permanent link to this comic: http://xkcd.com/149/<br />
144
+ Image URL (for hotlinking/embedding): http://imgs.xkcd.com/comics/sandwich.png
145
+ <div id="transcript" style="display: none">[[ A man is sitting on a couch, talking to another man. They are both stick figures. ]]
146
+ First man: Make me a sandwich.
147
+ Second man: What? Make it yourself.
148
+ First man: Sudo make me a sandwich.
149
+ Second man: Okay</div>
150
+ </div>
151
+ <div id="bottom" class="box">
152
+ <img src="http://imgs.xkcd.com/s/a899e84.jpg" width="520" height="100" alt="Selected Comics" usemap="#comicmap"/>
153
+ <map id="comicmap" name="comicmap">
154
+ <!-- http://code.google.com/p/chromium/issues/detail?id=108489 Might be MIME dependent. -->
155
+ <area shape="rect" coords="0,0,100,100" href="/150/" alt="Grownups"/>
156
+ <area shape="rect" coords="104,0,204,100" href="/730/" alt="Circuit Diagram"/>
157
+ <area shape="rect" coords="208,0,308,100" href="/162/" alt="Angular Momentum"/>
158
+ <area shape="rect" coords="312,0,412,100" href="/688/" alt="Self-Description"/>
159
+ <area shape="rect" coords="416,0,520,100" href="/556/" alt="Alternative Energy Revolution"/>
160
+ </map>
161
+ <div>
162
+ Search comic titles and transcripts:
163
+ <script type="text/javascript" src="//www.google.com/jsapi"></script>
164
+ <script type="text/javascript">google.load('search', '1');google.setOnLoadCallback(function() {google.search.CustomSearchControl.attachAutoCompletion('012652707207066138651:zudjtuwe28q',document.getElementById('q'),'cse-search-box');});</script>
165
+ <form action="//www.google.com/cse" id="cse-search-box">
166
+ <div>
167
+ <input type="hidden" name="cx" value="012652707207066138651:zudjtuwe28q"/>
168
+ <input type="hidden" name="ie" value="UTF-8"/>
169
+ <input type="text" name="q" id="q" size="31"/>
170
+ <input type="submit" name="sa" value="Search"/>
171
+ </div>
172
+ </form>
173
+ <script type="text/javascript" src="//www.google.com/cse/brand?form=cse-search-box&amp;lang=en"></script>
174
+ <a href="/rss.xml">RSS Feed</a> - <a href="/atom.xml">Atom Feed</a>
175
+ </div>
176
+ <br />
177
+ <div id="comicLinks">
178
+ Comics I enjoy:<br/>
179
+ <a href="http://threewordphrase.com/">Three Word Phrase</a>,
180
+ <a href="http://oglaf.com/">Oglaf</a> (nsfw),
181
+ <a href="http://www.smbc-comics.com/">SMBC</a>,
182
+ <a href="http://www.qwantz.com">Dinosaur Comics</a>,
183
+ <a href="http://www.asofterworld.com">A Softer World</a>,
184
+ <a href="http://buttersafe.com/">Buttersafe</a>,
185
+ <a href="http://pbfcomics.com/">Perry Bible Fellowship</a>,
186
+ <a href="http://questionablecontent.net/">Questionable Content</a>,
187
+ <a href="http://www.buttercupfestival.com/">Buttercup Festival</a>
188
+ </div>
189
+ <p>Warning: this comic occasionally contains strong language (which may be unsuitable for children), unusual humor (which may be unsuitable for adults), and advanced mathematics (which may be unsuitable for liberal-arts majors).</p>
190
+ <div id="footnote">BTC 1NfBXWqseXc9rCBc3Cbbu6HjxYssFUgkH6<br />We did not invent the algorithm. The algorithm consistently finds Jesus. The algorithm killed Jeeves. <br/>The algorithm is banned in China. The algorithm is from Jersey. The algorithm constantly finds Jesus.<br/>This is not the algorithm. This is close.</div>
191
+ <div id="licenseText">
192
+ <p>
193
+ This work is licensed under a
194
+ <a href="http://creativecommons.org/licenses/by-nc/2.5/">Creative Commons Attribution-NonCommercial 2.5 License</a>.
195
+ </p><p>
196
+ This means you're free to copy and share these comics (but not to sell them). <a rel="license" href="/license.html">More details</a>.</p>
197
+ </div>
198
+ </div>
199
+ </body>
200
+ <!-- Layout by Ian Clasbey, davean, and chromakode -->
201
+ </html>
202
+
203
+ http_version:
204
+ recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
205
+ recorded_with: VCR 2.5.0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tamara Temple
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-29 00:00:00.000000000 Z
11
+ date: 2013-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -145,6 +145,7 @@ files:
145
145
  - lib/scrapers/imgur.rb
146
146
  - lib/scrapers/nasa_apod.rb
147
147
  - lib/scrapers/version.rb
148
+ - lib/scrapers/xkcd.rb
148
149
  - scrapers.gemspec
149
150
  - spec/scrapers/allrecipes_spec.rb
150
151
  - spec/scrapers/discoverynews_spec.rb
@@ -152,6 +153,7 @@ files:
152
153
  - spec/scrapers/gocomics_spec.rb
153
154
  - spec/scrapers/imgur_spec.rb
154
155
  - spec/scrapers/nasa_apod_spec.rb
156
+ - spec/scrapers/xkcd_spec.rb
155
157
  - spec/scrapers_spec.rb
156
158
  - spec/spec_helper.rb
157
159
  - vcr_cassettes/allrecipes_morning-glory-muffins-i.yml
@@ -167,6 +169,7 @@ files:
167
169
  - vcr_cassettes/shouldincludelink_cassette.yml
168
170
  - vcr_cassettes/shouldincludepubDate_cassette.yml
169
171
  - vcr_cassettes/shouldincludetitle_cassette.yml
172
+ - vcr_cassettes/xkcd.yml
170
173
  homepage: http://github.com/tamouse/scrapers
171
174
  licenses:
172
175
  - MIT
@@ -198,5 +201,6 @@ test_files:
198
201
  - spec/scrapers/gocomics_spec.rb
199
202
  - spec/scrapers/imgur_spec.rb
200
203
  - spec/scrapers/nasa_apod_spec.rb
204
+ - spec/scrapers/xkcd_spec.rb
201
205
  - spec/scrapers_spec.rb
202
206
  - spec/spec_helper.rb