scrapers 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/scrapers/version.rb +1 -1
- data/lib/scrapers/xkcd.rb +23 -0
- data/spec/scrapers/xkcd_spec.rb +27 -0
- data/vcr_cassettes/xkcd.yml +205 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5742674b339e7f902b7fdb75c0a95ac8ac5063d
|
4
|
+
data.tar.gz: 8bfeea7a843d8657ef54e6bc1c42cd935c5e2865
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: adf3e62d1e53474a3e11bac3c239f1d0a21b69a1b4aa2fc277e32850913495248ca98424906dd7367bfd09b90aae6b81e9a7534ecf7a413a9b1c90484653f694
|
7
|
+
data.tar.gz: d68d1694e09f800d18de780432e6998a78627eb58aae10185dac0261d3ab1f866af8d8b4a8cb3845bdb6a2ea04aa4e23c20567c6645143d73fd6775902c5d50c
|
data/README.md
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc.
|
4
4
|
|
5
|
-
LICENSE
|
6
|
-
WEBSITE
|
5
|
+
* LICENSE: MIT
|
6
|
+
* WEBSITE: http://github.com/tamouse/scrapers
|
7
7
|
|
8
8
|
## Installation
|
9
9
|
|
data/lib/scrapers/version.rb
CHANGED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Scrapers
|
5
|
+
module Xkcd
|
6
|
+
XKCD_URL = "http://xkcd.com"
|
7
|
+
def self.scrape(comic=nil)
|
8
|
+
results = Hash.new
|
9
|
+
|
10
|
+
url = URI.parse XKCD_URL
|
11
|
+
url.path = "/#{comic}/" unless comic.nil?
|
12
|
+
results[:url] = url.to_s
|
13
|
+
doc = Nokogiri::HTML(open(url.to_s))
|
14
|
+
comic = doc.at_css("#comic img")
|
15
|
+
results[:img_src] = comic.attr("src")
|
16
|
+
results[:hover_text] = comic.attr("title")
|
17
|
+
results[:title] = comic.attr("alt")
|
18
|
+
|
19
|
+
results
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Scrapers::Xkcd do
|
4
|
+
it {should respond_to :scrape}
|
5
|
+
context "scraping" do
|
6
|
+
before(:all) do
|
7
|
+
@comic = VCR.use_cassette("xkcd") do
|
8
|
+
Scrapers::Xkcd.scrape 149
|
9
|
+
end
|
10
|
+
@expected =
|
11
|
+
{:title => "Sandwich",
|
12
|
+
:url => "http://xkcd.com/149/",
|
13
|
+
:img_src => "http://imgs.xkcd.com/comics/sandwich.png",
|
14
|
+
:hover_text => "Proper User Policy apparently means Simon Says."
|
15
|
+
}
|
16
|
+
end
|
17
|
+
it "should retrieve the comic" do
|
18
|
+
@comic.should_not be_nil
|
19
|
+
end
|
20
|
+
it "should be a Hash" do
|
21
|
+
@comic.should be_a(Hash)
|
22
|
+
end
|
23
|
+
it "should return expected" do
|
24
|
+
@comic.should eq @expected
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,205 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://xkcd.com/149
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept-Encoding:
|
11
|
+
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
|
12
|
+
Accept:
|
13
|
+
- '*/*'
|
14
|
+
User-Agent:
|
15
|
+
- Ruby
|
16
|
+
response:
|
17
|
+
status:
|
18
|
+
code: 301
|
19
|
+
message: Moved Permanently
|
20
|
+
headers:
|
21
|
+
Location:
|
22
|
+
- http://xkcd.com/149/
|
23
|
+
Content-Length:
|
24
|
+
- '0'
|
25
|
+
Date:
|
26
|
+
- Mon, 14 Oct 2013 05:13:16 GMT
|
27
|
+
Server:
|
28
|
+
- lighttpd/1.4.28
|
29
|
+
body:
|
30
|
+
encoding: UTF-8
|
31
|
+
string: ''
|
32
|
+
http_version:
|
33
|
+
recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
|
34
|
+
- request:
|
35
|
+
method: get
|
36
|
+
uri: http://xkcd.com/149/
|
37
|
+
body:
|
38
|
+
encoding: US-ASCII
|
39
|
+
string: ''
|
40
|
+
headers:
|
41
|
+
Accept-Encoding:
|
42
|
+
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
|
43
|
+
Accept:
|
44
|
+
- '*/*'
|
45
|
+
User-Agent:
|
46
|
+
- Ruby
|
47
|
+
response:
|
48
|
+
status:
|
49
|
+
code: 200
|
50
|
+
message: OK
|
51
|
+
headers:
|
52
|
+
Vary:
|
53
|
+
- Accept-Encoding
|
54
|
+
Last-Modified:
|
55
|
+
- Mon, 14 Oct 2013 04:00:05 GMT
|
56
|
+
Etag:
|
57
|
+
- '"871292384"'
|
58
|
+
Content-Type:
|
59
|
+
- text/html; charset=utf-8
|
60
|
+
Accept-Ranges:
|
61
|
+
- bytes
|
62
|
+
Content-Length:
|
63
|
+
- '2685'
|
64
|
+
Date:
|
65
|
+
- Mon, 14 Oct 2013 05:13:17 GMT
|
66
|
+
Server:
|
67
|
+
- lighttpd/1.4.28
|
68
|
+
body:
|
69
|
+
encoding: UTF-8
|
70
|
+
string: |+
|
71
|
+
<?xml version="1.0" encoding="UTF-8" ?>
|
72
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
73
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
74
|
+
<html version="-//W3C//DTD XHTML 1.1//EN" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
75
|
+
<head>
|
76
|
+
<link rel="stylesheet" type="text/css" href="/s/d16ebb.css" title="Default"/>
|
77
|
+
<title>xkcd: Sandwich</title>
|
78
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge"/>
|
79
|
+
<link rel="shortcut icon" href="/s/919f27.ico" type="image/x-icon"/>
|
80
|
+
<link rel="icon" href="/s/919f27.ico" type="image/x-icon"/>
|
81
|
+
<link rel="alternate" type="application/atom+xml" title="Atom 1.0" href="/atom.xml"/>
|
82
|
+
<link rel="alternate" type="application/rss+xml" title="RSS 2.0" href="/rss.xml"/>
|
83
|
+
<link rel="apple-touch-icon-precomposed" href="/s/d9522a.png" />
|
84
|
+
<script type="text/javascript">
|
85
|
+
var _gaq = _gaq || [];
|
86
|
+
_gaq.push(['_setAccount', 'UA-25700708-7']);
|
87
|
+
_gaq.push(['_setDomainName', 'xkcd.com']);
|
88
|
+
_gaq.push(['_setAllowLinker', true]);
|
89
|
+
_gaq.push(['_trackPageview']);
|
90
|
+
|
91
|
+
(function() {
|
92
|
+
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
93
|
+
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
94
|
+
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
95
|
+
})();
|
96
|
+
</script>
|
97
|
+
|
98
|
+
</head>
|
99
|
+
<body>
|
100
|
+
<div id="topContainer">
|
101
|
+
<div id="topLeft">
|
102
|
+
<ul>
|
103
|
+
<li><a href="/archive">Archive</a></li>
|
104
|
+
<li><a href="http://what-if.xkcd.com">What If?</a></li>
|
105
|
+
<li><a href="http://blag.xkcd.com">Blag</a></li>
|
106
|
+
<li><a href="http://store.xkcd.com/">Store</a></li>
|
107
|
+
<li><a rel="author" href="/about">About</a></li>
|
108
|
+
</ul>
|
109
|
+
</div>
|
110
|
+
<div id="topRight">
|
111
|
+
<div id="masthead">
|
112
|
+
<span><a href="/"><img src="http://imgs.xkcd.com/static/terrible_small_logo.png" alt="xkcd.com logo" height="83" width="185"/></a></span>
|
113
|
+
<span id="slogan">A webcomic of romance,<br/> sarcasm, math, and language.</span>
|
114
|
+
</div>
|
115
|
+
<div id="news">
|
116
|
+
You can get the Subways comic as a <a href="http://store-xkcd-com.myshopify.com/products/subways">poster</a>!
|
117
|
+
</div>
|
118
|
+
</div>
|
119
|
+
<div id="bgLeft" class="bg box"></div>
|
120
|
+
<div id="bgRight" class="bg box"></div>
|
121
|
+
</div>
|
122
|
+
<div id="middleContainer" class="box">
|
123
|
+
|
124
|
+
<div id="ctitle">Sandwich</div>
|
125
|
+
<ul class="comicNav">
|
126
|
+
<li><a href="/1/">|<</a></li>
|
127
|
+
<li><a rel="prev" href="/148/" accesskey="p">< Prev</a></li>
|
128
|
+
<li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
|
129
|
+
<li><a rel="next" href="/150/" accesskey="n">Next ></a></li>
|
130
|
+
<li><a href="/">>|</a></li>
|
131
|
+
</ul>
|
132
|
+
<div id="comic">
|
133
|
+
<img src="http://imgs.xkcd.com/comics/sandwich.png" title="Proper User Policy apparently means Simon Says." alt="Sandwich" />
|
134
|
+
</div>
|
135
|
+
<ul class="comicNav">
|
136
|
+
<li><a href="/1/">|<</a></li>
|
137
|
+
<li><a rel="prev" href="/148/" accesskey="p">< Prev</a></li>
|
138
|
+
<li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
|
139
|
+
<li><a rel="next" href="/150/" accesskey="n">Next ></a></li>
|
140
|
+
<li><a href="/">>|</a></li>
|
141
|
+
</ul>
|
142
|
+
<br />
|
143
|
+
Permanent link to this comic: http://xkcd.com/149/<br />
|
144
|
+
Image URL (for hotlinking/embedding): http://imgs.xkcd.com/comics/sandwich.png
|
145
|
+
<div id="transcript" style="display: none">[[ A man is sitting on a couch, talking to another man. They are both stick figures. ]]
|
146
|
+
First man: Make me a sandwich.
|
147
|
+
Second man: What? Make it yourself.
|
148
|
+
First man: Sudo make me a sandwich.
|
149
|
+
Second man: Okay</div>
|
150
|
+
</div>
|
151
|
+
<div id="bottom" class="box">
|
152
|
+
<img src="http://imgs.xkcd.com/s/a899e84.jpg" width="520" height="100" alt="Selected Comics" usemap="#comicmap"/>
|
153
|
+
<map id="comicmap" name="comicmap">
|
154
|
+
<!-- http://code.google.com/p/chromium/issues/detail?id=108489 Might be MIME dependent. -->
|
155
|
+
<area shape="rect" coords="0,0,100,100" href="/150/" alt="Grownups"/>
|
156
|
+
<area shape="rect" coords="104,0,204,100" href="/730/" alt="Circuit Diagram"/>
|
157
|
+
<area shape="rect" coords="208,0,308,100" href="/162/" alt="Angular Momentum"/>
|
158
|
+
<area shape="rect" coords="312,0,412,100" href="/688/" alt="Self-Description"/>
|
159
|
+
<area shape="rect" coords="416,0,520,100" href="/556/" alt="Alternative Energy Revolution"/>
|
160
|
+
</map>
|
161
|
+
<div>
|
162
|
+
Search comic titles and transcripts:
|
163
|
+
<script type="text/javascript" src="//www.google.com/jsapi"></script>
|
164
|
+
<script type="text/javascript">google.load('search', '1');google.setOnLoadCallback(function() {google.search.CustomSearchControl.attachAutoCompletion('012652707207066138651:zudjtuwe28q',document.getElementById('q'),'cse-search-box');});</script>
|
165
|
+
<form action="//www.google.com/cse" id="cse-search-box">
|
166
|
+
<div>
|
167
|
+
<input type="hidden" name="cx" value="012652707207066138651:zudjtuwe28q"/>
|
168
|
+
<input type="hidden" name="ie" value="UTF-8"/>
|
169
|
+
<input type="text" name="q" id="q" size="31"/>
|
170
|
+
<input type="submit" name="sa" value="Search"/>
|
171
|
+
</div>
|
172
|
+
</form>
|
173
|
+
<script type="text/javascript" src="//www.google.com/cse/brand?form=cse-search-box&lang=en"></script>
|
174
|
+
<a href="/rss.xml">RSS Feed</a> - <a href="/atom.xml">Atom Feed</a>
|
175
|
+
</div>
|
176
|
+
<br />
|
177
|
+
<div id="comicLinks">
|
178
|
+
Comics I enjoy:<br/>
|
179
|
+
<a href="http://threewordphrase.com/">Three Word Phrase</a>,
|
180
|
+
<a href="http://oglaf.com/">Oglaf</a> (nsfw),
|
181
|
+
<a href="http://www.smbc-comics.com/">SMBC</a>,
|
182
|
+
<a href="http://www.qwantz.com">Dinosaur Comics</a>,
|
183
|
+
<a href="http://www.asofterworld.com">A Softer World</a>,
|
184
|
+
<a href="http://buttersafe.com/">Buttersafe</a>,
|
185
|
+
<a href="http://pbfcomics.com/">Perry Bible Fellowship</a>,
|
186
|
+
<a href="http://questionablecontent.net/">Questionable Content</a>,
|
187
|
+
<a href="http://www.buttercupfestival.com/">Buttercup Festival</a>
|
188
|
+
</div>
|
189
|
+
<p>Warning: this comic occasionally contains strong language (which may be unsuitable for children), unusual humor (which may be unsuitable for adults), and advanced mathematics (which may be unsuitable for liberal-arts majors).</p>
|
190
|
+
<div id="footnote">BTC 1NfBXWqseXc9rCBc3Cbbu6HjxYssFUgkH6<br />We did not invent the algorithm. The algorithm consistently finds Jesus. The algorithm killed Jeeves. <br/>The algorithm is banned in China. The algorithm is from Jersey. The algorithm constantly finds Jesus.<br/>This is not the algorithm. This is close.</div>
|
191
|
+
<div id="licenseText">
|
192
|
+
<p>
|
193
|
+
This work is licensed under a
|
194
|
+
<a href="http://creativecommons.org/licenses/by-nc/2.5/">Creative Commons Attribution-NonCommercial 2.5 License</a>.
|
195
|
+
</p><p>
|
196
|
+
This means you're free to copy and share these comics (but not to sell them). <a rel="license" href="/license.html">More details</a>.</p>
|
197
|
+
</div>
|
198
|
+
</div>
|
199
|
+
</body>
|
200
|
+
<!-- Layout by Ian Clasbey, davean, and chromakode -->
|
201
|
+
</html>
|
202
|
+
|
203
|
+
http_version:
|
204
|
+
recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
|
205
|
+
recorded_with: VCR 2.5.0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tamara Temple
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -145,6 +145,7 @@ files:
|
|
145
145
|
- lib/scrapers/imgur.rb
|
146
146
|
- lib/scrapers/nasa_apod.rb
|
147
147
|
- lib/scrapers/version.rb
|
148
|
+
- lib/scrapers/xkcd.rb
|
148
149
|
- scrapers.gemspec
|
149
150
|
- spec/scrapers/allrecipes_spec.rb
|
150
151
|
- spec/scrapers/discoverynews_spec.rb
|
@@ -152,6 +153,7 @@ files:
|
|
152
153
|
- spec/scrapers/gocomics_spec.rb
|
153
154
|
- spec/scrapers/imgur_spec.rb
|
154
155
|
- spec/scrapers/nasa_apod_spec.rb
|
156
|
+
- spec/scrapers/xkcd_spec.rb
|
155
157
|
- spec/scrapers_spec.rb
|
156
158
|
- spec/spec_helper.rb
|
157
159
|
- vcr_cassettes/allrecipes_morning-glory-muffins-i.yml
|
@@ -167,6 +169,7 @@ files:
|
|
167
169
|
- vcr_cassettes/shouldincludelink_cassette.yml
|
168
170
|
- vcr_cassettes/shouldincludepubDate_cassette.yml
|
169
171
|
- vcr_cassettes/shouldincludetitle_cassette.yml
|
172
|
+
- vcr_cassettes/xkcd.yml
|
170
173
|
homepage: http://github.com/tamouse/scrapers
|
171
174
|
licenses:
|
172
175
|
- MIT
|
@@ -198,5 +201,6 @@ test_files:
|
|
198
201
|
- spec/scrapers/gocomics_spec.rb
|
199
202
|
- spec/scrapers/imgur_spec.rb
|
200
203
|
- spec/scrapers/nasa_apod_spec.rb
|
204
|
+
- spec/scrapers/xkcd_spec.rb
|
201
205
|
- spec/scrapers_spec.rb
|
202
206
|
- spec/spec_helper.rb
|