scrapers 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/scrapers/version.rb +1 -1
- data/lib/scrapers/xkcd.rb +23 -0
- data/spec/scrapers/xkcd_spec.rb +27 -0
- data/vcr_cassettes/xkcd.yml +205 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5742674b339e7f902b7fdb75c0a95ac8ac5063d
|
4
|
+
data.tar.gz: 8bfeea7a843d8657ef54e6bc1c42cd935c5e2865
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: adf3e62d1e53474a3e11bac3c239f1d0a21b69a1b4aa2fc277e32850913495248ca98424906dd7367bfd09b90aae6b81e9a7534ecf7a413a9b1c90484653f694
|
7
|
+
data.tar.gz: d68d1694e09f800d18de780432e6998a78627eb58aae10185dac0261d3ab1f866af8d8b4a8cb3845bdb6a2ea04aa4e23c20567c6645143d73fd6775902c5d50c
|
data/README.md
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc.
|
4
4
|
|
5
|
-
LICENSE
|
6
|
-
WEBSITE
|
5
|
+
* LICENSE: MIT
|
6
|
+
* WEBSITE: http://github.com/tamouse/scrapers
|
7
7
|
|
8
8
|
## Installation
|
9
9
|
|
data/lib/scrapers/version.rb
CHANGED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Scrapers
|
5
|
+
module Xkcd
|
6
|
+
XKCD_URL = "http://xkcd.com"
|
7
|
+
def self.scrape(comic=nil)
|
8
|
+
results = Hash.new
|
9
|
+
|
10
|
+
url = URI.parse XKCD_URL
|
11
|
+
url.path = "/#{comic}/" unless comic.nil?
|
12
|
+
results[:url] = url.to_s
|
13
|
+
doc = Nokogiri::HTML(open(url.to_s))
|
14
|
+
comic = doc.at_css("#comic img")
|
15
|
+
results[:img_src] = comic.attr("src")
|
16
|
+
results[:hover_text] = comic.attr("title")
|
17
|
+
results[:title] = comic.attr("alt")
|
18
|
+
|
19
|
+
results
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Scrapers::Xkcd do
|
4
|
+
it {should respond_to :scrape}
|
5
|
+
context "scraping" do
|
6
|
+
before(:all) do
|
7
|
+
@comic = VCR.use_cassette("xkcd") do
|
8
|
+
Scrapers::Xkcd.scrape 149
|
9
|
+
end
|
10
|
+
@expected =
|
11
|
+
{:title => "Sandwich",
|
12
|
+
:url => "http://xkcd.com/149/",
|
13
|
+
:img_src => "http://imgs.xkcd.com/comics/sandwich.png",
|
14
|
+
:hover_text => "Proper User Policy apparently means Simon Says."
|
15
|
+
}
|
16
|
+
end
|
17
|
+
it "should retrieve the comic" do
|
18
|
+
@comic.should_not be_nil
|
19
|
+
end
|
20
|
+
it "should be a Hash" do
|
21
|
+
@comic.should be_a(Hash)
|
22
|
+
end
|
23
|
+
it "should return expected" do
|
24
|
+
@comic.should eq @expected
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,205 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://xkcd.com/149
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept-Encoding:
|
11
|
+
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
|
12
|
+
Accept:
|
13
|
+
- '*/*'
|
14
|
+
User-Agent:
|
15
|
+
- Ruby
|
16
|
+
response:
|
17
|
+
status:
|
18
|
+
code: 301
|
19
|
+
message: Moved Permanently
|
20
|
+
headers:
|
21
|
+
Location:
|
22
|
+
- http://xkcd.com/149/
|
23
|
+
Content-Length:
|
24
|
+
- '0'
|
25
|
+
Date:
|
26
|
+
- Mon, 14 Oct 2013 05:13:16 GMT
|
27
|
+
Server:
|
28
|
+
- lighttpd/1.4.28
|
29
|
+
body:
|
30
|
+
encoding: UTF-8
|
31
|
+
string: ''
|
32
|
+
http_version:
|
33
|
+
recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
|
34
|
+
- request:
|
35
|
+
method: get
|
36
|
+
uri: http://xkcd.com/149/
|
37
|
+
body:
|
38
|
+
encoding: US-ASCII
|
39
|
+
string: ''
|
40
|
+
headers:
|
41
|
+
Accept-Encoding:
|
42
|
+
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
|
43
|
+
Accept:
|
44
|
+
- '*/*'
|
45
|
+
User-Agent:
|
46
|
+
- Ruby
|
47
|
+
response:
|
48
|
+
status:
|
49
|
+
code: 200
|
50
|
+
message: OK
|
51
|
+
headers:
|
52
|
+
Vary:
|
53
|
+
- Accept-Encoding
|
54
|
+
Last-Modified:
|
55
|
+
- Mon, 14 Oct 2013 04:00:05 GMT
|
56
|
+
Etag:
|
57
|
+
- '"871292384"'
|
58
|
+
Content-Type:
|
59
|
+
- text/html; charset=utf-8
|
60
|
+
Accept-Ranges:
|
61
|
+
- bytes
|
62
|
+
Content-Length:
|
63
|
+
- '2685'
|
64
|
+
Date:
|
65
|
+
- Mon, 14 Oct 2013 05:13:17 GMT
|
66
|
+
Server:
|
67
|
+
- lighttpd/1.4.28
|
68
|
+
body:
|
69
|
+
encoding: UTF-8
|
70
|
+
string: |+
|
71
|
+
<?xml version="1.0" encoding="UTF-8" ?>
|
72
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
73
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
74
|
+
<html version="-//W3C//DTD XHTML 1.1//EN" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
75
|
+
<head>
|
76
|
+
<link rel="stylesheet" type="text/css" href="/s/d16ebb.css" title="Default"/>
|
77
|
+
<title>xkcd: Sandwich</title>
|
78
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge"/>
|
79
|
+
<link rel="shortcut icon" href="/s/919f27.ico" type="image/x-icon"/>
|
80
|
+
<link rel="icon" href="/s/919f27.ico" type="image/x-icon"/>
|
81
|
+
<link rel="alternate" type="application/atom+xml" title="Atom 1.0" href="/atom.xml"/>
|
82
|
+
<link rel="alternate" type="application/rss+xml" title="RSS 2.0" href="/rss.xml"/>
|
83
|
+
<link rel="apple-touch-icon-precomposed" href="/s/d9522a.png" />
|
84
|
+
<script type="text/javascript">
|
85
|
+
var _gaq = _gaq || [];
|
86
|
+
_gaq.push(['_setAccount', 'UA-25700708-7']);
|
87
|
+
_gaq.push(['_setDomainName', 'xkcd.com']);
|
88
|
+
_gaq.push(['_setAllowLinker', true]);
|
89
|
+
_gaq.push(['_trackPageview']);
|
90
|
+
|
91
|
+
(function() {
|
92
|
+
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
93
|
+
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
94
|
+
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
95
|
+
})();
|
96
|
+
</script>
|
97
|
+
|
98
|
+
</head>
|
99
|
+
<body>
|
100
|
+
<div id="topContainer">
|
101
|
+
<div id="topLeft">
|
102
|
+
<ul>
|
103
|
+
<li><a href="/archive">Archive</a></li>
|
104
|
+
<li><a href="http://what-if.xkcd.com">What If?</a></li>
|
105
|
+
<li><a href="http://blag.xkcd.com">Blag</a></li>
|
106
|
+
<li><a href="http://store.xkcd.com/">Store</a></li>
|
107
|
+
<li><a rel="author" href="/about">About</a></li>
|
108
|
+
</ul>
|
109
|
+
</div>
|
110
|
+
<div id="topRight">
|
111
|
+
<div id="masthead">
|
112
|
+
<span><a href="/"><img src="http://imgs.xkcd.com/static/terrible_small_logo.png" alt="xkcd.com logo" height="83" width="185"/></a></span>
|
113
|
+
<span id="slogan">A webcomic of romance,<br/> sarcasm, math, and language.</span>
|
114
|
+
</div>
|
115
|
+
<div id="news">
|
116
|
+
You can get the Subways comic as a <a href="http://store-xkcd-com.myshopify.com/products/subways">poster</a>!
|
117
|
+
</div>
|
118
|
+
</div>
|
119
|
+
<div id="bgLeft" class="bg box"></div>
|
120
|
+
<div id="bgRight" class="bg box"></div>
|
121
|
+
</div>
|
122
|
+
<div id="middleContainer" class="box">
|
123
|
+
|
124
|
+
<div id="ctitle">Sandwich</div>
|
125
|
+
<ul class="comicNav">
|
126
|
+
<li><a href="/1/">|<</a></li>
|
127
|
+
<li><a rel="prev" href="/148/" accesskey="p">< Prev</a></li>
|
128
|
+
<li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
|
129
|
+
<li><a rel="next" href="/150/" accesskey="n">Next ></a></li>
|
130
|
+
<li><a href="/">>|</a></li>
|
131
|
+
</ul>
|
132
|
+
<div id="comic">
|
133
|
+
<img src="http://imgs.xkcd.com/comics/sandwich.png" title="Proper User Policy apparently means Simon Says." alt="Sandwich" />
|
134
|
+
</div>
|
135
|
+
<ul class="comicNav">
|
136
|
+
<li><a href="/1/">|<</a></li>
|
137
|
+
<li><a rel="prev" href="/148/" accesskey="p">< Prev</a></li>
|
138
|
+
<li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
|
139
|
+
<li><a rel="next" href="/150/" accesskey="n">Next ></a></li>
|
140
|
+
<li><a href="/">>|</a></li>
|
141
|
+
</ul>
|
142
|
+
<br />
|
143
|
+
Permanent link to this comic: http://xkcd.com/149/<br />
|
144
|
+
Image URL (for hotlinking/embedding): http://imgs.xkcd.com/comics/sandwich.png
|
145
|
+
<div id="transcript" style="display: none">[[ A man is sitting on a couch, talking to another man. They are both stick figures. ]]
|
146
|
+
First man: Make me a sandwich.
|
147
|
+
Second man: What? Make it yourself.
|
148
|
+
First man: Sudo make me a sandwich.
|
149
|
+
Second man: Okay</div>
|
150
|
+
</div>
|
151
|
+
<div id="bottom" class="box">
|
152
|
+
<img src="http://imgs.xkcd.com/s/a899e84.jpg" width="520" height="100" alt="Selected Comics" usemap="#comicmap"/>
|
153
|
+
<map id="comicmap" name="comicmap">
|
154
|
+
<!-- http://code.google.com/p/chromium/issues/detail?id=108489 Might be MIME dependent. -->
|
155
|
+
<area shape="rect" coords="0,0,100,100" href="/150/" alt="Grownups"/>
|
156
|
+
<area shape="rect" coords="104,0,204,100" href="/730/" alt="Circuit Diagram"/>
|
157
|
+
<area shape="rect" coords="208,0,308,100" href="/162/" alt="Angular Momentum"/>
|
158
|
+
<area shape="rect" coords="312,0,412,100" href="/688/" alt="Self-Description"/>
|
159
|
+
<area shape="rect" coords="416,0,520,100" href="/556/" alt="Alternative Energy Revolution"/>
|
160
|
+
</map>
|
161
|
+
<div>
|
162
|
+
Search comic titles and transcripts:
|
163
|
+
<script type="text/javascript" src="//www.google.com/jsapi"></script>
|
164
|
+
<script type="text/javascript">google.load('search', '1');google.setOnLoadCallback(function() {google.search.CustomSearchControl.attachAutoCompletion('012652707207066138651:zudjtuwe28q',document.getElementById('q'),'cse-search-box');});</script>
|
165
|
+
<form action="//www.google.com/cse" id="cse-search-box">
|
166
|
+
<div>
|
167
|
+
<input type="hidden" name="cx" value="012652707207066138651:zudjtuwe28q"/>
|
168
|
+
<input type="hidden" name="ie" value="UTF-8"/>
|
169
|
+
<input type="text" name="q" id="q" size="31"/>
|
170
|
+
<input type="submit" name="sa" value="Search"/>
|
171
|
+
</div>
|
172
|
+
</form>
|
173
|
+
<script type="text/javascript" src="//www.google.com/cse/brand?form=cse-search-box&lang=en"></script>
|
174
|
+
<a href="/rss.xml">RSS Feed</a> - <a href="/atom.xml">Atom Feed</a>
|
175
|
+
</div>
|
176
|
+
<br />
|
177
|
+
<div id="comicLinks">
|
178
|
+
Comics I enjoy:<br/>
|
179
|
+
<a href="http://threewordphrase.com/">Three Word Phrase</a>,
|
180
|
+
<a href="http://oglaf.com/">Oglaf</a> (nsfw),
|
181
|
+
<a href="http://www.smbc-comics.com/">SMBC</a>,
|
182
|
+
<a href="http://www.qwantz.com">Dinosaur Comics</a>,
|
183
|
+
<a href="http://www.asofterworld.com">A Softer World</a>,
|
184
|
+
<a href="http://buttersafe.com/">Buttersafe</a>,
|
185
|
+
<a href="http://pbfcomics.com/">Perry Bible Fellowship</a>,
|
186
|
+
<a href="http://questionablecontent.net/">Questionable Content</a>,
|
187
|
+
<a href="http://www.buttercupfestival.com/">Buttercup Festival</a>
|
188
|
+
</div>
|
189
|
+
<p>Warning: this comic occasionally contains strong language (which may be unsuitable for children), unusual humor (which may be unsuitable for adults), and advanced mathematics (which may be unsuitable for liberal-arts majors).</p>
|
190
|
+
<div id="footnote">BTC 1NfBXWqseXc9rCBc3Cbbu6HjxYssFUgkH6<br />We did not invent the algorithm. The algorithm consistently finds Jesus. The algorithm killed Jeeves. <br/>The algorithm is banned in China. The algorithm is from Jersey. The algorithm constantly finds Jesus.<br/>This is not the algorithm. This is close.</div>
|
191
|
+
<div id="licenseText">
|
192
|
+
<p>
|
193
|
+
This work is licensed under a
|
194
|
+
<a href="http://creativecommons.org/licenses/by-nc/2.5/">Creative Commons Attribution-NonCommercial 2.5 License</a>.
|
195
|
+
</p><p>
|
196
|
+
This means you're free to copy and share these comics (but not to sell them). <a rel="license" href="/license.html">More details</a>.</p>
|
197
|
+
</div>
|
198
|
+
</div>
|
199
|
+
</body>
|
200
|
+
<!-- Layout by Ian Clasbey, davean, and chromakode -->
|
201
|
+
</html>
|
202
|
+
|
203
|
+
http_version:
|
204
|
+
recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
|
205
|
+
recorded_with: VCR 2.5.0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tamara Temple
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -145,6 +145,7 @@ files:
|
|
145
145
|
- lib/scrapers/imgur.rb
|
146
146
|
- lib/scrapers/nasa_apod.rb
|
147
147
|
- lib/scrapers/version.rb
|
148
|
+
- lib/scrapers/xkcd.rb
|
148
149
|
- scrapers.gemspec
|
149
150
|
- spec/scrapers/allrecipes_spec.rb
|
150
151
|
- spec/scrapers/discoverynews_spec.rb
|
@@ -152,6 +153,7 @@ files:
|
|
152
153
|
- spec/scrapers/gocomics_spec.rb
|
153
154
|
- spec/scrapers/imgur_spec.rb
|
154
155
|
- spec/scrapers/nasa_apod_spec.rb
|
156
|
+
- spec/scrapers/xkcd_spec.rb
|
155
157
|
- spec/scrapers_spec.rb
|
156
158
|
- spec/spec_helper.rb
|
157
159
|
- vcr_cassettes/allrecipes_morning-glory-muffins-i.yml
|
@@ -167,6 +169,7 @@ files:
|
|
167
169
|
- vcr_cassettes/shouldincludelink_cassette.yml
|
168
170
|
- vcr_cassettes/shouldincludepubDate_cassette.yml
|
169
171
|
- vcr_cassettes/shouldincludetitle_cassette.yml
|
172
|
+
- vcr_cassettes/xkcd.yml
|
170
173
|
homepage: http://github.com/tamouse/scrapers
|
171
174
|
licenses:
|
172
175
|
- MIT
|
@@ -198,5 +201,6 @@ test_files:
|
|
198
201
|
- spec/scrapers/gocomics_spec.rb
|
199
202
|
- spec/scrapers/imgur_spec.rb
|
200
203
|
- spec/scrapers/nasa_apod_spec.rb
|
204
|
+
- spec/scrapers/xkcd_spec.rb
|
201
205
|
- spec/scrapers_spec.rb
|
202
206
|
- spec/spec_helper.rb
|