scrapers 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0ed963b518db6e326e8f86caf549821942e95248
4
- data.tar.gz: fec6828d9b6b9923d0f595852accb4ba282454a0
3
+ metadata.gz: e5742674b339e7f902b7fdb75c0a95ac8ac5063d
4
+ data.tar.gz: 8bfeea7a843d8657ef54e6bc1c42cd935c5e2865
5
5
  SHA512:
6
- metadata.gz: 644b43272415ef5c765bd75d9e788564f861430b84914aa79e37f664f691eb1436f7b98934283521331d6411da51605af2c8937f9d4a1b856686badf454cf1e0
7
- data.tar.gz: e47dc6eadcd7c7ca05197cdff5e19c60010e760380264c1bf515f832fa814bc7bde29d7c02df980b239dae75507198dc9b6b27a40840ada8398d7f09065a2f5d
6
+ metadata.gz: adf3e62d1e53474a3e11bac3c239f1d0a21b69a1b4aa2fc277e32850913495248ca98424906dd7367bfd09b90aae6b81e9a7534ecf7a413a9b1c90484653f694
7
+ data.tar.gz: d68d1694e09f800d18de780432e6998a78627eb58aae10185dac0261d3ab1f866af8d8b4a8cb3845bdb6a2ea04aa4e23c20567c6645143d73fd6775902c5d50c
data/README.md CHANGED
@@ -2,8 +2,8 @@
2
2
 
3
3
  A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc.
4
4
 
5
- LICENSE:: MIT
6
- WEBSITE:: http://github.com/tamouse/scrapers
5
+ * LICENSE: MIT
6
+ * WEBSITE: http://github.com/tamouse/scrapers
7
7
 
8
8
  ## Installation
9
9
 
@@ -1,5 +1,5 @@
1
1
  module Scrapers
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
4
4
  SUMMARY = "Web site scrapers"
5
5
  LICENSE = "MIT"
@@ -0,0 +1,23 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+
4
+ module Scrapers
5
+ module Xkcd
6
+ XKCD_URL = "http://xkcd.com"
7
+ def self.scrape(comic=nil)
8
+ results = Hash.new
9
+
10
+ url = URI.parse XKCD_URL
11
+ url.path = "/#{comic}/" unless comic.nil?
12
+ results[:url] = url.to_s
13
+ doc = Nokogiri::HTML(open(url.to_s))
14
+ comic = doc.at_css("#comic img")
15
+ results[:img_src] = comic.attr("src")
16
+ results[:hover_text] = comic.attr("title")
17
+ results[:title] = comic.attr("alt")
18
+
19
+ results
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Scrapers::Xkcd do
4
+ it {should respond_to :scrape}
5
+ context "scraping" do
6
+ before(:all) do
7
+ @comic = VCR.use_cassette("xkcd") do
8
+ Scrapers::Xkcd.scrape 149
9
+ end
10
+ @expected =
11
+ {:title => "Sandwich",
12
+ :url => "http://xkcd.com/149/",
13
+ :img_src => "http://imgs.xkcd.com/comics/sandwich.png",
14
+ :hover_text => "Proper User Policy apparently means Simon Says."
15
+ }
16
+ end
17
+ it "should retrieve the comic" do
18
+ @comic.should_not be_nil
19
+ end
20
+ it "should be a Hash" do
21
+ @comic.should be_a(Hash)
22
+ end
23
+ it "should return expected" do
24
+ @comic.should eq @expected
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,205 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://xkcd.com/149
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ Accept-Encoding:
11
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
12
+ Accept:
13
+ - '*/*'
14
+ User-Agent:
15
+ - Ruby
16
+ response:
17
+ status:
18
+ code: 301
19
+ message: Moved Permanently
20
+ headers:
21
+ Location:
22
+ - http://xkcd.com/149/
23
+ Content-Length:
24
+ - '0'
25
+ Date:
26
+ - Mon, 14 Oct 2013 05:13:16 GMT
27
+ Server:
28
+ - lighttpd/1.4.28
29
+ body:
30
+ encoding: UTF-8
31
+ string: ''
32
+ http_version:
33
+ recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
34
+ - request:
35
+ method: get
36
+ uri: http://xkcd.com/149/
37
+ body:
38
+ encoding: US-ASCII
39
+ string: ''
40
+ headers:
41
+ Accept-Encoding:
42
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
43
+ Accept:
44
+ - '*/*'
45
+ User-Agent:
46
+ - Ruby
47
+ response:
48
+ status:
49
+ code: 200
50
+ message: OK
51
+ headers:
52
+ Vary:
53
+ - Accept-Encoding
54
+ Last-Modified:
55
+ - Mon, 14 Oct 2013 04:00:05 GMT
56
+ Etag:
57
+ - '"871292384"'
58
+ Content-Type:
59
+ - text/html; charset=utf-8
60
+ Accept-Ranges:
61
+ - bytes
62
+ Content-Length:
63
+ - '2685'
64
+ Date:
65
+ - Mon, 14 Oct 2013 05:13:17 GMT
66
+ Server:
67
+ - lighttpd/1.4.28
68
+ body:
69
+ encoding: UTF-8
70
+ string: |+
71
+ <?xml version="1.0" encoding="UTF-8" ?>
72
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
73
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
74
+ <html version="-//W3C//DTD XHTML 1.1//EN" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
75
+ <head>
76
+ <link rel="stylesheet" type="text/css" href="/s/d16ebb.css" title="Default"/>
77
+ <title>xkcd: Sandwich</title>
78
+ <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
79
+ <link rel="shortcut icon" href="/s/919f27.ico" type="image/x-icon"/>
80
+ <link rel="icon" href="/s/919f27.ico" type="image/x-icon"/>
81
+ <link rel="alternate" type="application/atom+xml" title="Atom 1.0" href="/atom.xml"/>
82
+ <link rel="alternate" type="application/rss+xml" title="RSS 2.0" href="/rss.xml"/>
83
+ <link rel="apple-touch-icon-precomposed" href="/s/d9522a.png" />
84
+ <script type="text/javascript">
85
+ var _gaq = _gaq || [];
86
+ _gaq.push(['_setAccount', 'UA-25700708-7']);
87
+ _gaq.push(['_setDomainName', 'xkcd.com']);
88
+ _gaq.push(['_setAllowLinker', true]);
89
+ _gaq.push(['_trackPageview']);
90
+
91
+ (function() {
92
+ var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
93
+ ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
94
+ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
95
+ })();
96
+ </script>
97
+
98
+ </head>
99
+ <body>
100
+ <div id="topContainer">
101
+ <div id="topLeft">
102
+ <ul>
103
+ <li><a href="/archive">Archive</a></li>
104
+ <li><a href="http://what-if.xkcd.com">What If?</a></li>
105
+ <li><a href="http://blag.xkcd.com">Blag</a></li>
106
+ <li><a href="http://store.xkcd.com/">Store</a></li>
107
+ <li><a rel="author" href="/about">About</a></li>
108
+ </ul>
109
+ </div>
110
+ <div id="topRight">
111
+ <div id="masthead">
112
+ <span><a href="/"><img src="http://imgs.xkcd.com/static/terrible_small_logo.png" alt="xkcd.com logo" height="83" width="185"/></a></span>
113
+ <span id="slogan">A webcomic of romance,<br/> sarcasm, math, and language.</span>
114
+ </div>
115
+ <div id="news">
116
+ You can get the Subways comic as a <a href="http://store-xkcd-com.myshopify.com/products/subways">poster</a>!
117
+ </div>
118
+ </div>
119
+ <div id="bgLeft" class="bg box"></div>
120
+ <div id="bgRight" class="bg box"></div>
121
+ </div>
122
+ <div id="middleContainer" class="box">
123
+
124
+ <div id="ctitle">Sandwich</div>
125
+ <ul class="comicNav">
126
+ <li><a href="/1/">|&lt;</a></li>
127
+ <li><a rel="prev" href="/148/" accesskey="p">&lt; Prev</a></li>
128
+ <li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
129
+ <li><a rel="next" href="/150/" accesskey="n">Next &gt;</a></li>
130
+ <li><a href="/">&gt;|</a></li>
131
+ </ul>
132
+ <div id="comic">
133
+ <img src="http://imgs.xkcd.com/comics/sandwich.png" title="Proper User Policy apparently means Simon Says." alt="Sandwich" />
134
+ </div>
135
+ <ul class="comicNav">
136
+ <li><a href="/1/">|&lt;</a></li>
137
+ <li><a rel="prev" href="/148/" accesskey="p">&lt; Prev</a></li>
138
+ <li><a href="http://dynamic.xkcd.com/random/comic/">Random</a></li>
139
+ <li><a rel="next" href="/150/" accesskey="n">Next &gt;</a></li>
140
+ <li><a href="/">&gt;|</a></li>
141
+ </ul>
142
+ <br />
143
+ Permanent link to this comic: http://xkcd.com/149/<br />
144
+ Image URL (for hotlinking/embedding): http://imgs.xkcd.com/comics/sandwich.png
145
+ <div id="transcript" style="display: none">[[ A man is sitting on a couch, talking to another man. They are both stick figures. ]]
146
+ First man: Make me a sandwich.
147
+ Second man: What? Make it yourself.
148
+ First man: Sudo make me a sandwich.
149
+ Second man: Okay</div>
150
+ </div>
151
+ <div id="bottom" class="box">
152
+ <img src="http://imgs.xkcd.com/s/a899e84.jpg" width="520" height="100" alt="Selected Comics" usemap="#comicmap"/>
153
+ <map id="comicmap" name="comicmap">
154
+ <!-- http://code.google.com/p/chromium/issues/detail?id=108489 Might be MIME dependent. -->
155
+ <area shape="rect" coords="0,0,100,100" href="/150/" alt="Grownups"/>
156
+ <area shape="rect" coords="104,0,204,100" href="/730/" alt="Circuit Diagram"/>
157
+ <area shape="rect" coords="208,0,308,100" href="/162/" alt="Angular Momentum"/>
158
+ <area shape="rect" coords="312,0,412,100" href="/688/" alt="Self-Description"/>
159
+ <area shape="rect" coords="416,0,520,100" href="/556/" alt="Alternative Energy Revolution"/>
160
+ </map>
161
+ <div>
162
+ Search comic titles and transcripts:
163
+ <script type="text/javascript" src="//www.google.com/jsapi"></script>
164
+ <script type="text/javascript">google.load('search', '1');google.setOnLoadCallback(function() {google.search.CustomSearchControl.attachAutoCompletion('012652707207066138651:zudjtuwe28q',document.getElementById('q'),'cse-search-box');});</script>
165
+ <form action="//www.google.com/cse" id="cse-search-box">
166
+ <div>
167
+ <input type="hidden" name="cx" value="012652707207066138651:zudjtuwe28q"/>
168
+ <input type="hidden" name="ie" value="UTF-8"/>
169
+ <input type="text" name="q" id="q" size="31"/>
170
+ <input type="submit" name="sa" value="Search"/>
171
+ </div>
172
+ </form>
173
+ <script type="text/javascript" src="//www.google.com/cse/brand?form=cse-search-box&amp;lang=en"></script>
174
+ <a href="/rss.xml">RSS Feed</a> - <a href="/atom.xml">Atom Feed</a>
175
+ </div>
176
+ <br />
177
+ <div id="comicLinks">
178
+ Comics I enjoy:<br/>
179
+ <a href="http://threewordphrase.com/">Three Word Phrase</a>,
180
+ <a href="http://oglaf.com/">Oglaf</a> (nsfw),
181
+ <a href="http://www.smbc-comics.com/">SMBC</a>,
182
+ <a href="http://www.qwantz.com">Dinosaur Comics</a>,
183
+ <a href="http://www.asofterworld.com">A Softer World</a>,
184
+ <a href="http://buttersafe.com/">Buttersafe</a>,
185
+ <a href="http://pbfcomics.com/">Perry Bible Fellowship</a>,
186
+ <a href="http://questionablecontent.net/">Questionable Content</a>,
187
+ <a href="http://www.buttercupfestival.com/">Buttercup Festival</a>
188
+ </div>
189
+ <p>Warning: this comic occasionally contains strong language (which may be unsuitable for children), unusual humor (which may be unsuitable for adults), and advanced mathematics (which may be unsuitable for liberal-arts majors).</p>
190
+ <div id="footnote">BTC 1NfBXWqseXc9rCBc3Cbbu6HjxYssFUgkH6<br />We did not invent the algorithm. The algorithm consistently finds Jesus. The algorithm killed Jeeves. <br/>The algorithm is banned in China. The algorithm is from Jersey. The algorithm constantly finds Jesus.<br/>This is not the algorithm. This is close.</div>
191
+ <div id="licenseText">
192
+ <p>
193
+ This work is licensed under a
194
+ <a href="http://creativecommons.org/licenses/by-nc/2.5/">Creative Commons Attribution-NonCommercial 2.5 License</a>.
195
+ </p><p>
196
+ This means you're free to copy and share these comics (but not to sell them). <a rel="license" href="/license.html">More details</a>.</p>
197
+ </div>
198
+ </div>
199
+ </body>
200
+ <!-- Layout by Ian Clasbey, davean, and chromakode -->
201
+ </html>
202
+
203
+ http_version:
204
+ recorded_at: Mon, 14 Oct 2013 05:13:17 GMT
205
+ recorded_with: VCR 2.5.0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tamara Temple
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-29 00:00:00.000000000 Z
11
+ date: 2013-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -145,6 +145,7 @@ files:
145
145
  - lib/scrapers/imgur.rb
146
146
  - lib/scrapers/nasa_apod.rb
147
147
  - lib/scrapers/version.rb
148
+ - lib/scrapers/xkcd.rb
148
149
  - scrapers.gemspec
149
150
  - spec/scrapers/allrecipes_spec.rb
150
151
  - spec/scrapers/discoverynews_spec.rb
@@ -152,6 +153,7 @@ files:
152
153
  - spec/scrapers/gocomics_spec.rb
153
154
  - spec/scrapers/imgur_spec.rb
154
155
  - spec/scrapers/nasa_apod_spec.rb
156
+ - spec/scrapers/xkcd_spec.rb
155
157
  - spec/scrapers_spec.rb
156
158
  - spec/spec_helper.rb
157
159
  - vcr_cassettes/allrecipes_morning-glory-muffins-i.yml
@@ -167,6 +169,7 @@ files:
167
169
  - vcr_cassettes/shouldincludelink_cassette.yml
168
170
  - vcr_cassettes/shouldincludepubDate_cassette.yml
169
171
  - vcr_cassettes/shouldincludetitle_cassette.yml
172
+ - vcr_cassettes/xkcd.yml
170
173
  homepage: http://github.com/tamouse/scrapers
171
174
  licenses:
172
175
  - MIT
@@ -198,5 +201,6 @@ test_files:
198
201
  - spec/scrapers/gocomics_spec.rb
199
202
  - spec/scrapers/imgur_spec.rb
200
203
  - spec/scrapers/nasa_apod_spec.rb
204
+ - spec/scrapers/xkcd_spec.rb
201
205
  - spec/scrapers_spec.rb
202
206
  - spec/spec_helper.rb