scrapers 0.4.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/scrapers/download.rb +5 -3
- data/lib/scrapers/nasa_apod.rb +20 -22
- data/lib/scrapers/sinfest.rb +24 -0
- data/lib/scrapers/version.rb +10 -1
- data/spec/scrapers/download_spec.rb +45 -8
- data/spec/scrapers/sinfest_spec.rb +46 -0
- data/spec/spec_helper.rb +1 -1
- data/vcr_cassettes/allrecipes_morning-glory-muffins-i.yml +879 -851
- data/vcr_cassettes/disconews_history-of-space.yml +347 -340
- data/vcr_cassettes/download-newfile.yml +631 -0
- data/vcr_cassettes/download-overwrite.yml +631 -0
- data/vcr_cassettes/download_cassette.yml +7 -7
- data/vcr_cassettes/gocomics_nonsequitur.yml +907 -765
- data/vcr_cassettes/sinfest.yml +393 -0
- data/vcr_cassettes/xkcd.yml +8 -8
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 960eadf205104f76a6be3e1bf820dd3bb401fae8
|
4
|
+
data.tar.gz: 56ec817fe56cbe1adc9f82060b5fe8d5b96222be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fa018e7d9a1ca5faeebca9eb478832784fa3cb377b8b382f6a1d7c28299e1df31297eeeb70a0031748de0f1ce1b7e19d6348ddbeb853e73bda88f760b2db6ab3
|
7
|
+
data.tar.gz: 223214a05801135f0e5df20d60393104a865c9446cdcb40d810a08d764bb73202ba4902e642531a3ffb81470611fd20a4c581e7a6074c37dde97e3d49921b900
|
data/lib/scrapers/download.rb
CHANGED
@@ -16,12 +16,14 @@ module Scrapers
|
|
16
16
|
module Download
|
17
17
|
|
18
18
|
def self.download(url,dir=".",overwrite=false)
|
19
|
-
|
19
|
+
# need a new agent each time!
|
20
|
+
agent = Mechanize.new
|
21
|
+
agent.pluggable_parser.default = Mechanize::Download
|
20
22
|
@dir = validate_directory(dir)
|
21
|
-
dl =
|
23
|
+
dl = agent.get(url)
|
22
24
|
Dir.chdir(@dir) do |dir|
|
23
25
|
if overwrite
|
24
|
-
dl.save!()
|
26
|
+
dl.save!(dl.filename)
|
25
27
|
else
|
26
28
|
dl.save()
|
27
29
|
end
|
data/lib/scrapers/nasa_apod.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
nasa_apod.rb -- oneline desc
|
4
4
|
|
5
|
-
Time-stamp: <2013-
|
5
|
+
Time-stamp: <2013-10-15 00:17:15 tamara>
|
6
6
|
Copyright (C) 2013 Tamara Temple Web Development
|
7
7
|
Author: Tamara Temple <tamouse@gmail.com>
|
8
8
|
License: MIT
|
@@ -23,35 +23,33 @@ module Scrapers
|
|
23
23
|
|
24
24
|
module NasaApod
|
25
25
|
|
26
|
+
NASA_APOD_URL="http://asterisk.apod.com/library/APOD/APOD%20mirror/astropix.html"
|
27
|
+
|
26
28
|
module_function
|
27
29
|
|
28
|
-
def scrape(url)
|
30
|
+
def scrape(url=nil)
|
31
|
+
url ||= NASA_APOD_URL
|
29
32
|
apod = Hash.new
|
30
|
-
|
31
|
-
|
32
|
-
Mechanize.start do |m|
|
33
|
+
Mechanize.start do |m|
|
33
34
|
|
34
|
-
|
35
|
+
m.get url
|
35
36
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
end
|
50
|
-
|
37
|
+
# APOD has a funky entry page, but we want the actual page
|
38
|
+
prev = m.current_page.link_with(:text => '<').href
|
39
|
+
m.get prev
|
40
|
+
canonical = m.current_page.link_with(:text => '>' ).href
|
41
|
+
m.get canonical
|
42
|
+
|
43
|
+
m.current_page.tap do |page|
|
44
|
+
apod[:title] = page.title.strip
|
45
|
+
apod[:link] = page.uri.to_s
|
46
|
+
apod[:description] = (page/("body")).text
|
47
|
+
apod[:pubDate] = page.response['date'].to_s
|
48
|
+
apod[:guid] = page.uri.to_s
|
49
|
+
apod[:content_encoded] = (page/("body")).to_html
|
51
50
|
end
|
52
51
|
|
53
52
|
end
|
54
|
-
|
55
53
|
apod
|
56
54
|
end
|
57
55
|
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module Scrapers
|
4
|
+
module Sinfest
|
5
|
+
SINFEST_URL = "http://sinfest.net"
|
6
|
+
|
7
|
+
def self.scrape
|
8
|
+
results = Hash.new
|
9
|
+
results[:comic] = 'Sinfest'
|
10
|
+
results[:url] = SINFEST_URL
|
11
|
+
Mechanize.start do |agent|
|
12
|
+
agent.get SINFEST_URL
|
13
|
+
agent.current_page.image(src: %r{comikaze/comics}).tap do |comic|
|
14
|
+
results[:title] = comic.alt.to_s
|
15
|
+
results[:img_src] = comic.src.to_s
|
16
|
+
comicdate = Date.parse(File.basename(comic.src.to_s,'.gif'))
|
17
|
+
pubdate = Time.utc(comicdate.year,comicdate.month,comicdate.day)
|
18
|
+
results[:pubdate] = pubdate.to_s
|
19
|
+
end
|
20
|
+
end
|
21
|
+
results.tap{|t| $stderr.puts "DEBUG: #{caller(0,1).first} results #{t.inspect}"}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/scrapers/version.rb
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
module Scrapers
|
2
|
-
|
2
|
+
module Version
|
3
|
+
|
4
|
+
MAJOR = 1
|
5
|
+
MINOR = 0
|
6
|
+
BUILD = 0
|
7
|
+
|
8
|
+
end
|
9
|
+
|
10
|
+
VERSION = [Version::MAJOR,Version::MINOR,Version::BUILD].map(&:to_s).join(".")
|
11
|
+
|
3
12
|
DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
|
4
13
|
SUMMARY = "Web site scrapers"
|
5
14
|
LICENSE = "MIT"
|
@@ -12,25 +12,62 @@
|
|
12
12
|
require 'spec_helper'
|
13
13
|
require 'tmpdir'
|
14
14
|
|
15
|
+
def in_tmpdir
|
16
|
+
Dir.mktmpdir do |dir|
|
17
|
+
Dir.chdir(dir) do |dir|
|
18
|
+
yield dir
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
15
24
|
module Scrapers
|
16
25
|
|
17
26
|
describe Download do
|
27
|
+
|
18
28
|
it {Scrapers::Download.should respond_to :download}
|
19
29
|
|
20
|
-
|
21
|
-
|
30
|
+
it "should download and save the file" do
|
31
|
+
in_tmpdir do |dir|
|
22
32
|
@url="http://imgur.com/download/v70StgA/%2Asnrrrrrrrrrrrf%21%2A"
|
23
33
|
VCR.use_cassette("download.cassette") do
|
24
|
-
@file = Scrapers::Download.download(@url,
|
34
|
+
@file = Scrapers::Download.download(@url,dir)
|
25
35
|
end
|
26
|
-
end
|
27
|
-
|
28
|
-
it "saves the file" do
|
29
36
|
@file.should =~ /.*snrrrrrrrrrrrf.*Imgur\.jpg/
|
30
37
|
File.exist?(@file).should be_true
|
31
38
|
end
|
32
39
|
end
|
33
|
-
|
34
|
-
|
40
|
+
it "should overwrite file with second download" do
|
41
|
+
in_tmpdir do |dir|
|
42
|
+
@url="http://imgs.xkcd.com/comics/sandwich.png"
|
43
|
+
VCR.use_cassette("download-overwrite") do
|
44
|
+
@file1 = Scrapers::Download.download(@url,dir)
|
45
|
+
@file2 = Scrapers::Download.download(@url,dir,true)
|
46
|
+
end
|
47
|
+
@file1.should eq @file2
|
48
|
+
@file1.should eq File.join(dir,'sandwich.png')
|
49
|
+
File.exist?(@file1).should be_true
|
50
|
+
end
|
51
|
+
end
|
52
|
+
it "should make a new file on second download" do
|
53
|
+
in_tmpdir do |dir|
|
54
|
+
@url="http://imgs.xkcd.com/comics/sandwich.png"
|
55
|
+
VCR.use_cassette("download-newfile") do
|
56
|
+
@file1 = Scrapers::Download.download(@url,dir)
|
57
|
+
@file2 = Scrapers::Download.download(@url,dir)
|
58
|
+
end
|
35
59
|
|
60
|
+
# Filed issue against mechanise to make save return
|
61
|
+
# the actual file name saved under. Until that's fixed
|
62
|
+
# have to work around it.
|
63
|
+
@file2 += '.1'
|
64
|
+
|
65
|
+
@file1.should_not eq @file2
|
66
|
+
@file1.should eq File.join(dir,'sandwich.png')
|
67
|
+
File.exist?(@file1).should be_true
|
68
|
+
@file2.should eq File.join(dir,'sandwich.png.1')
|
69
|
+
File.exist?(@file2).should be_true
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
36
73
|
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Scrapers
|
4
|
+
|
5
|
+
describe Sinfest do
|
6
|
+
it{should respond_to :scrape}
|
7
|
+
context "scraping" do
|
8
|
+
before(:all) do
|
9
|
+
@comic = VCR.use_cassette('sinfest') do
|
10
|
+
Scrapers::Sinfest.scrape
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
it "retrieves a comic" do
|
15
|
+
@comic.should_not be_nil
|
16
|
+
end
|
17
|
+
it "should be a Hash" do
|
18
|
+
@comic.should be_a(Hash)
|
19
|
+
end
|
20
|
+
%w{title url pubdate img_src}.map(&:to_sym).each do |key|
|
21
|
+
it "should have key #{key}" do
|
22
|
+
@comic.should have_key(key)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
context "title" do
|
26
|
+
it{@comic[:title].should_not be_empty}
|
27
|
+
it{@comic[:title].should match /Nails/}
|
28
|
+
end
|
29
|
+
context "url" do
|
30
|
+
it{@comic[:url].should_not be_empty}
|
31
|
+
it{@comic[:url].should match /http:\/\/sinfest.net/}
|
32
|
+
end
|
33
|
+
context "pubdate" do
|
34
|
+
it{@comic[:pubdate].should_not be_empty}
|
35
|
+
it{Date.parse(@comic[:pubdate]).should be_a(Date)}
|
36
|
+
end
|
37
|
+
context "img_src" do
|
38
|
+
it{@comic[:img_src].should_not be_empty}
|
39
|
+
it{URI.parse(@comic[:img_src]).should be_a(URI::HTTP)}
|
40
|
+
it{@comic[:img_src].should eq 'http://sinfest.net/comikaze/comics/2013-10-19.gif'}
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
#require 'webmock/rspec'
|
2
2
|
require 'vcr'
|
3
3
|
|
4
|
-
|
5
4
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
6
5
|
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
7
6
|
# Require this file using `require "spec_helper"` to ensure that it is only
|
@@ -29,3 +28,4 @@ end
|
|
29
28
|
|
30
29
|
|
31
30
|
require 'scrapers.rb'
|
31
|
+
|