scrapers 0.4.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/scrapers/download.rb +5 -3
- data/lib/scrapers/nasa_apod.rb +20 -22
- data/lib/scrapers/sinfest.rb +24 -0
- data/lib/scrapers/version.rb +10 -1
- data/spec/scrapers/download_spec.rb +45 -8
- data/spec/scrapers/sinfest_spec.rb +46 -0
- data/spec/spec_helper.rb +1 -1
- data/vcr_cassettes/allrecipes_morning-glory-muffins-i.yml +879 -851
- data/vcr_cassettes/disconews_history-of-space.yml +347 -340
- data/vcr_cassettes/download-newfile.yml +631 -0
- data/vcr_cassettes/download-overwrite.yml +631 -0
- data/vcr_cassettes/download_cassette.yml +7 -7
- data/vcr_cassettes/gocomics_nonsequitur.yml +907 -765
- data/vcr_cassettes/sinfest.yml +393 -0
- data/vcr_cassettes/xkcd.yml +8 -8
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 960eadf205104f76a6be3e1bf820dd3bb401fae8
|
4
|
+
data.tar.gz: 56ec817fe56cbe1adc9f82060b5fe8d5b96222be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fa018e7d9a1ca5faeebca9eb478832784fa3cb377b8b382f6a1d7c28299e1df31297eeeb70a0031748de0f1ce1b7e19d6348ddbeb853e73bda88f760b2db6ab3
|
7
|
+
data.tar.gz: 223214a05801135f0e5df20d60393104a865c9446cdcb40d810a08d764bb73202ba4902e642531a3ffb81470611fd20a4c581e7a6074c37dde97e3d49921b900
|
data/lib/scrapers/download.rb
CHANGED
@@ -16,12 +16,14 @@ module Scrapers
|
|
16
16
|
module Download
|
17
17
|
|
18
18
|
def self.download(url,dir=".",overwrite=false)
|
19
|
-
|
19
|
+
# need a new agent each time!
|
20
|
+
agent = Mechanize.new
|
21
|
+
agent.pluggable_parser.default = Mechanize::Download
|
20
22
|
@dir = validate_directory(dir)
|
21
|
-
dl =
|
23
|
+
dl = agent.get(url)
|
22
24
|
Dir.chdir(@dir) do |dir|
|
23
25
|
if overwrite
|
24
|
-
dl.save!()
|
26
|
+
dl.save!(dl.filename)
|
25
27
|
else
|
26
28
|
dl.save()
|
27
29
|
end
|
data/lib/scrapers/nasa_apod.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
nasa_apod.rb -- oneline desc
|
4
4
|
|
5
|
-
Time-stamp: <2013-
|
5
|
+
Time-stamp: <2013-10-15 00:17:15 tamara>
|
6
6
|
Copyright (C) 2013 Tamara Temple Web Development
|
7
7
|
Author: Tamara Temple <tamouse@gmail.com>
|
8
8
|
License: MIT
|
@@ -23,35 +23,33 @@ module Scrapers
|
|
23
23
|
|
24
24
|
module NasaApod
|
25
25
|
|
26
|
+
NASA_APOD_URL="http://asterisk.apod.com/library/APOD/APOD%20mirror/astropix.html"
|
27
|
+
|
26
28
|
module_function
|
27
29
|
|
28
|
-
def scrape(url)
|
30
|
+
def scrape(url=nil)
|
31
|
+
url ||= NASA_APOD_URL
|
29
32
|
apod = Hash.new
|
30
|
-
|
31
|
-
|
32
|
-
Mechanize.start do |m|
|
33
|
+
Mechanize.start do |m|
|
33
34
|
|
34
|
-
|
35
|
+
m.get url
|
35
36
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
end
|
50
|
-
|
37
|
+
# APOD has a funky entry page, but we want the actual page
|
38
|
+
prev = m.current_page.link_with(:text => '<').href
|
39
|
+
m.get prev
|
40
|
+
canonical = m.current_page.link_with(:text => '>' ).href
|
41
|
+
m.get canonical
|
42
|
+
|
43
|
+
m.current_page.tap do |page|
|
44
|
+
apod[:title] = page.title.strip
|
45
|
+
apod[:link] = page.uri.to_s
|
46
|
+
apod[:description] = (page/("body")).text
|
47
|
+
apod[:pubDate] = page.response['date'].to_s
|
48
|
+
apod[:guid] = page.uri.to_s
|
49
|
+
apod[:content_encoded] = (page/("body")).to_html
|
51
50
|
end
|
52
51
|
|
53
52
|
end
|
54
|
-
|
55
53
|
apod
|
56
54
|
end
|
57
55
|
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module Scrapers
|
4
|
+
module Sinfest
|
5
|
+
SINFEST_URL = "http://sinfest.net"
|
6
|
+
|
7
|
+
def self.scrape
|
8
|
+
results = Hash.new
|
9
|
+
results[:comic] = 'Sinfest'
|
10
|
+
results[:url] = SINFEST_URL
|
11
|
+
Mechanize.start do |agent|
|
12
|
+
agent.get SINFEST_URL
|
13
|
+
agent.current_page.image(src: %r{comikaze/comics}).tap do |comic|
|
14
|
+
results[:title] = comic.alt.to_s
|
15
|
+
results[:img_src] = comic.src.to_s
|
16
|
+
comicdate = Date.parse(File.basename(comic.src.to_s,'.gif'))
|
17
|
+
pubdate = Time.utc(comicdate.year,comicdate.month,comicdate.day)
|
18
|
+
results[:pubdate] = pubdate.to_s
|
19
|
+
end
|
20
|
+
end
|
21
|
+
results.tap{|t| $stderr.puts "DEBUG: #{caller(0,1).first} results #{t.inspect}"}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/scrapers/version.rb
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
module Scrapers
|
2
|
-
|
2
|
+
module Version
|
3
|
+
|
4
|
+
MAJOR = 1
|
5
|
+
MINOR = 0
|
6
|
+
BUILD = 0
|
7
|
+
|
8
|
+
end
|
9
|
+
|
10
|
+
VERSION = [Version::MAJOR,Version::MINOR,Version::BUILD].map(&:to_s).join(".")
|
11
|
+
|
3
12
|
DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
|
4
13
|
SUMMARY = "Web site scrapers"
|
5
14
|
LICENSE = "MIT"
|
@@ -12,25 +12,62 @@
|
|
12
12
|
require 'spec_helper'
|
13
13
|
require 'tmpdir'
|
14
14
|
|
15
|
+
def in_tmpdir
|
16
|
+
Dir.mktmpdir do |dir|
|
17
|
+
Dir.chdir(dir) do |dir|
|
18
|
+
yield dir
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
15
24
|
module Scrapers
|
16
25
|
|
17
26
|
describe Download do
|
27
|
+
|
18
28
|
it {Scrapers::Download.should respond_to :download}
|
19
29
|
|
20
|
-
|
21
|
-
|
30
|
+
it "should download and save the file" do
|
31
|
+
in_tmpdir do |dir|
|
22
32
|
@url="http://imgur.com/download/v70StgA/%2Asnrrrrrrrrrrrf%21%2A"
|
23
33
|
VCR.use_cassette("download.cassette") do
|
24
|
-
@file = Scrapers::Download.download(@url,
|
34
|
+
@file = Scrapers::Download.download(@url,dir)
|
25
35
|
end
|
26
|
-
end
|
27
|
-
|
28
|
-
it "saves the file" do
|
29
36
|
@file.should =~ /.*snrrrrrrrrrrrf.*Imgur\.jpg/
|
30
37
|
File.exist?(@file).should be_true
|
31
38
|
end
|
32
39
|
end
|
33
|
-
|
34
|
-
|
40
|
+
it "should overwrite file with second download" do
|
41
|
+
in_tmpdir do |dir|
|
42
|
+
@url="http://imgs.xkcd.com/comics/sandwich.png"
|
43
|
+
VCR.use_cassette("download-overwrite") do
|
44
|
+
@file1 = Scrapers::Download.download(@url,dir)
|
45
|
+
@file2 = Scrapers::Download.download(@url,dir,true)
|
46
|
+
end
|
47
|
+
@file1.should eq @file2
|
48
|
+
@file1.should eq File.join(dir,'sandwich.png')
|
49
|
+
File.exist?(@file1).should be_true
|
50
|
+
end
|
51
|
+
end
|
52
|
+
it "should make a new file on second download" do
|
53
|
+
in_tmpdir do |dir|
|
54
|
+
@url="http://imgs.xkcd.com/comics/sandwich.png"
|
55
|
+
VCR.use_cassette("download-newfile") do
|
56
|
+
@file1 = Scrapers::Download.download(@url,dir)
|
57
|
+
@file2 = Scrapers::Download.download(@url,dir)
|
58
|
+
end
|
35
59
|
|
60
|
+
# Filed issue against mechanise to make save return
|
61
|
+
# the actual file name saved under. Until that's fixed
|
62
|
+
# have to work around it.
|
63
|
+
@file2 += '.1'
|
64
|
+
|
65
|
+
@file1.should_not eq @file2
|
66
|
+
@file1.should eq File.join(dir,'sandwich.png')
|
67
|
+
File.exist?(@file1).should be_true
|
68
|
+
@file2.should eq File.join(dir,'sandwich.png.1')
|
69
|
+
File.exist?(@file2).should be_true
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
36
73
|
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Scrapers
|
4
|
+
|
5
|
+
describe Sinfest do
|
6
|
+
it{should respond_to :scrape}
|
7
|
+
context "scraping" do
|
8
|
+
before(:all) do
|
9
|
+
@comic = VCR.use_cassette('sinfest') do
|
10
|
+
Scrapers::Sinfest.scrape
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
it "retrieves a comic" do
|
15
|
+
@comic.should_not be_nil
|
16
|
+
end
|
17
|
+
it "should be a Hash" do
|
18
|
+
@comic.should be_a(Hash)
|
19
|
+
end
|
20
|
+
%w{title url pubdate img_src}.map(&:to_sym).each do |key|
|
21
|
+
it "should have key #{key}" do
|
22
|
+
@comic.should have_key(key)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
context "title" do
|
26
|
+
it{@comic[:title].should_not be_empty}
|
27
|
+
it{@comic[:title].should match /Nails/}
|
28
|
+
end
|
29
|
+
context "url" do
|
30
|
+
it{@comic[:url].should_not be_empty}
|
31
|
+
it{@comic[:url].should match /http:\/\/sinfest.net/}
|
32
|
+
end
|
33
|
+
context "pubdate" do
|
34
|
+
it{@comic[:pubdate].should_not be_empty}
|
35
|
+
it{Date.parse(@comic[:pubdate]).should be_a(Date)}
|
36
|
+
end
|
37
|
+
context "img_src" do
|
38
|
+
it{@comic[:img_src].should_not be_empty}
|
39
|
+
it{URI.parse(@comic[:img_src]).should be_a(URI::HTTP)}
|
40
|
+
it{@comic[:img_src].should eq 'http://sinfest.net/comikaze/comics/2013-10-19.gif'}
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
#require 'webmock/rspec'
|
2
2
|
require 'vcr'
|
3
3
|
|
4
|
-
|
5
4
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
6
5
|
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
7
6
|
# Require this file using `require "spec_helper"` to ensure that it is only
|
@@ -29,3 +28,4 @@ end
|
|
29
28
|
|
30
29
|
|
31
30
|
require 'scrapers.rb'
|
31
|
+
|