scrapers 0.4.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 155c10cf4c011c7b4eed63949eebe2839dabcc56
4
- data.tar.gz: 54794bf2c19e302ecfcf44bce9317eb5dc17b97e
3
+ metadata.gz: 960eadf205104f76a6be3e1bf820dd3bb401fae8
4
+ data.tar.gz: 56ec817fe56cbe1adc9f82060b5fe8d5b96222be
5
5
  SHA512:
6
- metadata.gz: 1cf0bae23ef4a72e1057e801663cb038e876ebfdcedbcb113c97973773e542905e10c806cbffb30286fb1528d725346b7662e8033ce02efa097ba2650cded5b2
7
- data.tar.gz: 58eef7cdad244caceae175958d4bffdffa520b622b828acbad6362db255ee6ecee61a97fac190e3e14cafad5d30e5416e48ab3cb15225af10495e0c5189b607e
6
+ metadata.gz: fa018e7d9a1ca5faeebca9eb478832784fa3cb377b8b382f6a1d7c28299e1df31297eeeb70a0031748de0f1ce1b7e19d6348ddbeb853e73bda88f760b2db6ab3
7
+ data.tar.gz: 223214a05801135f0e5df20d60393104a865c9446cdcb40d810a08d764bb73202ba4902e642531a3ffb81470611fd20a4c581e7a6074c37dde97e3d49921b900
@@ -16,12 +16,14 @@ module Scrapers
16
16
  module Download
17
17
 
18
18
  def self.download(url,dir=".",overwrite=false)
19
- Scrapers.agent.pluggable_parser.default = Mechanize::Download
19
+ # need a new agent each time!
20
+ agent = Mechanize.new
21
+ agent.pluggable_parser.default = Mechanize::Download
20
22
  @dir = validate_directory(dir)
21
- dl = Scrapers.agent.get(url)
23
+ dl = agent.get(url)
22
24
  Dir.chdir(@dir) do |dir|
23
25
  if overwrite
24
- dl.save!()
26
+ dl.save!(dl.filename)
25
27
  else
26
28
  dl.save()
27
29
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  nasa_apod.rb -- oneline desc
4
4
 
5
- Time-stamp: <2013-08-23 22:47:58 tamara>
5
+ Time-stamp: <2013-10-15 00:17:15 tamara>
6
6
  Copyright (C) 2013 Tamara Temple Web Development
7
7
  Author: Tamara Temple <tamouse@gmail.com>
8
8
  License: MIT
@@ -23,35 +23,33 @@ module Scrapers
23
23
 
24
24
  module NasaApod
25
25
 
26
+ NASA_APOD_URL="http://asterisk.apod.com/library/APOD/APOD%20mirror/astropix.html"
27
+
26
28
  module_function
27
29
 
28
- def scrape(url)
30
+ def scrape(url=nil)
31
+ url ||= NASA_APOD_URL
29
32
  apod = Hash.new
30
- unless url.nil?
31
-
32
- Mechanize.start do |m|
33
+ Mechanize.start do |m|
33
34
 
34
- m.get url
35
+ m.get url
35
36
 
36
- # APOD has a funky entry page, but we want the actual page
37
- prev = m.current_page.link_with(:text => '<').href
38
- m.get prev
39
- canonical = m.current_page.link_with(:text => '>' ).href
40
- m.get canonical
41
-
42
- m.current_page.tap do |page|
43
- apod[:title] = page.title.strip
44
- apod[:link] = page.uri.to_s
45
- apod[:description] = (page/("body")).text
46
- apod[:pubDate] = page.response['date'].to_s
47
- apod[:guid] = page.uri.to_s
48
- apod[:content_encoded] = (page/("body")).to_html
49
- end
50
-
37
+ # APOD has a funky entry page, but we want the actual page
38
+ prev = m.current_page.link_with(:text => '<').href
39
+ m.get prev
40
+ canonical = m.current_page.link_with(:text => '>' ).href
41
+ m.get canonical
42
+
43
+ m.current_page.tap do |page|
44
+ apod[:title] = page.title.strip
45
+ apod[:link] = page.uri.to_s
46
+ apod[:description] = (page/("body")).text
47
+ apod[:pubDate] = page.response['date'].to_s
48
+ apod[:guid] = page.uri.to_s
49
+ apod[:content_encoded] = (page/("body")).to_html
51
50
  end
52
51
 
53
52
  end
54
-
55
53
  apod
56
54
  end
57
55
 
@@ -0,0 +1,24 @@
1
+ require 'mechanize'
2
+
3
+ module Scrapers
4
+ module Sinfest
5
+ SINFEST_URL = "http://sinfest.net"
6
+
7
+ def self.scrape
8
+ results = Hash.new
9
+ results[:comic] = 'Sinfest'
10
+ results[:url] = SINFEST_URL
11
+ Mechanize.start do |agent|
12
+ agent.get SINFEST_URL
13
+ agent.current_page.image(src: %r{comikaze/comics}).tap do |comic|
14
+ results[:title] = comic.alt.to_s
15
+ results[:img_src] = comic.src.to_s
16
+ comicdate = Date.parse(File.basename(comic.src.to_s,'.gif'))
17
+ pubdate = Time.utc(comicdate.year,comicdate.month,comicdate.day)
18
+ results[:pubdate] = pubdate.to_s
19
+ end
20
+ end
21
+ results.tap{|t| $stderr.puts "DEBUG: #{caller(0,1).first} results #{t.inspect}"}
22
+ end
23
+ end
24
+ end
@@ -1,5 +1,14 @@
1
1
  module Scrapers
2
- VERSION = "0.4.3"
2
+ module Version
3
+
4
+ MAJOR = 1
5
+ MINOR = 0
6
+ BUILD = 0
7
+
8
+ end
9
+
10
+ VERSION = [Version::MAJOR,Version::MINOR,Version::BUILD].map(&:to_s).join(".")
11
+
3
12
  DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
4
13
  SUMMARY = "Web site scrapers"
5
14
  LICENSE = "MIT"
@@ -12,25 +12,62 @@
12
12
  require 'spec_helper'
13
13
  require 'tmpdir'
14
14
 
15
+ def in_tmpdir
16
+ Dir.mktmpdir do |dir|
17
+ Dir.chdir(dir) do |dir|
18
+ yield dir
19
+ end
20
+ end
21
+ end
22
+
23
+
15
24
  module Scrapers
16
25
 
17
26
  describe Download do
27
+
18
28
  it {Scrapers::Download.should respond_to :download}
19
29
 
20
- context "download" do
21
- before(:all) do
30
+ it "should download and save the file" do
31
+ in_tmpdir do |dir|
22
32
  @url="http://imgur.com/download/v70StgA/%2Asnrrrrrrrrrrrf%21%2A"
23
33
  VCR.use_cassette("download.cassette") do
24
- @file = Scrapers::Download.download(@url,'tmp')
34
+ @file = Scrapers::Download.download(@url,dir)
25
35
  end
26
- end
27
-
28
- it "saves the file" do
29
36
  @file.should =~ /.*snrrrrrrrrrrrf.*Imgur\.jpg/
30
37
  File.exist?(@file).should be_true
31
38
  end
32
39
  end
33
-
34
- end
40
+ it "should overwrite file with second download" do
41
+ in_tmpdir do |dir|
42
+ @url="http://imgs.xkcd.com/comics/sandwich.png"
43
+ VCR.use_cassette("download-overwrite") do
44
+ @file1 = Scrapers::Download.download(@url,dir)
45
+ @file2 = Scrapers::Download.download(@url,dir,true)
46
+ end
47
+ @file1.should eq @file2
48
+ @file1.should eq File.join(dir,'sandwich.png')
49
+ File.exist?(@file1).should be_true
50
+ end
51
+ end
52
+ it "should make a new file on second download" do
53
+ in_tmpdir do |dir|
54
+ @url="http://imgs.xkcd.com/comics/sandwich.png"
55
+ VCR.use_cassette("download-newfile") do
56
+ @file1 = Scrapers::Download.download(@url,dir)
57
+ @file2 = Scrapers::Download.download(@url,dir)
58
+ end
35
59
 
60
+ # Filed issue against mechanise to make save return
61
+ # the actual file name saved under. Until that's fixed
62
+ # have to work around it.
63
+ @file2 += '.1'
64
+
65
+ @file1.should_not eq @file2
66
+ @file1.should eq File.join(dir,'sandwich.png')
67
+ File.exist?(@file1).should be_true
68
+ @file2.should eq File.join(dir,'sandwich.png.1')
69
+ File.exist?(@file2).should be_true
70
+ end
71
+ end
72
+ end
36
73
  end
@@ -0,0 +1,46 @@
1
+ require 'spec_helper'
2
+
3
+ module Scrapers
4
+
5
+ describe Sinfest do
6
+ it{should respond_to :scrape}
7
+ context "scraping" do
8
+ before(:all) do
9
+ @comic = VCR.use_cassette('sinfest') do
10
+ Scrapers::Sinfest.scrape
11
+ end
12
+ end
13
+
14
+ it "retrieves a comic" do
15
+ @comic.should_not be_nil
16
+ end
17
+ it "should be a Hash" do
18
+ @comic.should be_a(Hash)
19
+ end
20
+ %w{title url pubdate img_src}.map(&:to_sym).each do |key|
21
+ it "should have key #{key}" do
22
+ @comic.should have_key(key)
23
+ end
24
+ end
25
+ context "title" do
26
+ it{@comic[:title].should_not be_empty}
27
+ it{@comic[:title].should match /Nails/}
28
+ end
29
+ context "url" do
30
+ it{@comic[:url].should_not be_empty}
31
+ it{@comic[:url].should match /http:\/\/sinfest.net/}
32
+ end
33
+ context "pubdate" do
34
+ it{@comic[:pubdate].should_not be_empty}
35
+ it{Date.parse(@comic[:pubdate]).should be_a(Date)}
36
+ end
37
+ context "img_src" do
38
+ it{@comic[:img_src].should_not be_empty}
39
+ it{URI.parse(@comic[:img_src]).should be_a(URI::HTTP)}
40
+ it{@comic[:img_src].should eq 'http://sinfest.net/comikaze/comics/2013-10-19.gif'}
41
+ end
42
+
43
+ end
44
+ end
45
+ end
46
+
data/spec/spec_helper.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  #require 'webmock/rspec'
2
2
  require 'vcr'
3
3
 
4
-
5
4
  # This file was generated by the `rspec --init` command. Conventionally, all
6
5
  # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
7
6
  # Require this file using `require "spec_helper"` to ensure that it is only
@@ -29,3 +28,4 @@ end
29
28
 
30
29
 
31
30
  require 'scrapers.rb'
31
+