scrapers 0.4.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 155c10cf4c011c7b4eed63949eebe2839dabcc56
4
- data.tar.gz: 54794bf2c19e302ecfcf44bce9317eb5dc17b97e
3
+ metadata.gz: 960eadf205104f76a6be3e1bf820dd3bb401fae8
4
+ data.tar.gz: 56ec817fe56cbe1adc9f82060b5fe8d5b96222be
5
5
  SHA512:
6
- metadata.gz: 1cf0bae23ef4a72e1057e801663cb038e876ebfdcedbcb113c97973773e542905e10c806cbffb30286fb1528d725346b7662e8033ce02efa097ba2650cded5b2
7
- data.tar.gz: 58eef7cdad244caceae175958d4bffdffa520b622b828acbad6362db255ee6ecee61a97fac190e3e14cafad5d30e5416e48ab3cb15225af10495e0c5189b607e
6
+ metadata.gz: fa018e7d9a1ca5faeebca9eb478832784fa3cb377b8b382f6a1d7c28299e1df31297eeeb70a0031748de0f1ce1b7e19d6348ddbeb853e73bda88f760b2db6ab3
7
+ data.tar.gz: 223214a05801135f0e5df20d60393104a865c9446cdcb40d810a08d764bb73202ba4902e642531a3ffb81470611fd20a4c581e7a6074c37dde97e3d49921b900
@@ -16,12 +16,14 @@ module Scrapers
16
16
  module Download
17
17
 
18
18
  def self.download(url,dir=".",overwrite=false)
19
- Scrapers.agent.pluggable_parser.default = Mechanize::Download
19
+ # need a new agent each time!
20
+ agent = Mechanize.new
21
+ agent.pluggable_parser.default = Mechanize::Download
20
22
  @dir = validate_directory(dir)
21
- dl = Scrapers.agent.get(url)
23
+ dl = agent.get(url)
22
24
  Dir.chdir(@dir) do |dir|
23
25
  if overwrite
24
- dl.save!()
26
+ dl.save!(dl.filename)
25
27
  else
26
28
  dl.save()
27
29
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  nasa_apod.rb -- oneline desc
4
4
 
5
- Time-stamp: <2013-08-23 22:47:58 tamara>
5
+ Time-stamp: <2013-10-15 00:17:15 tamara>
6
6
  Copyright (C) 2013 Tamara Temple Web Development
7
7
  Author: Tamara Temple <tamouse@gmail.com>
8
8
  License: MIT
@@ -23,35 +23,33 @@ module Scrapers
23
23
 
24
24
  module NasaApod
25
25
 
26
+ NASA_APOD_URL="http://asterisk.apod.com/library/APOD/APOD%20mirror/astropix.html"
27
+
26
28
  module_function
27
29
 
28
- def scrape(url)
30
+ def scrape(url=nil)
31
+ url ||= NASA_APOD_URL
29
32
  apod = Hash.new
30
- unless url.nil?
31
-
32
- Mechanize.start do |m|
33
+ Mechanize.start do |m|
33
34
 
34
- m.get url
35
+ m.get url
35
36
 
36
- # APOD has a funky entry page, but we want the actual page
37
- prev = m.current_page.link_with(:text => '<').href
38
- m.get prev
39
- canonical = m.current_page.link_with(:text => '>' ).href
40
- m.get canonical
41
-
42
- m.current_page.tap do |page|
43
- apod[:title] = page.title.strip
44
- apod[:link] = page.uri.to_s
45
- apod[:description] = (page/("body")).text
46
- apod[:pubDate] = page.response['date'].to_s
47
- apod[:guid] = page.uri.to_s
48
- apod[:content_encoded] = (page/("body")).to_html
49
- end
50
-
37
+ # APOD has a funky entry page, but we want the actual page
38
+ prev = m.current_page.link_with(:text => '<').href
39
+ m.get prev
40
+ canonical = m.current_page.link_with(:text => '>' ).href
41
+ m.get canonical
42
+
43
+ m.current_page.tap do |page|
44
+ apod[:title] = page.title.strip
45
+ apod[:link] = page.uri.to_s
46
+ apod[:description] = (page/("body")).text
47
+ apod[:pubDate] = page.response['date'].to_s
48
+ apod[:guid] = page.uri.to_s
49
+ apod[:content_encoded] = (page/("body")).to_html
51
50
  end
52
51
 
53
52
  end
54
-
55
53
  apod
56
54
  end
57
55
 
@@ -0,0 +1,24 @@
1
+ require 'mechanize'
2
+
3
+ module Scrapers
4
+ module Sinfest
5
+ SINFEST_URL = "http://sinfest.net"
6
+
7
+ def self.scrape
8
+ results = Hash.new
9
+ results[:comic] = 'Sinfest'
10
+ results[:url] = SINFEST_URL
11
+ Mechanize.start do |agent|
12
+ agent.get SINFEST_URL
13
+ agent.current_page.image(src: %r{comikaze/comics}).tap do |comic|
14
+ results[:title] = comic.alt.to_s
15
+ results[:img_src] = comic.src.to_s
16
+ comicdate = Date.parse(File.basename(comic.src.to_s,'.gif'))
17
+ pubdate = Time.utc(comicdate.year,comicdate.month,comicdate.day)
18
+ results[:pubdate] = pubdate.to_s
19
+ end
20
+ end
21
+ results.tap{|t| $stderr.puts "DEBUG: #{caller(0,1).first} results #{t.inspect}"}
22
+ end
23
+ end
24
+ end
@@ -1,5 +1,14 @@
1
1
  module Scrapers
2
- VERSION = "0.4.3"
2
+ module Version
3
+
4
+ MAJOR = 1
5
+ MINOR = 0
6
+ BUILD = 0
7
+
8
+ end
9
+
10
+ VERSION = [Version::MAJOR,Version::MINOR,Version::BUILD].map(&:to_s).join(".")
11
+
3
12
  DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
4
13
  SUMMARY = "Web site scrapers"
5
14
  LICENSE = "MIT"
@@ -12,25 +12,62 @@
12
12
  require 'spec_helper'
13
13
  require 'tmpdir'
14
14
 
15
+ def in_tmpdir
16
+ Dir.mktmpdir do |dir|
17
+ Dir.chdir(dir) do |dir|
18
+ yield dir
19
+ end
20
+ end
21
+ end
22
+
23
+
15
24
  module Scrapers
16
25
 
17
26
  describe Download do
27
+
18
28
  it {Scrapers::Download.should respond_to :download}
19
29
 
20
- context "download" do
21
- before(:all) do
30
+ it "should download and save the file" do
31
+ in_tmpdir do |dir|
22
32
  @url="http://imgur.com/download/v70StgA/%2Asnrrrrrrrrrrrf%21%2A"
23
33
  VCR.use_cassette("download.cassette") do
24
- @file = Scrapers::Download.download(@url,'tmp')
34
+ @file = Scrapers::Download.download(@url,dir)
25
35
  end
26
- end
27
-
28
- it "saves the file" do
29
36
  @file.should =~ /.*snrrrrrrrrrrrf.*Imgur\.jpg/
30
37
  File.exist?(@file).should be_true
31
38
  end
32
39
  end
33
-
34
- end
40
+ it "should overwrite file with second download" do
41
+ in_tmpdir do |dir|
42
+ @url="http://imgs.xkcd.com/comics/sandwich.png"
43
+ VCR.use_cassette("download-overwrite") do
44
+ @file1 = Scrapers::Download.download(@url,dir)
45
+ @file2 = Scrapers::Download.download(@url,dir,true)
46
+ end
47
+ @file1.should eq @file2
48
+ @file1.should eq File.join(dir,'sandwich.png')
49
+ File.exist?(@file1).should be_true
50
+ end
51
+ end
52
+ it "should make a new file on second download" do
53
+ in_tmpdir do |dir|
54
+ @url="http://imgs.xkcd.com/comics/sandwich.png"
55
+ VCR.use_cassette("download-newfile") do
56
+ @file1 = Scrapers::Download.download(@url,dir)
57
+ @file2 = Scrapers::Download.download(@url,dir)
58
+ end
35
59
 
60
+ # Filed issue against mechanise to make save return
61
+ # the actual file name saved under. Until that's fixed
62
+ # have to work around it.
63
+ @file2 += '.1'
64
+
65
+ @file1.should_not eq @file2
66
+ @file1.should eq File.join(dir,'sandwich.png')
67
+ File.exist?(@file1).should be_true
68
+ @file2.should eq File.join(dir,'sandwich.png.1')
69
+ File.exist?(@file2).should be_true
70
+ end
71
+ end
72
+ end
36
73
  end
@@ -0,0 +1,46 @@
1
+ require 'spec_helper'
2
+
3
+ module Scrapers
4
+
5
+ describe Sinfest do
6
+ it{should respond_to :scrape}
7
+ context "scraping" do
8
+ before(:all) do
9
+ @comic = VCR.use_cassette('sinfest') do
10
+ Scrapers::Sinfest.scrape
11
+ end
12
+ end
13
+
14
+ it "retrieves a comic" do
15
+ @comic.should_not be_nil
16
+ end
17
+ it "should be a Hash" do
18
+ @comic.should be_a(Hash)
19
+ end
20
+ %w{title url pubdate img_src}.map(&:to_sym).each do |key|
21
+ it "should have key #{key}" do
22
+ @comic.should have_key(key)
23
+ end
24
+ end
25
+ context "title" do
26
+ it{@comic[:title].should_not be_empty}
27
+ it{@comic[:title].should match /Nails/}
28
+ end
29
+ context "url" do
30
+ it{@comic[:url].should_not be_empty}
31
+ it{@comic[:url].should match /http:\/\/sinfest.net/}
32
+ end
33
+ context "pubdate" do
34
+ it{@comic[:pubdate].should_not be_empty}
35
+ it{Date.parse(@comic[:pubdate]).should be_a(Date)}
36
+ end
37
+ context "img_src" do
38
+ it{@comic[:img_src].should_not be_empty}
39
+ it{URI.parse(@comic[:img_src]).should be_a(URI::HTTP)}
40
+ it{@comic[:img_src].should eq 'http://sinfest.net/comikaze/comics/2013-10-19.gif'}
41
+ end
42
+
43
+ end
44
+ end
45
+ end
46
+
data/spec/spec_helper.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  #require 'webmock/rspec'
2
2
  require 'vcr'
3
3
 
4
-
5
4
  # This file was generated by the `rspec --init` command. Conventionally, all
6
5
  # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
7
6
  # Require this file using `require "spec_helper"` to ensure that it is only
@@ -29,3 +28,4 @@ end
29
28
 
30
29
 
31
30
  require 'scrapers.rb'
31
+