scrapers 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec-example +2 -0
- data/.watchr +27 -0
- data/Gemfile +4 -0
- data/Guardfile +6 -0
- data/LICENSE.txt +22 -0
- data/README.md +32 -0
- data/Rakefile +1 -0
- data/lib/scrapers/allrecipes.rb +49 -0
- data/lib/scrapers/discoverynews.rb +28 -0
- data/lib/scrapers/download.rb +37 -0
- data/lib/scrapers/gocomics.rb +43 -0
- data/lib/scrapers/imgur.rb +56 -0
- data/lib/scrapers/nasa_apod.rb +60 -0
- data/lib/scrapers/version.rb +7 -0
- data/lib/scrapers.rb +9 -0
- data/scrapers.gemspec +29 -0
- data/spec/scrapers/allrecipes_spec.rb +29 -0
- data/spec/scrapers/discoverynews_spec.rb +39 -0
- data/spec/scrapers/download_spec.rb +36 -0
- data/spec/scrapers/gocomics_spec.rb +46 -0
- data/spec/scrapers/imgur_spec.rb +31 -0
- data/spec/scrapers/nasa_apod_spec.rb +30 -0
- data/spec/scrapers_spec.rb +9 -0
- data/spec/spec_helper.rb +31 -0
- data/vcr_cassettes/allrecipes_morning-glory-muffins-i.yml +1047 -0
- data/vcr_cassettes/disconews_history-of-space.yml +385 -0
- data/vcr_cassettes/download_cassette.yml +431 -0
- data/vcr_cassettes/exampleatspecscrapersnasaapodspecrb16_cassette.yml +326 -0
- data/vcr_cassettes/exampleatspecscrapersnasaapodspecrb21_cassette.yml +326 -0
- data/vcr_cassettes/exampleatspecscrapersnasaapodspecrb22_cassette.yml +312 -0
- data/vcr_cassettes/gocomics_nonsequitur.yml +336 -0
- data/vcr_cassettes/shouldincludecontentencoded_cassette.yml +326 -0
- data/vcr_cassettes/shouldincludedescription_cassette.yml +326 -0
- data/vcr_cassettes/shouldincludeguid_cassette.yml +326 -0
- data/vcr_cassettes/shouldincludelink_cassette.yml +326 -0
- data/vcr_cassettes/shouldincludepubDate_cassette.yml +326 -0
- data/vcr_cassettes/shouldincludetitle_cassette.yml +326 -0
- metadata +203 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: eef6a353955404ad8beaf88e4db8ab4b0a4f3cec
|
4
|
+
data.tar.gz: 87ec094c87d315640c1ebb274b6f334b6dd73646
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ece9775696f757216e7c9fd9c2ef2a6ff4a1bd3a9dcbf1fd754caed7c751bdeb1f055a2ea6694b216dfaf61f7a1678cedc777fd0281d660d73e68f34399f2c90
|
7
|
+
data.tar.gz: 370b23a91c38f349a8f02a6fe4ed8770bb58017b43876ad736c449087da7b303aae605f2a314e76b6489bdfc89aaceaa2588f94a6e6e1d7b3e0423c50f2c5f63
|
data/.gitignore
ADDED
data/.rspec-example
ADDED
data/.watchr
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
def run_spec(file)
|
3
|
+
unless File.exist?(file)
|
4
|
+
puts "#{file} does not exist"
|
5
|
+
return
|
6
|
+
end
|
7
|
+
|
8
|
+
puts "Running #{file}"
|
9
|
+
system "bundle exec rspec #{file}"
|
10
|
+
puts
|
11
|
+
end
|
12
|
+
|
13
|
+
watch("^spec/*_spec.rb$") do |match|
|
14
|
+
run_spec match[0]
|
15
|
+
end
|
16
|
+
|
17
|
+
watch("^spec/.*/*_spec.rb$") do |match|
|
18
|
+
run_spec match[0]
|
19
|
+
end
|
20
|
+
|
21
|
+
watch("^lib/(.*).rb$") do |match|
|
22
|
+
run_spec "spec/#{match[1]}_spec.rb"
|
23
|
+
end
|
24
|
+
|
25
|
+
watch("^lib/(.*/.*).rb$") do |match|
|
26
|
+
run_spec "spec/#{match[1]}_spec.rb"
|
27
|
+
end
|
data/Gemfile
ADDED
data/Guardfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Tamara Temple
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# Scrapers
|
2
|
+
|
3
|
+
A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc.
|
4
|
+
|
5
|
+
LICENSE:: MIT
|
6
|
+
WEBSITE:: http://github.com/tamouse/scrapers
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Add this line to your application's Gemfile:
|
11
|
+
|
12
|
+
gem 'scrapers'
|
13
|
+
|
14
|
+
And then execute:
|
15
|
+
|
16
|
+
$ bundle
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
$ gem install scrapers
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
See the various RDoc for explanation of each item.
|
25
|
+
|
26
|
+
## Contributing
|
27
|
+
|
28
|
+
1. Fork it
|
29
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
30
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
31
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
32
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
|
4
|
+
module Scrapers
|
5
|
+
|
6
|
+
module AllRecipes
|
7
|
+
|
8
|
+
def self.scrape(url)
|
9
|
+
|
10
|
+
results = Hash.new
|
11
|
+
|
12
|
+
Scrapers.agent.get(url).tap do |page|
|
13
|
+
results[:url] = page.uri.to_s
|
14
|
+
results[:title] = page.title.strip
|
15
|
+
results[:ingredients] = scrape_ingredients(page)
|
16
|
+
results[:directions] = scrape_directions(page)
|
17
|
+
results[:photo] = scrape_photo(page)
|
18
|
+
end
|
19
|
+
|
20
|
+
results
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.scrape_ingredients(page)
|
25
|
+
page.
|
26
|
+
search("ul.ingredient-wrap").
|
27
|
+
search(".//li").
|
28
|
+
map do |i|
|
29
|
+
i.text.gsub(/[[:space:]]+/,' ').sub(/^/,'*')
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.scrape_directions(page)
|
34
|
+
page.
|
35
|
+
search("div.directLeft").first.
|
36
|
+
search("li").
|
37
|
+
map do |i|
|
38
|
+
i.text.gsub(/[[:space:]]+/,' ').sub(/^/,'# ')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.scrape_photo(page)
|
43
|
+
photo = page.search("img#imgPhoto").first
|
44
|
+
Hash[photo.attributes.map{|k,v| [k,v.value]}]
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
|
3
|
+
= DISCOVERYNEWS.RB
|
4
|
+
|
5
|
+
*Author*:: Tamara Temple <tamouse@gmail.com>
|
6
|
+
*Since*:: 2013-06-15
|
7
|
+
*Copyright*:: (c) 2013 Tamara Temple Web Development
|
8
|
+
*License*:: MIT
|
9
|
+
|
10
|
+
Scraper for disco news pictures of the week
|
11
|
+
|
12
|
+
=end
|
13
|
+
|
14
|
+
require 'mechanize'
|
15
|
+
|
16
|
+
module Scrapers
|
17
|
+
|
18
|
+
module DiscoNews
|
19
|
+
|
20
|
+
def self.disco_downloads(url)
|
21
|
+
@url = url.clone
|
22
|
+
@page = Scrapers.agent.get(url)
|
23
|
+
images = @page.images_with(:class => "media-hero").map(&:src)
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
|
3
|
+
= DOWNLOAD.RB
|
4
|
+
|
5
|
+
*Author*:: Tamara Temple <tamara@tamaratemple.com>
|
6
|
+
*Since*:: 2013-05-27
|
7
|
+
*Copyright*:: (c) 2013 Tamara Temple Web Development
|
8
|
+
*License*:: MIT
|
9
|
+
|
10
|
+
=end
|
11
|
+
|
12
|
+
require 'mechanize'
|
13
|
+
|
14
|
+
module Scrapers
|
15
|
+
|
16
|
+
module Download
|
17
|
+
|
18
|
+
def self.download(url,dir=".")
|
19
|
+
Scrapers.agent.pluggable_parser.default = Mechanize::Download
|
20
|
+
@dir = validate_directory(dir)
|
21
|
+
dl = Scrapers.agent.get(url)
|
22
|
+
Dir.chdir(@dir) do |dir|
|
23
|
+
dl.save()
|
24
|
+
end
|
25
|
+
File.join(@dir,dl.filename)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.validate_directory(d)
|
29
|
+
STDERR.puts caller(0,1).first +
|
30
|
+
"d: #{d.inspect}. directory? #{File.directory?(d)}. writable? #{File.writable?(d)}"
|
31
|
+
raise "#{d} is not a writable directory!" unless File.directory?(d) and File.writable?(d)
|
32
|
+
d
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
|
5
|
+
module Scrapers
|
6
|
+
|
7
|
+
module GoComics
|
8
|
+
|
9
|
+
GOCOMIC_URL = "http://www.gocomics.com/"
|
10
|
+
|
11
|
+
def self.scrape(comic)
|
12
|
+
|
13
|
+
results = Hash.new
|
14
|
+
|
15
|
+
url = URI.parse GOCOMIC_URL
|
16
|
+
url.path = "/#{comic}"
|
17
|
+
|
18
|
+
page = Nokogiri::HTML(open(url.to_s))
|
19
|
+
results[:url] = url.to_s
|
20
|
+
results[:title] = scrape_title(page)
|
21
|
+
results[:pubdate] = scrape_pubdate(page)
|
22
|
+
results[:img_src] = scrape_image_source(page)
|
23
|
+
results
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.scrape_title(page)
|
27
|
+
page.at_css("title").content.strip.gsub(/[[:space:]]/,' ').squeeze(" ")
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.scrape_pubdate(page)
|
31
|
+
Date.parse(page.at_css("ul.feature-nav > li").content).to_s
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.scrape_image_source(page)
|
35
|
+
page.
|
36
|
+
at_css("p.feature_item").
|
37
|
+
at_css("img").
|
38
|
+
attr("src")
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
|
3
|
+
= IMGUR.RB
|
4
|
+
|
5
|
+
*Author*:: Tamara Temple <tamara@tamaratemple.com>
|
6
|
+
*Since*:: 2013-05-27
|
7
|
+
*Copyright*:: (c) 2013 Tamara Temple Web Development
|
8
|
+
*License*:: MIT
|
9
|
+
|
10
|
+
=end
|
11
|
+
|
12
|
+
module Scrapers
|
13
|
+
|
14
|
+
IMGUR_TEMPLATE="http://imgur.com/path"
|
15
|
+
|
16
|
+
class Imgur
|
17
|
+
|
18
|
+
attr_accessor :agent, :url, :download, :page
|
19
|
+
|
20
|
+
def initialize
|
21
|
+
@agent = Mechanize.new
|
22
|
+
@url = URI.parse(IMGUR_TEMPLATE)
|
23
|
+
@download = URI.parse(IMGUR_TEMPLATE)
|
24
|
+
end
|
25
|
+
|
26
|
+
def download_link(code)
|
27
|
+
make_url(code)
|
28
|
+
retrieve_page()
|
29
|
+
find_download()
|
30
|
+
@download.to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
def make_url(imgur_code)
|
34
|
+
@url.path = "/#{imgur_code}"
|
35
|
+
end
|
36
|
+
|
37
|
+
def retrieve_page()
|
38
|
+
@page = @agent.get(@url.to_s)
|
39
|
+
end
|
40
|
+
|
41
|
+
def find_download(link_text=/Download/)
|
42
|
+
link = @page.link_with(:text => link_text)
|
43
|
+
raise "#{link_text} not found on #{@page.uri.to_s}" if link.nil?
|
44
|
+
@download.path = link.href
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
module_function
|
50
|
+
|
51
|
+
def imgur(url)
|
52
|
+
code = File.basename(url).sub(/\.[^.]+$/,'')
|
53
|
+
"http://imgur.com/download/#{code}/"
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
|
3
|
+
nasa_apod.rb -- oneline desc
|
4
|
+
|
5
|
+
Time-stamp: <2013-08-23 22:47:58 tamara>
|
6
|
+
Copyright (C) 2013 Tamara Temple Web Development
|
7
|
+
Author: Tamara Temple <tamouse@gmail.com>
|
8
|
+
License: MIT
|
9
|
+
|
10
|
+
== Discussion
|
11
|
+
|
12
|
+
NASA's Astronomy Picture of the Day is a great source for nice astro
|
13
|
+
photos and various other information. But it isn't something I
|
14
|
+
remember to go see every day, so I'd like it to drop in my in-box or
|
15
|
+
an evernote notebook. But the feed does not include the image, for
|
16
|
+
some ungodly reason, so I'm adding a scraper to grab the nice info off
|
17
|
+
the page including the photo.
|
18
|
+
|
19
|
+
=end
|
20
|
+
|
21
|
+
|
22
|
+
module Scrapers
|
23
|
+
|
24
|
+
module NasaApod
|
25
|
+
|
26
|
+
module_function
|
27
|
+
|
28
|
+
def scrape(url)
|
29
|
+
apod = Hash.new
|
30
|
+
unless url.nil?
|
31
|
+
|
32
|
+
Mechanize.start do |m|
|
33
|
+
|
34
|
+
m.get url
|
35
|
+
|
36
|
+
# APOD has a funky entry page, but we want the actual page
|
37
|
+
prev = m.current_page.link_with(:text => '<').href
|
38
|
+
m.get prev
|
39
|
+
canonical = m.current_page.link_with(:text => '>' ).href
|
40
|
+
m.get canonical
|
41
|
+
|
42
|
+
m.current_page.tap do |page|
|
43
|
+
apod[:title] = page.title.strip
|
44
|
+
apod[:link] = page.uri.to_s
|
45
|
+
apod[:description] = (page/("body")).text
|
46
|
+
apod[:pubDate] = page.response['date'].to_s
|
47
|
+
apod[:guid] = page.uri.to_s
|
48
|
+
apod[:content_encoded] = (page/("body")).to_html
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
apod
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
module Scrapers
|
2
|
+
VERSION = "0.2.0"
|
3
|
+
DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
|
4
|
+
SUMMARY = "Web site scrapers"
|
5
|
+
LICENSE = "MIT"
|
6
|
+
WEBSITE = "http://github.com/tamouse/scrapers"
|
7
|
+
end
|
data/lib/scrapers.rb
ADDED
data/scrapers.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'scrapers/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "scrapers"
|
8
|
+
spec.version = Scrapers::VERSION
|
9
|
+
spec.authors = ["Tamara Temple"]
|
10
|
+
spec.email = ["tamouse@gmail.com"]
|
11
|
+
spec.description = Scrapers::DESCRIPTION
|
12
|
+
spec.summary = Scrapers::SUMMARY
|
13
|
+
spec.homepage = Scrapers::WEBSITE
|
14
|
+
spec.license = Scrapers::LICENSE
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency "rspec"
|
24
|
+
spec.add_dependency "mechanize"
|
25
|
+
spec.add_development_dependency "guard"
|
26
|
+
spec.add_development_dependency "guard-rspec"
|
27
|
+
spec.add_development_dependency "webmock"
|
28
|
+
spec.add_development_dependency "vcr"
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Scrapers
|
4
|
+
|
5
|
+
describe AllRecipes do
|
6
|
+
it{should respond_to :scrape}
|
7
|
+
context "scraping" do
|
8
|
+
before(:all) do
|
9
|
+
@url = "http://allrecipes.com/recipe/morning-glory-muffins-i/detail.aspx"
|
10
|
+
@recipe = VCR.use_cassette('allrecipes.morning-glory-muffins-i') do
|
11
|
+
Scrapers::AllRecipes.scrape(@url)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it "retrieves a recipe" do
|
16
|
+
@recipe.should_not be_nil
|
17
|
+
end
|
18
|
+
it "should be a Hash" do
|
19
|
+
@recipe.should be_a(Hash)
|
20
|
+
end
|
21
|
+
%w{title url ingredients directions photo}.map(&:to_sym).each do |key|
|
22
|
+
it "should have key #{key}" do
|
23
|
+
@recipe.should have_key(key)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
|
3
|
+
= DISCOVERYNEWS_SPEC.RB
|
4
|
+
|
5
|
+
*Author*:: Tamara Temple <tamouse@gmail.com>
|
6
|
+
*Since*:: 2013-06-15
|
7
|
+
*Copyright*:: (c) 2013 Tamara Temple Web Development
|
8
|
+
*License*:: MIT
|
9
|
+
|
10
|
+
=end
|
11
|
+
|
12
|
+
require 'spec_helper'
|
13
|
+
|
14
|
+
|
15
|
+
module Scrapers
|
16
|
+
|
17
|
+
describe DiscoNews do
|
18
|
+
it {should respond_to(:disco_downloads)}
|
19
|
+
|
20
|
+
context "scraping" do
|
21
|
+
let(:url) {"http://news.discovery.com/space/history-of-space/stunning-space-photos-week-june-9-14-pictures-130614.htm"}
|
22
|
+
let(:images) do
|
23
|
+
VCR.use_cassette('disconews.history-of-space') do
|
24
|
+
Scrapers::DiscoNews.disco_downloads(url)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
it "retrieves an array of images" do
|
29
|
+
images.should be_a(Array)
|
30
|
+
images.each do |i|
|
31
|
+
i.should =~ /^http:\/\/.*(jpe?g|png|gif)/
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
|
3
|
+
= DOWNLOAD_SPEC.RB
|
4
|
+
|
5
|
+
*Author*:: Tamara Temple <tamara@tamaratemple.com>
|
6
|
+
*Since*:: 2013-05-27
|
7
|
+
*Copyright*:: (c) 2013 Tamara Temple Web Development
|
8
|
+
*License*:: MIT
|
9
|
+
|
10
|
+
=end
|
11
|
+
|
12
|
+
require 'spec_helper'
|
13
|
+
require 'tmpdir'
|
14
|
+
|
15
|
+
module Scrapers
|
16
|
+
|
17
|
+
describe Download do
|
18
|
+
it {Scrapers::Download.should respond_to :download}
|
19
|
+
|
20
|
+
context "download" do
|
21
|
+
before(:all) do
|
22
|
+
@url="http://imgur.com/download/v70StgA/%2Asnrrrrrrrrrrrf%21%2A"
|
23
|
+
VCR.use_cassette("download.cassette") do
|
24
|
+
@file = Scrapers::Download.download(@url,'tmp')
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
it "saves the file" do
|
29
|
+
@file.should =~ /.*snrrrrrrrrrrrf.*Imgur\.jpg/
|
30
|
+
File.exist?(@file).should be_true
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Scrapers
|
4
|
+
|
5
|
+
describe GoComics do
|
6
|
+
it{should respond_to :scrape}
|
7
|
+
context "scraping" do
|
8
|
+
before(:all) do
|
9
|
+
@comic_strip = 'nonsequitur'
|
10
|
+
@comic = VCR.use_cassette('gocomics.nonsequitur') do
|
11
|
+
Scrapers::GoComics.scrape(@comic_strip)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it "retrieves a comic" do
|
16
|
+
@comic.should_not be_nil
|
17
|
+
end
|
18
|
+
it "should be a Hash" do
|
19
|
+
@comic.should be_a(Hash)
|
20
|
+
end
|
21
|
+
%w{title url pubdate img_src}.map(&:to_sym).each do |key|
|
22
|
+
it "should have key #{key}" do
|
23
|
+
@comic.should have_key(key)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
context "title" do
|
27
|
+
it{@comic[:title].should_not be_empty}
|
28
|
+
it{@comic[:title].should match /Non Sequitur Comic Strip on GoComics.com/}
|
29
|
+
end
|
30
|
+
context "url" do
|
31
|
+
it{@comic[:url].should_not be_empty}
|
32
|
+
it{@comic[:url].should match /www\.gocomics\.com\/nonsequitur/}
|
33
|
+
end
|
34
|
+
context "pubdate" do
|
35
|
+
it{@comic[:pubdate].should_not be_empty}
|
36
|
+
it{Date.parse(@comic[:pubdate]).should be_a(Date)}
|
37
|
+
end
|
38
|
+
context "img_src" do
|
39
|
+
it{@comic[:img_src].should_not be_empty}
|
40
|
+
it{URI.parse(@comic[:img_src]).should be_a(URI::HTTP)}
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
@@ -0,0 +1,31 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
|
3
|
+
= IMGUR_SPEC.RB
|
4
|
+
|
5
|
+
*Author*:: Tamara Temple <tamara@tamaratemple.com>
|
6
|
+
*Since*:: 2013-05-27
|
7
|
+
*Copyright*:: (c) 2013 Tamara Temple Web Development
|
8
|
+
*License*:: MIT
|
9
|
+
|
10
|
+
=end
|
11
|
+
|
12
|
+
require 'spec_helper'
|
13
|
+
|
14
|
+
module Scrapers
|
15
|
+
|
16
|
+
describe "Scrapers" do
|
17
|
+
it {Scrapers.should respond_to(:imgur)}
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Fetch the download link" do
|
21
|
+
let(:url) {"http://imgur.com/v70StgA"}
|
22
|
+
|
23
|
+
it "should return the download link from a given url" do
|
24
|
+
Scrapers.imgur(url).should =~ %r{http://imgur.com/download/v70StgA/}
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
|
5
|
+
module Scrapers
|
6
|
+
|
7
|
+
describe NasaApod do
|
8
|
+
it {Scrapers::NasaApod.should respond_to :scrape}
|
9
|
+
|
10
|
+
context "scrape" do
|
11
|
+
let(:url){"http://apod.nasa.gov/apod/astropix.html"}
|
12
|
+
let(:apod_hash){
|
13
|
+
VCR.use_cassette("#{example.description.gsub(/[^-[:alnum:]]/,'')}.cassette", :record => :new_episodes) do
|
14
|
+
Scrapers::NasaApod.scrape(url)
|
15
|
+
end}
|
16
|
+
it {apod_hash.should be_a(Hash)}
|
17
|
+
%w{title link description pubDate guid content_encoded}.map(&:to_sym).each do |attr|
|
18
|
+
it "should include #{attr}" do
|
19
|
+
apod_hash.keys.should include attr
|
20
|
+
end
|
21
|
+
it {apod_hash[attr].should_not be_nil}
|
22
|
+
it {apod_hash[attr].should be_a(String)}
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|