pet_rescue-scraper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e0869a2e7eca1d31e68d6b428c5c470a94c73958
4
+ data.tar.gz: 7ac8ef13bcdde87b2cd716ab7bf8e3e9835e5429
5
+ SHA512:
6
+ metadata.gz: 0be3f54185d3d0d4fb90aef48ac500be43ea1bf94705350c1998ddb49751e620f1a0470671b09c3bb80fcc85cbf22407009b69daec8c7dd648ae7545ebf6efe5
7
+ data.tar.gz: d059b42dd88c4d0aab3d00313af90d08eed1346088f218246fb29119f50febbe96411da22216b69dde0cd8c389543264fd9abc73e7b77057c2804c87bec2def7
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in pet_rescue-scraper.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Alex Smith
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # PetRescue::Scraper
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'pet_rescue-scraper'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install pet_rescue-scraper
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/pet_rescue-scraper/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
@@ -0,0 +1,18 @@
1
+ require 'open-uri'
2
+
3
+ module PetRescue
4
+ # A listing page on petrescue.com.au
5
+ class ListingPage
6
+ attr_reader :id
7
+
8
+ def initialize(id, parser)
9
+ @id = id
10
+ @url = "http://www.petrescue.com.au/listings/#{@id}"
11
+ @parser = parser
12
+ end
13
+
14
+ def pet
15
+ @parser.parse(open(@url))
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,106 @@
1
+ require 'nokogiri'
2
+ require 'pet_rescue/pet'
3
+
4
+ module PetRescue
5
+ # Parses an HTML listing page from petrescue.com.au
6
+ class ListingPageParser
7
+ def parse(document)
8
+ @document = Nokogiri::HTML(document)
9
+
10
+ Pet.new(name: pet_name,
11
+ size: pet_size,
12
+ gender: pet_gender,
13
+ breed: pet_breed,
14
+ age: pet_age,
15
+ location: pet_location,
16
+ vaccinated: pet_vaccinated?,
17
+ desexed: pet_desexed?,
18
+ biography: pet_biography,
19
+ rescue_group: pet_rescue_group,
20
+ small_photo_url: pet_small_photo_url,
21
+ large_photo_url: pet_large_photo_url,
22
+ adoption_fee: pet_adoption_fee,
23
+ adoption_process: pet_adoption_process,
24
+ contact_name: pet_contact_name,
25
+ contact_number: pet_contact_number)
26
+ end
27
+
28
+ def pet_name
29
+ @document.at_css("#main h1").text.strip
30
+ end
31
+
32
+ def pet_size
33
+ species.split(" ")[0]
34
+ end
35
+
36
+ def pet_gender
37
+ species.split(" ")[1]
38
+ end
39
+
40
+ def pet_breed
41
+ species.split(" ").drop(2).join(" ")
42
+ end
43
+
44
+ def pet_age
45
+ @document.at_css("dd.age").text.strip
46
+ end
47
+
48
+ def pet_location
49
+ @document.at_css(".located_in").text.strip.gsub("Located in ", "").strip
50
+ end
51
+
52
+ def pet_vaccinated?
53
+ @document.at_css("dd.vaccinated").text.strip == "Yes"
54
+ end
55
+
56
+ def pet_desexed?
57
+ @document.at_css("dd.desexed").text.strip == "Yes"
58
+ end
59
+
60
+ def pet_biography
61
+ @document.at_css(".personality").text.strip
62
+ end
63
+
64
+ def pet_adoption_fee
65
+ @document.at_css("dd.adoption_fee").text.strip.gsub("$", "").to_i
66
+ end
67
+
68
+ def pet_adoption_process
69
+ node = @document.at_css(".adoption_process")
70
+ node && node.text.strip
71
+ end
72
+
73
+ def pet_rescue_group
74
+ @document.at_css("dd.fostered_by").text.strip
75
+ end
76
+
77
+ def pet_contact_name
78
+ node = @document.at_css("dd.contact_name")
79
+ node && node.text.strip
80
+ end
81
+
82
+ def pet_contact_number
83
+ node = @document.at_css("dt.contact_number")
84
+ node &&= node.next_element.children.first
85
+ node && node.text.strip
86
+ end
87
+
88
+ def pet_small_photo_url
89
+ photo_node && photo_node.at_css("img").attributes["src"].value
90
+ end
91
+
92
+ def pet_large_photo_url
93
+ photo_node && photo_node.at_css("a").attributes["href"].value
94
+ end
95
+
96
+ private
97
+
98
+ def photo_node
99
+ @document.at_css("#featured_photo")
100
+ end
101
+
102
+ def species
103
+ @document.at_css(".species").text.strip
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,26 @@
1
+ require 'virtus'
2
+
3
+ module PetRescue
4
+ # Model for a pet on petrescue.com.au
5
+ class Pet
6
+ include Virtus.model
7
+
8
+ attribute :id, String
9
+ attribute :name, String
10
+ attribute :size, String
11
+ attribute :gender, String
12
+ attribute :breed, String
13
+ attribute :age, String
14
+ attribute :location, String
15
+ attribute :vaccinated, Boolean
16
+ attribute :desexed, Boolean
17
+ attribute :biography, String
18
+ attribute :adoption_fee, Integer
19
+ attribute :adoption_process, String
20
+ attribute :rescue_group, String
21
+ attribute :contact_name , String
22
+ attribute :contact_number, String
23
+ attribute :small_photo_url, String
24
+ attribute :large_photo_url, String
25
+ end
26
+ end
@@ -0,0 +1,5 @@
1
+ module PetRescue
2
+ module Scraper
3
+ VERSION = "1.0.0"
4
+ end
5
+ end
@@ -0,0 +1,33 @@
1
+ require 'pet_rescue/scraper/version'
2
+ require 'pet_rescue/search_results_page'
3
+
4
+ module PetRescue
5
+ module Scraper
6
+ # A collection of dog listings on petrescue.com.au
7
+ class DogListings
8
+ include Enumerable
9
+
10
+ def initialize(per_page: 48)
11
+ @per_page = per_page
12
+ end
13
+
14
+ def each(&block)
15
+ search_page = SearchResultsPage.from_url(first_page_url)
16
+
17
+ loop do
18
+ search_page.listing_pages.each(&block)
19
+
20
+ break unless search_page.has_next_page?
21
+
22
+ search_page = search_page.next_page
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def first_page_url
29
+ "http://www.petrescue.com.au/listings/dogs?per_page=#{@per_page}&page=1"
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,46 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'pet_rescue/listing_page'
4
+ require 'pet_rescue/listing_page_parser'
5
+
6
+ module PetRescue
7
+ # A page of search results on petrescue.com.au
8
+ class SearchResultsPage
9
+ def self.from_url(url)
10
+ SearchResultsPage.new(open(url))
11
+ end
12
+
13
+ def initialize(document)
14
+ @document = Nokogiri::HTML(document)
15
+ end
16
+
17
+ def listing_pages
18
+ listing_ids.map do |id|
19
+ parser = PetRescue::ListingPageParser.new
20
+ ListingPage.new(id, parser)
21
+ end
22
+ end
23
+
24
+ def has_next_page?
25
+ !!next_page_href
26
+ end
27
+
28
+ def next_page
29
+ url = "http://www.petrescue.com.au#{next_page_href}"
30
+ SearchResultsPage.from_url(url)
31
+ end
32
+
33
+ private
34
+
35
+ def next_page_href
36
+ node = @document.at_css(".next a")
37
+ node && node[:href]
38
+ end
39
+
40
+ def listing_ids
41
+ @document
42
+ .css(".listing .name a")
43
+ .map { |node| Integer(node[:href].to_s.gsub("/listings/", "")) }
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'pet_rescue/scraper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pet_rescue-scraper"
8
+ spec.version = PetRescue::Scraper::VERSION
9
+ spec.authors = ["Alex Smith"]
10
+ spec.email = ["alex@thatalexguy.com"]
11
+ spec.summary = %q{Scraper for petrescue.com.au}
12
+ spec.description = %q{Allows scraping of pet listings from petrescue.com.au}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "nokogiri", "~> 1.6.3.1"
22
+ spec.add_dependency "virtus", "~> 1.0.3"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.7"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ spec.add_development_dependency "rspec", "~> 3.1.0"
27
+ spec.add_development_dependency "vcr", "~> 2.9.3"
28
+ spec.add_development_dependency "webmock", "~> 1.19.0"
29
+ spec.add_development_dependency "simplecov", "~> 0.9.1"
30
+ end