RubyGems - pet_rescue-scraper - Versions diffs - 1.0.0 - Mend

pet_rescue-scraper 1.0.0

Files changed (26) hide show

checksums.yaml +7 -0
data/.gitignore +14 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +31 -0
data/Rakefile +6 -0
data/lib/pet_rescue/listing_page.rb +18 -0
data/lib/pet_rescue/listing_page_parser.rb +106 -0
data/lib/pet_rescue/pet.rb +26 -0
data/lib/pet_rescue/scraper/version.rb +5 -0
data/lib/pet_rescue/scraper.rb +33 -0
data/lib/pet_rescue/search_results_page.rb +46 -0
data/pet_rescue-scraper.gemspec +30 -0
data/spec/fixtures/dog_search_first_page.html +1492 -0
data/spec/fixtures/dog_search_last_page.html +742 -0
data/spec/fixtures/mau.html +354 -0
data/spec/fixtures/muttley.html +402 -0
data/spec/fixtures/vcr_cassettes/dogs.yml +3743 -0
data/spec/fixtures/wyatt.html +375 -0
data/spec/pet_rescue/listing_page_parser_spec.rb +140 -0
data/spec/pet_rescue/listing_page_spec.rb +20 -0
data/spec/pet_rescue/scraper_spec.rb +15 -0
data/spec/pet_rescue/search_results_page_spec.rb +32 -0
data/spec/spec_helper.rb +19 -0
data/spec/support/vcr.rb +7 -0
metadata +192 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: e0869a2e7eca1d31e68d6b428c5c470a94c73958
+  data.tar.gz: 7ac8ef13bcdde87b2cd716ab7bf8e3e9835e5429
+SHA512:
+  metadata.gz: 0be3f54185d3d0d4fb90aef48ac500be43ea1bf94705350c1998ddb49751e620f1a0470671b09c3bb80fcc85cbf22407009b69daec8c7dd648ae7545ebf6efe5
+  data.tar.gz: d059b42dd88c4d0aab3d00313af90d08eed1346088f218246fb29119f50febbe96411da22216b69dde0cd8c389543264fd9abc73e7b77057c2804c87bec2def7

data/.gitignore ADDED Viewed

@@ -0,0 +1,14 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+*.bundle
+*.so
+*.o
+*.a
+mkmf.log

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in pet_rescue-scraper.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2014 Alex Smith
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,31 @@
+# PetRescue::Scraper
+TODO: Write a gem description
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'pet_rescue-scraper'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install pet_rescue-scraper
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it ( https://github.com/[my-github-username]/pet_rescue-scraper/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec

data/lib/pet_rescue/listing_page.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'open-uri'
+module PetRescue
+  # A listing page on petrescue.com.au
+  class ListingPage
+    attr_reader :id
+    def initialize(id, parser)
+      @id = id
+      @url = "http://www.petrescue.com.au/listings/#{@id}"
+      @parser = parser
+    end
+    def pet
+      @parser.parse(open(@url))
+    end
+  end
+end

data/lib/pet_rescue/listing_page_parser.rb ADDED Viewed

@@ -0,0 +1,106 @@
+require 'nokogiri'
+require 'pet_rescue/pet'
+module PetRescue
+  # Parses an HTML listing page from petrescue.com.au
+  class ListingPageParser
+    def parse(document)
+      @document = Nokogiri::HTML(document)
+      Pet.new(name: pet_name,
+              size: pet_size,
+              gender: pet_gender,
+              breed: pet_breed,
+              age: pet_age,
+              location: pet_location,
+              vaccinated: pet_vaccinated?,
+              desexed: pet_desexed?,
+              biography: pet_biography,
+              rescue_group: pet_rescue_group,
+              small_photo_url: pet_small_photo_url,
+              large_photo_url: pet_large_photo_url,
+              adoption_fee: pet_adoption_fee,
+              adoption_process: pet_adoption_process,
+              contact_name: pet_contact_name,
+              contact_number: pet_contact_number)
+    end
+    def pet_name
+      @document.at_css("#main h1").text.strip
+    end
+    def pet_size
+      species.split(" ")[0]
+    end
+    def pet_gender
+      species.split(" ")[1]
+    end
+    def pet_breed
+      species.split(" ").drop(2).join(" ")
+    end
+    def pet_age
+      @document.at_css("dd.age").text.strip
+    end
+    def pet_location
+      @document.at_css(".located_in").text.strip.gsub("Located in ", "").strip
+    end
+    def pet_vaccinated?
+      @document.at_css("dd.vaccinated").text.strip == "Yes"
+    end
+    def pet_desexed?
+      @document.at_css("dd.desexed").text.strip == "Yes"
+    end
+    def pet_biography
+      @document.at_css(".personality").text.strip
+    end
+    def pet_adoption_fee
+      @document.at_css("dd.adoption_fee").text.strip.gsub("$", "").to_i
+    end
+    def pet_adoption_process
+      node = @document.at_css(".adoption_process")
+      node && node.text.strip
+    end
+    def pet_rescue_group
+      @document.at_css("dd.fostered_by").text.strip
+    end
+    def pet_contact_name
+      node = @document.at_css("dd.contact_name")
+      node && node.text.strip
+    end
+    def pet_contact_number
+      node = @document.at_css("dt.contact_number")
+      node &&= node.next_element.children.first
+      node && node.text.strip
+    end
+    def pet_small_photo_url
+      photo_node && photo_node.at_css("img").attributes["src"].value
+    end
+    def pet_large_photo_url
+      photo_node && photo_node.at_css("a").attributes["href"].value
+    end
+    private
+    def photo_node
+      @document.at_css("#featured_photo")
+    end
+    def species
+      @document.at_css(".species").text.strip
+    end
+  end
+end

data/lib/pet_rescue/pet.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'virtus'
+module PetRescue
+  # Model for a pet on petrescue.com.au
+  class Pet
+    include Virtus.model
+    attribute :id,                String
+    attribute :name,              String
+    attribute :size,              String
+    attribute :gender,            String
+    attribute :breed,             String
+    attribute :age,               String
+    attribute :location,          String
+    attribute :vaccinated,        Boolean
+    attribute :desexed,           Boolean
+    attribute :biography,         String
+    attribute :adoption_fee,      Integer
+    attribute :adoption_process,  String
+    attribute :rescue_group,      String
+    attribute :contact_name ,     String
+    attribute :contact_number,    String
+    attribute :small_photo_url,   String
+    attribute :large_photo_url,   String
+  end
+end

data/lib/pet_rescue/scraper/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module PetRescue
+  module Scraper
+    VERSION = "1.0.0"
+  end
+end

data/lib/pet_rescue/scraper.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'pet_rescue/scraper/version'
+require 'pet_rescue/search_results_page'
+module PetRescue
+  module Scraper
+    # A collection of dog listings on petrescue.com.au
+    class DogListings
+      include Enumerable
+      def initialize(per_page: 48)
+        @per_page = per_page
+      end
+      def each(&block)
+        search_page = SearchResultsPage.from_url(first_page_url)
+        loop do
+          search_page.listing_pages.each(&block)
+          break unless search_page.has_next_page?
+          search_page = search_page.next_page
+        end
+      end
+      private
+      def first_page_url
+        "http://www.petrescue.com.au/listings/dogs?per_page=#{@per_page}&page=1"
+      end
+    end
+  end
+end

data/lib/pet_rescue/search_results_page.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'nokogiri'
+require 'open-uri'
+require 'pet_rescue/listing_page'
+require 'pet_rescue/listing_page_parser'
+module PetRescue
+  # A page of search results on petrescue.com.au
+  class SearchResultsPage
+    def self.from_url(url)
+      SearchResultsPage.new(open(url))
+    end
+    def initialize(document)
+      @document = Nokogiri::HTML(document)
+    end
+    def listing_pages
+      listing_ids.map do |id|
+        parser = PetRescue::ListingPageParser.new
+        ListingPage.new(id, parser)
+      end
+    end
+    def has_next_page?
+      !!next_page_href
+    end
+    def next_page
+      url = "http://www.petrescue.com.au#{next_page_href}"
+      SearchResultsPage.from_url(url)
+    end
+    private
+    def next_page_href
+      node = @document.at_css(".next a")
+      node && node[:href]
+    end
+    def listing_ids
+      @document
+        .css(".listing .name a")
+        .map { |node| Integer(node[:href].to_s.gsub("/listings/", "")) }
+    end
+  end
+end

data/pet_rescue-scraper.gemspec ADDED Viewed

@@ -0,0 +1,30 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'pet_rescue/scraper/version'
+Gem::Specification.new do |spec|
+  spec.name          = "pet_rescue-scraper"
+  spec.version       = PetRescue::Scraper::VERSION
+  spec.authors       = ["Alex Smith"]
+  spec.email         = ["alex@thatalexguy.com"]
+  spec.summary       = %q{Scraper for petrescue.com.au}
+  spec.description   = %q{Allows scraping of pet listings from petrescue.com.au}
+  spec.homepage      = ""
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_dependency "nokogiri", "~> 1.6.3.1"
+  spec.add_dependency "virtus", "~> 1.0.3"
+  spec.add_development_dependency "bundler", "~> 1.7"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec", "~> 3.1.0"
+  spec.add_development_dependency "vcr", "~> 2.9.3"
+  spec.add_development_dependency "webmock", "~> 1.19.0"
+  spec.add_development_dependency "simplecov", "~> 0.9.1"
+end