RubyGems - proto - Versions diffs - 0.0.6 → 0.0.7 - Mend

proto 0.0.6 → 0.0.7

Files changed (4) hide show

@@ -1,37 +1,50 @@
 module Proto
   class Scraper
-    attr_accessor :url, :doc, :url_collection
+    attr_accessor :url, :doc, :url_collection, :traverse, :page_count
     def initialize(url)
       @url = url.chomp '/' #remove trailing slash
       @doc = Nokogiri::HTML(open(url))
+      @page_count     = 1
+      @url_collection = []
     end
-    def collect_urls(base_url=self.url, selector)
-      @url_collection = doc.css(selector).map do |link|
-        "#{base_url}#{link['href']}"
+    def collect_urls(base_url=self.url, pagination_selector=nil, url_selector)
+      number_of_pages = doc.css(pagination_selector).map.count if pagination_selector
+      page_urls = doc.css(url_selector).map { |link| "#{base_url}#{link['href']}" }
+      if pagination_selector && (@page_count < number_of_pages)
+        next_url = base_url << doc.css(pagination_selector)[@page_count]['href']
+        self.doc = Nokogiri::HTML(open(next_url))
+        @page_count += 1
+        url_collection << page_urls
+        collect_urls(base_url, pagination_selector, url_selector)
+      else
+        url_collection << page_urls
+        url_collection.flatten!
       end
     end
     def fetch(name='Type', args)
-      if url_collection
-        attributes = scrape_multiple_pages(args)
-      else
+      if url_collection.empty?
         attributes = scrape_single_page(args)
+      else
+        attributes = scrape_multiple_pages(args)
       end
       protos = create_return_objects(name, attributes)
-      return protos
+      protos
     end
     alias_method :fetch_and_create!, :fetch
-  private
+    private
     def scrape_multiple_pages(attributes)
       url_collection.map do |url|
          gather_data(url, attributes)
       end
     end
     def gather_data(url, attributes)
       page = Nokogiri::HTML(open(url))
       attributes.each_with_object({}) do |(key, selector), attrs|
@@ -41,7 +54,7 @@ module Proto
     def scrape_single_page(attributes)
       length_of_scrape = doc.css(attributes.first[1]).count
       length_of_scrape.times.map do |index|
         attributes.inject({}) do |hash, (attr_name, selector)|
           hash.merge(attr_name => doc.css(selector)[index].text.strip) if doc.css(selector)[index]
@@ -55,4 +68,4 @@ module Proto
       attributes.map { |hash| Proto.const_get(name).new(hash) }
     end
   end
-end
+end

data/lib/proto/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Proto
-  VERSION = "0.0.6"
+  VERSION = "0.0.7"
 end

data/spec/proto/scraper_spec.rb CHANGED

@@ -4,7 +4,7 @@ describe Proto::Scraper do
   it 'returns my objects!' do
     obj = Proto::Scraper.new('https://twitter.com/kcurtin')
     obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
-                                          :content => 'p.js-tweet-text',
+                                          :content => 'p.js-tweet-text',
                                           :created_at => 'small.time' }
                               )
     obj_collection.first.class.to_s.should == 'Proto::Tweet'
@@ -35,15 +35,10 @@ describe Proto::Scraper do
     jobs.first.class.to_s.should == 'Proto::Type'
     jobs.first.title.should =~ /Ruby/
   end
-end
-# ruby_inside = Scraper.new('Ruby Inside', 'http://ruby.jobamatic.com/a/jbb/find-jobs/',
-#                           'http://ruby.jobamatic.com', job_database)
-# ruby_inside.compile_job_url_collection('tr.listing td.title a')
-# ruby_inside.scrape_away({
-#   title_text:       'h2.jam_headline',
-#   # company_text:     'h3 a.jam_link',
-#   location_text:    'div#c_address',
-#   type_text:         'div#c_jobtype',
-#   description_text: 'div#c_job_description'
-# })
+  it "should work with pagination" do
+    obj = Proto::Scraper.new('http://www.mediauk.com/radio/starting-with/a')
+    obj.collect_urls('http://www.mediauk.com', 'div.pages a', 'div.columns a')
+    obj.url_collection.length.should > 10
+  end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: proto
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.0.7
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-05 00:00:00.000000000 Z
+date: 2012-12-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec