proto 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,37 +1,50 @@
1
1
  module Proto
2
2
  class Scraper
3
- attr_accessor :url, :doc, :url_collection
3
+ attr_accessor :url, :doc, :url_collection, :traverse, :page_count
4
4
 
5
5
  def initialize(url)
6
6
  @url = url.chomp '/' #remove trailing slash
7
7
  @doc = Nokogiri::HTML(open(url))
8
+ @page_count = 1
9
+ @url_collection = []
8
10
  end
9
11
 
10
- def collect_urls(base_url=self.url, selector)
11
- @url_collection = doc.css(selector).map do |link|
12
- "#{base_url}#{link['href']}"
12
+ def collect_urls(base_url=self.url, pagination_selector=nil, url_selector)
13
+ number_of_pages = doc.css(pagination_selector).map.count if pagination_selector
14
+
15
+ page_urls = doc.css(url_selector).map { |link| "#{base_url}#{link['href']}" }
16
+
17
+ if pagination_selector && (@page_count < number_of_pages)
18
+ next_url = base_url << doc.css(pagination_selector)[@page_count]['href']
19
+ self.doc = Nokogiri::HTML(open(next_url))
20
+ @page_count += 1
21
+ url_collection << page_urls
22
+ collect_urls(base_url, pagination_selector, url_selector)
23
+ else
24
+ url_collection << page_urls
25
+ url_collection.flatten!
13
26
  end
14
27
  end
15
28
 
16
29
  def fetch(name='Type', args)
17
- if url_collection
18
- attributes = scrape_multiple_pages(args)
19
- else
30
+ if url_collection.empty?
20
31
  attributes = scrape_single_page(args)
32
+ else
33
+ attributes = scrape_multiple_pages(args)
21
34
  end
22
35
  protos = create_return_objects(name, attributes)
23
- return protos
36
+ protos
24
37
  end
25
38
  alias_method :fetch_and_create!, :fetch
26
39
 
27
- private
40
+ private
28
41
 
29
42
  def scrape_multiple_pages(attributes)
30
43
  url_collection.map do |url|
31
44
  gather_data(url, attributes)
32
45
  end
33
46
  end
34
-
47
+
35
48
  def gather_data(url, attributes)
36
49
  page = Nokogiri::HTML(open(url))
37
50
  attributes.each_with_object({}) do |(key, selector), attrs|
@@ -41,7 +54,7 @@ module Proto
41
54
 
42
55
  def scrape_single_page(attributes)
43
56
  length_of_scrape = doc.css(attributes.first[1]).count
44
-
57
+
45
58
  length_of_scrape.times.map do |index|
46
59
  attributes.inject({}) do |hash, (attr_name, selector)|
47
60
  hash.merge(attr_name => doc.css(selector)[index].text.strip) if doc.css(selector)[index]
@@ -55,4 +68,4 @@ module Proto
55
68
  attributes.map { |hash| Proto.const_get(name).new(hash) }
56
69
  end
57
70
  end
58
- end
71
+ end
@@ -1,3 +1,3 @@
1
1
  module Proto
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -4,7 +4,7 @@ describe Proto::Scraper do
4
4
  it 'returns my objects!' do
5
5
  obj = Proto::Scraper.new('https://twitter.com/kcurtin')
6
6
  obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
7
- :content => 'p.js-tweet-text',
7
+ :content => 'p.js-tweet-text',
8
8
  :created_at => 'small.time' }
9
9
  )
10
10
  obj_collection.first.class.to_s.should == 'Proto::Tweet'
@@ -35,15 +35,10 @@ describe Proto::Scraper do
35
35
  jobs.first.class.to_s.should == 'Proto::Type'
36
36
  jobs.first.title.should =~ /Ruby/
37
37
  end
38
- end
39
38
 
40
- # ruby_inside = Scraper.new('Ruby Inside', 'http://ruby.jobamatic.com/a/jbb/find-jobs/',
41
- # 'http://ruby.jobamatic.com', job_database)
42
- # ruby_inside.compile_job_url_collection('tr.listing td.title a')
43
- # ruby_inside.scrape_away({
44
- # title_text: 'h2.jam_headline',
45
- # # company_text: 'h3 a.jam_link',
46
- # location_text: 'div#c_address',
47
- # type_text: 'div#c_jobtype',
48
- # description_text: 'div#c_job_description'
49
- # })
39
+ it "should work with pagination" do
40
+ obj = Proto::Scraper.new('http://www.mediauk.com/radio/starting-with/a')
41
+ obj.collect_urls('http://www.mediauk.com', 'div.pages a', 'div.columns a')
42
+ obj.url_collection.length.should > 10
43
+ end
44
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-05 00:00:00.000000000 Z
12
+ date: 2012-12-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec