proto 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,37 +1,50 @@
1
1
  module Proto
2
2
  class Scraper
3
- attr_accessor :url, :doc, :url_collection
3
+ attr_accessor :url, :doc, :url_collection, :traverse, :page_count
4
4
 
5
5
  def initialize(url)
6
6
  @url = url.chomp '/' #remove trailing slash
7
7
  @doc = Nokogiri::HTML(open(url))
8
+ @page_count = 1
9
+ @url_collection = []
8
10
  end
9
11
 
10
- def collect_urls(base_url=self.url, selector)
11
- @url_collection = doc.css(selector).map do |link|
12
- "#{base_url}#{link['href']}"
12
+ def collect_urls(base_url=self.url, pagination_selector=nil, url_selector)
13
+ number_of_pages = doc.css(pagination_selector).map.count if pagination_selector
14
+
15
+ page_urls = doc.css(url_selector).map { |link| "#{base_url}#{link['href']}" }
16
+
17
+ if pagination_selector && (@page_count < number_of_pages)
18
+ next_url = base_url << doc.css(pagination_selector)[@page_count]['href']
19
+ self.doc = Nokogiri::HTML(open(next_url))
20
+ @page_count += 1
21
+ url_collection << page_urls
22
+ collect_urls(base_url, pagination_selector, url_selector)
23
+ else
24
+ url_collection << page_urls
25
+ url_collection.flatten!
13
26
  end
14
27
  end
15
28
 
16
29
  def fetch(name='Type', args)
17
- if url_collection
18
- attributes = scrape_multiple_pages(args)
19
- else
30
+ if url_collection.empty?
20
31
  attributes = scrape_single_page(args)
32
+ else
33
+ attributes = scrape_multiple_pages(args)
21
34
  end
22
35
  protos = create_return_objects(name, attributes)
23
- return protos
36
+ protos
24
37
  end
25
38
  alias_method :fetch_and_create!, :fetch
26
39
 
27
- private
40
+ private
28
41
 
29
42
  def scrape_multiple_pages(attributes)
30
43
  url_collection.map do |url|
31
44
  gather_data(url, attributes)
32
45
  end
33
46
  end
34
-
47
+
35
48
  def gather_data(url, attributes)
36
49
  page = Nokogiri::HTML(open(url))
37
50
  attributes.each_with_object({}) do |(key, selector), attrs|
@@ -41,7 +54,7 @@ module Proto
41
54
 
42
55
  def scrape_single_page(attributes)
43
56
  length_of_scrape = doc.css(attributes.first[1]).count
44
-
57
+
45
58
  length_of_scrape.times.map do |index|
46
59
  attributes.inject({}) do |hash, (attr_name, selector)|
47
60
  hash.merge(attr_name => doc.css(selector)[index].text.strip) if doc.css(selector)[index]
@@ -55,4 +68,4 @@ module Proto
55
68
  attributes.map { |hash| Proto.const_get(name).new(hash) }
56
69
  end
57
70
  end
58
- end
71
+ end
@@ -1,3 +1,3 @@
1
1
  module Proto
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -4,7 +4,7 @@ describe Proto::Scraper do
4
4
  it 'returns my objects!' do
5
5
  obj = Proto::Scraper.new('https://twitter.com/kcurtin')
6
6
  obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
7
- :content => 'p.js-tweet-text',
7
+ :content => 'p.js-tweet-text',
8
8
  :created_at => 'small.time' }
9
9
  )
10
10
  obj_collection.first.class.to_s.should == 'Proto::Tweet'
@@ -35,15 +35,10 @@ describe Proto::Scraper do
35
35
  jobs.first.class.to_s.should == 'Proto::Type'
36
36
  jobs.first.title.should =~ /Ruby/
37
37
  end
38
- end
39
38
 
40
- # ruby_inside = Scraper.new('Ruby Inside', 'http://ruby.jobamatic.com/a/jbb/find-jobs/',
41
- # 'http://ruby.jobamatic.com', job_database)
42
- # ruby_inside.compile_job_url_collection('tr.listing td.title a')
43
- # ruby_inside.scrape_away({
44
- # title_text: 'h2.jam_headline',
45
- # # company_text: 'h3 a.jam_link',
46
- # location_text: 'div#c_address',
47
- # type_text: 'div#c_jobtype',
48
- # description_text: 'div#c_job_description'
49
- # })
39
+ it "should work with pagination" do
40
+ obj = Proto::Scraper.new('http://www.mediauk.com/radio/starting-with/a')
41
+ obj.collect_urls('http://www.mediauk.com', 'div.pages a', 'div.columns a')
42
+ obj.url_collection.length.should > 10
43
+ end
44
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-05 00:00:00.000000000 Z
12
+ date: 2012-12-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec