proto 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/proto/scraper.rb +25 -12
- data/lib/proto/version.rb +1 -1
- data/spec/proto/scraper_spec.rb +7 -12
- metadata +2 -2
data/lib/proto/scraper.rb
CHANGED
@@ -1,37 +1,50 @@
|
|
1
1
|
module Proto
|
2
2
|
class Scraper
|
3
|
-
attr_accessor :url, :doc, :url_collection
|
3
|
+
attr_accessor :url, :doc, :url_collection, :traverse, :page_count
|
4
4
|
|
5
5
|
def initialize(url)
|
6
6
|
@url = url.chomp '/' #remove trailing slash
|
7
7
|
@doc = Nokogiri::HTML(open(url))
|
8
|
+
@page_count = 1
|
9
|
+
@url_collection = []
|
8
10
|
end
|
9
11
|
|
10
|
-
def collect_urls(base_url=self.url,
|
11
|
-
|
12
|
-
|
12
|
+
def collect_urls(base_url=self.url, pagination_selector=nil, url_selector)
|
13
|
+
number_of_pages = doc.css(pagination_selector).map.count if pagination_selector
|
14
|
+
|
15
|
+
page_urls = doc.css(url_selector).map { |link| "#{base_url}#{link['href']}" }
|
16
|
+
|
17
|
+
if pagination_selector && (@page_count < number_of_pages)
|
18
|
+
next_url = base_url << doc.css(pagination_selector)[@page_count]['href']
|
19
|
+
self.doc = Nokogiri::HTML(open(next_url))
|
20
|
+
@page_count += 1
|
21
|
+
url_collection << page_urls
|
22
|
+
collect_urls(base_url, pagination_selector, url_selector)
|
23
|
+
else
|
24
|
+
url_collection << page_urls
|
25
|
+
url_collection.flatten!
|
13
26
|
end
|
14
27
|
end
|
15
28
|
|
16
29
|
def fetch(name='Type', args)
|
17
|
-
if url_collection
|
18
|
-
attributes = scrape_multiple_pages(args)
|
19
|
-
else
|
30
|
+
if url_collection.empty?
|
20
31
|
attributes = scrape_single_page(args)
|
32
|
+
else
|
33
|
+
attributes = scrape_multiple_pages(args)
|
21
34
|
end
|
22
35
|
protos = create_return_objects(name, attributes)
|
23
|
-
|
36
|
+
protos
|
24
37
|
end
|
25
38
|
alias_method :fetch_and_create!, :fetch
|
26
39
|
|
27
|
-
|
40
|
+
private
|
28
41
|
|
29
42
|
def scrape_multiple_pages(attributes)
|
30
43
|
url_collection.map do |url|
|
31
44
|
gather_data(url, attributes)
|
32
45
|
end
|
33
46
|
end
|
34
|
-
|
47
|
+
|
35
48
|
def gather_data(url, attributes)
|
36
49
|
page = Nokogiri::HTML(open(url))
|
37
50
|
attributes.each_with_object({}) do |(key, selector), attrs|
|
@@ -41,7 +54,7 @@ module Proto
|
|
41
54
|
|
42
55
|
def scrape_single_page(attributes)
|
43
56
|
length_of_scrape = doc.css(attributes.first[1]).count
|
44
|
-
|
57
|
+
|
45
58
|
length_of_scrape.times.map do |index|
|
46
59
|
attributes.inject({}) do |hash, (attr_name, selector)|
|
47
60
|
hash.merge(attr_name => doc.css(selector)[index].text.strip) if doc.css(selector)[index]
|
@@ -55,4 +68,4 @@ module Proto
|
|
55
68
|
attributes.map { |hash| Proto.const_get(name).new(hash) }
|
56
69
|
end
|
57
70
|
end
|
58
|
-
end
|
71
|
+
end
|
data/lib/proto/version.rb
CHANGED
data/spec/proto/scraper_spec.rb
CHANGED
@@ -4,7 +4,7 @@ describe Proto::Scraper do
|
|
4
4
|
it 'returns my objects!' do
|
5
5
|
obj = Proto::Scraper.new('https://twitter.com/kcurtin')
|
6
6
|
obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
|
7
|
-
:content => 'p.js-tweet-text',
|
7
|
+
:content => 'p.js-tweet-text',
|
8
8
|
:created_at => 'small.time' }
|
9
9
|
)
|
10
10
|
obj_collection.first.class.to_s.should == 'Proto::Tweet'
|
@@ -35,15 +35,10 @@ describe Proto::Scraper do
|
|
35
35
|
jobs.first.class.to_s.should == 'Proto::Type'
|
36
36
|
jobs.first.title.should =~ /Ruby/
|
37
37
|
end
|
38
|
-
end
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
# location_text: 'div#c_address',
|
47
|
-
# type_text: 'div#c_jobtype',
|
48
|
-
# description_text: 'div#c_job_description'
|
49
|
-
# })
|
39
|
+
it "should work with pagination" do
|
40
|
+
obj = Proto::Scraper.new('http://www.mediauk.com/radio/starting-with/a')
|
41
|
+
obj.collect_urls('http://www.mediauk.com', 'div.pages a', 'div.columns a')
|
42
|
+
obj.url_collection.length.should > 10
|
43
|
+
end
|
44
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|