proto 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/proto/scraper.rb +25 -12
- data/lib/proto/version.rb +1 -1
- data/spec/proto/scraper_spec.rb +7 -12
- metadata +2 -2
data/lib/proto/scraper.rb
CHANGED
@@ -1,37 +1,50 @@
|
|
1
1
|
module Proto
|
2
2
|
class Scraper
|
3
|
-
attr_accessor :url, :doc, :url_collection
|
3
|
+
attr_accessor :url, :doc, :url_collection, :traverse, :page_count
|
4
4
|
|
5
5
|
def initialize(url)
|
6
6
|
@url = url.chomp '/' #remove trailing slash
|
7
7
|
@doc = Nokogiri::HTML(open(url))
|
8
|
+
@page_count = 1
|
9
|
+
@url_collection = []
|
8
10
|
end
|
9
11
|
|
10
|
-
def collect_urls(base_url=self.url,
|
11
|
-
|
12
|
-
|
12
|
+
def collect_urls(base_url=self.url, pagination_selector=nil, url_selector)
|
13
|
+
number_of_pages = doc.css(pagination_selector).map.count if pagination_selector
|
14
|
+
|
15
|
+
page_urls = doc.css(url_selector).map { |link| "#{base_url}#{link['href']}" }
|
16
|
+
|
17
|
+
if pagination_selector && (@page_count < number_of_pages)
|
18
|
+
next_url = base_url << doc.css(pagination_selector)[@page_count]['href']
|
19
|
+
self.doc = Nokogiri::HTML(open(next_url))
|
20
|
+
@page_count += 1
|
21
|
+
url_collection << page_urls
|
22
|
+
collect_urls(base_url, pagination_selector, url_selector)
|
23
|
+
else
|
24
|
+
url_collection << page_urls
|
25
|
+
url_collection.flatten!
|
13
26
|
end
|
14
27
|
end
|
15
28
|
|
16
29
|
def fetch(name='Type', args)
|
17
|
-
if url_collection
|
18
|
-
attributes = scrape_multiple_pages(args)
|
19
|
-
else
|
30
|
+
if url_collection.empty?
|
20
31
|
attributes = scrape_single_page(args)
|
32
|
+
else
|
33
|
+
attributes = scrape_multiple_pages(args)
|
21
34
|
end
|
22
35
|
protos = create_return_objects(name, attributes)
|
23
|
-
|
36
|
+
protos
|
24
37
|
end
|
25
38
|
alias_method :fetch_and_create!, :fetch
|
26
39
|
|
27
|
-
|
40
|
+
private
|
28
41
|
|
29
42
|
def scrape_multiple_pages(attributes)
|
30
43
|
url_collection.map do |url|
|
31
44
|
gather_data(url, attributes)
|
32
45
|
end
|
33
46
|
end
|
34
|
-
|
47
|
+
|
35
48
|
def gather_data(url, attributes)
|
36
49
|
page = Nokogiri::HTML(open(url))
|
37
50
|
attributes.each_with_object({}) do |(key, selector), attrs|
|
@@ -41,7 +54,7 @@ module Proto
|
|
41
54
|
|
42
55
|
def scrape_single_page(attributes)
|
43
56
|
length_of_scrape = doc.css(attributes.first[1]).count
|
44
|
-
|
57
|
+
|
45
58
|
length_of_scrape.times.map do |index|
|
46
59
|
attributes.inject({}) do |hash, (attr_name, selector)|
|
47
60
|
hash.merge(attr_name => doc.css(selector)[index].text.strip) if doc.css(selector)[index]
|
@@ -55,4 +68,4 @@ module Proto
|
|
55
68
|
attributes.map { |hash| Proto.const_get(name).new(hash) }
|
56
69
|
end
|
57
70
|
end
|
58
|
-
end
|
71
|
+
end
|
data/lib/proto/version.rb
CHANGED
data/spec/proto/scraper_spec.rb
CHANGED
@@ -4,7 +4,7 @@ describe Proto::Scraper do
|
|
4
4
|
it 'returns my objects!' do
|
5
5
|
obj = Proto::Scraper.new('https://twitter.com/kcurtin')
|
6
6
|
obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
|
7
|
-
:content => 'p.js-tweet-text',
|
7
|
+
:content => 'p.js-tweet-text',
|
8
8
|
:created_at => 'small.time' }
|
9
9
|
)
|
10
10
|
obj_collection.first.class.to_s.should == 'Proto::Tweet'
|
@@ -35,15 +35,10 @@ describe Proto::Scraper do
|
|
35
35
|
jobs.first.class.to_s.should == 'Proto::Type'
|
36
36
|
jobs.first.title.should =~ /Ruby/
|
37
37
|
end
|
38
|
-
end
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
# location_text: 'div#c_address',
|
47
|
-
# type_text: 'div#c_jobtype',
|
48
|
-
# description_text: 'div#c_job_description'
|
49
|
-
# })
|
39
|
+
it "should work with pagination" do
|
40
|
+
obj = Proto::Scraper.new('http://www.mediauk.com/radio/starting-with/a')
|
41
|
+
obj.collect_urls('http://www.mediauk.com', 'div.pages a', 'div.columns a')
|
42
|
+
obj.url_collection.length.should > 10
|
43
|
+
end
|
44
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|