proto 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/proto/scraper.rb +37 -7
- data/lib/proto/version.rb +1 -1
- data/proto.gemspec +1 -0
- data/spec/proto/scraper_spec.rb +31 -48
- data/spec/proto_spec.rb +1 -1
- data/test/test_proto.rb +12 -0
- metadata +21 -2
data/lib/proto/scraper.rb
CHANGED
@@ -1,25 +1,55 @@
|
|
1
1
|
module Proto
|
2
2
|
class Scraper
|
3
|
-
attr_accessor :doc
|
3
|
+
attr_accessor :url, :doc, :url_collection
|
4
4
|
|
5
5
|
def initialize(url)
|
6
|
+
@url = url.chomp '/'
|
6
7
|
@doc = Nokogiri::HTML(open(url))
|
7
8
|
end
|
8
9
|
|
10
|
+
def collect_urls(selector)
|
11
|
+
@url_collection = doc.css(selector).map do |link|
|
12
|
+
"#{url}#{link['href']}"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
9
16
|
def fetch(name='Type', args)
|
10
|
-
|
11
|
-
|
12
|
-
|
17
|
+
if url_collection
|
18
|
+
attributes = visit_urls_and_fetch(args)
|
19
|
+
protos = create_return_objects(name, attributes)
|
20
|
+
return protos
|
21
|
+
else
|
22
|
+
attributes = scrape_attribute_data(args)
|
23
|
+
protos = create_return_objects(name, attributes)
|
24
|
+
return protos
|
25
|
+
end
|
13
26
|
end
|
14
27
|
alias_method :fetch_and_create!, :fetch
|
15
28
|
|
16
29
|
private
|
17
|
-
def scrape_attribute_data(attributes)
|
18
|
-
length_of_scrape = @doc.css(attributes.first[1]).count
|
19
30
|
|
31
|
+
def visit_urls_and_fetch(attributes)
|
32
|
+
hash_array = []
|
33
|
+
final_array = url_collection.map do |url|
|
34
|
+
page = Nokogiri::HTML(open(url))
|
35
|
+
attrs_hash = gather_data(page, attributes)
|
36
|
+
hash_array << attrs_hash
|
37
|
+
end
|
38
|
+
return hash_array
|
39
|
+
end
|
40
|
+
|
41
|
+
def gather_data(page, attributes)
|
42
|
+
job_hash = attributes.each_with_object({}) do |(key, selector), attrs|
|
43
|
+
attrs[key] = page.css(selector).text.strip
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def scrape_attribute_data(document=self.doc, attributes)
|
48
|
+
length_of_scrape = document.css(attributes.first[1]).count
|
49
|
+
|
20
50
|
final_array = length_of_scrape.times.map do |index|
|
21
51
|
attributes.inject(Hash.new) do |hash, (attr_name, selector)|
|
22
|
-
hash.merge(attr_name =>
|
52
|
+
hash.merge(attr_name => document.css(selector)[index].text.strip) if document.css(selector)[index]
|
23
53
|
end
|
24
54
|
end
|
25
55
|
|
data/lib/proto/version.rb
CHANGED
data/proto.gemspec
CHANGED
data/spec/proto/scraper_spec.rb
CHANGED
@@ -1,18 +1,12 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../spec_helper'
|
2
2
|
|
3
3
|
describe Proto::Scraper do
|
4
|
-
before(:each) do
|
5
|
-
# Nokogiri::HTML.stub!(:open).and_return("doc")
|
6
|
-
# Nokogiri::HTML::Document.stub!(:parse)
|
7
|
-
# @scrape = Proto::Scraper.new('http://example.com')
|
8
|
-
# @scrape.stub_chain(:doc, :css, :each).and_return('STUBBED OUT')
|
9
|
-
end
|
10
|
-
|
11
4
|
it 'returns my objects!' do
|
12
5
|
obj = Proto::Scraper.new('https://twitter.com/kcurtin')
|
13
6
|
obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
|
14
|
-
|
15
|
-
|
7
|
+
:content => 'p.js-tweet-text',
|
8
|
+
:created_at => 'small.time' }
|
9
|
+
)
|
16
10
|
obj_collection.first.class.to_s.should == 'Proto::Tweet'
|
17
11
|
obj_collection.first.name.should == 'Kevin Curtin'
|
18
12
|
end
|
@@ -22,45 +16,34 @@ describe Proto::Scraper do
|
|
22
16
|
Proto::Scraper.new('blah_url')
|
23
17
|
}.to raise_error(Errno::ENOENT)
|
24
18
|
end
|
25
|
-
# context ".fetch" do
|
26
|
-
# it "the default class name is 'Proto::Type'" do
|
27
|
-
# our_obj = @scrape.fetch({})
|
28
|
-
# our_obj.class.to_s.should == 'Proto::Type'
|
29
|
-
# end
|
30
|
-
|
31
|
-
# it "accepts only a hash and sets default class name" do
|
32
|
-
# our_obj = @scrape.fetch({:name => 'default const'})
|
33
|
-
# our_obj.class.to_s.should == 'Proto::Type'
|
34
|
-
# end
|
35
|
-
|
36
|
-
# it "returns a Proto object with attributes set" do
|
37
|
-
# our_obj = @scrape.fetch('Sample', {:name => "Kevin", :title => "Developer"})
|
38
|
-
# our_obj.name.should == "STUBBED OUT"
|
39
|
-
# our_obj.title.should == "STUBBED OUT"
|
40
|
-
# our_obj.class.to_s.should == "Proto::Sample"
|
41
|
-
# end
|
42
|
-
# end
|
43
19
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
# end
|
50
|
-
|
51
|
-
# it "accepts a hash and name and sets custom attrs" do
|
52
|
-
# our_obj = @scrape.send(:create_return_objects, 'Test', [{:name => 'Kevin'},{:title => "Title"}])
|
53
|
-
# our_obj.first.name.should == 'Kevin'
|
54
|
-
# our_obj.last.title.should == 'Title'
|
55
|
-
# our_obj.length.should == 2
|
56
|
-
# end
|
57
|
-
# end
|
20
|
+
it 'can collect a bunch of urls' do
|
21
|
+
obj = Proto::Scraper.new('http://jobs.rubynow.com/')
|
22
|
+
obj.collect_urls('ul.jobs li h2 a:first')
|
23
|
+
obj.url_collection.first.should =~ /http:\/\/jobs/
|
24
|
+
end
|
58
25
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
#
|
64
|
-
#
|
65
|
-
|
26
|
+
it "should create the objects this way too" do
|
27
|
+
obj = Proto::Scraper.new('http://jobs.rubynow.com/')
|
28
|
+
obj.collect_urls('ul.jobs li h2 a:first')
|
29
|
+
jobs = obj.fetch({ :title => 'h2#headline',
|
30
|
+
:company => 'h2#headline a',
|
31
|
+
:location => 'h3#location',
|
32
|
+
:type => 'strong:last',
|
33
|
+
:description => 'div#info' }
|
34
|
+
)
|
35
|
+
jobs.first.class.to_s.should == 'Proto::Type'
|
36
|
+
jobs.first.title.should =~ /Ruby/
|
37
|
+
end
|
66
38
|
end
|
39
|
+
|
40
|
+
# ruby_inside = Scraper.new('Ruby Inside', 'http://ruby.jobamatic.com/a/jbb/find-jobs/',
|
41
|
+
# 'http://ruby.jobamatic.com', job_database)
|
42
|
+
# ruby_inside.compile_job_url_collection('tr.listing td.title a')
|
43
|
+
# ruby_inside.scrape_away({
|
44
|
+
# title_text: 'h2.jam_headline',
|
45
|
+
# # company_text: 'h3 a.jam_link',
|
46
|
+
# location_text: 'div#c_address',
|
47
|
+
# type_text: 'div#c_jobtype',
|
48
|
+
# description_text: 'div#c_job_description'
|
49
|
+
# })
|
data/spec/proto_spec.rb
CHANGED
data/test/test_proto.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-11-
|
12
|
+
date: 2012-11-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: minitest
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
30
46
|
- !ruby/object:Gem::Dependency
|
31
47
|
name: nokogiri
|
32
48
|
requirement: !ruby/object:Gem::Requirement
|
@@ -63,6 +79,7 @@ files:
|
|
63
79
|
- spec/proto_spec.rb
|
64
80
|
- spec/sample_pages/twitter.html
|
65
81
|
- spec/spec_helper.rb
|
82
|
+
- test/test_proto.rb
|
66
83
|
homepage: https://github.com/kcurtin/proto
|
67
84
|
licenses: []
|
68
85
|
post_install_message:
|
@@ -92,3 +109,5 @@ test_files:
|
|
92
109
|
- spec/proto_spec.rb
|
93
110
|
- spec/sample_pages/twitter.html
|
94
111
|
- spec/spec_helper.rb
|
112
|
+
- test/test_proto.rb
|
113
|
+
has_rdoc:
|