proto 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/proto/scraper.rb +37 -7
- data/lib/proto/version.rb +1 -1
- data/proto.gemspec +1 -0
- data/spec/proto/scraper_spec.rb +31 -48
- data/spec/proto_spec.rb +1 -1
- data/test/test_proto.rb +12 -0
- metadata +21 -2
data/lib/proto/scraper.rb
CHANGED
@@ -1,25 +1,55 @@
|
|
1
1
|
module Proto
|
2
2
|
class Scraper
|
3
|
-
attr_accessor :doc
|
3
|
+
attr_accessor :url, :doc, :url_collection
|
4
4
|
|
5
5
|
def initialize(url)
|
6
|
+
@url = url.chomp '/'
|
6
7
|
@doc = Nokogiri::HTML(open(url))
|
7
8
|
end
|
8
9
|
|
10
|
+
def collect_urls(selector)
|
11
|
+
@url_collection = doc.css(selector).map do |link|
|
12
|
+
"#{url}#{link['href']}"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
9
16
|
def fetch(name='Type', args)
|
10
|
-
|
11
|
-
|
12
|
-
|
17
|
+
if url_collection
|
18
|
+
attributes = visit_urls_and_fetch(args)
|
19
|
+
protos = create_return_objects(name, attributes)
|
20
|
+
return protos
|
21
|
+
else
|
22
|
+
attributes = scrape_attribute_data(args)
|
23
|
+
protos = create_return_objects(name, attributes)
|
24
|
+
return protos
|
25
|
+
end
|
13
26
|
end
|
14
27
|
alias_method :fetch_and_create!, :fetch
|
15
28
|
|
16
29
|
private
|
17
|
-
def scrape_attribute_data(attributes)
|
18
|
-
length_of_scrape = @doc.css(attributes.first[1]).count
|
19
30
|
|
31
|
+
def visit_urls_and_fetch(attributes)
|
32
|
+
hash_array = []
|
33
|
+
final_array = url_collection.map do |url|
|
34
|
+
page = Nokogiri::HTML(open(url))
|
35
|
+
attrs_hash = gather_data(page, attributes)
|
36
|
+
hash_array << attrs_hash
|
37
|
+
end
|
38
|
+
return hash_array
|
39
|
+
end
|
40
|
+
|
41
|
+
def gather_data(page, attributes)
|
42
|
+
job_hash = attributes.each_with_object({}) do |(key, selector), attrs|
|
43
|
+
attrs[key] = page.css(selector).text.strip
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def scrape_attribute_data(document=self.doc, attributes)
|
48
|
+
length_of_scrape = document.css(attributes.first[1]).count
|
49
|
+
|
20
50
|
final_array = length_of_scrape.times.map do |index|
|
21
51
|
attributes.inject(Hash.new) do |hash, (attr_name, selector)|
|
22
|
-
hash.merge(attr_name =>
|
52
|
+
hash.merge(attr_name => document.css(selector)[index].text.strip) if document.css(selector)[index]
|
23
53
|
end
|
24
54
|
end
|
25
55
|
|
data/lib/proto/version.rb
CHANGED
data/proto.gemspec
CHANGED
data/spec/proto/scraper_spec.rb
CHANGED
@@ -1,18 +1,12 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../spec_helper'
|
2
2
|
|
3
3
|
describe Proto::Scraper do
|
4
|
-
before(:each) do
|
5
|
-
# Nokogiri::HTML.stub!(:open).and_return("doc")
|
6
|
-
# Nokogiri::HTML::Document.stub!(:parse)
|
7
|
-
# @scrape = Proto::Scraper.new('http://example.com')
|
8
|
-
# @scrape.stub_chain(:doc, :css, :each).and_return('STUBBED OUT')
|
9
|
-
end
|
10
|
-
|
11
4
|
it 'returns my objects!' do
|
12
5
|
obj = Proto::Scraper.new('https://twitter.com/kcurtin')
|
13
6
|
obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
|
14
|
-
|
15
|
-
|
7
|
+
:content => 'p.js-tweet-text',
|
8
|
+
:created_at => 'small.time' }
|
9
|
+
)
|
16
10
|
obj_collection.first.class.to_s.should == 'Proto::Tweet'
|
17
11
|
obj_collection.first.name.should == 'Kevin Curtin'
|
18
12
|
end
|
@@ -22,45 +16,34 @@ describe Proto::Scraper do
|
|
22
16
|
Proto::Scraper.new('blah_url')
|
23
17
|
}.to raise_error(Errno::ENOENT)
|
24
18
|
end
|
25
|
-
# context ".fetch" do
|
26
|
-
# it "the default class name is 'Proto::Type'" do
|
27
|
-
# our_obj = @scrape.fetch({})
|
28
|
-
# our_obj.class.to_s.should == 'Proto::Type'
|
29
|
-
# end
|
30
|
-
|
31
|
-
# it "accepts only a hash and sets default class name" do
|
32
|
-
# our_obj = @scrape.fetch({:name => 'default const'})
|
33
|
-
# our_obj.class.to_s.should == 'Proto::Type'
|
34
|
-
# end
|
35
|
-
|
36
|
-
# it "returns a Proto object with attributes set" do
|
37
|
-
# our_obj = @scrape.fetch('Sample', {:name => "Kevin", :title => "Developer"})
|
38
|
-
# our_obj.name.should == "STUBBED OUT"
|
39
|
-
# our_obj.title.should == "STUBBED OUT"
|
40
|
-
# our_obj.class.to_s.should == "Proto::Sample"
|
41
|
-
# end
|
42
|
-
# end
|
43
19
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
# end
|
50
|
-
|
51
|
-
# it "accepts a hash and name and sets custom attrs" do
|
52
|
-
# our_obj = @scrape.send(:create_return_objects, 'Test', [{:name => 'Kevin'},{:title => "Title"}])
|
53
|
-
# our_obj.first.name.should == 'Kevin'
|
54
|
-
# our_obj.last.title.should == 'Title'
|
55
|
-
# our_obj.length.should == 2
|
56
|
-
# end
|
57
|
-
# end
|
20
|
+
it 'can collect a bunch of urls' do
|
21
|
+
obj = Proto::Scraper.new('http://jobs.rubynow.com/')
|
22
|
+
obj.collect_urls('ul.jobs li h2 a:first')
|
23
|
+
obj.url_collection.first.should =~ /http:\/\/jobs/
|
24
|
+
end
|
58
25
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
#
|
64
|
-
#
|
65
|
-
|
26
|
+
it "should create the objects this way too" do
|
27
|
+
obj = Proto::Scraper.new('http://jobs.rubynow.com/')
|
28
|
+
obj.collect_urls('ul.jobs li h2 a:first')
|
29
|
+
jobs = obj.fetch({ :title => 'h2#headline',
|
30
|
+
:company => 'h2#headline a',
|
31
|
+
:location => 'h3#location',
|
32
|
+
:type => 'strong:last',
|
33
|
+
:description => 'div#info' }
|
34
|
+
)
|
35
|
+
jobs.first.class.to_s.should == 'Proto::Type'
|
36
|
+
jobs.first.title.should =~ /Ruby/
|
37
|
+
end
|
66
38
|
end
|
39
|
+
|
40
|
+
# ruby_inside = Scraper.new('Ruby Inside', 'http://ruby.jobamatic.com/a/jbb/find-jobs/',
|
41
|
+
# 'http://ruby.jobamatic.com', job_database)
|
42
|
+
# ruby_inside.compile_job_url_collection('tr.listing td.title a')
|
43
|
+
# ruby_inside.scrape_away({
|
44
|
+
# title_text: 'h2.jam_headline',
|
45
|
+
# # company_text: 'h3 a.jam_link',
|
46
|
+
# location_text: 'div#c_address',
|
47
|
+
# type_text: 'div#c_jobtype',
|
48
|
+
# description_text: 'div#c_job_description'
|
49
|
+
# })
|
data/spec/proto_spec.rb
CHANGED
data/test/test_proto.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-11-
|
12
|
+
date: 2012-11-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: minitest
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
30
46
|
- !ruby/object:Gem::Dependency
|
31
47
|
name: nokogiri
|
32
48
|
requirement: !ruby/object:Gem::Requirement
|
@@ -63,6 +79,7 @@ files:
|
|
63
79
|
- spec/proto_spec.rb
|
64
80
|
- spec/sample_pages/twitter.html
|
65
81
|
- spec/spec_helper.rb
|
82
|
+
- test/test_proto.rb
|
66
83
|
homepage: https://github.com/kcurtin/proto
|
67
84
|
licenses: []
|
68
85
|
post_install_message:
|
@@ -92,3 +109,5 @@ test_files:
|
|
92
109
|
- spec/proto_spec.rb
|
93
110
|
- spec/sample_pages/twitter.html
|
94
111
|
- spec/spec_helper.rb
|
112
|
+
- test/test_proto.rb
|
113
|
+
has_rdoc:
|