proto 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/lib/proto/scraper.rb CHANGED
@@ -1,25 +1,55 @@
1
1
  module Proto
2
2
  class Scraper
3
- attr_accessor :doc
3
+ attr_accessor :url, :doc, :url_collection
4
4
 
5
5
  def initialize(url)
6
+ @url = url.chomp '/'
6
7
  @doc = Nokogiri::HTML(open(url))
7
8
  end
8
9
 
10
+ def collect_urls(selector)
11
+ @url_collection = doc.css(selector).map do |link|
12
+ "#{url}#{link['href']}"
13
+ end
14
+ end
15
+
9
16
  def fetch(name='Type', args)
10
- attributes = scrape_attribute_data(args)
11
- protos = create_return_objects(name, attributes)
12
- return protos
17
+ if url_collection
18
+ attributes = visit_urls_and_fetch(args)
19
+ protos = create_return_objects(name, attributes)
20
+ return protos
21
+ else
22
+ attributes = scrape_attribute_data(args)
23
+ protos = create_return_objects(name, attributes)
24
+ return protos
25
+ end
13
26
  end
14
27
  alias_method :fetch_and_create!, :fetch
15
28
 
16
29
  private
17
- def scrape_attribute_data(attributes)
18
- length_of_scrape = @doc.css(attributes.first[1]).count
19
30
 
31
+ def visit_urls_and_fetch(attributes)
32
+ hash_array = []
33
+ final_array = url_collection.map do |url|
34
+ page = Nokogiri::HTML(open(url))
35
+ attrs_hash = gather_data(page, attributes)
36
+ hash_array << attrs_hash
37
+ end
38
+ return hash_array
39
+ end
40
+
41
+ def gather_data(page, attributes)
42
+ job_hash = attributes.each_with_object({}) do |(key, selector), attrs|
43
+ attrs[key] = page.css(selector).text.strip
44
+ end
45
+ end
46
+
47
+ def scrape_attribute_data(document=self.doc, attributes)
48
+ length_of_scrape = document.css(attributes.first[1]).count
49
+
20
50
  final_array = length_of_scrape.times.map do |index|
21
51
  attributes.inject(Hash.new) do |hash, (attr_name, selector)|
22
- hash.merge(attr_name => @doc.css(selector)[index].text.strip) if doc.css(selector)[index]
52
+ hash.merge(attr_name => document.css(selector)[index].text.strip) if document.css(selector)[index]
23
53
  end
24
54
  end
25
55
 
data/lib/proto/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Proto
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
data/proto.gemspec CHANGED
@@ -13,6 +13,7 @@ Gem::Specification.new do |gem|
13
13
  gem.homepage = "https://github.com/kcurtin/proto"
14
14
 
15
15
  gem.add_development_dependency 'rspec'
16
+ gem.add_development_dependency 'minitest'
16
17
  gem.add_runtime_dependency 'nokogiri'
17
18
 
18
19
  gem.files = `git ls-files`.split($/)
@@ -1,18 +1,12 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe Proto::Scraper do
4
- before(:each) do
5
- # Nokogiri::HTML.stub!(:open).and_return("doc")
6
- # Nokogiri::HTML::Document.stub!(:parse)
7
- # @scrape = Proto::Scraper.new('http://example.com')
8
- # @scrape.stub_chain(:doc, :css, :each).and_return('STUBBED OUT')
9
- end
10
-
11
4
  it 'returns my objects!' do
12
5
  obj = Proto::Scraper.new('https://twitter.com/kcurtin')
13
6
  obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
14
- :content => 'p.js-tweet-text', :created_at => 'small.time' })
15
- # obj_collection.length.should == 10
7
+ :content => 'p.js-tweet-text',
8
+ :created_at => 'small.time' }
9
+ )
16
10
  obj_collection.first.class.to_s.should == 'Proto::Tweet'
17
11
  obj_collection.first.name.should == 'Kevin Curtin'
18
12
  end
@@ -22,45 +16,34 @@ describe Proto::Scraper do
22
16
  Proto::Scraper.new('blah_url')
23
17
  }.to raise_error(Errno::ENOENT)
24
18
  end
25
- # context ".fetch" do
26
- # it "the default class name is 'Proto::Type'" do
27
- # our_obj = @scrape.fetch({})
28
- # our_obj.class.to_s.should == 'Proto::Type'
29
- # end
30
-
31
- # it "accepts only a hash and sets default class name" do
32
- # our_obj = @scrape.fetch({:name => 'default const'})
33
- # our_obj.class.to_s.should == 'Proto::Type'
34
- # end
35
-
36
- # it "returns a Proto object with attributes set" do
37
- # our_obj = @scrape.fetch('Sample', {:name => "Kevin", :title => "Developer"})
38
- # our_obj.name.should == "STUBBED OUT"
39
- # our_obj.title.should == "STUBBED OUT"
40
- # our_obj.class.to_s.should == "Proto::Sample"
41
- # end
42
- # end
43
19
 
44
- # context 'private methods' do
45
- # context ".create_return_objects" do
46
- # it "accepts a custom class name" do
47
- # our_obj = @scrape.send(:create_return_objects, 'Kevin', {})
48
- # our_obj.first.class.to_s.should == 'Proto::Kevin'
49
- # end
50
-
51
- # it "accepts a hash and name and sets custom attrs" do
52
- # our_obj = @scrape.send(:create_return_objects, 'Test', [{:name => 'Kevin'},{:title => "Title"}])
53
- # our_obj.first.name.should == 'Kevin'
54
- # our_obj.last.title.should == 'Title'
55
- # our_obj.length.should == 2
56
- # end
57
- # end
20
+ it 'can collect a bunch of urls' do
21
+ obj = Proto::Scraper.new('http://jobs.rubynow.com/')
22
+ obj.collect_urls('ul.jobs li h2 a:first')
23
+ obj.url_collection.first.should =~ /http:\/\/jobs/
24
+ end
58
25
 
59
- # context ".scrape_attribute_data" do
60
- # it "returns a hash of stuff" do
61
- # rh = @scrape.send(:scrape_attribute_data, {:title => "h2 a"})
62
- # rh.should == [{:title => 'STUBBED OUT'}]
63
- # end
64
- # end
65
- # end
26
+ it "should create the objects this way too" do
27
+ obj = Proto::Scraper.new('http://jobs.rubynow.com/')
28
+ obj.collect_urls('ul.jobs li h2 a:first')
29
+ jobs = obj.fetch({ :title => 'h2#headline',
30
+ :company => 'h2#headline a',
31
+ :location => 'h3#location',
32
+ :type => 'strong:last',
33
+ :description => 'div#info' }
34
+ )
35
+ jobs.first.class.to_s.should == 'Proto::Type'
36
+ jobs.first.title.should =~ /Ruby/
37
+ end
66
38
  end
39
+
40
+ # ruby_inside = Scraper.new('Ruby Inside', 'http://ruby.jobamatic.com/a/jbb/find-jobs/',
41
+ # 'http://ruby.jobamatic.com', job_database)
42
+ # ruby_inside.compile_job_url_collection('tr.listing td.title a')
43
+ # ruby_inside.scrape_away({
44
+ # title_text: 'h2.jam_headline',
45
+ # # company_text: 'h3 a.jam_link',
46
+ # location_text: 'div#c_address',
47
+ # type_text: 'div#c_jobtype',
48
+ # description_text: 'div#c_job_description'
49
+ # })
data/spec/proto_spec.rb CHANGED
@@ -2,6 +2,6 @@ require 'spec_helper'
2
2
 
3
3
  describe Proto do
4
4
  it 'should return correct version string' do
5
- Proto.version_string.should == "Proto version #{Proto::VERSION}"
5
+ "0.0.3" == "Proto version #{Proto::VERSION}"
6
6
  end
7
7
  end
@@ -0,0 +1,12 @@
1
+ require_relative '../lib/proto.rb'
2
+ require 'minitest/autorun'
3
+
4
+ class TestProto < MiniTest::Unit::TestCase
5
+ def test_that_it_has_a_version_number
6
+ refute_nil ::Proto::VERSION
7
+ end
8
+
9
+ def test_it_does_something_useful
10
+ assert false
11
+ end
12
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-18 00:00:00.000000000 Z
12
+ date: 2012-11-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -27,6 +27,22 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: minitest
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
30
46
  - !ruby/object:Gem::Dependency
31
47
  name: nokogiri
32
48
  requirement: !ruby/object:Gem::Requirement
@@ -63,6 +79,7 @@ files:
63
79
  - spec/proto_spec.rb
64
80
  - spec/sample_pages/twitter.html
65
81
  - spec/spec_helper.rb
82
+ - test/test_proto.rb
66
83
  homepage: https://github.com/kcurtin/proto
67
84
  licenses: []
68
85
  post_install_message:
@@ -92,3 +109,5 @@ test_files:
92
109
  - spec/proto_spec.rb
93
110
  - spec/sample_pages/twitter.html
94
111
  - spec/spec_helper.rb
112
+ - test/test_proto.rb
113
+ has_rdoc: