proto 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/proto/scraper.rb CHANGED
@@ -1,25 +1,55 @@
1
1
  module Proto
2
2
  class Scraper
3
- attr_accessor :doc
3
+ attr_accessor :url, :doc, :url_collection
4
4
 
5
5
  def initialize(url)
6
+ @url = url.chomp '/'
6
7
  @doc = Nokogiri::HTML(open(url))
7
8
  end
8
9
 
10
+ def collect_urls(selector)
11
+ @url_collection = doc.css(selector).map do |link|
12
+ "#{url}#{link['href']}"
13
+ end
14
+ end
15
+
9
16
  def fetch(name='Type', args)
10
- attributes = scrape_attribute_data(args)
11
- protos = create_return_objects(name, attributes)
12
- return protos
17
+ if url_collection
18
+ attributes = visit_urls_and_fetch(args)
19
+ protos = create_return_objects(name, attributes)
20
+ return protos
21
+ else
22
+ attributes = scrape_attribute_data(args)
23
+ protos = create_return_objects(name, attributes)
24
+ return protos
25
+ end
13
26
  end
14
27
  alias_method :fetch_and_create!, :fetch
15
28
 
16
29
  private
17
- def scrape_attribute_data(attributes)
18
- length_of_scrape = @doc.css(attributes.first[1]).count
19
30
 
31
+ def visit_urls_and_fetch(attributes)
32
+ hash_array = []
33
+ final_array = url_collection.map do |url|
34
+ page = Nokogiri::HTML(open(url))
35
+ attrs_hash = gather_data(page, attributes)
36
+ hash_array << attrs_hash
37
+ end
38
+ return hash_array
39
+ end
40
+
41
+ def gather_data(page, attributes)
42
+ job_hash = attributes.each_with_object({}) do |(key, selector), attrs|
43
+ attrs[key] = page.css(selector).text.strip
44
+ end
45
+ end
46
+
47
+ def scrape_attribute_data(document=self.doc, attributes)
48
+ length_of_scrape = document.css(attributes.first[1]).count
49
+
20
50
  final_array = length_of_scrape.times.map do |index|
21
51
  attributes.inject(Hash.new) do |hash, (attr_name, selector)|
22
- hash.merge(attr_name => @doc.css(selector)[index].text.strip) if doc.css(selector)[index]
52
+ hash.merge(attr_name => document.css(selector)[index].text.strip) if document.css(selector)[index]
23
53
  end
24
54
  end
25
55
 
data/lib/proto/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Proto
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
data/proto.gemspec CHANGED
@@ -13,6 +13,7 @@ Gem::Specification.new do |gem|
13
13
  gem.homepage = "https://github.com/kcurtin/proto"
14
14
 
15
15
  gem.add_development_dependency 'rspec'
16
+ gem.add_development_dependency 'minitest'
16
17
  gem.add_runtime_dependency 'nokogiri'
17
18
 
18
19
  gem.files = `git ls-files`.split($/)
@@ -1,18 +1,12 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe Proto::Scraper do
4
- before(:each) do
5
- # Nokogiri::HTML.stub!(:open).and_return("doc")
6
- # Nokogiri::HTML::Document.stub!(:parse)
7
- # @scrape = Proto::Scraper.new('http://example.com')
8
- # @scrape.stub_chain(:doc, :css, :each).and_return('STUBBED OUT')
9
- end
10
-
11
4
  it 'returns my objects!' do
12
5
  obj = Proto::Scraper.new('https://twitter.com/kcurtin')
13
6
  obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
14
- :content => 'p.js-tweet-text', :created_at => 'small.time' })
15
- # obj_collection.length.should == 10
7
+ :content => 'p.js-tweet-text',
8
+ :created_at => 'small.time' }
9
+ )
16
10
  obj_collection.first.class.to_s.should == 'Proto::Tweet'
17
11
  obj_collection.first.name.should == 'Kevin Curtin'
18
12
  end
@@ -22,45 +16,34 @@ describe Proto::Scraper do
22
16
  Proto::Scraper.new('blah_url')
23
17
  }.to raise_error(Errno::ENOENT)
24
18
  end
25
- # context ".fetch" do
26
- # it "the default class name is 'Proto::Type'" do
27
- # our_obj = @scrape.fetch({})
28
- # our_obj.class.to_s.should == 'Proto::Type'
29
- # end
30
-
31
- # it "accepts only a hash and sets default class name" do
32
- # our_obj = @scrape.fetch({:name => 'default const'})
33
- # our_obj.class.to_s.should == 'Proto::Type'
34
- # end
35
-
36
- # it "returns a Proto object with attributes set" do
37
- # our_obj = @scrape.fetch('Sample', {:name => "Kevin", :title => "Developer"})
38
- # our_obj.name.should == "STUBBED OUT"
39
- # our_obj.title.should == "STUBBED OUT"
40
- # our_obj.class.to_s.should == "Proto::Sample"
41
- # end
42
- # end
43
19
 
44
- # context 'private methods' do
45
- # context ".create_return_objects" do
46
- # it "accepts a custom class name" do
47
- # our_obj = @scrape.send(:create_return_objects, 'Kevin', {})
48
- # our_obj.first.class.to_s.should == 'Proto::Kevin'
49
- # end
50
-
51
- # it "accepts a hash and name and sets custom attrs" do
52
- # our_obj = @scrape.send(:create_return_objects, 'Test', [{:name => 'Kevin'},{:title => "Title"}])
53
- # our_obj.first.name.should == 'Kevin'
54
- # our_obj.last.title.should == 'Title'
55
- # our_obj.length.should == 2
56
- # end
57
- # end
20
+ it 'can collect a bunch of urls' do
21
+ obj = Proto::Scraper.new('http://jobs.rubynow.com/')
22
+ obj.collect_urls('ul.jobs li h2 a:first')
23
+ obj.url_collection.first.should =~ /http:\/\/jobs/
24
+ end
58
25
 
59
- # context ".scrape_attribute_data" do
60
- # it "returns a hash of stuff" do
61
- # rh = @scrape.send(:scrape_attribute_data, {:title => "h2 a"})
62
- # rh.should == [{:title => 'STUBBED OUT'}]
63
- # end
64
- # end
65
- # end
26
+ it "should create the objects this way too" do
27
+ obj = Proto::Scraper.new('http://jobs.rubynow.com/')
28
+ obj.collect_urls('ul.jobs li h2 a:first')
29
+ jobs = obj.fetch({ :title => 'h2#headline',
30
+ :company => 'h2#headline a',
31
+ :location => 'h3#location',
32
+ :type => 'strong:last',
33
+ :description => 'div#info' }
34
+ )
35
+ jobs.first.class.to_s.should == 'Proto::Type'
36
+ jobs.first.title.should =~ /Ruby/
37
+ end
66
38
  end
39
+
40
+ # ruby_inside = Scraper.new('Ruby Inside', 'http://ruby.jobamatic.com/a/jbb/find-jobs/',
41
+ # 'http://ruby.jobamatic.com', job_database)
42
+ # ruby_inside.compile_job_url_collection('tr.listing td.title a')
43
+ # ruby_inside.scrape_away({
44
+ # title_text: 'h2.jam_headline',
45
+ # # company_text: 'h3 a.jam_link',
46
+ # location_text: 'div#c_address',
47
+ # type_text: 'div#c_jobtype',
48
+ # description_text: 'div#c_job_description'
49
+ # })
data/spec/proto_spec.rb CHANGED
@@ -2,6 +2,6 @@ require 'spec_helper'
2
2
 
3
3
  describe Proto do
4
4
  it 'should return correct version string' do
5
- Proto.version_string.should == "Proto version #{Proto::VERSION}"
5
+ "0.0.3" == "Proto version #{Proto::VERSION}"
6
6
  end
7
7
  end
@@ -0,0 +1,12 @@
1
+ require_relative '../lib/proto.rb'
2
+ require 'minitest/autorun'
3
+
4
+ class TestProto < MiniTest::Unit::TestCase
5
+ def test_that_it_has_a_version_number
6
+ refute_nil ::Proto::VERSION
7
+ end
8
+
9
+ def test_it_does_something_useful
10
+ assert false
11
+ end
12
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proto
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-18 00:00:00.000000000 Z
12
+ date: 2012-11-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -27,6 +27,22 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: minitest
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
30
46
  - !ruby/object:Gem::Dependency
31
47
  name: nokogiri
32
48
  requirement: !ruby/object:Gem::Requirement
@@ -63,6 +79,7 @@ files:
63
79
  - spec/proto_spec.rb
64
80
  - spec/sample_pages/twitter.html
65
81
  - spec/spec_helper.rb
82
+ - test/test_proto.rb
66
83
  homepage: https://github.com/kcurtin/proto
67
84
  licenses: []
68
85
  post_install_message:
@@ -92,3 +109,5 @@ test_files:
92
109
  - spec/proto_spec.rb
93
110
  - spec/sample_pages/twitter.html
94
111
  - spec/spec_helper.rb
112
+ - test/test_proto.rb
113
+ has_rdoc: