proto 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +43 -0
- data/Rakefile +4 -0
- data/lib/proto/scraper.rb +41 -0
- data/lib/proto/version.rb +3 -0
- data/lib/proto.rb +8 -0
- data/proto.gemspec +23 -0
- data/spec/proto/scraper_spec.rb +66 -0
- data/spec/proto_spec.rb +7 -0
- data/spec/sample_pages/twitter.html +4563 -0
- data/spec/spec_helper.rb +17 -0
- metadata +94 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Kevin Curtin
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Proto
|
2
|
+
|
3
|
+
Proto lets you create highly malleable, disposable value objects. You create a Proto::Scraper object with a URL. You can then pass it the name of the class you want back and a hash with the attributes and selectors so that it knows which data to scrape for you. The objects you get back are OpenStructs and are very flexible.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'proto'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install proto
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
|
23
|
+
proto = Proto::Scraper.new('http://twitter.com/kcurtin')
|
24
|
+
|
25
|
+
@tweets = proto.fetch_and_create!('Tweet', {:name => 'strong.fullname',
|
26
|
+
:content => 'p.js-tweet-text',
|
27
|
+
:created_at => 'small.time'})
|
28
|
+
|
29
|
+
#by default, Proto::Scraper only returns 10 objects
|
30
|
+
|
31
|
+
@tweets.inspect
|
32
|
+
#<Proto::Tweet name="Kevin Curtin", content="@cawebs06 just a tad over my head... You guys are smart :)", created_at="11h">
|
33
|
+
#<Proto::Tweet name="Kevin Curtin", content="@garybernhardt awesome, thanks. any plans to be in nyc soon? @FlatironSchool would love to have you stop by. we love DAS", created_at="12h">...
|
34
|
+
|
35
|
+
```
|
36
|
+
|
37
|
+
## Contributing
|
38
|
+
|
39
|
+
1. Fork it
|
40
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
41
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
42
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
43
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
module Proto
|
2
|
+
class Scraper
|
3
|
+
attr_accessor :doc
|
4
|
+
|
5
|
+
def initialize(url)
|
6
|
+
@doc = Nokogiri::HTML(open(url))
|
7
|
+
end
|
8
|
+
|
9
|
+
def fetch_and_create!(name='Type', args)
|
10
|
+
attributes = scrape_attribute_data(args)
|
11
|
+
protos = create_return_objects(name, attributes)
|
12
|
+
return protos
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
def scrape_attribute_data(attributes)
|
17
|
+
collection = Array.new(attributes.length, [])
|
18
|
+
final_array = []
|
19
|
+
keys = attributes.keys
|
20
|
+
|
21
|
+
attributes.each_with_index do |(key, selector), index|
|
22
|
+
collection[index] = doc.css(selector).slice(1..10).map { |el| el.text.strip }
|
23
|
+
end
|
24
|
+
|
25
|
+
collection.transpose.each do |data|
|
26
|
+
hash = {}
|
27
|
+
data.each_with_index do |value, index|
|
28
|
+
hash[keys[index]] = value
|
29
|
+
end
|
30
|
+
final_array << hash
|
31
|
+
end
|
32
|
+
final_array
|
33
|
+
end
|
34
|
+
|
35
|
+
def create_return_objects(name, attributes)
|
36
|
+
new_class = Class.new(OpenStruct)
|
37
|
+
Proto.const_set(name, new_class)
|
38
|
+
attributes.map { |hash| Proto.const_get(name).new(hash) }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/proto.rb
ADDED
data/proto.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'proto/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "proto"
|
8
|
+
gem.version = Proto::VERSION
|
9
|
+
gem.authors = ["Kevin Curtin"]
|
10
|
+
gem.email = ["kevincurtin88@gmail.com"]
|
11
|
+
gem.description = %q{Highly malleable, disposable value objects}
|
12
|
+
gem.summary = %q{Highly malleable, disposable value objects}
|
13
|
+
gem.homepage = "https://github.com/kcurtin/proto"
|
14
|
+
|
15
|
+
gem.add_development_dependency 'rspec'
|
16
|
+
gem.add_development_dependency "nokogiri"
|
17
|
+
|
18
|
+
gem.files = `git ls-files`.split($/)
|
19
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
20
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
21
|
+
# gem.test_files = Dir.glob("spec/**/*.rb")
|
22
|
+
gem.require_paths = ["lib"]
|
23
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Proto::Scraper do
|
4
|
+
before(:each) do
|
5
|
+
# Nokogiri::HTML.stub!(:open).and_return("doc")
|
6
|
+
# Nokogiri::HTML::Document.stub!(:parse)
|
7
|
+
# @scrape = Proto::Scraper.new('http://example.com')
|
8
|
+
# @scrape.stub_chain(:doc, :css, :each).and_return('STUBBED OUT')
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'returns my objects!' do
|
12
|
+
obj = Proto::Scraper.new('https://twitter.com/kcurtin')
|
13
|
+
obj_collection = obj.fetch_and_create!('Tweet', { :name => 'strong.fullname',
|
14
|
+
:content => 'p.js-tweet-text', :created_at => 'small.time' })
|
15
|
+
obj_collection.length.should == 10
|
16
|
+
obj_collection.first.class.to_s.should == 'Proto::Tweet'
|
17
|
+
obj_collection.first.name.should == 'Kevin Curtin'
|
18
|
+
end
|
19
|
+
|
20
|
+
it "sets its doc attr to a nokogiri doc based on url" do
|
21
|
+
expect {
|
22
|
+
Proto::Scraper.new('blah_url')
|
23
|
+
}.to raise_error(Errno::ENOENT)
|
24
|
+
end
|
25
|
+
# context ".fetch_and_create!" do
|
26
|
+
# it "the default class name is 'Proto::Type'" do
|
27
|
+
# our_obj = @scrape.fetch_and_create!({})
|
28
|
+
# our_obj.class.to_s.should == 'Proto::Type'
|
29
|
+
# end
|
30
|
+
|
31
|
+
# it "accepts only a hash and sets default class name" do
|
32
|
+
# our_obj = @scrape.fetch_and_create!({:name => 'default const'})
|
33
|
+
# our_obj.class.to_s.should == 'Proto::Type'
|
34
|
+
# end
|
35
|
+
|
36
|
+
# it "returns a Proto object with attributes set" do
|
37
|
+
# our_obj = @scrape.fetch_and_create!('Sample', {:name => "Kevin", :title => "Developer"})
|
38
|
+
# our_obj.name.should == "STUBBED OUT"
|
39
|
+
# our_obj.title.should == "STUBBED OUT"
|
40
|
+
# our_obj.class.to_s.should == "Proto::Sample"
|
41
|
+
# end
|
42
|
+
# end
|
43
|
+
|
44
|
+
# context 'private methods' do
|
45
|
+
# context ".create_return_objects" do
|
46
|
+
# it "accepts a custom class name" do
|
47
|
+
# our_obj = @scrape.send(:create_return_objects, 'Kevin', {})
|
48
|
+
# our_obj.first.class.to_s.should == 'Proto::Kevin'
|
49
|
+
# end
|
50
|
+
|
51
|
+
# it "accepts a hash and name and sets custom attrs" do
|
52
|
+
# our_obj = @scrape.send(:create_return_objects, 'Test', [{:name => 'Kevin'},{:title => "Title"}])
|
53
|
+
# our_obj.first.name.should == 'Kevin'
|
54
|
+
# our_obj.last.title.should == 'Title'
|
55
|
+
# our_obj.length.should == 2
|
56
|
+
# end
|
57
|
+
# end
|
58
|
+
|
59
|
+
# context ".scrape_attribute_data" do
|
60
|
+
# it "returns a hash of stuff" do
|
61
|
+
# rh = @scrape.send(:scrape_attribute_data, {:title => "h2 a"})
|
62
|
+
# rh.should == [{:title => 'STUBBED OUT'}]
|
63
|
+
# end
|
64
|
+
# end
|
65
|
+
# end
|
66
|
+
end
|