proto 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +53 -18
- data/lib/proto/scraper.rb +8 -15
- data/lib/proto/version.rb +1 -1
- data/proto.gemspec +1 -1
- data/spec/proto/scraper_spec.rb +8 -8
- metadata +3 -3
data/README.md
CHANGED
@@ -1,6 +1,58 @@
|
|
1
1
|
# Proto
|
2
2
|
|
3
|
-
Proto
|
3
|
+
Proto is a nokogiri wrapper that uses scraping patterns to return value objects with minimal work.
|
4
|
+
|
5
|
+
It is the evolution of [another project](https://github.com/kcurtin/scrape_source).
|
6
|
+
|
7
|
+
Proto is meant to be lightweight and flexible, the objects you get back inherit from OpenStruct. New methods can be dynamically added to the objects, you won't ever get method_missing errors, and you can access the data in a bunch of different ways. Check out the documentation for more info: [OpenStruct](http://www.ruby-doc.org/stdlib-1.9.3/libdoc/ostruct/rdoc/OpenStruct.html)
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
Create a new Scraper object with the URL you want to scrape data from
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
proto = Proto::Scraper.new('http://twitter.com/kcurtin')
|
15
|
+
```
|
16
|
+
|
17
|
+
Initializing a scraper gives you an object with a nokogiri doc based on the URL you provided
|
18
|
+
```ruby
|
19
|
+
proto.inspect
|
20
|
+
#=> #<Proto::Scraper:0x007fc6fb852860 @doc=#<Nokogiri::HTML::Document:0x3fe37d0b1634...>
|
21
|
+
```
|
22
|
+
|
23
|
+
Currently, the API is strict. There is a single public method you can call. This method accepts a constant name and a hash as arguments:
|
24
|
+
```ruby
|
25
|
+
tweets = proto.fetch('Tweet', {:name => 'strong.fullname',
|
26
|
+
:content => 'p.js-tweet-text',
|
27
|
+
:created_at => 'small.time'})
|
28
|
+
```
|
29
|
+
The string you pass in as a constant name will become the class name of the objects you get back. They will be namespaced, so passing in 'Tweet' returns objects of the class ```Proto::Tweet```. If you fail to pass in a constant, your return objects will be of class ```Proto::Type```.
|
30
|
+
|
31
|
+
The keys correspond with the getter/setter methods that will be available on the object you get back and should describe the data you want. The values you pass in are CSS selectors that you need to provide to tell Proto where the data you want lives in the DOM of the page you are scraping. Proto depends on Nokogiri and supports the same CSS selectors. Currently, Proto calls ```.text``` on any elements you are scraping for and always returns Strings.
|
32
|
+
|
33
|
+
```.fetch``` returns an array of objects that contain your data:
|
34
|
+
```ruby
|
35
|
+
tweets.inspect
|
36
|
+
#=> [#<Proto::Tweet name="Kevin Curtin", content="@cawebs06 just a tad over my head... You guys are smart :)", created_at="11h">,
|
37
|
+
#<Proto::Tweet name="Kevin Curtin", content="@garybernhardt awesome, thanks. any plans to be in nyc soon? @FlatironSchool would love to have you stop by. we love DAS", created_at="12h">...]
|
38
|
+
```
|
39
|
+
|
40
|
+
OpenStruct features:
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
tweet = tweets.first
|
44
|
+
#=> #<Proto::Tweet name="Kevin Curtin", content="@cawebs06 just a tad over my head... You guys are smart :)", created_at="11h">
|
45
|
+
|
46
|
+
#flexible:
|
47
|
+
tweet.title
|
48
|
+
#=> nil
|
49
|
+
|
50
|
+
#dynamic:
|
51
|
+
tweet.username = 'kcurtin'
|
52
|
+
#=> 'kcurtin'
|
53
|
+
```
|
54
|
+
|
55
|
+
Enjoy!
|
4
56
|
|
5
57
|
## Installation
|
6
58
|
|
@@ -16,23 +68,6 @@ Or install it yourself as:
|
|
16
68
|
|
17
69
|
$ gem install proto
|
18
70
|
|
19
|
-
## Usage
|
20
|
-
|
21
|
-
```ruby
|
22
|
-
|
23
|
-
proto = Proto::Scraper.new('http://twitter.com/kcurtin')
|
24
|
-
|
25
|
-
@tweets = proto.fetch_and_create!('Tweet', {:name => 'strong.fullname',
|
26
|
-
:content => 'p.js-tweet-text',
|
27
|
-
:created_at => 'small.time'})
|
28
|
-
|
29
|
-
#by default, Proto::Scraper only returns 10 objects
|
30
|
-
|
31
|
-
@tweets.inspect
|
32
|
-
#<Proto::Tweet name="Kevin Curtin", content="@cawebs06 just a tad over my head... You guys are smart :)", created_at="11h">
|
33
|
-
#<Proto::Tweet name="Kevin Curtin", content="@garybernhardt awesome, thanks. any plans to be in nyc soon? @FlatironSchool would love to have you stop by. we love DAS", created_at="12h">...
|
34
|
-
|
35
|
-
```
|
36
71
|
|
37
72
|
## Contributing
|
38
73
|
|
data/lib/proto/scraper.rb
CHANGED
@@ -6,7 +6,7 @@ module Proto
|
|
6
6
|
@doc = Nokogiri::HTML(open(url))
|
7
7
|
end
|
8
8
|
|
9
|
-
def
|
9
|
+
def fetch(name='Type', args)
|
10
10
|
attributes = scrape_attribute_data(args)
|
11
11
|
protos = create_return_objects(name, attributes)
|
12
12
|
return protos
|
@@ -14,22 +14,15 @@ module Proto
|
|
14
14
|
|
15
15
|
private
|
16
16
|
def scrape_attribute_data(attributes)
|
17
|
-
|
18
|
-
final_array = []
|
19
|
-
keys = attributes.keys
|
17
|
+
length_of_scrape = @doc.css(attributes.first[1]).count
|
20
18
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
collection.transpose.each do |data|
|
26
|
-
hash = {}
|
27
|
-
data.each_with_index do |value, index|
|
28
|
-
hash[keys[index]] = value
|
19
|
+
final_array = length_of_scrape.times.map do |index|
|
20
|
+
attributes.inject(Hash.new) do |hash, (attr_name, selector)|
|
21
|
+
hash.merge(attr_name => @doc.css(selector)[index].text.strip) if doc.css(selector)[index]
|
29
22
|
end
|
30
|
-
final_array << hash
|
31
23
|
end
|
32
|
-
|
24
|
+
|
25
|
+
final_array.compact
|
33
26
|
end
|
34
27
|
|
35
28
|
def create_return_objects(name, attributes)
|
@@ -38,4 +31,4 @@ module Proto
|
|
38
31
|
attributes.map { |hash| Proto.const_get(name).new(hash) }
|
39
32
|
end
|
40
33
|
end
|
41
|
-
end
|
34
|
+
end
|
data/lib/proto/version.rb
CHANGED
data/proto.gemspec
CHANGED
@@ -13,7 +13,7 @@ Gem::Specification.new do |gem|
|
|
13
13
|
gem.homepage = "https://github.com/kcurtin/proto"
|
14
14
|
|
15
15
|
gem.add_development_dependency 'rspec'
|
16
|
-
gem.
|
16
|
+
gem.add_runtime_dependency 'nokogiri'
|
17
17
|
|
18
18
|
gem.files = `git ls-files`.split($/)
|
19
19
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
data/spec/proto/scraper_spec.rb
CHANGED
@@ -10,31 +10,31 @@ describe Proto::Scraper do
|
|
10
10
|
|
11
11
|
it 'returns my objects!' do
|
12
12
|
obj = Proto::Scraper.new('https://twitter.com/kcurtin')
|
13
|
-
obj_collection = obj.
|
13
|
+
obj_collection = obj.fetch('Tweet', { :name => 'strong.fullname',
|
14
14
|
:content => 'p.js-tweet-text', :created_at => 'small.time' })
|
15
|
-
obj_collection.length.should == 10
|
15
|
+
# obj_collection.length.should == 10
|
16
16
|
obj_collection.first.class.to_s.should == 'Proto::Tweet'
|
17
17
|
obj_collection.first.name.should == 'Kevin Curtin'
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
it "sets its doc attr to a nokogiri doc based on url" do
|
21
|
-
expect {
|
21
|
+
expect {
|
22
22
|
Proto::Scraper.new('blah_url')
|
23
23
|
}.to raise_error(Errno::ENOENT)
|
24
24
|
end
|
25
|
-
# context ".
|
25
|
+
# context ".fetch" do
|
26
26
|
# it "the default class name is 'Proto::Type'" do
|
27
|
-
# our_obj = @scrape.
|
27
|
+
# our_obj = @scrape.fetch({})
|
28
28
|
# our_obj.class.to_s.should == 'Proto::Type'
|
29
29
|
# end
|
30
30
|
|
31
31
|
# it "accepts only a hash and sets default class name" do
|
32
|
-
# our_obj = @scrape.
|
32
|
+
# our_obj = @scrape.fetch({:name => 'default const'})
|
33
33
|
# our_obj.class.to_s.should == 'Proto::Type'
|
34
34
|
# end
|
35
35
|
|
36
36
|
# it "returns a Proto object with attributes set" do
|
37
|
-
# our_obj = @scrape.
|
37
|
+
# our_obj = @scrape.fetch('Sample', {:name => "Kevin", :title => "Developer"})
|
38
38
|
# our_obj.name.should == "STUBBED OUT"
|
39
39
|
# our_obj.title.should == "STUBBED OUT"
|
40
40
|
# our_obj.class.to_s.should == "Proto::Sample"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proto
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-11-
|
12
|
+
date: 2012-11-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -35,7 +35,7 @@ dependencies:
|
|
35
35
|
- - ! '>='
|
36
36
|
- !ruby/object:Gem::Version
|
37
37
|
version: '0'
|
38
|
-
type: :
|
38
|
+
type: :runtime
|
39
39
|
prerelease: false
|
40
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
41
|
none: false
|