graboid 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Christopher Burnett
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.mdown ADDED
@@ -0,0 +1,90 @@
1
+ ### Graboid ###
2
+
3
+ ![Graboid](https://github.com/twoism/graboid/raw/master/spec/fixtures/graboid.jpg "Graboid")
4
+
5
+ Simply awesome web scraping. Better docs later. See specs.
6
+
7
+
8
+ ### Usage ###
9
+
10
+
11
+ ##### Simple Extraction with clean markup #####
12
+
13
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
14
+ "http://www.w3.org/TR/html4/strict.dtd">
15
+
16
+ <html lang="en">
17
+ <head>
18
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
19
+ <title>posts</title>
20
+ <meta name="generator" content="TextMate http://macromates.com/">
21
+ <meta name="author" content="Posterous">
22
+ <!-- Date: 2010-06-10 -->
23
+ </head>
24
+ <body>
25
+
26
+ <div class="post" id="1">
27
+
28
+ <p class="title">Post 1</p>
29
+
30
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
31
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
32
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
33
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
34
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
35
+ </p>
36
+ <span class="author">Someone Awesome (06/11/2010)</span>
37
+
38
+ </div>
39
+
40
+ <div class="post" id="2">
41
+
42
+ <p class="title">Post 2</p>
43
+
44
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
45
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
46
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
47
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
48
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
49
+ </p>
50
+ <span class="author">Someone Awesome (06/11/2010)</span>
51
+
52
+ </div>
53
+
54
+ </body>
55
+ </html>
56
+
57
+ To extract the Posts use:
58
+
59
+ class Post
60
+ include Graboid::Entity
61
+
62
+ field :title
63
+ field :body
64
+ field :author
65
+ field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
66
+ end
67
+
68
+ Post.source = 'The HTML string or URL to the document'
69
+
70
+ @post = Post.all.first
71
+
72
+ puts @post.date
73
+ => 06/11/2010
74
+
75
+ puts @post.title
76
+ => Post 1
77
+
78
+ ##Note on Patches/Pull Requests
79
+
80
+ * Fork the project.
81
+ * Make your feature addition or bug fix.
82
+ * Add tests for it. This is important so I don't break it in a
83
+ future version unintentionally.
84
+ * Commit, do not mess with rakefile, version, or history.
85
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
86
+ * Send me a pull request. Bonus points for topic branches.
87
+
88
+ ## Copyright
89
+
90
+ Copyright (c) 2010 Christopher Burnett. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,47 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "graboid"
8
+ gem.summary = %Q{web scraping made easy}
9
+ gem.description = %Q{web scraping made easier}
10
+ gem.email = "signalstatic@gmail.com"
11
+ gem.homepage = "http://github.com/twoism/graboid"
12
+ gem.authors = ["Christopher Burnett"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ gem.add_dependency "nokogiri"
15
+ gem.add_dependency "active_support"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ end
34
+
35
+ task :spec => :check_dependencies
36
+
37
+ task :default => :spec
38
+
39
+ require 'rake/rdoctask'
40
+ Rake::RDocTask.new do |rdoc|
41
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
+
43
+ rdoc.rdoc_dir = 'rdoc'
44
+ rdoc.title = "graboid #{version}"
45
+ rdoc.rdoc_files.include('README*')
46
+ rdoc.rdoc_files.include('lib/**/*.rb')
47
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/lib/graboid.rb ADDED
@@ -0,0 +1,5 @@
1
+ %w{rubygems nokogiri open-uri active_support}.each { |f| require f }
2
+
3
+ module Graboid
4
+ require 'graboid/entity'
5
+ end
@@ -0,0 +1,97 @@
1
+ module Graboid
2
+ module Entity
3
+
4
+ def self.included klass
5
+ klass.class_eval do
6
+ extend ClassMethods
7
+ include InstanceMethods
8
+ write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
9
+ end
10
+ end
11
+
12
+ module ClassMethods
13
+
14
+ def source
15
+ @source
16
+ end
17
+
18
+ def source=(src)
19
+ @source = src
20
+ end
21
+
22
+ def field name, opts={}, &block
23
+ opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
24
+ opts.merge!(:processor => block) if block_given?
25
+
26
+ attribute_map[name] = opts
27
+ end
28
+
29
+ def root selector
30
+ @root_selector = selector
31
+ end
32
+
33
+ def root_selector
34
+ @root_selector || inferred_selector
35
+ end
36
+
37
+ def inferred_selector
38
+ @inferred_selector ||= ".#{self.to_s.underscore}"
39
+ end
40
+
41
+ def doc
42
+ Nokogiri::HTML read_source
43
+ end
44
+
45
+ def attribute_map
46
+ read_inheritable_attribute :attribute_map
47
+ end
48
+
49
+ def extract_instance fragment
50
+ new(hash_map(fragment))
51
+ end
52
+
53
+ def hash_map fragment
54
+ attribute_map.inject({}) do |extracted_hash, at|
55
+ selector, processor = at.last[:selector], at.last[:processor]
56
+ extracted_hash[at.first] = processor.nil? ? fragment.css(selector).first.text : processor.call(fragment.css(selector).first)
57
+ extracted_hash
58
+ end
59
+ end
60
+
61
+ def all_fragments
62
+ doc.css root_selector
63
+ end
64
+
65
+ def all
66
+ all_fragments.collect{ |frag| extract_instance(frag) }
67
+ end
68
+
69
+ def read_source
70
+ case @source
71
+ when /^http:\/\//
72
+ open @source
73
+ when String
74
+ @source
75
+ end
76
+ end
77
+
78
+ end # ClassMethods
79
+
80
+ module InstanceMethods
81
+
82
+ def initialize opts={}
83
+ opts.each do |k,v|
84
+ self.class_eval do
85
+ define_method k do
86
+ v
87
+ end
88
+ end
89
+ end
90
+ end
91
+
92
+ def attribute_map
93
+ self.class.attribute_map
94
+ end
95
+ end # InstanceMethods
96
+ end
97
+ end
Binary file
@@ -0,0 +1,43 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>posts</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Posterous">
10
+ <!-- Date: 2010-06-10 -->
11
+ </head>
12
+ <body>
13
+
14
+ <div class="post" id="1">
15
+
16
+ <p class="title">Post 1</p>
17
+
18
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
19
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
20
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
21
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
22
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
23
+ </p>
24
+ <span class="author">Someone Awesome (06/11/2010)</span>
25
+
26
+ </div>
27
+
28
+ <div class="post" id="2">
29
+
30
+ <p class="title">Post 2</p>
31
+
32
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
33
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
34
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
35
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
36
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
37
+ </p>
38
+ <span class="author">Someone Awesome (06/11/2010)</span>
39
+
40
+ </div>
41
+
42
+ </body>
43
+ </html>
@@ -0,0 +1,188 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ class Post
4
+ include Graboid::Entity
5
+
6
+ root '.post'
7
+ end
8
+
9
+ describe Graboid::Entity do
10
+ describe "#source" do
11
+ describe "when url" do
12
+ before(:each) do
13
+ Post.source = 'http://foo.com/'
14
+ end
15
+
16
+ it "should set the source" do
17
+ Post.source.should == 'http://foo.com/'
18
+ end
19
+ end
20
+ end
21
+
22
+ describe "#root_selector" do
23
+
24
+ it "should be set" do
25
+ Post.root_selector.should == '.post'
26
+ end
27
+
28
+ describe "when inferred from class" do
29
+
30
+ before(:each) do
31
+ class Phony; include Graboid::Entity; end
32
+ end
33
+
34
+ it "should infer .phony" do
35
+ Phony.root_selector.should == '.phony'
36
+ end
37
+ end
38
+ end
39
+
40
+ describe "#doc" do
41
+
42
+ describe "when supplied a url" do
43
+
44
+ before(:each) do
45
+ Post.source = 'http://google.com'
46
+ end
47
+
48
+ it "should set the doc source" do
49
+ Post.doc.should be_a Nokogiri::HTML::Document
50
+ end
51
+
52
+ end
53
+
54
+ describe "when supplied html" do
55
+
56
+ before(:each) do
57
+ Post.source = POSTS_HTML_STR
58
+ end
59
+
60
+ it "should set the doc source" do
61
+ Post.doc.should be_a Nokogiri::HTML::Document
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+
68
+ describe "#field" do
69
+ describe "simple syntax" do
70
+
71
+ before(:each) do
72
+ Post.field :body
73
+ end
74
+
75
+ it "should be set in the attr map" do
76
+ Post.attribute_map[:body].should be_a Hash
77
+ end
78
+
79
+ it "should set the selector" do
80
+ Post.attribute_map[:body][:selector].should == '.body'
81
+ end
82
+ end
83
+
84
+ describe "custom selector syntax" do
85
+ before(:each) do
86
+ Post.field :body, :selector => '.custom'
87
+ end
88
+
89
+ it "should set the selector" do
90
+ Post.attribute_map[:body][:selector].should == '.custom'
91
+ end
92
+ end
93
+
94
+ describe "custom selector syntax with a lambda" do
95
+
96
+ before(:each) do
97
+ Post.field :body, :selector => '.custom' do |item|
98
+ "from lambda"
99
+ end
100
+ end
101
+
102
+ it "should set the selector" do
103
+ Post.attribute_map[:body][:selector].should == '.custom'
104
+ end
105
+
106
+ it "should set the processor" do
107
+ Post.attribute_map[:body][:processor].should be_a Proc
108
+ end
109
+
110
+ end
111
+ end
112
+
113
+ describe "#all_fragments" do
114
+ before(:each) do
115
+
116
+ class WorkingPost
117
+ include Graboid::Entity
118
+ root '.post'
119
+ field :body
120
+ end
121
+
122
+ WorkingPost.source = POSTS_HTML_STR
123
+ @fragments = WorkingPost.all_fragments
124
+ end
125
+
126
+ it "should return the NodeSet" do
127
+ @fragments.should be_a Nokogiri::XML::NodeSet
128
+ end
129
+
130
+ it "should have 2 results" do
131
+ @fragments.count.should == 2
132
+ end
133
+
134
+ end
135
+
136
+ describe "#extract_instance" do
137
+
138
+ before(:each) do
139
+ class WorkingPost
140
+ include Graboid::Entity
141
+ root '.post'
142
+ field :title
143
+ field :body
144
+ field :author
145
+ field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
146
+ end
147
+
148
+ @instance = WorkingPost.extract_instance(POST_FRAGMENT)
149
+
150
+ end
151
+
152
+ it "should return a WorkingPost instance" do
153
+ @instance.should be_a WorkingPost
154
+ end
155
+
156
+ it "should respond to attrs defined in the map" do
157
+ WorkingPost.attribute_map.each { |k,v| @instance.should respond_to(k) }
158
+ end
159
+
160
+ it "should extract the date" do
161
+ @instance.date.should == '06/11/2010'
162
+ end
163
+
164
+ end
165
+
166
+ describe "#all" do
167
+ before(:each) do
168
+ class WorkingPost
169
+ include Graboid::Entity
170
+ root '.post'
171
+ field :title
172
+ field :body
173
+ field :author
174
+ field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
175
+ end
176
+
177
+ WorkingPost.source = POSTS_HTML_STR
178
+
179
+ end
180
+
181
+ it "should return 2 WorkingPosts" do
182
+ puts WorkingPost.all.first
183
+ WorkingPost.all.length.should == 2
184
+ end
185
+
186
+ end
187
+
188
+ end
@@ -0,0 +1,5 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Graboid" do
4
+
5
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,3 @@
1
+ --colour
2
+ --format nested
3
+ --loadby mtime
@@ -0,0 +1,14 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'graboid'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ Spec::Runner.configure do |config|
8
+
9
+ end
10
+
11
+ file_path = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
12
+ POSTS_HTML_STR = File.read(file_path){|f| f.read }
13
+ d = Nokogiri::HTML(POSTS_HTML_STR)
14
+ POST_FRAGMENT = Nokogiri::HTML::fragment(d.css('.post').first.to_html)
metadata ADDED
@@ -0,0 +1,126 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: graboid
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Christopher Burnett
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-06-11 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rspec
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 13
30
+ segments:
31
+ - 1
32
+ - 2
33
+ - 9
34
+ version: 1.2.9
35
+ type: :development
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: nokogiri
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 3
46
+ segments:
47
+ - 0
48
+ version: "0"
49
+ type: :runtime
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: active_support
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ hash: 3
60
+ segments:
61
+ - 0
62
+ version: "0"
63
+ type: :runtime
64
+ version_requirements: *id003
65
+ description: web scraping made easier
66
+ email: signalstatic@gmail.com
67
+ executables: []
68
+
69
+ extensions: []
70
+
71
+ extra_rdoc_files:
72
+ - LICENSE
73
+ - README.mdown
74
+ files:
75
+ - .document
76
+ - .gitignore
77
+ - LICENSE
78
+ - README.mdown
79
+ - Rakefile
80
+ - VERSION
81
+ - lib/graboid.rb
82
+ - lib/graboid/entity.rb
83
+ - spec/fixtures/graboid.jpg
84
+ - spec/fixtures/posts.html
85
+ - spec/graboid/entity_spec.rb
86
+ - spec/graboid_spec.rb
87
+ - spec/spec.opts
88
+ - spec/spec_helper.rb
89
+ has_rdoc: true
90
+ homepage: http://github.com/twoism/graboid
91
+ licenses: []
92
+
93
+ post_install_message:
94
+ rdoc_options:
95
+ - --charset=UTF-8
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ hash: 3
104
+ segments:
105
+ - 0
106
+ version: "0"
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ hash: 3
113
+ segments:
114
+ - 0
115
+ version: "0"
116
+ requirements: []
117
+
118
+ rubyforge_project:
119
+ rubygems_version: 1.3.7
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: web scraping made easy
123
+ test_files:
124
+ - spec/graboid/entity_spec.rb
125
+ - spec/graboid_spec.rb
126
+ - spec/spec_helper.rb