graboid 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Christopher Burnett
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.mdown ADDED
@@ -0,0 +1,90 @@
1
+ ### Graboid ###
2
+
3
+ ![Graboid](https://github.com/twoism/graboid/raw/master/spec/fixtures/graboid.jpg "Graboid")
4
+
5
+ Simply awesome web scraping. Better docs later. See specs.
6
+
7
+
8
+ ### Usage ###
9
+
10
+
11
+ ##### Simple Extraction with clean markup #####
12
+
13
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
14
+ "http://www.w3.org/TR/html4/strict.dtd">
15
+
16
+ <html lang="en">
17
+ <head>
18
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
19
+ <title>posts</title>
20
+ <meta name="generator" content="TextMate http://macromates.com/">
21
+ <meta name="author" content="Posterous">
22
+ <!-- Date: 2010-06-10 -->
23
+ </head>
24
+ <body>
25
+
26
+ <div class="post" id="1">
27
+
28
+ <p class="title">Post 1</p>
29
+
30
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
31
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
32
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
33
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
34
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
35
+ </p>
36
+ <span class="author">Someone Awesome (06/11/2010)</span>
37
+
38
+ </div>
39
+
40
+ <div class="post" id="2">
41
+
42
+ <p class="title">Post 2</p>
43
+
44
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
45
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
46
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
47
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
48
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
49
+ </p>
50
+ <span class="author">Someone Awesome (06/11/2010)</span>
51
+
52
+ </div>
53
+
54
+ </body>
55
+ </html>
56
+
57
+ To extract the Posts use:
58
+
59
+ class Post
60
+ include Graboid::Entity
61
+
62
+ field :title
63
+ field :body
64
+ field :author
65
+ field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
66
+ end
67
+
68
+ Post.source = 'The HTML string or URL to the document'
69
+
70
+ @post = Post.all.first
71
+
72
+ puts @post.date
73
+ => 06/11/2010
74
+
75
+ puts @post.title
76
+ => Post 1
77
+
78
+ ##Note on Patches/Pull Requests
79
+
80
+ * Fork the project.
81
+ * Make your feature addition or bug fix.
82
+ * Add tests for it. This is important so I don't break it in a
83
+ future version unintentionally.
84
+ * Commit, do not mess with rakefile, version, or history.
85
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
86
+ * Send me a pull request. Bonus points for topic branches.
87
+
88
+ ## Copyright
89
+
90
+ Copyright (c) 2010 Christopher Burnett. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,47 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "graboid"
8
+ gem.summary = %Q{web scraping made easy}
9
+ gem.description = %Q{web scraping made easier}
10
+ gem.email = "signalstatic@gmail.com"
11
+ gem.homepage = "http://github.com/twoism/graboid"
12
+ gem.authors = ["Christopher Burnett"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ gem.add_dependency "nokogiri"
15
+ gem.add_dependency "active_support"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ end
34
+
35
+ task :spec => :check_dependencies
36
+
37
+ task :default => :spec
38
+
39
+ require 'rake/rdoctask'
40
+ Rake::RDocTask.new do |rdoc|
41
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
+
43
+ rdoc.rdoc_dir = 'rdoc'
44
+ rdoc.title = "graboid #{version}"
45
+ rdoc.rdoc_files.include('README*')
46
+ rdoc.rdoc_files.include('lib/**/*.rb')
47
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/lib/graboid.rb ADDED
@@ -0,0 +1,5 @@
1
+ %w{rubygems nokogiri open-uri active_support}.each { |f| require f }
2
+
3
+ module Graboid
4
+ require 'graboid/entity'
5
+ end
@@ -0,0 +1,97 @@
1
+ module Graboid
2
+ module Entity
3
+
4
+ def self.included klass
5
+ klass.class_eval do
6
+ extend ClassMethods
7
+ include InstanceMethods
8
+ write_inheritable_attribute(:attribute_map, {}) if attribute_map.nil?
9
+ end
10
+ end
11
+
12
+ module ClassMethods
13
+
14
+ def source
15
+ @source
16
+ end
17
+
18
+ def source=(src)
19
+ @source = src
20
+ end
21
+
22
+ def field name, opts={}, &block
23
+ opts.merge!(:selector => ".#{name}") unless opts[:selector].present?
24
+ opts.merge!(:processor => block) if block_given?
25
+
26
+ attribute_map[name] = opts
27
+ end
28
+
29
+ def root selector
30
+ @root_selector = selector
31
+ end
32
+
33
+ def root_selector
34
+ @root_selector || inferred_selector
35
+ end
36
+
37
+ def inferred_selector
38
+ @inferred_selector ||= ".#{self.to_s.underscore}"
39
+ end
40
+
41
+ def doc
42
+ Nokogiri::HTML read_source
43
+ end
44
+
45
+ def attribute_map
46
+ read_inheritable_attribute :attribute_map
47
+ end
48
+
49
+ def extract_instance fragment
50
+ new(hash_map(fragment))
51
+ end
52
+
53
+ def hash_map fragment
54
+ attribute_map.inject({}) do |extracted_hash, at|
55
+ selector, processor = at.last[:selector], at.last[:processor]
56
+ extracted_hash[at.first] = processor.nil? ? fragment.css(selector).first.text : processor.call(fragment.css(selector).first)
57
+ extracted_hash
58
+ end
59
+ end
60
+
61
+ def all_fragments
62
+ doc.css root_selector
63
+ end
64
+
65
+ def all
66
+ all_fragments.collect{ |frag| extract_instance(frag) }
67
+ end
68
+
69
+ def read_source
70
+ case @source
71
+ when /^http:\/\//
72
+ open @source
73
+ when String
74
+ @source
75
+ end
76
+ end
77
+
78
+ end # ClassMethods
79
+
80
+ module InstanceMethods
81
+
82
+ def initialize opts={}
83
+ opts.each do |k,v|
84
+ self.class_eval do
85
+ define_method k do
86
+ v
87
+ end
88
+ end
89
+ end
90
+ end
91
+
92
+ def attribute_map
93
+ self.class.attribute_map
94
+ end
95
+ end # InstanceMethods
96
+ end
97
+ end
Binary file
@@ -0,0 +1,43 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>posts</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="Posterous">
10
+ <!-- Date: 2010-06-10 -->
11
+ </head>
12
+ <body>
13
+
14
+ <div class="post" id="1">
15
+
16
+ <p class="title">Post 1</p>
17
+
18
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
19
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
20
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
21
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
22
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
23
+ </p>
24
+ <span class="author">Someone Awesome (06/11/2010)</span>
25
+
26
+ </div>
27
+
28
+ <div class="post" id="2">
29
+
30
+ <p class="title">Post 2</p>
31
+
32
+ <p class="body">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
33
+ incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
34
+ ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
35
+ in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
36
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
37
+ </p>
38
+ <span class="author">Someone Awesome (06/11/2010)</span>
39
+
40
+ </div>
41
+
42
+ </body>
43
+ </html>
@@ -0,0 +1,188 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ class Post
4
+ include Graboid::Entity
5
+
6
+ root '.post'
7
+ end
8
+
9
+ describe Graboid::Entity do
10
+ describe "#source" do
11
+ describe "when url" do
12
+ before(:each) do
13
+ Post.source = 'http://foo.com/'
14
+ end
15
+
16
+ it "should set the source" do
17
+ Post.source.should == 'http://foo.com/'
18
+ end
19
+ end
20
+ end
21
+
22
+ describe "#root_selector" do
23
+
24
+ it "should be set" do
25
+ Post.root_selector.should == '.post'
26
+ end
27
+
28
+ describe "when inferred from class" do
29
+
30
+ before(:each) do
31
+ class Phony; include Graboid::Entity; end
32
+ end
33
+
34
+ it "should infer .phony" do
35
+ Phony.root_selector.should == '.phony'
36
+ end
37
+ end
38
+ end
39
+
40
+ describe "#doc" do
41
+
42
+ describe "when supplied a url" do
43
+
44
+ before(:each) do
45
+ Post.source = 'http://google.com'
46
+ end
47
+
48
+ it "should set the doc source" do
49
+ Post.doc.should be_a Nokogiri::HTML::Document
50
+ end
51
+
52
+ end
53
+
54
+ describe "when supplied html" do
55
+
56
+ before(:each) do
57
+ Post.source = POSTS_HTML_STR
58
+ end
59
+
60
+ it "should set the doc source" do
61
+ Post.doc.should be_a Nokogiri::HTML::Document
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+
68
+ describe "#field" do
69
+ describe "simple syntax" do
70
+
71
+ before(:each) do
72
+ Post.field :body
73
+ end
74
+
75
+ it "should be set in the attr map" do
76
+ Post.attribute_map[:body].should be_a Hash
77
+ end
78
+
79
+ it "should set the selector" do
80
+ Post.attribute_map[:body][:selector].should == '.body'
81
+ end
82
+ end
83
+
84
+ describe "custom selector syntax" do
85
+ before(:each) do
86
+ Post.field :body, :selector => '.custom'
87
+ end
88
+
89
+ it "should set the selector" do
90
+ Post.attribute_map[:body][:selector].should == '.custom'
91
+ end
92
+ end
93
+
94
+ describe "custom selector syntax with a lambda" do
95
+
96
+ before(:each) do
97
+ Post.field :body, :selector => '.custom' do |item|
98
+ "from lambda"
99
+ end
100
+ end
101
+
102
+ it "should set the selector" do
103
+ Post.attribute_map[:body][:selector].should == '.custom'
104
+ end
105
+
106
+ it "should set the processor" do
107
+ Post.attribute_map[:body][:processor].should be_a Proc
108
+ end
109
+
110
+ end
111
+ end
112
+
113
+ describe "#all_fragments" do
114
+ before(:each) do
115
+
116
+ class WorkingPost
117
+ include Graboid::Entity
118
+ root '.post'
119
+ field :body
120
+ end
121
+
122
+ WorkingPost.source = POSTS_HTML_STR
123
+ @fragments = WorkingPost.all_fragments
124
+ end
125
+
126
+ it "should return the NodeSet" do
127
+ @fragments.should be_a Nokogiri::XML::NodeSet
128
+ end
129
+
130
+ it "should have 2 results" do
131
+ @fragments.count.should == 2
132
+ end
133
+
134
+ end
135
+
136
+ describe "#extract_instance" do
137
+
138
+ before(:each) do
139
+ class WorkingPost
140
+ include Graboid::Entity
141
+ root '.post'
142
+ field :title
143
+ field :body
144
+ field :author
145
+ field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
146
+ end
147
+
148
+ @instance = WorkingPost.extract_instance(POST_FRAGMENT)
149
+
150
+ end
151
+
152
+ it "should return a WorkingPost instance" do
153
+ @instance.should be_a WorkingPost
154
+ end
155
+
156
+ it "should respond to attrs defined in the map" do
157
+ WorkingPost.attribute_map.each { |k,v| @instance.should respond_to(k) }
158
+ end
159
+
160
+ it "should extract the date" do
161
+ @instance.date.should == '06/11/2010'
162
+ end
163
+
164
+ end
165
+
166
+ describe "#all" do
167
+ before(:each) do
168
+ class WorkingPost
169
+ include Graboid::Entity
170
+ root '.post'
171
+ field :title
172
+ field :body
173
+ field :author
174
+ field :date, :selector => '.author', :processor => lambda {|frag| frag.text.match(/\((.*)\)/)[1] }
175
+ end
176
+
177
+ WorkingPost.source = POSTS_HTML_STR
178
+
179
+ end
180
+
181
+ it "should return 2 WorkingPosts" do
182
+ puts WorkingPost.all.first
183
+ WorkingPost.all.length.should == 2
184
+ end
185
+
186
+ end
187
+
188
+ end
@@ -0,0 +1,5 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Graboid" do
4
+
5
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,3 @@
1
+ --colour
2
+ --format nested
3
+ --loadby mtime
@@ -0,0 +1,14 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'graboid'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ Spec::Runner.configure do |config|
8
+
9
+ end
10
+
11
+ file_path = File.expand_path(File.dirname(__FILE__)+'/fixtures/posts.html')
12
+ POSTS_HTML_STR = File.read(file_path){|f| f.read }
13
+ d = Nokogiri::HTML(POSTS_HTML_STR)
14
+ POST_FRAGMENT = Nokogiri::HTML::fragment(d.css('.post').first.to_html)
metadata ADDED
@@ -0,0 +1,126 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: graboid
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Christopher Burnett
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-06-11 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rspec
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 13
30
+ segments:
31
+ - 1
32
+ - 2
33
+ - 9
34
+ version: 1.2.9
35
+ type: :development
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: nokogiri
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 3
46
+ segments:
47
+ - 0
48
+ version: "0"
49
+ type: :runtime
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: active_support
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ hash: 3
60
+ segments:
61
+ - 0
62
+ version: "0"
63
+ type: :runtime
64
+ version_requirements: *id003
65
+ description: web scraping made easier
66
+ email: signalstatic@gmail.com
67
+ executables: []
68
+
69
+ extensions: []
70
+
71
+ extra_rdoc_files:
72
+ - LICENSE
73
+ - README.mdown
74
+ files:
75
+ - .document
76
+ - .gitignore
77
+ - LICENSE
78
+ - README.mdown
79
+ - Rakefile
80
+ - VERSION
81
+ - lib/graboid.rb
82
+ - lib/graboid/entity.rb
83
+ - spec/fixtures/graboid.jpg
84
+ - spec/fixtures/posts.html
85
+ - spec/graboid/entity_spec.rb
86
+ - spec/graboid_spec.rb
87
+ - spec/spec.opts
88
+ - spec/spec_helper.rb
89
+ has_rdoc: true
90
+ homepage: http://github.com/twoism/graboid
91
+ licenses: []
92
+
93
+ post_install_message:
94
+ rdoc_options:
95
+ - --charset=UTF-8
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ hash: 3
104
+ segments:
105
+ - 0
106
+ version: "0"
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ hash: 3
113
+ segments:
114
+ - 0
115
+ version: "0"
116
+ requirements: []
117
+
118
+ rubyforge_project:
119
+ rubygems_version: 1.3.7
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: web scraping made easy
123
+ test_files:
124
+ - spec/graboid/entity_spec.rb
125
+ - spec/graboid_spec.rb
126
+ - spec/spec_helper.rb