scrapie 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "mechanize"
4
+
5
+ # Add dependencies to develop your gem here.
6
+ # Include everything needed to run rake, tests, features, etc.
7
+ group :development do
8
+ gem "sham_rack", ">= 0"
9
+ gem "sinatra" # For sham_rack magic!
10
+ gem "rspec", ">= 0"
11
+ gem "bundler", "~> 1.0.0"
12
+ gem "jeweler", "~> 1.6.4"
13
+ gem "rcov", ">= 0"
14
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,51 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ diff-lcs (1.1.3)
5
+ git (1.2.5)
6
+ jeweler (1.6.4)
7
+ bundler (~> 1.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ mechanize (2.0.1)
11
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
12
+ net-http-persistent (~> 1.8)
13
+ nokogiri (~> 1.4)
14
+ webrobots (~> 0.0, >= 0.0.9)
15
+ net-http-digest_auth (1.1.1)
16
+ net-http-persistent (1.9)
17
+ nokogiri (1.5.0)
18
+ rack (1.3.5)
19
+ rack-protection (1.1.4)
20
+ rack
21
+ rake (0.9.2.2)
22
+ rcov (0.9.11)
23
+ rspec (2.7.0)
24
+ rspec-core (~> 2.7.0)
25
+ rspec-expectations (~> 2.7.0)
26
+ rspec-mocks (~> 2.7.0)
27
+ rspec-core (2.7.1)
28
+ rspec-expectations (2.7.0)
29
+ diff-lcs (~> 1.1.2)
30
+ rspec-mocks (2.7.0)
31
+ sham_rack (1.3.3)
32
+ rack
33
+ sinatra (1.3.1)
34
+ rack (~> 1.3, >= 1.3.4)
35
+ rack-protection (~> 1.1, >= 1.1.2)
36
+ tilt (~> 1.3, >= 1.3.3)
37
+ tilt (1.3.3)
38
+ webrobots (0.0.12)
39
+ nokogiri (>= 1.4.4)
40
+
41
+ PLATFORMS
42
+ ruby
43
+
44
+ DEPENDENCIES
45
+ bundler (~> 1.0.0)
46
+ jeweler (~> 1.6.4)
47
+ mechanize
48
+ rcov
49
+ rspec
50
+ sham_rack
51
+ sinatra
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Adrian Pike
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,55 @@
1
+ = scrapie
2
+
3
+ Hey, it's Scrapie! It's 2011, we should be able to scrape sites for their juicy data in a delicious fashion instead of having to hack something together every time.
4
+
5
+ It's basically a tool that allows you to really simply and quickly fab up a class that translates CSS selectors into attributes, and lets you specify your own translations on query params.
6
+
7
+ == Example
8
+
9
+ class Airplane < Scrapie
10
+ url 'http://registry.faa.gov/aircraftinquiry/NNum_Results.aspx'
11
+ params {
12
+ :n_number => 'NNumbertxt'
13
+ }
14
+ attributes {
15
+ 'serial_number' => 'div#serial_number',
16
+ 'classname' => '.class_name'
17
+ }
18
+ before_fetch do |agent|
19
+ # Do stuff with my agent, like log in or hax the gibson
20
+ end
21
+ after_fetch do |agent|
22
+ # Do more neatu stuff with my agent
23
+ end
24
+
25
+
26
+ # Other posisbilities
27
+ method :get
28
+ agent_options { :options_to_send_to_my_new_mechanize_agent => 'BE COOL MAN' }
29
+ end
30
+
31
+ a = Airplane.find(:n_number => '12345') # => Fetches http://registry.faa.gov/aircraftinquiry/NNum_Results.aspx?NNumbertxt=12345
32
+ a.serial_number = 'a cool serial number'
33
+
34
+ == Todo
35
+
36
+ * Set up the callbacks
37
+ * Sanitize
38
+ * Refactor
39
+ * Make it cooler!
40
+
41
+ == Contributing to scrapie
42
+
43
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
44
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
45
+ * Fork the project
46
+ * Start a feature/bugfix branch
47
+ * Commit and push until you are happy with your contribution
48
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
49
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
50
+
51
+ == Copyright
52
+
53
+ Copyright (c) 2011 Adrian Pike. See LICENSE.txt for
54
+ further details.
55
+
data/Rakefile ADDED
@@ -0,0 +1,54 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "scrapie"
18
+ gem.homepage = "http://github.com/adrianpike/scrapie"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Scrapie scrapes things for great justice.}
21
+ gem.description = %Q{Scrapie is a tool that allows you to really simply and quickly fab up a class that translates CSS selectors into attributes, and lets you specify your own translations on query params. }
22
+ gem.email = "adrian@pikeapps.com"
23
+ gem.authors = ["Adrian Pike"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+
29
+ require 'rspec'
30
+ require 'rspec/core/rake_task'
31
+ RSpec::Core::RakeTask.new('spec') do |t|
32
+ t.pattern = 'spec/*_spec.rb'
33
+ # t.rspec_opts = ["--backtrace"]
34
+ end
35
+
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ test.rcov_opts << '--exclude "gems/*"'
42
+ end
43
+
44
+ task :default => :test
45
+
46
+ require 'rake/rdoctask'
47
+ Rake::RDocTask.new do |rdoc|
48
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
49
+
50
+ rdoc.rdoc_dir = 'rdoc'
51
+ rdoc.title = "scrapie #{version}"
52
+ rdoc.rdoc_files.include('README*')
53
+ rdoc.rdoc_files.include('lib/**/*.rb')
54
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/lib/scrapie.rb ADDED
@@ -0,0 +1,55 @@
1
+ require 'mechanize'
2
+
3
+ class Scrapie
4
+ class ScrapieException < Exception; end
5
+ class NoAttributesException < ScrapieException; end
6
+
7
+ def self.url(url)
8
+ @url = url
9
+ end
10
+ def self.params(params)
11
+ @params = params
12
+ end
13
+ def self.http_method(method)
14
+ @http_method = method
15
+ end
16
+ def self.attributes(attributes)
17
+ @attributes = attributes
18
+ attributes.each {|name,page_selector|
19
+ self.send(:attr_accessor, name)
20
+ }
21
+ end
22
+
23
+ # find()
24
+ # find(:foo => bar)
25
+ # find(:foo => bar, :baz => bizzle)
26
+ def self.find(opts = {})
27
+ raise NoAttributesException unless (@attributes and @attributes.size > 0)
28
+ a = Mechanize.new
29
+
30
+ # Let's build out the parameters now
31
+ params = Hash[opts.collect{|k,v|
32
+ [@params[k], v] if @params and @params[k]
33
+ }]
34
+
35
+ page = a.send(@http_method || :get, @url, params)
36
+
37
+ new_object = self.new
38
+ @attributes.each {|name, page_selector|
39
+ new_object.send(name + '=', page.search(page_selector).inner_html)
40
+ }
41
+
42
+ new_object
43
+ end
44
+
45
+ # Callbacks # TODO
46
+
47
+ def self.before_fetch
48
+
49
+ end
50
+
51
+ def self.after_fetch
52
+
53
+ end
54
+
55
+ end
data/spec/helper.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rspec'
11
+ require 'sham_rack'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'scrapie'
@@ -0,0 +1,98 @@
1
+ require 'helper'
2
+
3
+ ShamRack.at("scrapietest").sinatra do
4
+ get "/test1" do
5
+ "No attributes here chief"
6
+ end
7
+ get "/test_with_params" do
8
+ "<div id='param'>#{params[:test_param_for_getting]}</div><div id='param_upcased'>#{params[:test_param_for_getting].upcase}</div>"
9
+ end
10
+ get "/test" do
11
+ "<div class='foo'>example</div>"
12
+ end
13
+ get '/500' do
14
+ DERP
15
+ end
16
+ post '/post' do
17
+ "<div id='post_param'>#{params[:le_post]}</div>"
18
+ end
19
+ end
20
+
21
+ class NoAttributeScrapie < Scrapie
22
+ url 'http://scrapietest/test1'
23
+ end
24
+
25
+ class BasicScrapie < Scrapie
26
+ url 'http://scrapietest/test'
27
+ attributes({ 'foo' => '.foo' })
28
+ end
29
+
30
+ class ParamsScrapie < Scrapie
31
+ url 'http://scrapietest/test_with_params'
32
+ params({ :test_param => 'test_param_for_getting' })
33
+ attributes({
34
+ 'param' => 'div#param',
35
+ 'param_upcased' => 'div#param_upcased'
36
+ })
37
+ end
38
+
39
+ class FourOhFourScrapie < Scrapie
40
+ url 'http://scrapietest/ends_of_the_earth'
41
+ attributes({
42
+ 'results' => 'div#post_param'
43
+ })
44
+ end
45
+
46
+ class FiveHundredScrapie < Scrapie
47
+ url 'http://scrapietest/500'
48
+ attributes({
49
+ 'results' => 'div#post_param'
50
+ })
51
+ end
52
+
53
+ class PostScrapie < Scrapie
54
+ url 'http://scrapietest/post'
55
+ http_method :post
56
+
57
+ params({ :search => 'le_post' })
58
+ attributes({
59
+ 'results' => 'div#post_param'
60
+ })
61
+ end
62
+
63
+ describe Scrapie do
64
+
65
+ it 'whines if you don\'t specify any attributes' do
66
+ lambda { nas = NoAttributeScrapie.find(:har => 'heh') }.should raise_error(Scrapie::NoAttributesException)
67
+ end
68
+
69
+ it 'does a basic fetch sans params' do
70
+ basic = BasicScrapie.find
71
+ basic.foo.should == 'example'
72
+ end
73
+
74
+ it 'handles params' do
75
+ test_string = 'sdkfjhdsafjkladhfklzxcv123' # todo: random string
76
+
77
+ paramtest = ParamsScrapie.find(:test_param => test_string)
78
+ paramtest.param.should == test_string
79
+ paramtest.param_upcased.should == test_string.upcase
80
+ end
81
+
82
+ it 'handles 404s' do
83
+ lambda { nas = FourOhFourScrapie.find(:har => 'heh') }.should raise_error(Mechanize::ResponseCodeError)
84
+ end
85
+ it 'handles 500s' do
86
+ lambda { nas = FiveHundredScrapie.find(:har => 'heh') }.should raise_error(Mechanize::ResponseCodeError)
87
+ end
88
+ it 'uses different HTTP methods' do
89
+ post = PostScrapie.find(:search => 'le_search')
90
+
91
+ post.results.should == 'le_search'
92
+ end
93
+
94
+ it 'uses a before_fetch'
95
+ it 'uses an after_fetch'
96
+ it 'sets agent options'
97
+
98
+ end
metadata ADDED
@@ -0,0 +1,139 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapie
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Adrian Pike
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-10-26 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: &2152794940 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *2152794940
25
+ - !ruby/object:Gem::Dependency
26
+ name: sham_rack
27
+ requirement: &2152794360 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *2152794360
36
+ - !ruby/object:Gem::Dependency
37
+ name: sinatra
38
+ requirement: &2152793740 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *2152793740
47
+ - !ruby/object:Gem::Dependency
48
+ name: rspec
49
+ requirement: &2152791580 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *2152791580
58
+ - !ruby/object:Gem::Dependency
59
+ name: bundler
60
+ requirement: &2152790880 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ~>
64
+ - !ruby/object:Gem::Version
65
+ version: 1.0.0
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *2152790880
69
+ - !ruby/object:Gem::Dependency
70
+ name: jeweler
71
+ requirement: &2152790220 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 1.6.4
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *2152790220
80
+ - !ruby/object:Gem::Dependency
81
+ name: rcov
82
+ requirement: &2152789620 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ type: :development
89
+ prerelease: false
90
+ version_requirements: *2152789620
91
+ description: ! 'Scrapie is a tool that allows you to really simply and quickly fab
92
+ up a class that translates CSS selectors into attributes, and lets you specify your
93
+ own translations on query params. '
94
+ email: adrian@pikeapps.com
95
+ executables: []
96
+ extensions: []
97
+ extra_rdoc_files:
98
+ - LICENSE.txt
99
+ - README.rdoc
100
+ files:
101
+ - .document
102
+ - Gemfile
103
+ - Gemfile.lock
104
+ - LICENSE.txt
105
+ - README.rdoc
106
+ - Rakefile
107
+ - VERSION
108
+ - lib/scrapie.rb
109
+ - spec/helper.rb
110
+ - spec/scrapie_spec.rb
111
+ homepage: http://github.com/adrianpike/scrapie
112
+ licenses:
113
+ - MIT
114
+ post_install_message:
115
+ rdoc_options: []
116
+ require_paths:
117
+ - lib
118
+ required_ruby_version: !ruby/object:Gem::Requirement
119
+ none: false
120
+ requirements:
121
+ - - ! '>='
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ segments:
125
+ - 0
126
+ hash: 4604187453020345507
127
+ required_rubygems_version: !ruby/object:Gem::Requirement
128
+ none: false
129
+ requirements:
130
+ - - ! '>='
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
133
+ requirements: []
134
+ rubyforge_project:
135
+ rubygems_version: 1.8.10
136
+ signing_key:
137
+ specification_version: 3
138
+ summary: Scrapie scrapes things for great justice.
139
+ test_files: []