active_scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ODhmOGY2MGY0MWNjZmExNGM5MzA5ZGQwZmFhNjg0ZDQxZTMzOGQwNw==
5
+ data.tar.gz: !binary |-
6
+ NWQ5ZDQwYzZlNTg5MjY2YjdmZjFkN2Q4MmEzMjdmOGYwNjE0OWQ2NQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ NjY1NWFjOTg5OGI1NWFlOTgzZjNjZGNiNjk2OGJkZDIyMTRjMmQwNWU3MjY3
10
+ MTIyMDNlZDJiZWMwZjNhZDM4NTk1Njk5MjdkNmRiYTNlZDEzZDZmNjdhYjRl
11
+ ZjFjNmVlMTVjYjI5MDE0MGYyNTdlNTEyZmEzMmI3N2QwYTNhNGY=
12
+ data.tar.gz: !binary |-
13
+ YzIyNWE4M2JhODc0MmY3OWQ1ZDFkNmYwNjA1M2M5MDZiNDAyMGVlYmU2MTVl
14
+ NDk5Y2ZiNTg1OGFkZmZkYmJhOGQyYjUyODA4NTg2ZDliNjhhMmQyZTNhZmEx
15
+ YzQwYWU1NzdlZWE2MjRiMTM2YjUwNzM3NjM5YWM2NDk3YmU4NDk=
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright 2013 YOURNAME
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,23 @@
1
+ begin
2
+ require 'bundler/setup'
3
+ rescue LoadError
4
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
5
+ end
6
+
7
+ require 'rdoc/task'
8
+
9
+ RDoc::Task.new(:rdoc) do |rdoc|
10
+ rdoc.rdoc_dir = 'rdoc'
11
+ rdoc.title = 'ActiveScraper'
12
+ rdoc.options << '--line-numbers'
13
+ rdoc.rdoc_files.include('README.rdoc')
14
+ rdoc.rdoc_files.include('lib/**/*.rb')
15
+ end
16
+
17
+ APP_RAKEFILE = File.expand_path("../spec/dummy/Rakefile", __FILE__)
18
+ load 'rails/tasks/engine.rake'
19
+
20
+
21
+
22
+ Bundler::GemHelper.install_tasks
23
+
@@ -0,0 +1,15 @@
1
+ /*
2
+ * This is a manifest file that'll be compiled into application.css, which will include all the files
3
+ * listed below.
4
+ *
5
+ * Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
6
+ * or vendor/assets/stylesheets of plugins, if any, can be referenced here using a relative path.
7
+ *
8
+ * You're free to add application-wide styles to this file and they'll appear at the bottom of the
9
+ * compiled file so the styles you add here take precedence over styles defined in any styles
10
+ * defined in the other CSS/SCSS files in this directory. It is generally better to create a new
11
+ * file per style scope.
12
+ *
13
+ *= require_tree .
14
+ *= require_self
15
+ */
@@ -0,0 +1,4 @@
1
+ module ActiveScraper
2
+ class ApplicationController < ActionController::Base
3
+ end
4
+ end
@@ -0,0 +1,4 @@
1
+ module ActiveScraper
2
+ module ApplicationHelper
3
+ end
4
+ end
@@ -0,0 +1,47 @@
1
+ module ActiveScraper
2
+ class AgnosticResponseObject < SimpleDelegator
3
+
4
+ attr_reader :code, :headers, :body, :content_type
5
+
6
+ def initialize(obj)
7
+ if obj.class == (HTTParty::Response)
8
+ # use the Net::HTTPResponse instead
9
+ obj = obj.response
10
+ end
11
+
12
+ response_obj = if obj.is_a?(Net::HTTPResponse)
13
+ @body = obj.body
14
+ @content_type = obj.content_type
15
+ @headers = obj.each_header.inject({}){|h, (k, v)| h[k] = v; h }
16
+ @code = obj.code.to_i
17
+ elsif obj.is_a?(ActiveScraper::Request)
18
+ @body = obj.body
19
+ @content_type = obj.content_type
20
+ @headers = obj.headers
21
+ @code = obj.code.to_i
22
+ else
23
+ # this is probably not used
24
+ @body = obj.to_s
25
+ @headers = {}
26
+ @content_type = nil
27
+ @code = nil
28
+ end
29
+
30
+ super({})
31
+
32
+ # now set its values
33
+ [:body, :headers, :content_type, :code].each do |a|
34
+ self[a] = self.send(a)
35
+ end
36
+ end
37
+
38
+ # def [](k)
39
+ # @values[k.to_sym]
40
+ # end
41
+
42
+ # def [](k,v)
43
+ # send(:"#{k}=", v)
44
+ # end
45
+
46
+ end
47
+ end
@@ -0,0 +1,98 @@
1
+ require 'addressable/uri'
2
+ module ActiveScraper
3
+ class Request < ActiveRecord::Base
4
+ has_many :responses, :dependent => :destroy
5
+ validates_uniqueness_of :path, scope: [:host, :query, :scheme]
6
+
7
+
8
+ scope :with_url, ->(u){
9
+ params = Request.build_validating_params(u)
10
+ where(params)
11
+ }
12
+
13
+
14
+ def obfuscated?
15
+ is_obfuscated == true
16
+ end
17
+
18
+ def uri
19
+ Addressable::URI.new(
20
+ self.attributes.symbolize_keys.slice(:scheme, :host, :path, :query)
21
+ )
22
+ end
23
+
24
+ def self.build_validating_params(uri, opts={})
25
+ h = build_request_params(uri, opts)
26
+
27
+ h.slice(:scheme, :host, :path, :query)
28
+ end
29
+
30
+ # Returns a Hash with symbolized keys
31
+ def self.build_request_params(uri, opts={})
32
+ u = Addressable::URI.parse(uri)
33
+ hsh = {scheme: u.normalized_scheme, host: u.normalized_host, path: u.normalized_path, query: u.normalized_query, extname: u.extname}
34
+
35
+ if ob_keys = opts.delete(:obfuscate_query)
36
+ Array(ob_keys).each do |key|
37
+ a = Array(key)
38
+
39
+ key_to_omit = Regexp.escape(a[0].to_s)
40
+ char_num = a[1] || 0
41
+
42
+ if val_to_omit = hsh[:query].match(/(?<=#{key_to_omit}=)(.*?)(?=&|$)/)
43
+ val = val_to_omit[1]
44
+ hsh[:query].sub!( val, "__OMIT__#{val[-char_num, char_num]}")
45
+ end
46
+ end
47
+
48
+ hsh[:is_obfuscated] = true
49
+ else
50
+ hsh[:is_obfuscated] = false
51
+ end
52
+
53
+ return hsh
54
+ end
55
+
56
+ def self.build_from_uri(uri, opts={})
57
+ request_params = build_request_params(uri, opts)
58
+ request_obj = Request.new(request_params)
59
+
60
+ return request_obj
61
+ end
62
+
63
+ def self.find_or_build_from_uri(uri, opts={})
64
+ self.with_url(uri).first || self.build_from_uri(uri, opts)
65
+ end
66
+
67
+
68
+ def self.create_from_uri(uri, opts={})
69
+ req = build_from_uri(uri, opts)
70
+ req.save
71
+
72
+ return req
73
+ end
74
+
75
+
76
+ def self.create_and_fetch_response(uri, opts={}, fetcher = nil)
77
+ request = find_or_build_from_uri(uri, opts)
78
+ fetcher = fetcher || Fetcher.new
79
+
80
+ if request.id.nil?
81
+ # this request is new
82
+ # so skip to the fresh
83
+ resp = fetcher.fetch request, fresh: true
84
+ else
85
+ # will check the cache and the fresh
86
+ resp = fetcher.fetch request
87
+ end
88
+
89
+ # build the response
90
+ response = request.responses.build(resp)
91
+ # theoretically, response will be saved too
92
+ request.save
93
+
94
+ return request
95
+ end
96
+
97
+ end
98
+ end
@@ -0,0 +1,26 @@
1
+ module ActiveScraper
2
+ class Response < ActiveRecord::Base
3
+ serialize :headers, Hash
4
+ belongs_to :request
5
+ before_save :set_checksum
6
+
7
+
8
+
9
+ private
10
+ def set_checksum
11
+ self.checksum = body.hash
12
+
13
+ true
14
+ end
15
+
16
+ ############## class methods
17
+ def self.build_from_response_object(resp)
18
+ response = self.new
19
+ [:body, :headers, :content_type, :code].each do |att|
20
+ response.send :write_attribute, att, resp.send(att)
21
+ end
22
+
23
+ return response
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>ActiveScraper</title>
5
+ <%= stylesheet_link_tag "active_scraper/application", media: "all" %>
6
+ <%= javascript_include_tag "active_scraper/application" %>
7
+ <%= csrf_meta_tags %>
8
+ </head>
9
+ <body>
10
+
11
+ <%= yield %>
12
+
13
+ </body>
14
+ </html>
data/config/routes.rb ADDED
@@ -0,0 +1,2 @@
1
+ ActiveScraper::Engine.routes.draw do
2
+ end
@@ -0,0 +1,16 @@
1
+ class CreateActiveScraperRequests < ActiveRecord::Migration
2
+ def change
3
+ create_table :active_scraper_requests do |t|
4
+ t.string :host
5
+ t.text :query
6
+ t.string :path
7
+ t.string :meta_tag
8
+ t.boolean :is_obfuscated
9
+
10
+ t.timestamps
11
+ end
12
+
13
+ add_index :active_scraper_requests, [:host, :path]
14
+
15
+ end
16
+ end
@@ -0,0 +1,17 @@
1
+ class CreateActiveScraperResponses < ActiveRecord::Migration
2
+ def change
3
+ create_table :active_scraper_responses do |t|
4
+ t.text :body, :limit => 4294967295
5
+ t.integer :code
6
+ t.text :headers
7
+ t.string :content_type
8
+ t.integer :checksum
9
+ t.integer :active_scraper_request_id
10
+
11
+ t.timestamps
12
+ end
13
+
14
+ add_index :active_scraper_responses, [:active_scraper_request_id, :created_at], name: 'index_request_id_and_created_at'
15
+ add_index :active_scraper_responses, [:active_scraper_request_id, :checksum], name: 'index_request_id_and_checksum'
16
+ end
17
+ end
@@ -0,0 +1,10 @@
1
+ module ActiveScraper
2
+ class Engine < ::Rails::Engine
3
+ isolate_namespace ActiveScraper
4
+ config.generators do |g|
5
+ g.test_framework :rspec, :fixture => false
6
+ g.assets false
7
+ g.helper false
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,65 @@
1
+ require 'httparty'
2
+
3
+ module ActiveScraper
4
+ class Fetcher
5
+
6
+ def fetch(u, opts={})
7
+ url = convert_uri_object(u)
8
+ force_fresh = opts.delete :fresh
9
+
10
+ if force_fresh != true && (record = fetch_from_cache(url, opts))
11
+ resp_obj = record
12
+ else
13
+ resp_obj = fetch_fresh(url, opts)
14
+ end
15
+
16
+ build_response_object(resp_obj)
17
+ end
18
+
19
+
20
+ def fetch_fresh(url, opts={})
21
+ opts = opts.stringify_keys
22
+
23
+ url = url.to_s
24
+ verb = opts.fetch('verb'){ 'get' }
25
+
26
+ resp = HTTParty.send(verb, url)
27
+ end
28
+
29
+
30
+ # returns:
31
+ # single ScrapeCache if a valid ActiveScraper::Request exists
32
+ #
33
+ def fetch_from_cache(uri, opts={})
34
+
35
+ end
36
+
37
+ # true or false if ActiveScraper::Request with these parameters exist
38
+ def has_cache?(uri, opts={})
39
+
40
+ end
41
+
42
+
43
+ # u can either be a Request object, a String, or Addressable::URI
44
+ # returns an Addressable::URI
45
+ def convert_uri_object(u)
46
+ if u.is_a?(ActiveScraper::Request)
47
+ x = u.uri
48
+ else
49
+ x = Addressable::URI.parse(u)
50
+ end
51
+
52
+ return x
53
+ end
54
+
55
+ def build_response_object(obj)
56
+ self.class.build_response_object(obj)
57
+ end
58
+
59
+ # returns an OpenStruct that Response can use
60
+ def self.build_response_object(obj)
61
+ return AgnosticResponseObject.new(obj)
62
+ end
63
+
64
+ end
65
+ end
@@ -0,0 +1,3 @@
1
+ module ActiveScraper
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,5 @@
1
+ require "active_scraper/engine"
2
+ require "active_scraper/fetcher"
3
+
4
+ module ActiveScraper
5
+ end
@@ -0,0 +1,4 @@
1
+ # desc "Explaining what the task does"
2
+ # task :active_scraper do
3
+ # # Task goes here
4
+ # end
metadata ADDED
@@ -0,0 +1,243 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: active_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Dan Nguyen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-01-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rails
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 4.1.0.beta1
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 4.1.0.beta1
27
+ - !ruby/object:Gem::Dependency
28
+ name: httparty
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: addressable
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: minitest
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: database_cleaner
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '='
88
+ - !ruby/object:Gem::Version
89
+ version: 1.0.1
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '='
95
+ - !ruby/object:Gem::Version
96
+ version: 1.0.1
97
+ - !ruby/object:Gem::Dependency
98
+ name: sqlite3
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: webmock
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ! '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: pry
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ! '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ! '>='
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: pry-rails
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ! '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ! '>='
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: vcr
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ! '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ! '>='
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: rspec
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ! '>='
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ! '>='
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
181
+ - !ruby/object:Gem::Dependency
182
+ name: rspec-rails
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ! '>='
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ! '>='
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ description: A Rails Engine using ActiveRecord to cache results of HTTP scrapes
196
+ email:
197
+ - dansonguyen@gmail.com
198
+ executables: []
199
+ extensions: []
200
+ extra_rdoc_files: []
201
+ files:
202
+ - app/assets/stylesheets/active_scraper/application.css
203
+ - app/controllers/active_scraper/application_controller.rb
204
+ - app/helpers/active_scraper/application_helper.rb
205
+ - app/models/active_scraper/agnostic_response_object.rb
206
+ - app/models/active_scraper/request.rb
207
+ - app/models/active_scraper/response.rb
208
+ - app/views/layouts/active_scraper/application.html.erb
209
+ - config/routes.rb
210
+ - db/migrate/20131229024155_create_active_scraper_requests.rb
211
+ - db/migrate/20131229033843_create_active_scraper_responses.rb
212
+ - lib/active_scraper/engine.rb
213
+ - lib/active_scraper/fetcher.rb
214
+ - lib/active_scraper/version.rb
215
+ - lib/active_scraper.rb
216
+ - lib/tasks/active_scraper_tasks.rake
217
+ - MIT-LICENSE
218
+ - Rakefile
219
+ homepage: http://github.com/dannguyen/active_scraper
220
+ licenses:
221
+ - MIT
222
+ metadata: {}
223
+ post_install_message:
224
+ rdoc_options: []
225
+ require_paths:
226
+ - lib
227
+ required_ruby_version: !ruby/object:Gem::Requirement
228
+ requirements:
229
+ - - ! '>='
230
+ - !ruby/object:Gem::Version
231
+ version: '0'
232
+ required_rubygems_version: !ruby/object:Gem::Requirement
233
+ requirements:
234
+ - - ! '>='
235
+ - !ruby/object:Gem::Version
236
+ version: '0'
237
+ requirements: []
238
+ rubyforge_project:
239
+ rubygems_version: 2.1.11
240
+ signing_key:
241
+ specification_version: 4
242
+ summary: A Rails Engine using ActiveRecord to cache results of HTTP scrapes
243
+ test_files: []