active_scraper 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ODhmOGY2MGY0MWNjZmExNGM5MzA5ZGQwZmFhNjg0ZDQxZTMzOGQwNw==
4
+ NWEwOTJjZGViNjkwYzZiMjYwY2YzMGE5YjRkMTdlNzJhMTMwZmJmMw==
5
5
  data.tar.gz: !binary |-
6
- NWQ5ZDQwYzZlNTg5MjY2YjdmZjFkN2Q4MmEzMjdmOGYwNjE0OWQ2NQ==
6
+ OGMyNmI5MDA3MmM5YWQ5MjIxZjlhYjZlYTliMDZhZWY0YjlmYjQ5ZQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NjY1NWFjOTg5OGI1NWFlOTgzZjNjZGNiNjk2OGJkZDIyMTRjMmQwNWU3MjY3
10
- MTIyMDNlZDJiZWMwZjNhZDM4NTk1Njk5MjdkNmRiYTNlZDEzZDZmNjdhYjRl
11
- ZjFjNmVlMTVjYjI5MDE0MGYyNTdlNTEyZmEzMmI3N2QwYTNhNGY=
9
+ YTc0Mzk5YzdhNWZlM2YyYzJlNmZiZjMxOTg4OTg0YTU0MzhkMWMxMDVlNDJl
10
+ Y2UzNDVmNWE1NDhkODJlZTMyYWY1OTFjNTU3ZGYyNGIwNzA4NDEzZDMxMzg5
11
+ YWZkZWVmMTc1MjFjNGNhZDIzYTU0NGU1MTg4ZGYyYWNhNGQ3YTI=
12
12
  data.tar.gz: !binary |-
13
- YzIyNWE4M2JhODc0MmY3OWQ1ZDFkNmYwNjA1M2M5MDZiNDAyMGVlYmU2MTVl
14
- NDk5Y2ZiNTg1OGFkZmZkYmJhOGQyYjUyODA4NTg2ZDliNjhhMmQyZTNhZmEx
15
- YzQwYWU1NzdlZWE2MjRiMTM2YjUwNzM3NjM5YWM2NDk3YmU4NDk=
13
+ ZWMyYTc5MDJhZDU5NGY4ZDVlMjFkYTI0YzNkMDM2NDFkNjgyY2JhNDE3MGI2
14
+ M2Y5MTMzMTMzMGE3ZmFmYzk5OWNiNjQyNzRmM2E4ZjJmYjk5MWY0MzUzMjU0
15
+ YmMxNDQ3YWU1MTY4YTFmNDZjMjRlM2YyM2E4Nzc1NzA1MDQ3N2I=
@@ -0,0 +1,163 @@
1
+ require 'httparty'
2
+ require 'addressable/uri'
3
+ require 'hashie/mash'
4
+
5
+ module ActiveScraper
6
+ class CachedRequest < ActiveRecord::Base
7
+ has_many :responses, :dependent => :destroy, class_name: 'CachedResponse', foreign_key: 'cached_request_id'
8
+ has_one :latest_response, ->{ order('created_at DESC') }, class_name: 'ActiveScraper::CachedResponse', foreign_key: 'cached_request_id'
9
+ validates_uniqueness_of :path, scope: [:host, :query, :scheme]
10
+
11
+ attr_accessor :unobfuscated_query
12
+
13
+ delegate :to_s, :to => :uri
14
+
15
+ # problematic
16
+ scope :with_url, ->(u){
17
+ matching_request(u)
18
+ }
19
+
20
+ scope :matching_request, ->(req, opts={}){
21
+ if req.is_a?(CachedRequest)
22
+ req = req.to_uri
23
+ end
24
+ params = CachedRequest.build_validating_params(req, opts)
25
+
26
+ where(params)
27
+ }
28
+
29
+ scope :last_fetched_before, ->(some_time){
30
+ some_time = Time.parse(some_time) if some_time.is_a?(String)
31
+
32
+ where("last_fetched_at < ?", some_time)
33
+ }
34
+
35
+ def latest_response_fetched_after(time)
36
+ if latest_response.present?
37
+ return latest_response if latest_response.created_at > time
38
+ end
39
+
40
+ nil
41
+ end
42
+
43
+ def to_fake_party_hash
44
+ h = Hashie::Mash.new(self.attributes.symbolize_keys.slice(:scheme, :host, :path, :query))
45
+ h[:uri] = self.standard_uri
46
+ h[:options] ||= {}
47
+ h[:headers] ||= {}
48
+
49
+ return h
50
+ end
51
+
52
+
53
+
54
+ def obfuscated?
55
+ is_obfuscated == true
56
+ end
57
+
58
+ # to follow HTTParty conventions
59
+ def standard_uri
60
+ URI.parse(uri)
61
+ end
62
+
63
+ def uri
64
+ to_uri
65
+ end
66
+
67
+ # during a fresh query, we need to actually use the unobfuscated_query
68
+ def to_uri
69
+ h = self.attributes.symbolize_keys.slice(:scheme, :host, :path)
70
+ h[:query] = self.unobfuscated_query || self.query
71
+
72
+ return Addressable::URI.new(h)
73
+ end
74
+
75
+ def self.build_validating_params(uri, opts={})
76
+ h = build_request_params(uri, opts)
77
+
78
+ h.slice(:scheme, :host, :path, :query)
79
+ end
80
+
81
+ #########################################################
82
+ ############ class methods
83
+
84
+
85
+
86
+ # Returns a Hash with symbolized keys
87
+ def self.build_request_params(uri, opts={})
88
+ u = Addressable::URI.parse(uri)
89
+ hsh = {scheme: u.normalized_scheme, host: u.normalized_host, path: u.normalized_path, query: u.normalized_query , extname: u.extname}
90
+ # deal with query separately
91
+ unless opts[:normalize_query] == false
92
+ hsh[:query] = normalize_query_params(hsh[:query])
93
+ end
94
+
95
+ hsh[:unobfuscated_query] = hsh[:query]
96
+ if ob_keys = opts[:obfuscate_query]
97
+ hsh[:query] = obfuscate_query_params(hsh[:query], ob_keys)
98
+ hsh[:is_obfuscated] = true
99
+ else
100
+ hsh[:is_obfuscated] = false
101
+ end
102
+
103
+ return hsh
104
+ end
105
+
106
+ def self.build_from_uri(uri, opts={})
107
+ request_params = build_request_params(uri, opts)
108
+ request_obj = CachedRequest.new(request_params)
109
+
110
+ return request_obj
111
+ end
112
+
113
+ def self.find_or_build_from_uri(uri, opts={})
114
+ self.matching_request(uri, opts).first || self.build_from_uri(uri, opts)
115
+ end
116
+
117
+ 1
118
+ def self.create_from_uri(uri, opts={})
119
+ req = build_from_uri(uri, opts)
120
+ req.save
121
+
122
+ return req
123
+ end
124
+
125
+
126
+
127
+ QUERY_NORMALIZER = HTTParty::Request::NON_RAILS_QUERY_STRING_NORMALIZER
128
+ # :q is a query String or Hash
129
+ # e.g. 'z=hello&b=world&a=dog'
130
+ # or: {z: ['hello', 'world'], a: 'dog'}
131
+ #
132
+ # returns: (String) "a=dog&z=hello&z=world"
133
+ def self.normalize_query_params(q)
134
+ return q if q.blank?
135
+
136
+ params_hash = CGI.parse(q)
137
+ params_str = QUERY_NORMALIZER[params_hash]
138
+
139
+ return params_str
140
+ end
141
+
142
+
143
+ private
144
+
145
+ def self.obfuscate_query_params(q, ob_keys)
146
+ string = q.dup
147
+ Array(ob_keys).each do |key|
148
+ a = Array(key)
149
+
150
+ key_to_omit = Regexp.escape(a[0].to_s)
151
+ char_num = a[1] || 0
152
+ if val_to_omit = string.match(/(?<=#{key_to_omit}=)(.*?)(?=&|$)/)
153
+ val = val_to_omit[1]
154
+ string.sub!( val, "__OMIT__#{val[-char_num, char_num]}")
155
+ end
156
+ end
157
+
158
+ return string
159
+ end
160
+
161
+
162
+ end
163
+ end
@@ -0,0 +1,135 @@
1
+ require 'nokogiri'
2
+ module ActiveScraper
3
+ class CachedResponse < ActiveRecord::Base
4
+ serialize :headers, Hash
5
+ belongs_to :request, touch: true, class_name: 'CachedRequest', foreign_key: 'cached_request_id'
6
+ before_create :encode_body_for_create
7
+ before_save :set_checksum
8
+
9
+ after_create :touch_request_fetched_at
10
+
11
+ def to_fake_party_hash
12
+ [:body, :headers, :content_type, :code].inject(Hashie::Mash.new) do |hsh, att|
13
+ hsh[att] = self.send(att)
14
+
15
+ hsh
16
+ end
17
+ end
18
+
19
+
20
+ def binary?
21
+ content_type =~ /pdf|image/ || !text?
22
+ end
23
+
24
+ def json?
25
+ content_type =~ /json/
26
+ end
27
+
28
+ def html?
29
+ content_type =~ /html/
30
+ end
31
+
32
+ def xml?
33
+ html? || content_type =~ /xml/
34
+ end
35
+
36
+ def text?
37
+ content_type =~ /text/ || xml? || json?
38
+ end
39
+
40
+ def body_changed?
41
+ self.changed_attributes.keys.include?('body')
42
+ end
43
+
44
+ def body
45
+ b = read_attribute(:body)
46
+ if b.present? && binary? && !body_changed?
47
+ return Base64.decode64(b)
48
+ else
49
+ return b
50
+ end
51
+ end
52
+
53
+ def parsed_body
54
+ @_parsedbody ||= if xml?
55
+ Nokogiri::HTML(body)
56
+ elsif json?
57
+ JSON.parse(body)
58
+ else
59
+ body
60
+ end
61
+ end
62
+
63
+ def to_s
64
+ body
65
+ end
66
+
67
+
68
+
69
+ private
70
+ def set_checksum
71
+ self.checksum = body.hash
72
+
73
+ true
74
+ end
75
+
76
+
77
+ def touch_request_fetched_at
78
+ if request && !request.new_record?
79
+ request.update_attributes(last_fetched_at: self.created_at) if self == request.latest_response
80
+ end
81
+
82
+ true
83
+ end
84
+
85
+
86
+ # expects @body to be populated
87
+ # returns string: e.g. 'utf-8', 'windows-1251'
88
+ def detect_encoding
89
+ if xml?
90
+ parsed_body.encoding
91
+ else
92
+ body.encoding
93
+ end
94
+ end
95
+
96
+ # converts @body to utf-8 if not already
97
+ def encode_body_for_create
98
+ if self.body.present?
99
+ if binary?
100
+ self.body = Base64.encode64(self.body)
101
+ elsif
102
+ denc = detect_encoding
103
+ self.body = self.body.encode('utf-8', denc)
104
+ end
105
+ end
106
+
107
+ true
108
+ end
109
+
110
+
111
+
112
+
113
+
114
+ ############## class methods
115
+ def self.find_cache_for_cached_request(cached_request, opts={})
116
+ time = opts[:fetched_after] || Time.at(0)
117
+ # smell: just goes back to CachedRequest
118
+ cached_request.latest_response_fetched_after(time)
119
+ end
120
+
121
+ def self.find_cache_for_request(req, opts)
122
+ # TODO
123
+ end
124
+
125
+ # has one side-effect: :body is properly encoded
126
+ def self.build_from_response_object(resp)
127
+ response = self.new
128
+ [:body, :headers, :content_type, :code].each do |att|
129
+ response.send :write_attribute, att, resp.send(att)
130
+ end
131
+
132
+ return response
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,19 @@
1
+ class CreateActiveScraperCachedRequests < ActiveRecord::Migration
2
+ def change
3
+ create_table :active_scraper_cached_requests do |t|
4
+ t.string "scheme"
5
+ t.string "host"
6
+ t.text "query"
7
+ t.string "path"
8
+ t.string "meta_tag"
9
+ t.string "extname"
10
+ t.boolean "is_obfuscated"
11
+ t.datetime "created_at"
12
+ t.datetime "updated_at"
13
+ t.datetime "last_fetched_at"
14
+ end
15
+
16
+ add_index "active_scraper_cached_requests", ["host", "path"], name: "index_as_requests_on_host_and_path"
17
+
18
+ end
19
+ end
@@ -0,0 +1,16 @@
1
+ class CreateActiveScraperCachedResponses < ActiveRecord::Migration
2
+ def change
3
+ create_table :active_scraper_cached_responses do |t|
4
+ t.text "body", limit: 4294967295
5
+ t.integer "code"
6
+ t.text "headers"
7
+ t.string "content_type"
8
+ t.integer "checksum"
9
+ t.integer "cached_request_id"
10
+ t.timestamps
11
+ end
12
+
13
+ add_index :active_scraper_cached_responses, [:cached_request_id, :created_at], name: 'index_request_id_and_created_at'
14
+ add_index :active_scraper_cached_responses, [:cached_request_id, :checksum], name: 'index_request_id_and_checksum'
15
+ end
16
+ end
@@ -1,5 +1,86 @@
1
+ # encoding: UTF-8
2
+
1
3
  require "active_scraper/engine"
2
- require "active_scraper/fetcher"
4
+ require 'active_scraper/fake_http_party_response'
5
+ require 'active_scraper/response_object'
3
6
 
4
7
  module ActiveScraper
8
+
9
+
10
+ # returns a ActiveScraper::CachedResponse
11
+ def self.get(uri, options={})
12
+ o = create_request_and_fetch_response(uri, options)
13
+
14
+ return o.response
15
+ end
16
+
17
+
18
+
19
+ # delegates to CachedRequest::find_or_build_from_uri
20
+ # req (URI or String). If CachedRequest, is idempotent
21
+ #
22
+ # returns a new or existing CachedRequest
23
+ def self.find_or_build_request(req, opts={})
24
+ CachedRequest.find_or_build_from_uri(req, opts)
25
+ end
26
+
27
+ ## cached_request (CachedRequest) => the request to find a response for
28
+ ##
29
+ ## returns a new or existing CachedResponse
30
+
31
+ def self.find_or_build_response(cached_request, opts={})
32
+ raise ArgumentError, "Only accepted CachedRequest, but was passed in a #{cached_request.class}" unless cached_request.is_a?(CachedRequest)
33
+ opts = normalize_hash(opts)
34
+
35
+ response = CachedResponse.find_cache_for_cached_request(cached_request, opts)
36
+
37
+ if response.blank?
38
+ fetched_obj = fetch_fresh(cached_request.uri, opts)
39
+ response = CachedResponse.build_from_response_object(fetched_obj)
40
+ end
41
+
42
+ return response
43
+ end
44
+
45
+
46
+ def self.create_request_and_fetch_response(uri, opts={})
47
+ opts = normalize_hash(opts)
48
+ # first, find or build the request
49
+ request = find_or_build_request(uri, opts)
50
+ # then find or build a matching response
51
+ response = find_or_build_response(request, opts)
52
+ # associate and save the two
53
+ request.responses << response
54
+ request.save
55
+
56
+ obj = Hashie::Mash.new(request: request, response: response)
57
+
58
+ return obj
59
+ end
60
+
61
+ # Returns an object compatible with HTTParty, i.e. an ActiveScraper::FakeHTTPartyResponse
62
+ # to be deprecated
63
+ def self.build_usable_response(request, response)
64
+ ActiveScraper::FakeHTTPartyResponse.new(request, response)
65
+ end
66
+
67
+
68
+
69
+ def self.fetch_fresh(url, opts={})
70
+ resp = HTTParty.get(url, opts)
71
+
72
+ return ActiveScraper::ResponseObject.factory(resp)
73
+ end
74
+
75
+
76
+
77
+
78
+ def self.normalize_hash(hsh)
79
+ unless hsh.is_a?(HashWithIndifferentAccess)
80
+ hsh = HashWithIndifferentAccess.new(hsh)
81
+ end
82
+
83
+ return hsh
84
+ end
85
+
5
86
  end
@@ -1,6 +1,17 @@
1
1
  module ActiveScraper
2
2
  class Engine < ::Rails::Engine
3
3
  isolate_namespace ActiveScraper
4
+
5
+ # monkey patch via: http://pivotallabs.com/leave-your-migrations-in-your-rails-engines/
6
+ initializer :append_migrations do |app|
7
+ unless app.root.to_s.match root.to_s
8
+ config.paths["db/migrate"].expanded.each do |expanded_path|
9
+ app.config.paths["db/migrate"] << expanded_path
10
+ end
11
+ end
12
+ end
13
+
14
+
4
15
  config.generators do |g|
5
16
  g.test_framework :rspec, :fixture => false
6
17
  g.assets false
@@ -0,0 +1,21 @@
1
+ require 'httparty'
2
+ require 'nokogiri'
3
+ module ActiveScraper
4
+ class FakeHTTPartyResponse < SimpleDelegator
5
+
6
+
7
+ def initialize(request, response, parsed_block=nil, options={})
8
+ request = request.to_fake_party_hash if request.is_a?(CachedRequest)
9
+ response = response.to_fake_party_hash if response.is_a?(CachedResponse)
10
+
11
+ ## making HTTParty happy...
12
+
13
+ parsed_block ||= ->(){ response.body }
14
+
15
+ super(HTTParty::Response.new request, response, parsed_block, options)
16
+ end
17
+
18
+
19
+
20
+ end
21
+ end
@@ -0,0 +1,13 @@
1
+ require 'active_scraper/response_object/basic'
2
+
3
+ module ActiveScraper
4
+ module ResponseObject
5
+
6
+
7
+ def self.factory(obj)
8
+ ActiveScraper::ResponseObject::Basic.new(obj)
9
+ end
10
+
11
+
12
+ end
13
+ end
@@ -0,0 +1,67 @@
1
+ module ActiveScraper
2
+ module ResponseObject
3
+ class Basic < SimpleDelegator
4
+ # I don't really know what to name this but this is passed
5
+ # between the various classes, including the Fetcher,
6
+ # and is expected to behave the same in those interactions
7
+
8
+ attr_reader :code, :headers, :body, :content_type
9
+
10
+ def initialize(obj)
11
+ if obj.class == (HTTParty::Response)
12
+ # use the Net::HTTPResponse instead
13
+ obj = obj.response
14
+ end
15
+
16
+ response_obj = if obj.is_a?(Net::HTTPResponse)
17
+ @body = obj.body
18
+ @content_type = obj.content_type
19
+ @headers = obj.each_header.inject({}){|h, (k, v)| h[k] = v; h }
20
+ @code = obj.code.to_i
21
+ elsif obj.is_a?(ActiveScraper::CachedResponse)
22
+ @body = obj.body
23
+ @content_type = obj.content_type
24
+ @headers = obj.headers
25
+ @code = obj.code.to_i
26
+ elsif obj.is_a?(StringIO) && obj.respond_to?(:meta) # OpenURI.open
27
+ @body = obj.read
28
+ @content_type = obj.content_type
29
+ @headers = obj.meta
30
+ @code = obj.status[0].to_i
31
+ elsif obj.nil?
32
+ # just do nothing
33
+ else
34
+ # other types have to raise an Error
35
+ raise ArgumentError, 'Improper class type'
36
+ end
37
+
38
+ super(ActiveSupport::HashWithIndifferentAccess.new() )
39
+
40
+ def empty?
41
+ @body.empty?
42
+ end
43
+
44
+ def nil?
45
+ @body.nil?
46
+ end
47
+
48
+
49
+ # now set its values
50
+ [:body, :headers, :content_type, :code].each do |a|
51
+ self[a] = self.send(a)
52
+ end
53
+ end
54
+
55
+ # def [](k)
56
+ # @values[k.to_sym]
57
+ # end
58
+
59
+ # def [](k,v)
60
+ # send(:"#{k}=", v)
61
+ # end
62
+
63
+ end
64
+ end
65
+ end
66
+
67
+
@@ -1,3 +1,3 @@
1
1
  module ActiveScraper
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: active_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dan Nguyen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-01-03 00:00:00.000000000 Z
11
+ date: 2014-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ! '>='
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: hashie
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: minitest
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -192,6 +206,20 @@ dependencies:
192
206
  - - ! '>='
193
207
  - !ruby/object:Gem::Version
194
208
  version: '0'
209
+ - !ruby/object:Gem::Dependency
210
+ name: timecop
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ! '>='
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ! '>='
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
195
223
  description: A Rails Engine using ActiveRecord to cache results of HTTP scrapes
196
224
  email:
197
225
  - dansonguyen@gmail.com
@@ -202,15 +230,16 @@ files:
202
230
  - app/assets/stylesheets/active_scraper/application.css
203
231
  - app/controllers/active_scraper/application_controller.rb
204
232
  - app/helpers/active_scraper/application_helper.rb
205
- - app/models/active_scraper/agnostic_response_object.rb
206
- - app/models/active_scraper/request.rb
207
- - app/models/active_scraper/response.rb
233
+ - app/models/active_scraper/cached_request.rb
234
+ - app/models/active_scraper/cached_response.rb
208
235
  - app/views/layouts/active_scraper/application.html.erb
209
236
  - config/routes.rb
210
- - db/migrate/20131229024155_create_active_scraper_requests.rb
211
- - db/migrate/20131229033843_create_active_scraper_responses.rb
237
+ - db/migrate/20131229024155_create_active_scraper_cached_requests.rb
238
+ - db/migrate/20131229033843_create_active_scraper_cached_responses.rb
212
239
  - lib/active_scraper/engine.rb
213
- - lib/active_scraper/fetcher.rb
240
+ - lib/active_scraper/fake_http_party_response.rb
241
+ - lib/active_scraper/response_object/basic.rb
242
+ - lib/active_scraper/response_object.rb
214
243
  - lib/active_scraper/version.rb
215
244
  - lib/active_scraper.rb
216
245
  - lib/tasks/active_scraper_tasks.rake
@@ -1,47 +0,0 @@
1
- module ActiveScraper
2
- class AgnosticResponseObject < SimpleDelegator
3
-
4
- attr_reader :code, :headers, :body, :content_type
5
-
6
- def initialize(obj)
7
- if obj.class == (HTTParty::Response)
8
- # use the Net::HTTPResponse instead
9
- obj = obj.response
10
- end
11
-
12
- response_obj = if obj.is_a?(Net::HTTPResponse)
13
- @body = obj.body
14
- @content_type = obj.content_type
15
- @headers = obj.each_header.inject({}){|h, (k, v)| h[k] = v; h }
16
- @code = obj.code.to_i
17
- elsif obj.is_a?(ActiveScraper::Request)
18
- @body = obj.body
19
- @content_type = obj.content_type
20
- @headers = obj.headers
21
- @code = obj.code.to_i
22
- else
23
- # this is probably not used
24
- @body = obj.to_s
25
- @headers = {}
26
- @content_type = nil
27
- @code = nil
28
- end
29
-
30
- super({})
31
-
32
- # now set its values
33
- [:body, :headers, :content_type, :code].each do |a|
34
- self[a] = self.send(a)
35
- end
36
- end
37
-
38
- # def [](k)
39
- # @values[k.to_sym]
40
- # end
41
-
42
- # def [](k,v)
43
- # send(:"#{k}=", v)
44
- # end
45
-
46
- end
47
- end
@@ -1,98 +0,0 @@
1
- require 'addressable/uri'
2
- module ActiveScraper
3
- class Request < ActiveRecord::Base
4
- has_many :responses, :dependent => :destroy
5
- validates_uniqueness_of :path, scope: [:host, :query, :scheme]
6
-
7
-
8
- scope :with_url, ->(u){
9
- params = Request.build_validating_params(u)
10
- where(params)
11
- }
12
-
13
-
14
- def obfuscated?
15
- is_obfuscated == true
16
- end
17
-
18
- def uri
19
- Addressable::URI.new(
20
- self.attributes.symbolize_keys.slice(:scheme, :host, :path, :query)
21
- )
22
- end
23
-
24
- def self.build_validating_params(uri, opts={})
25
- h = build_request_params(uri, opts)
26
-
27
- h.slice(:scheme, :host, :path, :query)
28
- end
29
-
30
- # Returns a Hash with symbolized keys
31
- def self.build_request_params(uri, opts={})
32
- u = Addressable::URI.parse(uri)
33
- hsh = {scheme: u.normalized_scheme, host: u.normalized_host, path: u.normalized_path, query: u.normalized_query, extname: u.extname}
34
-
35
- if ob_keys = opts.delete(:obfuscate_query)
36
- Array(ob_keys).each do |key|
37
- a = Array(key)
38
-
39
- key_to_omit = Regexp.escape(a[0].to_s)
40
- char_num = a[1] || 0
41
-
42
- if val_to_omit = hsh[:query].match(/(?<=#{key_to_omit}=)(.*?)(?=&|$)/)
43
- val = val_to_omit[1]
44
- hsh[:query].sub!( val, "__OMIT__#{val[-char_num, char_num]}")
45
- end
46
- end
47
-
48
- hsh[:is_obfuscated] = true
49
- else
50
- hsh[:is_obfuscated] = false
51
- end
52
-
53
- return hsh
54
- end
55
-
56
- def self.build_from_uri(uri, opts={})
57
- request_params = build_request_params(uri, opts)
58
- request_obj = Request.new(request_params)
59
-
60
- return request_obj
61
- end
62
-
63
- def self.find_or_build_from_uri(uri, opts={})
64
- self.with_url(uri).first || self.build_from_uri(uri, opts)
65
- end
66
-
67
-
68
- def self.create_from_uri(uri, opts={})
69
- req = build_from_uri(uri, opts)
70
- req.save
71
-
72
- return req
73
- end
74
-
75
-
76
- def self.create_and_fetch_response(uri, opts={}, fetcher = nil)
77
- request = find_or_build_from_uri(uri, opts)
78
- fetcher = fetcher || Fetcher.new
79
-
80
- if request.id.nil?
81
- # this request is new
82
- # so skip to the fresh
83
- resp = fetcher.fetch request, fresh: true
84
- else
85
- # will check the cache and the fresh
86
- resp = fetcher.fetch request
87
- end
88
-
89
- # build the response
90
- response = request.responses.build(resp)
91
- # theoretically, response will be saved too
92
- request.save
93
-
94
- return request
95
- end
96
-
97
- end
98
- end
@@ -1,26 +0,0 @@
1
- module ActiveScraper
2
- class Response < ActiveRecord::Base
3
- serialize :headers, Hash
4
- belongs_to :request
5
- before_save :set_checksum
6
-
7
-
8
-
9
- private
10
- def set_checksum
11
- self.checksum = body.hash
12
-
13
- true
14
- end
15
-
16
- ############## class methods
17
- def self.build_from_response_object(resp)
18
- response = self.new
19
- [:body, :headers, :content_type, :code].each do |att|
20
- response.send :write_attribute, att, resp.send(att)
21
- end
22
-
23
- return response
24
- end
25
- end
26
- end
@@ -1,16 +0,0 @@
1
- class CreateActiveScraperRequests < ActiveRecord::Migration
2
- def change
3
- create_table :active_scraper_requests do |t|
4
- t.string :host
5
- t.text :query
6
- t.string :path
7
- t.string :meta_tag
8
- t.boolean :is_obfuscated
9
-
10
- t.timestamps
11
- end
12
-
13
- add_index :active_scraper_requests, [:host, :path]
14
-
15
- end
16
- end
@@ -1,17 +0,0 @@
1
- class CreateActiveScraperResponses < ActiveRecord::Migration
2
- def change
3
- create_table :active_scraper_responses do |t|
4
- t.text :body, :limit => 4294967295
5
- t.integer :code
6
- t.text :headers
7
- t.string :content_type
8
- t.integer :checksum
9
- t.integer :active_scraper_request_id
10
-
11
- t.timestamps
12
- end
13
-
14
- add_index :active_scraper_responses, [:active_scraper_request_id, :created_at], name: 'index_request_id_and_created_at'
15
- add_index :active_scraper_responses, [:active_scraper_request_id, :checksum], name: 'index_request_id_and_checksum'
16
- end
17
- end
@@ -1,65 +0,0 @@
1
- require 'httparty'
2
-
3
- module ActiveScraper
4
- class Fetcher
5
-
6
- def fetch(u, opts={})
7
- url = convert_uri_object(u)
8
- force_fresh = opts.delete :fresh
9
-
10
- if force_fresh != true && (record = fetch_from_cache(url, opts))
11
- resp_obj = record
12
- else
13
- resp_obj = fetch_fresh(url, opts)
14
- end
15
-
16
- build_response_object(resp_obj)
17
- end
18
-
19
-
20
- def fetch_fresh(url, opts={})
21
- opts = opts.stringify_keys
22
-
23
- url = url.to_s
24
- verb = opts.fetch('verb'){ 'get' }
25
-
26
- resp = HTTParty.send(verb, url)
27
- end
28
-
29
-
30
- # returns:
31
- # single ScrapeCache if a valid ActiveScraper::Request exists
32
- #
33
- def fetch_from_cache(uri, opts={})
34
-
35
- end
36
-
37
- # true or false if ActiveScraper::Request with these parameters exist
38
- def has_cache?(uri, opts={})
39
-
40
- end
41
-
42
-
43
- # u can either be a Request object, a String, or Addressable::URI
44
- # returns an Addressable::URI
45
- def convert_uri_object(u)
46
- if u.is_a?(ActiveScraper::Request)
47
- x = u.uri
48
- else
49
- x = Addressable::URI.parse(u)
50
- end
51
-
52
- return x
53
- end
54
-
55
- def build_response_object(obj)
56
- self.class.build_response_object(obj)
57
- end
58
-
59
- # returns an OpenStruct that Response can use
60
- def self.build_response_object(obj)
61
- return AgnosticResponseObject.new(obj)
62
- end
63
-
64
- end
65
- end