active_scraper 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ODhmOGY2MGY0MWNjZmExNGM5MzA5ZGQwZmFhNjg0ZDQxZTMzOGQwNw==
4
+ NWEwOTJjZGViNjkwYzZiMjYwY2YzMGE5YjRkMTdlNzJhMTMwZmJmMw==
5
5
  data.tar.gz: !binary |-
6
- NWQ5ZDQwYzZlNTg5MjY2YjdmZjFkN2Q4MmEzMjdmOGYwNjE0OWQ2NQ==
6
+ OGMyNmI5MDA3MmM5YWQ5MjIxZjlhYjZlYTliMDZhZWY0YjlmYjQ5ZQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NjY1NWFjOTg5OGI1NWFlOTgzZjNjZGNiNjk2OGJkZDIyMTRjMmQwNWU3MjY3
10
- MTIyMDNlZDJiZWMwZjNhZDM4NTk1Njk5MjdkNmRiYTNlZDEzZDZmNjdhYjRl
11
- ZjFjNmVlMTVjYjI5MDE0MGYyNTdlNTEyZmEzMmI3N2QwYTNhNGY=
9
+ YTc0Mzk5YzdhNWZlM2YyYzJlNmZiZjMxOTg4OTg0YTU0MzhkMWMxMDVlNDJl
10
+ Y2UzNDVmNWE1NDhkODJlZTMyYWY1OTFjNTU3ZGYyNGIwNzA4NDEzZDMxMzg5
11
+ YWZkZWVmMTc1MjFjNGNhZDIzYTU0NGU1MTg4ZGYyYWNhNGQ3YTI=
12
12
  data.tar.gz: !binary |-
13
- YzIyNWE4M2JhODc0MmY3OWQ1ZDFkNmYwNjA1M2M5MDZiNDAyMGVlYmU2MTVl
14
- NDk5Y2ZiNTg1OGFkZmZkYmJhOGQyYjUyODA4NTg2ZDliNjhhMmQyZTNhZmEx
15
- YzQwYWU1NzdlZWE2MjRiMTM2YjUwNzM3NjM5YWM2NDk3YmU4NDk=
13
+ ZWMyYTc5MDJhZDU5NGY4ZDVlMjFkYTI0YzNkMDM2NDFkNjgyY2JhNDE3MGI2
14
+ M2Y5MTMzMTMzMGE3ZmFmYzk5OWNiNjQyNzRmM2E4ZjJmYjk5MWY0MzUzMjU0
15
+ YmMxNDQ3YWU1MTY4YTFmNDZjMjRlM2YyM2E4Nzc1NzA1MDQ3N2I=
@@ -0,0 +1,163 @@
1
+ require 'httparty'
2
+ require 'addressable/uri'
3
+ require 'hashie/mash'
4
+
5
+ module ActiveScraper
6
+ class CachedRequest < ActiveRecord::Base
7
+ has_many :responses, :dependent => :destroy, class_name: 'CachedResponse', foreign_key: 'cached_request_id'
8
+ has_one :latest_response, ->{ order('created_at DESC') }, class_name: 'ActiveScraper::CachedResponse', foreign_key: 'cached_request_id'
9
+ validates_uniqueness_of :path, scope: [:host, :query, :scheme]
10
+
11
+ attr_accessor :unobfuscated_query
12
+
13
+ delegate :to_s, :to => :uri
14
+
15
+ # problematic
16
+ scope :with_url, ->(u){
17
+ matching_request(u)
18
+ }
19
+
20
+ scope :matching_request, ->(req, opts={}){
21
+ if req.is_a?(CachedRequest)
22
+ req = req.to_uri
23
+ end
24
+ params = CachedRequest.build_validating_params(req, opts)
25
+
26
+ where(params)
27
+ }
28
+
29
+ scope :last_fetched_before, ->(some_time){
30
+ some_time = Time.parse(some_time) if some_time.is_a?(String)
31
+
32
+ where("last_fetched_at < ?", some_time)
33
+ }
34
+
35
+ def latest_response_fetched_after(time)
36
+ if latest_response.present?
37
+ return latest_response if latest_response.created_at > time
38
+ end
39
+
40
+ nil
41
+ end
42
+
43
+ def to_fake_party_hash
44
+ h = Hashie::Mash.new(self.attributes.symbolize_keys.slice(:scheme, :host, :path, :query))
45
+ h[:uri] = self.standard_uri
46
+ h[:options] ||= {}
47
+ h[:headers] ||= {}
48
+
49
+ return h
50
+ end
51
+
52
+
53
+
54
+ def obfuscated?
55
+ is_obfuscated == true
56
+ end
57
+
58
+ # to follow HTTParty conventions
59
+ def standard_uri
60
+ URI.parse(uri)
61
+ end
62
+
63
+ def uri
64
+ to_uri
65
+ end
66
+
67
+ # during a fresh query, we need to actually use the unobfuscated_query
68
+ def to_uri
69
+ h = self.attributes.symbolize_keys.slice(:scheme, :host, :path)
70
+ h[:query] = self.unobfuscated_query || self.query
71
+
72
+ return Addressable::URI.new(h)
73
+ end
74
+
75
+ def self.build_validating_params(uri, opts={})
76
+ h = build_request_params(uri, opts)
77
+
78
+ h.slice(:scheme, :host, :path, :query)
79
+ end
80
+
81
+ #########################################################
82
+ ############ class methods
83
+
84
+
85
+
86
+ # Returns a Hash with symbolized keys
87
+ def self.build_request_params(uri, opts={})
88
+ u = Addressable::URI.parse(uri)
89
+ hsh = {scheme: u.normalized_scheme, host: u.normalized_host, path: u.normalized_path, query: u.normalized_query , extname: u.extname}
90
+ # deal with query separately
91
+ unless opts[:normalize_query] == false
92
+ hsh[:query] = normalize_query_params(hsh[:query])
93
+ end
94
+
95
+ hsh[:unobfuscated_query] = hsh[:query]
96
+ if ob_keys = opts[:obfuscate_query]
97
+ hsh[:query] = obfuscate_query_params(hsh[:query], ob_keys)
98
+ hsh[:is_obfuscated] = true
99
+ else
100
+ hsh[:is_obfuscated] = false
101
+ end
102
+
103
+ return hsh
104
+ end
105
+
106
+ def self.build_from_uri(uri, opts={})
107
+ request_params = build_request_params(uri, opts)
108
+ request_obj = CachedRequest.new(request_params)
109
+
110
+ return request_obj
111
+ end
112
+
113
+ def self.find_or_build_from_uri(uri, opts={})
114
+ self.matching_request(uri, opts).first || self.build_from_uri(uri, opts)
115
+ end
116
+
117
+ 1
118
+ def self.create_from_uri(uri, opts={})
119
+ req = build_from_uri(uri, opts)
120
+ req.save
121
+
122
+ return req
123
+ end
124
+
125
+
126
+
127
+ QUERY_NORMALIZER = HTTParty::Request::NON_RAILS_QUERY_STRING_NORMALIZER
128
+ # :q is a query String or Hash
129
+ # e.g. 'z=hello&b=world&a=dog'
130
+ # or: {z: ['hello', 'world'], a: 'dog'}
131
+ #
132
+ # returns: (String) "a=dog&z=hello&z=world"
133
+ def self.normalize_query_params(q)
134
+ return q if q.blank?
135
+
136
+ params_hash = CGI.parse(q)
137
+ params_str = QUERY_NORMALIZER[params_hash]
138
+
139
+ return params_str
140
+ end
141
+
142
+
143
+ private
144
+
145
+ def self.obfuscate_query_params(q, ob_keys)
146
+ string = q.dup
147
+ Array(ob_keys).each do |key|
148
+ a = Array(key)
149
+
150
+ key_to_omit = Regexp.escape(a[0].to_s)
151
+ char_num = a[1] || 0
152
+ if val_to_omit = string.match(/(?<=#{key_to_omit}=)(.*?)(?=&|$)/)
153
+ val = val_to_omit[1]
154
+ string.sub!( val, "__OMIT__#{val[-char_num, char_num]}")
155
+ end
156
+ end
157
+
158
+ return string
159
+ end
160
+
161
+
162
+ end
163
+ end
@@ -0,0 +1,135 @@
1
+ require 'nokogiri'
2
+ module ActiveScraper
3
+ class CachedResponse < ActiveRecord::Base
4
+ serialize :headers, Hash
5
+ belongs_to :request, touch: true, class_name: 'CachedRequest', foreign_key: 'cached_request_id'
6
+ before_create :encode_body_for_create
7
+ before_save :set_checksum
8
+
9
+ after_create :touch_request_fetched_at
10
+
11
+ def to_fake_party_hash
12
+ [:body, :headers, :content_type, :code].inject(Hashie::Mash.new) do |hsh, att|
13
+ hsh[att] = self.send(att)
14
+
15
+ hsh
16
+ end
17
+ end
18
+
19
+
20
+ def binary?
21
+ content_type =~ /pdf|image/ || !text?
22
+ end
23
+
24
+ def json?
25
+ content_type =~ /json/
26
+ end
27
+
28
+ def html?
29
+ content_type =~ /html/
30
+ end
31
+
32
+ def xml?
33
+ html? || content_type =~ /xml/
34
+ end
35
+
36
+ def text?
37
+ content_type =~ /text/ || xml? || json?
38
+ end
39
+
40
+ def body_changed?
41
+ self.changed_attributes.keys.include?('body')
42
+ end
43
+
44
+ def body
45
+ b = read_attribute(:body)
46
+ if b.present? && binary? && !body_changed?
47
+ return Base64.decode64(b)
48
+ else
49
+ return b
50
+ end
51
+ end
52
+
53
+ def parsed_body
54
+ @_parsedbody ||= if xml?
55
+ Nokogiri::HTML(body)
56
+ elsif json?
57
+ JSON.parse(body)
58
+ else
59
+ body
60
+ end
61
+ end
62
+
63
+ def to_s
64
+ body
65
+ end
66
+
67
+
68
+
69
+ private
70
+ def set_checksum
71
+ self.checksum = body.hash
72
+
73
+ true
74
+ end
75
+
76
+
77
+ def touch_request_fetched_at
78
+ if request && !request.new_record?
79
+ request.update_attributes(last_fetched_at: self.created_at) if self == request.latest_response
80
+ end
81
+
82
+ true
83
+ end
84
+
85
+
86
+ # expects @body to be populated
87
+ # returns string: e.g. 'utf-8', 'windows-1251'
88
+ def detect_encoding
89
+ if xml?
90
+ parsed_body.encoding
91
+ else
92
+ body.encoding
93
+ end
94
+ end
95
+
96
+ # converts @body to utf-8 if not already
97
+ def encode_body_for_create
98
+ if self.body.present?
99
+ if binary?
100
+ self.body = Base64.encode64(self.body)
101
+ elsif
102
+ denc = detect_encoding
103
+ self.body = self.body.encode('utf-8', denc)
104
+ end
105
+ end
106
+
107
+ true
108
+ end
109
+
110
+
111
+
112
+
113
+
114
+ ############## class methods
115
+ def self.find_cache_for_cached_request(cached_request, opts={})
116
+ time = opts[:fetched_after] || Time.at(0)
117
+ # smell: just goes back to CachedRequest
118
+ cached_request.latest_response_fetched_after(time)
119
+ end
120
+
121
+ def self.find_cache_for_request(req, opts)
122
+ # TODO
123
+ end
124
+
125
+ # has one side-effect: :body is properly encoded
126
+ def self.build_from_response_object(resp)
127
+ response = self.new
128
+ [:body, :headers, :content_type, :code].each do |att|
129
+ response.send :write_attribute, att, resp.send(att)
130
+ end
131
+
132
+ return response
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,19 @@
1
+ class CreateActiveScraperCachedRequests < ActiveRecord::Migration
2
+ def change
3
+ create_table :active_scraper_cached_requests do |t|
4
+ t.string "scheme"
5
+ t.string "host"
6
+ t.text "query"
7
+ t.string "path"
8
+ t.string "meta_tag"
9
+ t.string "extname"
10
+ t.boolean "is_obfuscated"
11
+ t.datetime "created_at"
12
+ t.datetime "updated_at"
13
+ t.datetime "last_fetched_at"
14
+ end
15
+
16
+ add_index "active_scraper_cached_requests", ["host", "path"], name: "index_as_requests_on_host_and_path"
17
+
18
+ end
19
+ end
@@ -0,0 +1,16 @@
1
+ class CreateActiveScraperCachedResponses < ActiveRecord::Migration
2
+ def change
3
+ create_table :active_scraper_cached_responses do |t|
4
+ t.text "body", limit: 4294967295
5
+ t.integer "code"
6
+ t.text "headers"
7
+ t.string "content_type"
8
+ t.integer "checksum"
9
+ t.integer "cached_request_id"
10
+ t.timestamps
11
+ end
12
+
13
+ add_index :active_scraper_cached_responses, [:cached_request_id, :created_at], name: 'index_request_id_and_created_at'
14
+ add_index :active_scraper_cached_responses, [:cached_request_id, :checksum], name: 'index_request_id_and_checksum'
15
+ end
16
+ end
@@ -1,5 +1,86 @@
1
+ # encoding: UTF-8
2
+
1
3
  require "active_scraper/engine"
2
- require "active_scraper/fetcher"
4
+ require 'active_scraper/fake_http_party_response'
5
+ require 'active_scraper/response_object'
3
6
 
4
7
  module ActiveScraper
8
+
9
+
10
+ # returns a ActiveScraper::CachedResponse
11
+ def self.get(uri, options={})
12
+ o = create_request_and_fetch_response(uri, options)
13
+
14
+ return o.response
15
+ end
16
+
17
+
18
+
19
+ # delegates to CachedRequest::find_or_build_from_uri
20
+ # req (URI or String). If CachedRequest, is idempotent
21
+ #
22
+ # returns a new or existing CachedRequest
23
+ def self.find_or_build_request(req, opts={})
24
+ CachedRequest.find_or_build_from_uri(req, opts)
25
+ end
26
+
27
+ ## cached_request (CachedRequest) => the request to find a response for
28
+ ##
29
+ ## returns a new or existing CachedResponse
30
+
31
+ def self.find_or_build_response(cached_request, opts={})
32
+ raise ArgumentError, "Only accepted CachedRequest, but was passed in a #{cached_request.class}" unless cached_request.is_a?(CachedRequest)
33
+ opts = normalize_hash(opts)
34
+
35
+ response = CachedResponse.find_cache_for_cached_request(cached_request, opts)
36
+
37
+ if response.blank?
38
+ fetched_obj = fetch_fresh(cached_request.uri, opts)
39
+ response = CachedResponse.build_from_response_object(fetched_obj)
40
+ end
41
+
42
+ return response
43
+ end
44
+
45
+
46
+ def self.create_request_and_fetch_response(uri, opts={})
47
+ opts = normalize_hash(opts)
48
+ # first, find or build the request
49
+ request = find_or_build_request(uri, opts)
50
+ # then find or build a matching response
51
+ response = find_or_build_response(request, opts)
52
+ # associate and save the two
53
+ request.responses << response
54
+ request.save
55
+
56
+ obj = Hashie::Mash.new(request: request, response: response)
57
+
58
+ return obj
59
+ end
60
+
61
+ # Returns an object compatible with HTTParty, i.e. an ActiveScraper::FakeHTTPartyResponse
62
+ # to be deprecated
63
+ def self.build_usable_response(request, response)
64
+ ActiveScraper::FakeHTTPartyResponse.new(request, response)
65
+ end
66
+
67
+
68
+
69
+ def self.fetch_fresh(url, opts={})
70
+ resp = HTTParty.get(url, opts)
71
+
72
+ return ActiveScraper::ResponseObject.factory(resp)
73
+ end
74
+
75
+
76
+
77
+
78
+ def self.normalize_hash(hsh)
79
+ unless hsh.is_a?(HashWithIndifferentAccess)
80
+ hsh = HashWithIndifferentAccess.new(hsh)
81
+ end
82
+
83
+ return hsh
84
+ end
85
+
5
86
  end
@@ -1,6 +1,17 @@
1
1
  module ActiveScraper
2
2
  class Engine < ::Rails::Engine
3
3
  isolate_namespace ActiveScraper
4
+
5
+ # monkey patch via: http://pivotallabs.com/leave-your-migrations-in-your-rails-engines/
6
+ initializer :append_migrations do |app|
7
+ unless app.root.to_s.match root.to_s
8
+ config.paths["db/migrate"].expanded.each do |expanded_path|
9
+ app.config.paths["db/migrate"] << expanded_path
10
+ end
11
+ end
12
+ end
13
+
14
+
4
15
  config.generators do |g|
5
16
  g.test_framework :rspec, :fixture => false
6
17
  g.assets false
@@ -0,0 +1,21 @@
1
+ require 'httparty'
2
+ require 'nokogiri'
3
+ module ActiveScraper
4
+ class FakeHTTPartyResponse < SimpleDelegator
5
+
6
+
7
+ def initialize(request, response, parsed_block=nil, options={})
8
+ request = request.to_fake_party_hash if request.is_a?(CachedRequest)
9
+ response = response.to_fake_party_hash if response.is_a?(CachedResponse)
10
+
11
+ ## making HTTParty happy...
12
+
13
+ parsed_block ||= ->(){ response.body }
14
+
15
+ super(HTTParty::Response.new request, response, parsed_block, options)
16
+ end
17
+
18
+
19
+
20
+ end
21
+ end
@@ -0,0 +1,13 @@
1
+ require 'active_scraper/response_object/basic'
2
+
3
+ module ActiveScraper
4
+ module ResponseObject
5
+
6
+
7
+ def self.factory(obj)
8
+ ActiveScraper::ResponseObject::Basic.new(obj)
9
+ end
10
+
11
+
12
+ end
13
+ end
@@ -0,0 +1,67 @@
1
+ module ActiveScraper
2
+ module ResponseObject
3
+ class Basic < SimpleDelegator
4
+ # I don't really know what to name this but this is passed
5
+ # between the various classes, including the Fetcher,
6
+ # and is expected to behave the same in those interactions
7
+
8
+ attr_reader :code, :headers, :body, :content_type
9
+
10
+ def initialize(obj)
11
+ if obj.class == (HTTParty::Response)
12
+ # use the Net::HTTPResponse instead
13
+ obj = obj.response
14
+ end
15
+
16
+ response_obj = if obj.is_a?(Net::HTTPResponse)
17
+ @body = obj.body
18
+ @content_type = obj.content_type
19
+ @headers = obj.each_header.inject({}){|h, (k, v)| h[k] = v; h }
20
+ @code = obj.code.to_i
21
+ elsif obj.is_a?(ActiveScraper::CachedResponse)
22
+ @body = obj.body
23
+ @content_type = obj.content_type
24
+ @headers = obj.headers
25
+ @code = obj.code.to_i
26
+ elsif obj.is_a?(StringIO) && obj.respond_to?(:meta) # OpenURI.open
27
+ @body = obj.read
28
+ @content_type = obj.content_type
29
+ @headers = obj.meta
30
+ @code = obj.status[0].to_i
31
+ elsif obj.nil?
32
+ # just do nothing
33
+ else
34
+ # other types have to raise an Error
35
+ raise ArgumentError, 'Improper class type'
36
+ end
37
+
38
+ super(ActiveSupport::HashWithIndifferentAccess.new() )
39
+
40
+ def empty?
41
+ @body.empty?
42
+ end
43
+
44
+ def nil?
45
+ @body.nil?
46
+ end
47
+
48
+
49
+ # now set its values
50
+ [:body, :headers, :content_type, :code].each do |a|
51
+ self[a] = self.send(a)
52
+ end
53
+ end
54
+
55
+ # def [](k)
56
+ # @values[k.to_sym]
57
+ # end
58
+
59
+ # def [](k,v)
60
+ # send(:"#{k}=", v)
61
+ # end
62
+
63
+ end
64
+ end
65
+ end
66
+
67
+
@@ -1,3 +1,3 @@
1
1
  module ActiveScraper
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: active_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dan Nguyen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-01-03 00:00:00.000000000 Z
11
+ date: 2014-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ! '>='
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: hashie
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: minitest
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -192,6 +206,20 @@ dependencies:
192
206
  - - ! '>='
193
207
  - !ruby/object:Gem::Version
194
208
  version: '0'
209
+ - !ruby/object:Gem::Dependency
210
+ name: timecop
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ! '>='
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ! '>='
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
195
223
  description: A Rails Engine using ActiveRecord to cache results of HTTP scrapes
196
224
  email:
197
225
  - dansonguyen@gmail.com
@@ -202,15 +230,16 @@ files:
202
230
  - app/assets/stylesheets/active_scraper/application.css
203
231
  - app/controllers/active_scraper/application_controller.rb
204
232
  - app/helpers/active_scraper/application_helper.rb
205
- - app/models/active_scraper/agnostic_response_object.rb
206
- - app/models/active_scraper/request.rb
207
- - app/models/active_scraper/response.rb
233
+ - app/models/active_scraper/cached_request.rb
234
+ - app/models/active_scraper/cached_response.rb
208
235
  - app/views/layouts/active_scraper/application.html.erb
209
236
  - config/routes.rb
210
- - db/migrate/20131229024155_create_active_scraper_requests.rb
211
- - db/migrate/20131229033843_create_active_scraper_responses.rb
237
+ - db/migrate/20131229024155_create_active_scraper_cached_requests.rb
238
+ - db/migrate/20131229033843_create_active_scraper_cached_responses.rb
212
239
  - lib/active_scraper/engine.rb
213
- - lib/active_scraper/fetcher.rb
240
+ - lib/active_scraper/fake_http_party_response.rb
241
+ - lib/active_scraper/response_object/basic.rb
242
+ - lib/active_scraper/response_object.rb
214
243
  - lib/active_scraper/version.rb
215
244
  - lib/active_scraper.rb
216
245
  - lib/tasks/active_scraper_tasks.rake
@@ -1,47 +0,0 @@
1
- module ActiveScraper
2
- class AgnosticResponseObject < SimpleDelegator
3
-
4
- attr_reader :code, :headers, :body, :content_type
5
-
6
- def initialize(obj)
7
- if obj.class == (HTTParty::Response)
8
- # use the Net::HTTPResponse instead
9
- obj = obj.response
10
- end
11
-
12
- response_obj = if obj.is_a?(Net::HTTPResponse)
13
- @body = obj.body
14
- @content_type = obj.content_type
15
- @headers = obj.each_header.inject({}){|h, (k, v)| h[k] = v; h }
16
- @code = obj.code.to_i
17
- elsif obj.is_a?(ActiveScraper::Request)
18
- @body = obj.body
19
- @content_type = obj.content_type
20
- @headers = obj.headers
21
- @code = obj.code.to_i
22
- else
23
- # this is probably not used
24
- @body = obj.to_s
25
- @headers = {}
26
- @content_type = nil
27
- @code = nil
28
- end
29
-
30
- super({})
31
-
32
- # now set its values
33
- [:body, :headers, :content_type, :code].each do |a|
34
- self[a] = self.send(a)
35
- end
36
- end
37
-
38
- # def [](k)
39
- # @values[k.to_sym]
40
- # end
41
-
42
- # def [](k,v)
43
- # send(:"#{k}=", v)
44
- # end
45
-
46
- end
47
- end
@@ -1,98 +0,0 @@
1
- require 'addressable/uri'
2
- module ActiveScraper
3
- class Request < ActiveRecord::Base
4
- has_many :responses, :dependent => :destroy
5
- validates_uniqueness_of :path, scope: [:host, :query, :scheme]
6
-
7
-
8
- scope :with_url, ->(u){
9
- params = Request.build_validating_params(u)
10
- where(params)
11
- }
12
-
13
-
14
- def obfuscated?
15
- is_obfuscated == true
16
- end
17
-
18
- def uri
19
- Addressable::URI.new(
20
- self.attributes.symbolize_keys.slice(:scheme, :host, :path, :query)
21
- )
22
- end
23
-
24
- def self.build_validating_params(uri, opts={})
25
- h = build_request_params(uri, opts)
26
-
27
- h.slice(:scheme, :host, :path, :query)
28
- end
29
-
30
- # Returns a Hash with symbolized keys
31
- def self.build_request_params(uri, opts={})
32
- u = Addressable::URI.parse(uri)
33
- hsh = {scheme: u.normalized_scheme, host: u.normalized_host, path: u.normalized_path, query: u.normalized_query, extname: u.extname}
34
-
35
- if ob_keys = opts.delete(:obfuscate_query)
36
- Array(ob_keys).each do |key|
37
- a = Array(key)
38
-
39
- key_to_omit = Regexp.escape(a[0].to_s)
40
- char_num = a[1] || 0
41
-
42
- if val_to_omit = hsh[:query].match(/(?<=#{key_to_omit}=)(.*?)(?=&|$)/)
43
- val = val_to_omit[1]
44
- hsh[:query].sub!( val, "__OMIT__#{val[-char_num, char_num]}")
45
- end
46
- end
47
-
48
- hsh[:is_obfuscated] = true
49
- else
50
- hsh[:is_obfuscated] = false
51
- end
52
-
53
- return hsh
54
- end
55
-
56
- def self.build_from_uri(uri, opts={})
57
- request_params = build_request_params(uri, opts)
58
- request_obj = Request.new(request_params)
59
-
60
- return request_obj
61
- end
62
-
63
- def self.find_or_build_from_uri(uri, opts={})
64
- self.with_url(uri).first || self.build_from_uri(uri, opts)
65
- end
66
-
67
-
68
- def self.create_from_uri(uri, opts={})
69
- req = build_from_uri(uri, opts)
70
- req.save
71
-
72
- return req
73
- end
74
-
75
-
76
- def self.create_and_fetch_response(uri, opts={}, fetcher = nil)
77
- request = find_or_build_from_uri(uri, opts)
78
- fetcher = fetcher || Fetcher.new
79
-
80
- if request.id.nil?
81
- # this request is new
82
- # so skip to the fresh
83
- resp = fetcher.fetch request, fresh: true
84
- else
85
- # will check the cache and the fresh
86
- resp = fetcher.fetch request
87
- end
88
-
89
- # build the response
90
- response = request.responses.build(resp)
91
- # theoretically, response will be saved too
92
- request.save
93
-
94
- return request
95
- end
96
-
97
- end
98
- end
@@ -1,26 +0,0 @@
1
- module ActiveScraper
2
- class Response < ActiveRecord::Base
3
- serialize :headers, Hash
4
- belongs_to :request
5
- before_save :set_checksum
6
-
7
-
8
-
9
- private
10
- def set_checksum
11
- self.checksum = body.hash
12
-
13
- true
14
- end
15
-
16
- ############## class methods
17
- def self.build_from_response_object(resp)
18
- response = self.new
19
- [:body, :headers, :content_type, :code].each do |att|
20
- response.send :write_attribute, att, resp.send(att)
21
- end
22
-
23
- return response
24
- end
25
- end
26
- end
@@ -1,16 +0,0 @@
1
- class CreateActiveScraperRequests < ActiveRecord::Migration
2
- def change
3
- create_table :active_scraper_requests do |t|
4
- t.string :host
5
- t.text :query
6
- t.string :path
7
- t.string :meta_tag
8
- t.boolean :is_obfuscated
9
-
10
- t.timestamps
11
- end
12
-
13
- add_index :active_scraper_requests, [:host, :path]
14
-
15
- end
16
- end
@@ -1,17 +0,0 @@
1
- class CreateActiveScraperResponses < ActiveRecord::Migration
2
- def change
3
- create_table :active_scraper_responses do |t|
4
- t.text :body, :limit => 4294967295
5
- t.integer :code
6
- t.text :headers
7
- t.string :content_type
8
- t.integer :checksum
9
- t.integer :active_scraper_request_id
10
-
11
- t.timestamps
12
- end
13
-
14
- add_index :active_scraper_responses, [:active_scraper_request_id, :created_at], name: 'index_request_id_and_created_at'
15
- add_index :active_scraper_responses, [:active_scraper_request_id, :checksum], name: 'index_request_id_and_checksum'
16
- end
17
- end
@@ -1,65 +0,0 @@
1
- require 'httparty'
2
-
3
- module ActiveScraper
4
- class Fetcher
5
-
6
- def fetch(u, opts={})
7
- url = convert_uri_object(u)
8
- force_fresh = opts.delete :fresh
9
-
10
- if force_fresh != true && (record = fetch_from_cache(url, opts))
11
- resp_obj = record
12
- else
13
- resp_obj = fetch_fresh(url, opts)
14
- end
15
-
16
- build_response_object(resp_obj)
17
- end
18
-
19
-
20
- def fetch_fresh(url, opts={})
21
- opts = opts.stringify_keys
22
-
23
- url = url.to_s
24
- verb = opts.fetch('verb'){ 'get' }
25
-
26
- resp = HTTParty.send(verb, url)
27
- end
28
-
29
-
30
- # returns:
31
- # single ScrapeCache if a valid ActiveScraper::Request exists
32
- #
33
- def fetch_from_cache(uri, opts={})
34
-
35
- end
36
-
37
- # true or false if ActiveScraper::Request with these parameters exist
38
- def has_cache?(uri, opts={})
39
-
40
- end
41
-
42
-
43
- # u can either be a Request object, a String, or Addressable::URI
44
- # returns an Addressable::URI
45
- def convert_uri_object(u)
46
- if u.is_a?(ActiveScraper::Request)
47
- x = u.uri
48
- else
49
- x = Addressable::URI.parse(u)
50
- end
51
-
52
- return x
53
- end
54
-
55
- def build_response_object(obj)
56
- self.class.build_response_object(obj)
57
- end
58
-
59
- # returns an OpenStruct that Response can use
60
- def self.build_response_object(obj)
61
- return AgnosticResponseObject.new(obj)
62
- end
63
-
64
- end
65
- end