active_scraper 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/app/models/active_scraper/cached_request.rb +163 -0
- data/app/models/active_scraper/cached_response.rb +135 -0
- data/db/migrate/20131229024155_create_active_scraper_cached_requests.rb +19 -0
- data/db/migrate/20131229033843_create_active_scraper_cached_responses.rb +16 -0
- data/lib/active_scraper.rb +82 -1
- data/lib/active_scraper/engine.rb +11 -0
- data/lib/active_scraper/fake_http_party_response.rb +21 -0
- data/lib/active_scraper/response_object.rb +13 -0
- data/lib/active_scraper/response_object/basic.rb +67 -0
- data/lib/active_scraper/version.rb +1 -1
- metadata +37 -8
- data/app/models/active_scraper/agnostic_response_object.rb +0 -47
- data/app/models/active_scraper/request.rb +0 -98
- data/app/models/active_scraper/response.rb +0 -26
- data/db/migrate/20131229024155_create_active_scraper_requests.rb +0 -16
- data/db/migrate/20131229033843_create_active_scraper_responses.rb +0 -17
- data/lib/active_scraper/fetcher.rb +0 -65
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NWEwOTJjZGViNjkwYzZiMjYwY2YzMGE5YjRkMTdlNzJhMTMwZmJmMw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OGMyNmI5MDA3MmM5YWQ5MjIxZjlhYjZlYTliMDZhZWY0YjlmYjQ5ZQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YTc0Mzk5YzdhNWZlM2YyYzJlNmZiZjMxOTg4OTg0YTU0MzhkMWMxMDVlNDJl
|
10
|
+
Y2UzNDVmNWE1NDhkODJlZTMyYWY1OTFjNTU3ZGYyNGIwNzA4NDEzZDMxMzg5
|
11
|
+
YWZkZWVmMTc1MjFjNGNhZDIzYTU0NGU1MTg4ZGYyYWNhNGQ3YTI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZWMyYTc5MDJhZDU5NGY4ZDVlMjFkYTI0YzNkMDM2NDFkNjgyY2JhNDE3MGI2
|
14
|
+
M2Y5MTMzMTMzMGE3ZmFmYzk5OWNiNjQyNzRmM2E4ZjJmYjk5MWY0MzUzMjU0
|
15
|
+
YmMxNDQ3YWU1MTY4YTFmNDZjMjRlM2YyM2E4Nzc1NzA1MDQ3N2I=
|
@@ -0,0 +1,163 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
require 'addressable/uri'
|
3
|
+
require 'hashie/mash'
|
4
|
+
|
5
|
+
module ActiveScraper
|
6
|
+
class CachedRequest < ActiveRecord::Base
|
7
|
+
has_many :responses, :dependent => :destroy, class_name: 'CachedResponse', foreign_key: 'cached_request_id'
|
8
|
+
has_one :latest_response, ->{ order('created_at DESC') }, class_name: 'ActiveScraper::CachedResponse', foreign_key: 'cached_request_id'
|
9
|
+
validates_uniqueness_of :path, scope: [:host, :query, :scheme]
|
10
|
+
|
11
|
+
attr_accessor :unobfuscated_query
|
12
|
+
|
13
|
+
delegate :to_s, :to => :uri
|
14
|
+
|
15
|
+
# problematic
|
16
|
+
scope :with_url, ->(u){
|
17
|
+
matching_request(u)
|
18
|
+
}
|
19
|
+
|
20
|
+
scope :matching_request, ->(req, opts={}){
|
21
|
+
if req.is_a?(CachedRequest)
|
22
|
+
req = req.to_uri
|
23
|
+
end
|
24
|
+
params = CachedRequest.build_validating_params(req, opts)
|
25
|
+
|
26
|
+
where(params)
|
27
|
+
}
|
28
|
+
|
29
|
+
scope :last_fetched_before, ->(some_time){
|
30
|
+
some_time = Time.parse(some_time) if some_time.is_a?(String)
|
31
|
+
|
32
|
+
where("last_fetched_at < ?", some_time)
|
33
|
+
}
|
34
|
+
|
35
|
+
def latest_response_fetched_after(time)
|
36
|
+
if latest_response.present?
|
37
|
+
return latest_response if latest_response.created_at > time
|
38
|
+
end
|
39
|
+
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def to_fake_party_hash
|
44
|
+
h = Hashie::Mash.new(self.attributes.symbolize_keys.slice(:scheme, :host, :path, :query))
|
45
|
+
h[:uri] = self.standard_uri
|
46
|
+
h[:options] ||= {}
|
47
|
+
h[:headers] ||= {}
|
48
|
+
|
49
|
+
return h
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
def obfuscated?
|
55
|
+
is_obfuscated == true
|
56
|
+
end
|
57
|
+
|
58
|
+
# to follow HTTParty conventions
|
59
|
+
def standard_uri
|
60
|
+
URI.parse(uri)
|
61
|
+
end
|
62
|
+
|
63
|
+
def uri
|
64
|
+
to_uri
|
65
|
+
end
|
66
|
+
|
67
|
+
# during a fresh query, we need to actually use the unobfuscated_query
|
68
|
+
def to_uri
|
69
|
+
h = self.attributes.symbolize_keys.slice(:scheme, :host, :path)
|
70
|
+
h[:query] = self.unobfuscated_query || self.query
|
71
|
+
|
72
|
+
return Addressable::URI.new(h)
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.build_validating_params(uri, opts={})
|
76
|
+
h = build_request_params(uri, opts)
|
77
|
+
|
78
|
+
h.slice(:scheme, :host, :path, :query)
|
79
|
+
end
|
80
|
+
|
81
|
+
#########################################################
|
82
|
+
############ class methods
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
# Returns a Hash with symbolized keys
|
87
|
+
def self.build_request_params(uri, opts={})
|
88
|
+
u = Addressable::URI.parse(uri)
|
89
|
+
hsh = {scheme: u.normalized_scheme, host: u.normalized_host, path: u.normalized_path, query: u.normalized_query , extname: u.extname}
|
90
|
+
# deal with query separately
|
91
|
+
unless opts[:normalize_query] == false
|
92
|
+
hsh[:query] = normalize_query_params(hsh[:query])
|
93
|
+
end
|
94
|
+
|
95
|
+
hsh[:unobfuscated_query] = hsh[:query]
|
96
|
+
if ob_keys = opts[:obfuscate_query]
|
97
|
+
hsh[:query] = obfuscate_query_params(hsh[:query], ob_keys)
|
98
|
+
hsh[:is_obfuscated] = true
|
99
|
+
else
|
100
|
+
hsh[:is_obfuscated] = false
|
101
|
+
end
|
102
|
+
|
103
|
+
return hsh
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.build_from_uri(uri, opts={})
|
107
|
+
request_params = build_request_params(uri, opts)
|
108
|
+
request_obj = CachedRequest.new(request_params)
|
109
|
+
|
110
|
+
return request_obj
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.find_or_build_from_uri(uri, opts={})
|
114
|
+
self.matching_request(uri, opts).first || self.build_from_uri(uri, opts)
|
115
|
+
end
|
116
|
+
|
117
|
+
1
|
118
|
+
def self.create_from_uri(uri, opts={})
|
119
|
+
req = build_from_uri(uri, opts)
|
120
|
+
req.save
|
121
|
+
|
122
|
+
return req
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
|
127
|
+
QUERY_NORMALIZER = HTTParty::Request::NON_RAILS_QUERY_STRING_NORMALIZER
|
128
|
+
# :q is a query String or Hash
|
129
|
+
# e.g. 'z=hello&b=world&a=dog'
|
130
|
+
# or: {z: ['hello', 'world'], a: 'dog'}
|
131
|
+
#
|
132
|
+
# returns: (String) "a=dog&z=hello&z=world"
|
133
|
+
def self.normalize_query_params(q)
|
134
|
+
return q if q.blank?
|
135
|
+
|
136
|
+
params_hash = CGI.parse(q)
|
137
|
+
params_str = QUERY_NORMALIZER[params_hash]
|
138
|
+
|
139
|
+
return params_str
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def self.obfuscate_query_params(q, ob_keys)
|
146
|
+
string = q.dup
|
147
|
+
Array(ob_keys).each do |key|
|
148
|
+
a = Array(key)
|
149
|
+
|
150
|
+
key_to_omit = Regexp.escape(a[0].to_s)
|
151
|
+
char_num = a[1] || 0
|
152
|
+
if val_to_omit = string.match(/(?<=#{key_to_omit}=)(.*?)(?=&|$)/)
|
153
|
+
val = val_to_omit[1]
|
154
|
+
string.sub!( val, "__OMIT__#{val[-char_num, char_num]}")
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
return string
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
module ActiveScraper
|
3
|
+
class CachedResponse < ActiveRecord::Base
|
4
|
+
serialize :headers, Hash
|
5
|
+
belongs_to :request, touch: true, class_name: 'CachedRequest', foreign_key: 'cached_request_id'
|
6
|
+
before_create :encode_body_for_create
|
7
|
+
before_save :set_checksum
|
8
|
+
|
9
|
+
after_create :touch_request_fetched_at
|
10
|
+
|
11
|
+
def to_fake_party_hash
|
12
|
+
[:body, :headers, :content_type, :code].inject(Hashie::Mash.new) do |hsh, att|
|
13
|
+
hsh[att] = self.send(att)
|
14
|
+
|
15
|
+
hsh
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
def binary?
|
21
|
+
content_type =~ /pdf|image/ || !text?
|
22
|
+
end
|
23
|
+
|
24
|
+
def json?
|
25
|
+
content_type =~ /json/
|
26
|
+
end
|
27
|
+
|
28
|
+
def html?
|
29
|
+
content_type =~ /html/
|
30
|
+
end
|
31
|
+
|
32
|
+
def xml?
|
33
|
+
html? || content_type =~ /xml/
|
34
|
+
end
|
35
|
+
|
36
|
+
def text?
|
37
|
+
content_type =~ /text/ || xml? || json?
|
38
|
+
end
|
39
|
+
|
40
|
+
def body_changed?
|
41
|
+
self.changed_attributes.keys.include?('body')
|
42
|
+
end
|
43
|
+
|
44
|
+
def body
|
45
|
+
b = read_attribute(:body)
|
46
|
+
if b.present? && binary? && !body_changed?
|
47
|
+
return Base64.decode64(b)
|
48
|
+
else
|
49
|
+
return b
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def parsed_body
|
54
|
+
@_parsedbody ||= if xml?
|
55
|
+
Nokogiri::HTML(body)
|
56
|
+
elsif json?
|
57
|
+
JSON.parse(body)
|
58
|
+
else
|
59
|
+
body
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def to_s
|
64
|
+
body
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
private
|
70
|
+
def set_checksum
|
71
|
+
self.checksum = body.hash
|
72
|
+
|
73
|
+
true
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
def touch_request_fetched_at
|
78
|
+
if request && !request.new_record?
|
79
|
+
request.update_attributes(last_fetched_at: self.created_at) if self == request.latest_response
|
80
|
+
end
|
81
|
+
|
82
|
+
true
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# expects @body to be populated
|
87
|
+
# returns string: e.g. 'utf-8', 'windows-1251'
|
88
|
+
def detect_encoding
|
89
|
+
if xml?
|
90
|
+
parsed_body.encoding
|
91
|
+
else
|
92
|
+
body.encoding
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# converts @body to utf-8 if not already
|
97
|
+
def encode_body_for_create
|
98
|
+
if self.body.present?
|
99
|
+
if binary?
|
100
|
+
self.body = Base64.encode64(self.body)
|
101
|
+
elsif
|
102
|
+
denc = detect_encoding
|
103
|
+
self.body = self.body.encode('utf-8', denc)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
true
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
############## class methods
|
115
|
+
def self.find_cache_for_cached_request(cached_request, opts={})
|
116
|
+
time = opts[:fetched_after] || Time.at(0)
|
117
|
+
# smell: just goes back to CachedRequest
|
118
|
+
cached_request.latest_response_fetched_after(time)
|
119
|
+
end
|
120
|
+
|
121
|
+
def self.find_cache_for_request(req, opts)
|
122
|
+
# TODO
|
123
|
+
end
|
124
|
+
|
125
|
+
# has one side-effect: :body is properly encoded
|
126
|
+
def self.build_from_response_object(resp)
|
127
|
+
response = self.new
|
128
|
+
[:body, :headers, :content_type, :code].each do |att|
|
129
|
+
response.send :write_attribute, att, resp.send(att)
|
130
|
+
end
|
131
|
+
|
132
|
+
return response
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class CreateActiveScraperCachedRequests < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :active_scraper_cached_requests do |t|
|
4
|
+
t.string "scheme"
|
5
|
+
t.string "host"
|
6
|
+
t.text "query"
|
7
|
+
t.string "path"
|
8
|
+
t.string "meta_tag"
|
9
|
+
t.string "extname"
|
10
|
+
t.boolean "is_obfuscated"
|
11
|
+
t.datetime "created_at"
|
12
|
+
t.datetime "updated_at"
|
13
|
+
t.datetime "last_fetched_at"
|
14
|
+
end
|
15
|
+
|
16
|
+
add_index "active_scraper_cached_requests", ["host", "path"], name: "index_as_requests_on_host_and_path"
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class CreateActiveScraperCachedResponses < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :active_scraper_cached_responses do |t|
|
4
|
+
t.text "body", limit: 4294967295
|
5
|
+
t.integer "code"
|
6
|
+
t.text "headers"
|
7
|
+
t.string "content_type"
|
8
|
+
t.integer "checksum"
|
9
|
+
t.integer "cached_request_id"
|
10
|
+
t.timestamps
|
11
|
+
end
|
12
|
+
|
13
|
+
add_index :active_scraper_cached_responses, [:cached_request_id, :created_at], name: 'index_request_id_and_created_at'
|
14
|
+
add_index :active_scraper_cached_responses, [:cached_request_id, :checksum], name: 'index_request_id_and_checksum'
|
15
|
+
end
|
16
|
+
end
|
data/lib/active_scraper.rb
CHANGED
@@ -1,5 +1,86 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require "active_scraper/engine"
|
2
|
-
require
|
4
|
+
require 'active_scraper/fake_http_party_response'
|
5
|
+
require 'active_scraper/response_object'
|
3
6
|
|
4
7
|
module ActiveScraper
|
8
|
+
|
9
|
+
|
10
|
+
# returns a ActiveScraper::CachedResponse
|
11
|
+
def self.get(uri, options={})
|
12
|
+
o = create_request_and_fetch_response(uri, options)
|
13
|
+
|
14
|
+
return o.response
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
# delegates to CachedRequest::find_or_build_from_uri
|
20
|
+
# req (URI or String). If CachedRequest, is idempotent
|
21
|
+
#
|
22
|
+
# returns a new or existing CachedRequest
|
23
|
+
def self.find_or_build_request(req, opts={})
|
24
|
+
CachedRequest.find_or_build_from_uri(req, opts)
|
25
|
+
end
|
26
|
+
|
27
|
+
## cached_request (CachedRequest) => the request to find a response for
|
28
|
+
##
|
29
|
+
## returns a new or existing CachedResponse
|
30
|
+
|
31
|
+
def self.find_or_build_response(cached_request, opts={})
|
32
|
+
raise ArgumentError, "Only accepted CachedRequest, but was passed in a #{cached_request.class}" unless cached_request.is_a?(CachedRequest)
|
33
|
+
opts = normalize_hash(opts)
|
34
|
+
|
35
|
+
response = CachedResponse.find_cache_for_cached_request(cached_request, opts)
|
36
|
+
|
37
|
+
if response.blank?
|
38
|
+
fetched_obj = fetch_fresh(cached_request.uri, opts)
|
39
|
+
response = CachedResponse.build_from_response_object(fetched_obj)
|
40
|
+
end
|
41
|
+
|
42
|
+
return response
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
def self.create_request_and_fetch_response(uri, opts={})
|
47
|
+
opts = normalize_hash(opts)
|
48
|
+
# first, find or build the request
|
49
|
+
request = find_or_build_request(uri, opts)
|
50
|
+
# then find or build a matching response
|
51
|
+
response = find_or_build_response(request, opts)
|
52
|
+
# associate and save the two
|
53
|
+
request.responses << response
|
54
|
+
request.save
|
55
|
+
|
56
|
+
obj = Hashie::Mash.new(request: request, response: response)
|
57
|
+
|
58
|
+
return obj
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns an object compatible with HTTParty, i.e. an ActiveScraper::FakeHTTPartyResponse
|
62
|
+
# to be deprecated
|
63
|
+
def self.build_usable_response(request, response)
|
64
|
+
ActiveScraper::FakeHTTPartyResponse.new(request, response)
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
def self.fetch_fresh(url, opts={})
|
70
|
+
resp = HTTParty.get(url, opts)
|
71
|
+
|
72
|
+
return ActiveScraper::ResponseObject.factory(resp)
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
def self.normalize_hash(hsh)
|
79
|
+
unless hsh.is_a?(HashWithIndifferentAccess)
|
80
|
+
hsh = HashWithIndifferentAccess.new(hsh)
|
81
|
+
end
|
82
|
+
|
83
|
+
return hsh
|
84
|
+
end
|
85
|
+
|
5
86
|
end
|
@@ -1,6 +1,17 @@
|
|
1
1
|
module ActiveScraper
|
2
2
|
class Engine < ::Rails::Engine
|
3
3
|
isolate_namespace ActiveScraper
|
4
|
+
|
5
|
+
# monkey patch via: http://pivotallabs.com/leave-your-migrations-in-your-rails-engines/
|
6
|
+
initializer :append_migrations do |app|
|
7
|
+
unless app.root.to_s.match root.to_s
|
8
|
+
config.paths["db/migrate"].expanded.each do |expanded_path|
|
9
|
+
app.config.paths["db/migrate"] << expanded_path
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
|
4
15
|
config.generators do |g|
|
5
16
|
g.test_framework :rspec, :fixture => false
|
6
17
|
g.assets false
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
require 'nokogiri'
|
3
|
+
module ActiveScraper
|
4
|
+
class FakeHTTPartyResponse < SimpleDelegator
|
5
|
+
|
6
|
+
|
7
|
+
def initialize(request, response, parsed_block=nil, options={})
|
8
|
+
request = request.to_fake_party_hash if request.is_a?(CachedRequest)
|
9
|
+
response = response.to_fake_party_hash if response.is_a?(CachedResponse)
|
10
|
+
|
11
|
+
## making HTTParty happy...
|
12
|
+
|
13
|
+
parsed_block ||= ->(){ response.body }
|
14
|
+
|
15
|
+
super(HTTParty::Response.new request, response, parsed_block, options)
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module ActiveScraper
|
2
|
+
module ResponseObject
|
3
|
+
class Basic < SimpleDelegator
|
4
|
+
# I don't really know what to name this but this is passed
|
5
|
+
# between the various classes, including the Fetcher,
|
6
|
+
# and is expected to behave the same in those interactions
|
7
|
+
|
8
|
+
attr_reader :code, :headers, :body, :content_type
|
9
|
+
|
10
|
+
def initialize(obj)
|
11
|
+
if obj.class == (HTTParty::Response)
|
12
|
+
# use the Net::HTTPResponse instead
|
13
|
+
obj = obj.response
|
14
|
+
end
|
15
|
+
|
16
|
+
response_obj = if obj.is_a?(Net::HTTPResponse)
|
17
|
+
@body = obj.body
|
18
|
+
@content_type = obj.content_type
|
19
|
+
@headers = obj.each_header.inject({}){|h, (k, v)| h[k] = v; h }
|
20
|
+
@code = obj.code.to_i
|
21
|
+
elsif obj.is_a?(ActiveScraper::CachedResponse)
|
22
|
+
@body = obj.body
|
23
|
+
@content_type = obj.content_type
|
24
|
+
@headers = obj.headers
|
25
|
+
@code = obj.code.to_i
|
26
|
+
elsif obj.is_a?(StringIO) && obj.respond_to?(:meta) # OpenURI.open
|
27
|
+
@body = obj.read
|
28
|
+
@content_type = obj.content_type
|
29
|
+
@headers = obj.meta
|
30
|
+
@code = obj.status[0].to_i
|
31
|
+
elsif obj.nil?
|
32
|
+
# just do nothing
|
33
|
+
else
|
34
|
+
# other types have to raise an Error
|
35
|
+
raise ArgumentError, 'Improper class type'
|
36
|
+
end
|
37
|
+
|
38
|
+
super(ActiveSupport::HashWithIndifferentAccess.new() )
|
39
|
+
|
40
|
+
def empty?
|
41
|
+
@body.empty?
|
42
|
+
end
|
43
|
+
|
44
|
+
def nil?
|
45
|
+
@body.nil?
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
# now set its values
|
50
|
+
[:body, :headers, :content_type, :code].each do |a|
|
51
|
+
self[a] = self.send(a)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# def [](k)
|
56
|
+
# @values[k.to_sym]
|
57
|
+
# end
|
58
|
+
|
59
|
+
# def [](k,v)
|
60
|
+
# send(:"#{k}=", v)
|
61
|
+
# end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: active_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dan Nguyen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ! '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: hashie
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: minitest
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -192,6 +206,20 @@ dependencies:
|
|
192
206
|
- - ! '>='
|
193
207
|
- !ruby/object:Gem::Version
|
194
208
|
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: timecop
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ! '>='
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :development
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ! '>='
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0'
|
195
223
|
description: A Rails Engine using ActiveRecord to cache results of HTTP scrapes
|
196
224
|
email:
|
197
225
|
- dansonguyen@gmail.com
|
@@ -202,15 +230,16 @@ files:
|
|
202
230
|
- app/assets/stylesheets/active_scraper/application.css
|
203
231
|
- app/controllers/active_scraper/application_controller.rb
|
204
232
|
- app/helpers/active_scraper/application_helper.rb
|
205
|
-
- app/models/active_scraper/
|
206
|
-
- app/models/active_scraper/
|
207
|
-
- app/models/active_scraper/response.rb
|
233
|
+
- app/models/active_scraper/cached_request.rb
|
234
|
+
- app/models/active_scraper/cached_response.rb
|
208
235
|
- app/views/layouts/active_scraper/application.html.erb
|
209
236
|
- config/routes.rb
|
210
|
-
- db/migrate/
|
211
|
-
- db/migrate/
|
237
|
+
- db/migrate/20131229024155_create_active_scraper_cached_requests.rb
|
238
|
+
- db/migrate/20131229033843_create_active_scraper_cached_responses.rb
|
212
239
|
- lib/active_scraper/engine.rb
|
213
|
-
- lib/active_scraper/
|
240
|
+
- lib/active_scraper/fake_http_party_response.rb
|
241
|
+
- lib/active_scraper/response_object/basic.rb
|
242
|
+
- lib/active_scraper/response_object.rb
|
214
243
|
- lib/active_scraper/version.rb
|
215
244
|
- lib/active_scraper.rb
|
216
245
|
- lib/tasks/active_scraper_tasks.rake
|
@@ -1,47 +0,0 @@
|
|
1
|
-
module ActiveScraper
|
2
|
-
class AgnosticResponseObject < SimpleDelegator
|
3
|
-
|
4
|
-
attr_reader :code, :headers, :body, :content_type
|
5
|
-
|
6
|
-
def initialize(obj)
|
7
|
-
if obj.class == (HTTParty::Response)
|
8
|
-
# use the Net::HTTPResponse instead
|
9
|
-
obj = obj.response
|
10
|
-
end
|
11
|
-
|
12
|
-
response_obj = if obj.is_a?(Net::HTTPResponse)
|
13
|
-
@body = obj.body
|
14
|
-
@content_type = obj.content_type
|
15
|
-
@headers = obj.each_header.inject({}){|h, (k, v)| h[k] = v; h }
|
16
|
-
@code = obj.code.to_i
|
17
|
-
elsif obj.is_a?(ActiveScraper::Request)
|
18
|
-
@body = obj.body
|
19
|
-
@content_type = obj.content_type
|
20
|
-
@headers = obj.headers
|
21
|
-
@code = obj.code.to_i
|
22
|
-
else
|
23
|
-
# this is probably not used
|
24
|
-
@body = obj.to_s
|
25
|
-
@headers = {}
|
26
|
-
@content_type = nil
|
27
|
-
@code = nil
|
28
|
-
end
|
29
|
-
|
30
|
-
super({})
|
31
|
-
|
32
|
-
# now set its values
|
33
|
-
[:body, :headers, :content_type, :code].each do |a|
|
34
|
-
self[a] = self.send(a)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# def [](k)
|
39
|
-
# @values[k.to_sym]
|
40
|
-
# end
|
41
|
-
|
42
|
-
# def [](k,v)
|
43
|
-
# send(:"#{k}=", v)
|
44
|
-
# end
|
45
|
-
|
46
|
-
end
|
47
|
-
end
|
@@ -1,98 +0,0 @@
|
|
1
|
-
require 'addressable/uri'
|
2
|
-
module ActiveScraper
|
3
|
-
class Request < ActiveRecord::Base
|
4
|
-
has_many :responses, :dependent => :destroy
|
5
|
-
validates_uniqueness_of :path, scope: [:host, :query, :scheme]
|
6
|
-
|
7
|
-
|
8
|
-
scope :with_url, ->(u){
|
9
|
-
params = Request.build_validating_params(u)
|
10
|
-
where(params)
|
11
|
-
}
|
12
|
-
|
13
|
-
|
14
|
-
def obfuscated?
|
15
|
-
is_obfuscated == true
|
16
|
-
end
|
17
|
-
|
18
|
-
def uri
|
19
|
-
Addressable::URI.new(
|
20
|
-
self.attributes.symbolize_keys.slice(:scheme, :host, :path, :query)
|
21
|
-
)
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.build_validating_params(uri, opts={})
|
25
|
-
h = build_request_params(uri, opts)
|
26
|
-
|
27
|
-
h.slice(:scheme, :host, :path, :query)
|
28
|
-
end
|
29
|
-
|
30
|
-
# Returns a Hash with symbolized keys
|
31
|
-
def self.build_request_params(uri, opts={})
|
32
|
-
u = Addressable::URI.parse(uri)
|
33
|
-
hsh = {scheme: u.normalized_scheme, host: u.normalized_host, path: u.normalized_path, query: u.normalized_query, extname: u.extname}
|
34
|
-
|
35
|
-
if ob_keys = opts.delete(:obfuscate_query)
|
36
|
-
Array(ob_keys).each do |key|
|
37
|
-
a = Array(key)
|
38
|
-
|
39
|
-
key_to_omit = Regexp.escape(a[0].to_s)
|
40
|
-
char_num = a[1] || 0
|
41
|
-
|
42
|
-
if val_to_omit = hsh[:query].match(/(?<=#{key_to_omit}=)(.*?)(?=&|$)/)
|
43
|
-
val = val_to_omit[1]
|
44
|
-
hsh[:query].sub!( val, "__OMIT__#{val[-char_num, char_num]}")
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
hsh[:is_obfuscated] = true
|
49
|
-
else
|
50
|
-
hsh[:is_obfuscated] = false
|
51
|
-
end
|
52
|
-
|
53
|
-
return hsh
|
54
|
-
end
|
55
|
-
|
56
|
-
def self.build_from_uri(uri, opts={})
|
57
|
-
request_params = build_request_params(uri, opts)
|
58
|
-
request_obj = Request.new(request_params)
|
59
|
-
|
60
|
-
return request_obj
|
61
|
-
end
|
62
|
-
|
63
|
-
def self.find_or_build_from_uri(uri, opts={})
|
64
|
-
self.with_url(uri).first || self.build_from_uri(uri, opts)
|
65
|
-
end
|
66
|
-
|
67
|
-
|
68
|
-
def self.create_from_uri(uri, opts={})
|
69
|
-
req = build_from_uri(uri, opts)
|
70
|
-
req.save
|
71
|
-
|
72
|
-
return req
|
73
|
-
end
|
74
|
-
|
75
|
-
|
76
|
-
def self.create_and_fetch_response(uri, opts={}, fetcher = nil)
|
77
|
-
request = find_or_build_from_uri(uri, opts)
|
78
|
-
fetcher = fetcher || Fetcher.new
|
79
|
-
|
80
|
-
if request.id.nil?
|
81
|
-
# this request is new
|
82
|
-
# so skip to the fresh
|
83
|
-
resp = fetcher.fetch request, fresh: true
|
84
|
-
else
|
85
|
-
# will check the cache and the fresh
|
86
|
-
resp = fetcher.fetch request
|
87
|
-
end
|
88
|
-
|
89
|
-
# build the response
|
90
|
-
response = request.responses.build(resp)
|
91
|
-
# theoretically, response will be saved too
|
92
|
-
request.save
|
93
|
-
|
94
|
-
return request
|
95
|
-
end
|
96
|
-
|
97
|
-
end
|
98
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module ActiveScraper
|
2
|
-
class Response < ActiveRecord::Base
|
3
|
-
serialize :headers, Hash
|
4
|
-
belongs_to :request
|
5
|
-
before_save :set_checksum
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
private
|
10
|
-
def set_checksum
|
11
|
-
self.checksum = body.hash
|
12
|
-
|
13
|
-
true
|
14
|
-
end
|
15
|
-
|
16
|
-
############## class methods
|
17
|
-
def self.build_from_response_object(resp)
|
18
|
-
response = self.new
|
19
|
-
[:body, :headers, :content_type, :code].each do |att|
|
20
|
-
response.send :write_attribute, att, resp.send(att)
|
21
|
-
end
|
22
|
-
|
23
|
-
return response
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
class CreateActiveScraperRequests < ActiveRecord::Migration
|
2
|
-
def change
|
3
|
-
create_table :active_scraper_requests do |t|
|
4
|
-
t.string :host
|
5
|
-
t.text :query
|
6
|
-
t.string :path
|
7
|
-
t.string :meta_tag
|
8
|
-
t.boolean :is_obfuscated
|
9
|
-
|
10
|
-
t.timestamps
|
11
|
-
end
|
12
|
-
|
13
|
-
add_index :active_scraper_requests, [:host, :path]
|
14
|
-
|
15
|
-
end
|
16
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
class CreateActiveScraperResponses < ActiveRecord::Migration
|
2
|
-
def change
|
3
|
-
create_table :active_scraper_responses do |t|
|
4
|
-
t.text :body, :limit => 4294967295
|
5
|
-
t.integer :code
|
6
|
-
t.text :headers
|
7
|
-
t.string :content_type
|
8
|
-
t.integer :checksum
|
9
|
-
t.integer :active_scraper_request_id
|
10
|
-
|
11
|
-
t.timestamps
|
12
|
-
end
|
13
|
-
|
14
|
-
add_index :active_scraper_responses, [:active_scraper_request_id, :created_at], name: 'index_request_id_and_created_at'
|
15
|
-
add_index :active_scraper_responses, [:active_scraper_request_id, :checksum], name: 'index_request_id_and_checksum'
|
16
|
-
end
|
17
|
-
end
|
@@ -1,65 +0,0 @@
|
|
1
|
-
require 'httparty'
|
2
|
-
|
3
|
-
module ActiveScraper
|
4
|
-
class Fetcher
|
5
|
-
|
6
|
-
def fetch(u, opts={})
|
7
|
-
url = convert_uri_object(u)
|
8
|
-
force_fresh = opts.delete :fresh
|
9
|
-
|
10
|
-
if force_fresh != true && (record = fetch_from_cache(url, opts))
|
11
|
-
resp_obj = record
|
12
|
-
else
|
13
|
-
resp_obj = fetch_fresh(url, opts)
|
14
|
-
end
|
15
|
-
|
16
|
-
build_response_object(resp_obj)
|
17
|
-
end
|
18
|
-
|
19
|
-
|
20
|
-
def fetch_fresh(url, opts={})
|
21
|
-
opts = opts.stringify_keys
|
22
|
-
|
23
|
-
url = url.to_s
|
24
|
-
verb = opts.fetch('verb'){ 'get' }
|
25
|
-
|
26
|
-
resp = HTTParty.send(verb, url)
|
27
|
-
end
|
28
|
-
|
29
|
-
|
30
|
-
# returns:
|
31
|
-
# single ScrapeCache if a valid ActiveScraper::Request exists
|
32
|
-
#
|
33
|
-
def fetch_from_cache(uri, opts={})
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
# true or false if ActiveScraper::Request with these parameters exist
|
38
|
-
def has_cache?(uri, opts={})
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
|
43
|
-
# u can either be a Request object, a String, or Addressable::URI
|
44
|
-
# returns an Addressable::URI
|
45
|
-
def convert_uri_object(u)
|
46
|
-
if u.is_a?(ActiveScraper::Request)
|
47
|
-
x = u.uri
|
48
|
-
else
|
49
|
-
x = Addressable::URI.parse(u)
|
50
|
-
end
|
51
|
-
|
52
|
-
return x
|
53
|
-
end
|
54
|
-
|
55
|
-
def build_response_object(obj)
|
56
|
-
self.class.build_response_object(obj)
|
57
|
-
end
|
58
|
-
|
59
|
-
# returns an OpenStruct that Response can use
|
60
|
-
def self.build_response_object(obj)
|
61
|
-
return AgnosticResponseObject.new(obj)
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
65
|
-
end
|