active_scraper 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/app/models/active_scraper/cached_request.rb +163 -0
- data/app/models/active_scraper/cached_response.rb +135 -0
- data/db/migrate/20131229024155_create_active_scraper_cached_requests.rb +19 -0
- data/db/migrate/20131229033843_create_active_scraper_cached_responses.rb +16 -0
- data/lib/active_scraper.rb +82 -1
- data/lib/active_scraper/engine.rb +11 -0
- data/lib/active_scraper/fake_http_party_response.rb +21 -0
- data/lib/active_scraper/response_object.rb +13 -0
- data/lib/active_scraper/response_object/basic.rb +67 -0
- data/lib/active_scraper/version.rb +1 -1
- metadata +37 -8
- data/app/models/active_scraper/agnostic_response_object.rb +0 -47
- data/app/models/active_scraper/request.rb +0 -98
- data/app/models/active_scraper/response.rb +0 -26
- data/db/migrate/20131229024155_create_active_scraper_requests.rb +0 -16
- data/db/migrate/20131229033843_create_active_scraper_responses.rb +0 -17
- data/lib/active_scraper/fetcher.rb +0 -65
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NWEwOTJjZGViNjkwYzZiMjYwY2YzMGE5YjRkMTdlNzJhMTMwZmJmMw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OGMyNmI5MDA3MmM5YWQ5MjIxZjlhYjZlYTliMDZhZWY0YjlmYjQ5ZQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YTc0Mzk5YzdhNWZlM2YyYzJlNmZiZjMxOTg4OTg0YTU0MzhkMWMxMDVlNDJl
|
10
|
+
Y2UzNDVmNWE1NDhkODJlZTMyYWY1OTFjNTU3ZGYyNGIwNzA4NDEzZDMxMzg5
|
11
|
+
YWZkZWVmMTc1MjFjNGNhZDIzYTU0NGU1MTg4ZGYyYWNhNGQ3YTI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZWMyYTc5MDJhZDU5NGY4ZDVlMjFkYTI0YzNkMDM2NDFkNjgyY2JhNDE3MGI2
|
14
|
+
M2Y5MTMzMTMzMGE3ZmFmYzk5OWNiNjQyNzRmM2E4ZjJmYjk5MWY0MzUzMjU0
|
15
|
+
YmMxNDQ3YWU1MTY4YTFmNDZjMjRlM2YyM2E4Nzc1NzA1MDQ3N2I=
|
@@ -0,0 +1,163 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
require 'addressable/uri'
|
3
|
+
require 'hashie/mash'
|
4
|
+
|
5
|
+
module ActiveScraper
|
6
|
+
class CachedRequest < ActiveRecord::Base
|
7
|
+
has_many :responses, :dependent => :destroy, class_name: 'CachedResponse', foreign_key: 'cached_request_id'
|
8
|
+
has_one :latest_response, ->{ order('created_at DESC') }, class_name: 'ActiveScraper::CachedResponse', foreign_key: 'cached_request_id'
|
9
|
+
validates_uniqueness_of :path, scope: [:host, :query, :scheme]
|
10
|
+
|
11
|
+
attr_accessor :unobfuscated_query
|
12
|
+
|
13
|
+
delegate :to_s, :to => :uri
|
14
|
+
|
15
|
+
# problematic
|
16
|
+
scope :with_url, ->(u){
|
17
|
+
matching_request(u)
|
18
|
+
}
|
19
|
+
|
20
|
+
scope :matching_request, ->(req, opts={}){
|
21
|
+
if req.is_a?(CachedRequest)
|
22
|
+
req = req.to_uri
|
23
|
+
end
|
24
|
+
params = CachedRequest.build_validating_params(req, opts)
|
25
|
+
|
26
|
+
where(params)
|
27
|
+
}
|
28
|
+
|
29
|
+
scope :last_fetched_before, ->(some_time){
|
30
|
+
some_time = Time.parse(some_time) if some_time.is_a?(String)
|
31
|
+
|
32
|
+
where("last_fetched_at < ?", some_time)
|
33
|
+
}
|
34
|
+
|
35
|
+
def latest_response_fetched_after(time)
|
36
|
+
if latest_response.present?
|
37
|
+
return latest_response if latest_response.created_at > time
|
38
|
+
end
|
39
|
+
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def to_fake_party_hash
|
44
|
+
h = Hashie::Mash.new(self.attributes.symbolize_keys.slice(:scheme, :host, :path, :query))
|
45
|
+
h[:uri] = self.standard_uri
|
46
|
+
h[:options] ||= {}
|
47
|
+
h[:headers] ||= {}
|
48
|
+
|
49
|
+
return h
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
def obfuscated?
|
55
|
+
is_obfuscated == true
|
56
|
+
end
|
57
|
+
|
58
|
+
# to follow HTTParty conventions
|
59
|
+
def standard_uri
|
60
|
+
URI.parse(uri)
|
61
|
+
end
|
62
|
+
|
63
|
+
def uri
|
64
|
+
to_uri
|
65
|
+
end
|
66
|
+
|
67
|
+
# during a fresh query, we need to actually use the unobfuscated_query
|
68
|
+
def to_uri
|
69
|
+
h = self.attributes.symbolize_keys.slice(:scheme, :host, :path)
|
70
|
+
h[:query] = self.unobfuscated_query || self.query
|
71
|
+
|
72
|
+
return Addressable::URI.new(h)
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.build_validating_params(uri, opts={})
|
76
|
+
h = build_request_params(uri, opts)
|
77
|
+
|
78
|
+
h.slice(:scheme, :host, :path, :query)
|
79
|
+
end
|
80
|
+
|
81
|
+
#########################################################
|
82
|
+
############ class methods
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
# Returns a Hash with symbolized keys
|
87
|
+
def self.build_request_params(uri, opts={})
|
88
|
+
u = Addressable::URI.parse(uri)
|
89
|
+
hsh = {scheme: u.normalized_scheme, host: u.normalized_host, path: u.normalized_path, query: u.normalized_query , extname: u.extname}
|
90
|
+
# deal with query separately
|
91
|
+
unless opts[:normalize_query] == false
|
92
|
+
hsh[:query] = normalize_query_params(hsh[:query])
|
93
|
+
end
|
94
|
+
|
95
|
+
hsh[:unobfuscated_query] = hsh[:query]
|
96
|
+
if ob_keys = opts[:obfuscate_query]
|
97
|
+
hsh[:query] = obfuscate_query_params(hsh[:query], ob_keys)
|
98
|
+
hsh[:is_obfuscated] = true
|
99
|
+
else
|
100
|
+
hsh[:is_obfuscated] = false
|
101
|
+
end
|
102
|
+
|
103
|
+
return hsh
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.build_from_uri(uri, opts={})
|
107
|
+
request_params = build_request_params(uri, opts)
|
108
|
+
request_obj = CachedRequest.new(request_params)
|
109
|
+
|
110
|
+
return request_obj
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.find_or_build_from_uri(uri, opts={})
|
114
|
+
self.matching_request(uri, opts).first || self.build_from_uri(uri, opts)
|
115
|
+
end
|
116
|
+
|
117
|
+
1
|
118
|
+
def self.create_from_uri(uri, opts={})
|
119
|
+
req = build_from_uri(uri, opts)
|
120
|
+
req.save
|
121
|
+
|
122
|
+
return req
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
|
127
|
+
QUERY_NORMALIZER = HTTParty::Request::NON_RAILS_QUERY_STRING_NORMALIZER
|
128
|
+
# :q is a query String or Hash
|
129
|
+
# e.g. 'z=hello&b=world&a=dog'
|
130
|
+
# or: {z: ['hello', 'world'], a: 'dog'}
|
131
|
+
#
|
132
|
+
# returns: (String) "a=dog&z=hello&z=world"
|
133
|
+
def self.normalize_query_params(q)
|
134
|
+
return q if q.blank?
|
135
|
+
|
136
|
+
params_hash = CGI.parse(q)
|
137
|
+
params_str = QUERY_NORMALIZER[params_hash]
|
138
|
+
|
139
|
+
return params_str
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def self.obfuscate_query_params(q, ob_keys)
|
146
|
+
string = q.dup
|
147
|
+
Array(ob_keys).each do |key|
|
148
|
+
a = Array(key)
|
149
|
+
|
150
|
+
key_to_omit = Regexp.escape(a[0].to_s)
|
151
|
+
char_num = a[1] || 0
|
152
|
+
if val_to_omit = string.match(/(?<=#{key_to_omit}=)(.*?)(?=&|$)/)
|
153
|
+
val = val_to_omit[1]
|
154
|
+
string.sub!( val, "__OMIT__#{val[-char_num, char_num]}")
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
return string
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
module ActiveScraper
|
3
|
+
class CachedResponse < ActiveRecord::Base
|
4
|
+
serialize :headers, Hash
|
5
|
+
belongs_to :request, touch: true, class_name: 'CachedRequest', foreign_key: 'cached_request_id'
|
6
|
+
before_create :encode_body_for_create
|
7
|
+
before_save :set_checksum
|
8
|
+
|
9
|
+
after_create :touch_request_fetched_at
|
10
|
+
|
11
|
+
def to_fake_party_hash
|
12
|
+
[:body, :headers, :content_type, :code].inject(Hashie::Mash.new) do |hsh, att|
|
13
|
+
hsh[att] = self.send(att)
|
14
|
+
|
15
|
+
hsh
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
def binary?
|
21
|
+
content_type =~ /pdf|image/ || !text?
|
22
|
+
end
|
23
|
+
|
24
|
+
def json?
|
25
|
+
content_type =~ /json/
|
26
|
+
end
|
27
|
+
|
28
|
+
def html?
|
29
|
+
content_type =~ /html/
|
30
|
+
end
|
31
|
+
|
32
|
+
def xml?
|
33
|
+
html? || content_type =~ /xml/
|
34
|
+
end
|
35
|
+
|
36
|
+
def text?
|
37
|
+
content_type =~ /text/ || xml? || json?
|
38
|
+
end
|
39
|
+
|
40
|
+
def body_changed?
|
41
|
+
self.changed_attributes.keys.include?('body')
|
42
|
+
end
|
43
|
+
|
44
|
+
def body
|
45
|
+
b = read_attribute(:body)
|
46
|
+
if b.present? && binary? && !body_changed?
|
47
|
+
return Base64.decode64(b)
|
48
|
+
else
|
49
|
+
return b
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def parsed_body
|
54
|
+
@_parsedbody ||= if xml?
|
55
|
+
Nokogiri::HTML(body)
|
56
|
+
elsif json?
|
57
|
+
JSON.parse(body)
|
58
|
+
else
|
59
|
+
body
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def to_s
|
64
|
+
body
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
private
|
70
|
+
def set_checksum
|
71
|
+
self.checksum = body.hash
|
72
|
+
|
73
|
+
true
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
def touch_request_fetched_at
|
78
|
+
if request && !request.new_record?
|
79
|
+
request.update_attributes(last_fetched_at: self.created_at) if self == request.latest_response
|
80
|
+
end
|
81
|
+
|
82
|
+
true
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# expects @body to be populated
|
87
|
+
# returns string: e.g. 'utf-8', 'windows-1251'
|
88
|
+
def detect_encoding
|
89
|
+
if xml?
|
90
|
+
parsed_body.encoding
|
91
|
+
else
|
92
|
+
body.encoding
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# converts @body to utf-8 if not already
|
97
|
+
def encode_body_for_create
|
98
|
+
if self.body.present?
|
99
|
+
if binary?
|
100
|
+
self.body = Base64.encode64(self.body)
|
101
|
+
elsif
|
102
|
+
denc = detect_encoding
|
103
|
+
self.body = self.body.encode('utf-8', denc)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
true
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
############## class methods
|
115
|
+
def self.find_cache_for_cached_request(cached_request, opts={})
|
116
|
+
time = opts[:fetched_after] || Time.at(0)
|
117
|
+
# smell: just goes back to CachedRequest
|
118
|
+
cached_request.latest_response_fetched_after(time)
|
119
|
+
end
|
120
|
+
|
121
|
+
def self.find_cache_for_request(req, opts)
|
122
|
+
# TODO
|
123
|
+
end
|
124
|
+
|
125
|
+
# has one side-effect: :body is properly encoded
|
126
|
+
def self.build_from_response_object(resp)
|
127
|
+
response = self.new
|
128
|
+
[:body, :headers, :content_type, :code].each do |att|
|
129
|
+
response.send :write_attribute, att, resp.send(att)
|
130
|
+
end
|
131
|
+
|
132
|
+
return response
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class CreateActiveScraperCachedRequests < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :active_scraper_cached_requests do |t|
|
4
|
+
t.string "scheme"
|
5
|
+
t.string "host"
|
6
|
+
t.text "query"
|
7
|
+
t.string "path"
|
8
|
+
t.string "meta_tag"
|
9
|
+
t.string "extname"
|
10
|
+
t.boolean "is_obfuscated"
|
11
|
+
t.datetime "created_at"
|
12
|
+
t.datetime "updated_at"
|
13
|
+
t.datetime "last_fetched_at"
|
14
|
+
end
|
15
|
+
|
16
|
+
add_index "active_scraper_cached_requests", ["host", "path"], name: "index_as_requests_on_host_and_path"
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class CreateActiveScraperCachedResponses < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :active_scraper_cached_responses do |t|
|
4
|
+
t.text "body", limit: 4294967295
|
5
|
+
t.integer "code"
|
6
|
+
t.text "headers"
|
7
|
+
t.string "content_type"
|
8
|
+
t.integer "checksum"
|
9
|
+
t.integer "cached_request_id"
|
10
|
+
t.timestamps
|
11
|
+
end
|
12
|
+
|
13
|
+
add_index :active_scraper_cached_responses, [:cached_request_id, :created_at], name: 'index_request_id_and_created_at'
|
14
|
+
add_index :active_scraper_cached_responses, [:cached_request_id, :checksum], name: 'index_request_id_and_checksum'
|
15
|
+
end
|
16
|
+
end
|
data/lib/active_scraper.rb
CHANGED
@@ -1,5 +1,86 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require "active_scraper/engine"
|
2
|
-
require
|
4
|
+
require 'active_scraper/fake_http_party_response'
|
5
|
+
require 'active_scraper/response_object'
|
3
6
|
|
4
7
|
module ActiveScraper
|
8
|
+
|
9
|
+
|
10
|
+
# returns a ActiveScraper::CachedResponse
|
11
|
+
def self.get(uri, options={})
|
12
|
+
o = create_request_and_fetch_response(uri, options)
|
13
|
+
|
14
|
+
return o.response
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
# delegates to CachedRequest::find_or_build_from_uri
|
20
|
+
# req (URI or String). If CachedRequest, is idempotent
|
21
|
+
#
|
22
|
+
# returns a new or existing CachedRequest
|
23
|
+
def self.find_or_build_request(req, opts={})
|
24
|
+
CachedRequest.find_or_build_from_uri(req, opts)
|
25
|
+
end
|
26
|
+
|
27
|
+
## cached_request (CachedRequest) => the request to find a response for
|
28
|
+
##
|
29
|
+
## returns a new or existing CachedResponse
|
30
|
+
|
31
|
+
def self.find_or_build_response(cached_request, opts={})
|
32
|
+
raise ArgumentError, "Only accepted CachedRequest, but was passed in a #{cached_request.class}" unless cached_request.is_a?(CachedRequest)
|
33
|
+
opts = normalize_hash(opts)
|
34
|
+
|
35
|
+
response = CachedResponse.find_cache_for_cached_request(cached_request, opts)
|
36
|
+
|
37
|
+
if response.blank?
|
38
|
+
fetched_obj = fetch_fresh(cached_request.uri, opts)
|
39
|
+
response = CachedResponse.build_from_response_object(fetched_obj)
|
40
|
+
end
|
41
|
+
|
42
|
+
return response
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
def self.create_request_and_fetch_response(uri, opts={})
|
47
|
+
opts = normalize_hash(opts)
|
48
|
+
# first, find or build the request
|
49
|
+
request = find_or_build_request(uri, opts)
|
50
|
+
# then find or build a matching response
|
51
|
+
response = find_or_build_response(request, opts)
|
52
|
+
# associate and save the two
|
53
|
+
request.responses << response
|
54
|
+
request.save
|
55
|
+
|
56
|
+
obj = Hashie::Mash.new(request: request, response: response)
|
57
|
+
|
58
|
+
return obj
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns an object compatible with HTTParty, i.e. an ActiveScraper::FakeHTTPartyResponse
|
62
|
+
# to be deprecated
|
63
|
+
def self.build_usable_response(request, response)
|
64
|
+
ActiveScraper::FakeHTTPartyResponse.new(request, response)
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
def self.fetch_fresh(url, opts={})
|
70
|
+
resp = HTTParty.get(url, opts)
|
71
|
+
|
72
|
+
return ActiveScraper::ResponseObject.factory(resp)
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
def self.normalize_hash(hsh)
|
79
|
+
unless hsh.is_a?(HashWithIndifferentAccess)
|
80
|
+
hsh = HashWithIndifferentAccess.new(hsh)
|
81
|
+
end
|
82
|
+
|
83
|
+
return hsh
|
84
|
+
end
|
85
|
+
|
5
86
|
end
|
@@ -1,6 +1,17 @@
|
|
1
1
|
module ActiveScraper
|
2
2
|
class Engine < ::Rails::Engine
|
3
3
|
isolate_namespace ActiveScraper
|
4
|
+
|
5
|
+
# monkey patch via: http://pivotallabs.com/leave-your-migrations-in-your-rails-engines/
|
6
|
+
initializer :append_migrations do |app|
|
7
|
+
unless app.root.to_s.match root.to_s
|
8
|
+
config.paths["db/migrate"].expanded.each do |expanded_path|
|
9
|
+
app.config.paths["db/migrate"] << expanded_path
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
|
4
15
|
config.generators do |g|
|
5
16
|
g.test_framework :rspec, :fixture => false
|
6
17
|
g.assets false
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
require 'nokogiri'
|
3
|
+
module ActiveScraper
|
4
|
+
class FakeHTTPartyResponse < SimpleDelegator
|
5
|
+
|
6
|
+
|
7
|
+
def initialize(request, response, parsed_block=nil, options={})
|
8
|
+
request = request.to_fake_party_hash if request.is_a?(CachedRequest)
|
9
|
+
response = response.to_fake_party_hash if response.is_a?(CachedResponse)
|
10
|
+
|
11
|
+
## making HTTParty happy...
|
12
|
+
|
13
|
+
parsed_block ||= ->(){ response.body }
|
14
|
+
|
15
|
+
super(HTTParty::Response.new request, response, parsed_block, options)
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module ActiveScraper
|
2
|
+
module ResponseObject
|
3
|
+
class Basic < SimpleDelegator
|
4
|
+
# I don't really know what to name this but this is passed
|
5
|
+
# between the various classes, including the Fetcher,
|
6
|
+
# and is expected to behave the same in those interactions
|
7
|
+
|
8
|
+
attr_reader :code, :headers, :body, :content_type
|
9
|
+
|
10
|
+
def initialize(obj)
|
11
|
+
if obj.class == (HTTParty::Response)
|
12
|
+
# use the Net::HTTPResponse instead
|
13
|
+
obj = obj.response
|
14
|
+
end
|
15
|
+
|
16
|
+
response_obj = if obj.is_a?(Net::HTTPResponse)
|
17
|
+
@body = obj.body
|
18
|
+
@content_type = obj.content_type
|
19
|
+
@headers = obj.each_header.inject({}){|h, (k, v)| h[k] = v; h }
|
20
|
+
@code = obj.code.to_i
|
21
|
+
elsif obj.is_a?(ActiveScraper::CachedResponse)
|
22
|
+
@body = obj.body
|
23
|
+
@content_type = obj.content_type
|
24
|
+
@headers = obj.headers
|
25
|
+
@code = obj.code.to_i
|
26
|
+
elsif obj.is_a?(StringIO) && obj.respond_to?(:meta) # OpenURI.open
|
27
|
+
@body = obj.read
|
28
|
+
@content_type = obj.content_type
|
29
|
+
@headers = obj.meta
|
30
|
+
@code = obj.status[0].to_i
|
31
|
+
elsif obj.nil?
|
32
|
+
# just do nothing
|
33
|
+
else
|
34
|
+
# other types have to raise an Error
|
35
|
+
raise ArgumentError, 'Improper class type'
|
36
|
+
end
|
37
|
+
|
38
|
+
super(ActiveSupport::HashWithIndifferentAccess.new() )
|
39
|
+
|
40
|
+
def empty?
|
41
|
+
@body.empty?
|
42
|
+
end
|
43
|
+
|
44
|
+
def nil?
|
45
|
+
@body.nil?
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
# now set its values
|
50
|
+
[:body, :headers, :content_type, :code].each do |a|
|
51
|
+
self[a] = self.send(a)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# def [](k)
|
56
|
+
# @values[k.to_sym]
|
57
|
+
# end
|
58
|
+
|
59
|
+
# def [](k,v)
|
60
|
+
# send(:"#{k}=", v)
|
61
|
+
# end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: active_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dan Nguyen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ! '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: hashie
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: minitest
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -192,6 +206,20 @@ dependencies:
|
|
192
206
|
- - ! '>='
|
193
207
|
- !ruby/object:Gem::Version
|
194
208
|
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: timecop
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ! '>='
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :development
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ! '>='
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0'
|
195
223
|
description: A Rails Engine using ActiveRecord to cache results of HTTP scrapes
|
196
224
|
email:
|
197
225
|
- dansonguyen@gmail.com
|
@@ -202,15 +230,16 @@ files:
|
|
202
230
|
- app/assets/stylesheets/active_scraper/application.css
|
203
231
|
- app/controllers/active_scraper/application_controller.rb
|
204
232
|
- app/helpers/active_scraper/application_helper.rb
|
205
|
-
- app/models/active_scraper/
|
206
|
-
- app/models/active_scraper/
|
207
|
-
- app/models/active_scraper/response.rb
|
233
|
+
- app/models/active_scraper/cached_request.rb
|
234
|
+
- app/models/active_scraper/cached_response.rb
|
208
235
|
- app/views/layouts/active_scraper/application.html.erb
|
209
236
|
- config/routes.rb
|
210
|
-
- db/migrate/
|
211
|
-
- db/migrate/
|
237
|
+
- db/migrate/20131229024155_create_active_scraper_cached_requests.rb
|
238
|
+
- db/migrate/20131229033843_create_active_scraper_cached_responses.rb
|
212
239
|
- lib/active_scraper/engine.rb
|
213
|
-
- lib/active_scraper/
|
240
|
+
- lib/active_scraper/fake_http_party_response.rb
|
241
|
+
- lib/active_scraper/response_object/basic.rb
|
242
|
+
- lib/active_scraper/response_object.rb
|
214
243
|
- lib/active_scraper/version.rb
|
215
244
|
- lib/active_scraper.rb
|
216
245
|
- lib/tasks/active_scraper_tasks.rake
|
@@ -1,47 +0,0 @@
|
|
1
|
-
module ActiveScraper
|
2
|
-
class AgnosticResponseObject < SimpleDelegator
|
3
|
-
|
4
|
-
attr_reader :code, :headers, :body, :content_type
|
5
|
-
|
6
|
-
def initialize(obj)
|
7
|
-
if obj.class == (HTTParty::Response)
|
8
|
-
# use the Net::HTTPResponse instead
|
9
|
-
obj = obj.response
|
10
|
-
end
|
11
|
-
|
12
|
-
response_obj = if obj.is_a?(Net::HTTPResponse)
|
13
|
-
@body = obj.body
|
14
|
-
@content_type = obj.content_type
|
15
|
-
@headers = obj.each_header.inject({}){|h, (k, v)| h[k] = v; h }
|
16
|
-
@code = obj.code.to_i
|
17
|
-
elsif obj.is_a?(ActiveScraper::Request)
|
18
|
-
@body = obj.body
|
19
|
-
@content_type = obj.content_type
|
20
|
-
@headers = obj.headers
|
21
|
-
@code = obj.code.to_i
|
22
|
-
else
|
23
|
-
# this is probably not used
|
24
|
-
@body = obj.to_s
|
25
|
-
@headers = {}
|
26
|
-
@content_type = nil
|
27
|
-
@code = nil
|
28
|
-
end
|
29
|
-
|
30
|
-
super({})
|
31
|
-
|
32
|
-
# now set its values
|
33
|
-
[:body, :headers, :content_type, :code].each do |a|
|
34
|
-
self[a] = self.send(a)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# def [](k)
|
39
|
-
# @values[k.to_sym]
|
40
|
-
# end
|
41
|
-
|
42
|
-
# def [](k,v)
|
43
|
-
# send(:"#{k}=", v)
|
44
|
-
# end
|
45
|
-
|
46
|
-
end
|
47
|
-
end
|
@@ -1,98 +0,0 @@
|
|
1
|
-
require 'addressable/uri'
|
2
|
-
module ActiveScraper
|
3
|
-
class Request < ActiveRecord::Base
|
4
|
-
has_many :responses, :dependent => :destroy
|
5
|
-
validates_uniqueness_of :path, scope: [:host, :query, :scheme]
|
6
|
-
|
7
|
-
|
8
|
-
scope :with_url, ->(u){
|
9
|
-
params = Request.build_validating_params(u)
|
10
|
-
where(params)
|
11
|
-
}
|
12
|
-
|
13
|
-
|
14
|
-
def obfuscated?
|
15
|
-
is_obfuscated == true
|
16
|
-
end
|
17
|
-
|
18
|
-
def uri
|
19
|
-
Addressable::URI.new(
|
20
|
-
self.attributes.symbolize_keys.slice(:scheme, :host, :path, :query)
|
21
|
-
)
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.build_validating_params(uri, opts={})
|
25
|
-
h = build_request_params(uri, opts)
|
26
|
-
|
27
|
-
h.slice(:scheme, :host, :path, :query)
|
28
|
-
end
|
29
|
-
|
30
|
-
# Returns a Hash with symbolized keys
|
31
|
-
def self.build_request_params(uri, opts={})
|
32
|
-
u = Addressable::URI.parse(uri)
|
33
|
-
hsh = {scheme: u.normalized_scheme, host: u.normalized_host, path: u.normalized_path, query: u.normalized_query, extname: u.extname}
|
34
|
-
|
35
|
-
if ob_keys = opts.delete(:obfuscate_query)
|
36
|
-
Array(ob_keys).each do |key|
|
37
|
-
a = Array(key)
|
38
|
-
|
39
|
-
key_to_omit = Regexp.escape(a[0].to_s)
|
40
|
-
char_num = a[1] || 0
|
41
|
-
|
42
|
-
if val_to_omit = hsh[:query].match(/(?<=#{key_to_omit}=)(.*?)(?=&|$)/)
|
43
|
-
val = val_to_omit[1]
|
44
|
-
hsh[:query].sub!( val, "__OMIT__#{val[-char_num, char_num]}")
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
hsh[:is_obfuscated] = true
|
49
|
-
else
|
50
|
-
hsh[:is_obfuscated] = false
|
51
|
-
end
|
52
|
-
|
53
|
-
return hsh
|
54
|
-
end
|
55
|
-
|
56
|
-
def self.build_from_uri(uri, opts={})
|
57
|
-
request_params = build_request_params(uri, opts)
|
58
|
-
request_obj = Request.new(request_params)
|
59
|
-
|
60
|
-
return request_obj
|
61
|
-
end
|
62
|
-
|
63
|
-
def self.find_or_build_from_uri(uri, opts={})
|
64
|
-
self.with_url(uri).first || self.build_from_uri(uri, opts)
|
65
|
-
end
|
66
|
-
|
67
|
-
|
68
|
-
def self.create_from_uri(uri, opts={})
|
69
|
-
req = build_from_uri(uri, opts)
|
70
|
-
req.save
|
71
|
-
|
72
|
-
return req
|
73
|
-
end
|
74
|
-
|
75
|
-
|
76
|
-
def self.create_and_fetch_response(uri, opts={}, fetcher = nil)
|
77
|
-
request = find_or_build_from_uri(uri, opts)
|
78
|
-
fetcher = fetcher || Fetcher.new
|
79
|
-
|
80
|
-
if request.id.nil?
|
81
|
-
# this request is new
|
82
|
-
# so skip to the fresh
|
83
|
-
resp = fetcher.fetch request, fresh: true
|
84
|
-
else
|
85
|
-
# will check the cache and the fresh
|
86
|
-
resp = fetcher.fetch request
|
87
|
-
end
|
88
|
-
|
89
|
-
# build the response
|
90
|
-
response = request.responses.build(resp)
|
91
|
-
# theoretically, response will be saved too
|
92
|
-
request.save
|
93
|
-
|
94
|
-
return request
|
95
|
-
end
|
96
|
-
|
97
|
-
end
|
98
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module ActiveScraper
|
2
|
-
class Response < ActiveRecord::Base
|
3
|
-
serialize :headers, Hash
|
4
|
-
belongs_to :request
|
5
|
-
before_save :set_checksum
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
private
|
10
|
-
def set_checksum
|
11
|
-
self.checksum = body.hash
|
12
|
-
|
13
|
-
true
|
14
|
-
end
|
15
|
-
|
16
|
-
############## class methods
|
17
|
-
def self.build_from_response_object(resp)
|
18
|
-
response = self.new
|
19
|
-
[:body, :headers, :content_type, :code].each do |att|
|
20
|
-
response.send :write_attribute, att, resp.send(att)
|
21
|
-
end
|
22
|
-
|
23
|
-
return response
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
class CreateActiveScraperRequests < ActiveRecord::Migration
|
2
|
-
def change
|
3
|
-
create_table :active_scraper_requests do |t|
|
4
|
-
t.string :host
|
5
|
-
t.text :query
|
6
|
-
t.string :path
|
7
|
-
t.string :meta_tag
|
8
|
-
t.boolean :is_obfuscated
|
9
|
-
|
10
|
-
t.timestamps
|
11
|
-
end
|
12
|
-
|
13
|
-
add_index :active_scraper_requests, [:host, :path]
|
14
|
-
|
15
|
-
end
|
16
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
class CreateActiveScraperResponses < ActiveRecord::Migration
|
2
|
-
def change
|
3
|
-
create_table :active_scraper_responses do |t|
|
4
|
-
t.text :body, :limit => 4294967295
|
5
|
-
t.integer :code
|
6
|
-
t.text :headers
|
7
|
-
t.string :content_type
|
8
|
-
t.integer :checksum
|
9
|
-
t.integer :active_scraper_request_id
|
10
|
-
|
11
|
-
t.timestamps
|
12
|
-
end
|
13
|
-
|
14
|
-
add_index :active_scraper_responses, [:active_scraper_request_id, :created_at], name: 'index_request_id_and_created_at'
|
15
|
-
add_index :active_scraper_responses, [:active_scraper_request_id, :checksum], name: 'index_request_id_and_checksum'
|
16
|
-
end
|
17
|
-
end
|
@@ -1,65 +0,0 @@
|
|
1
|
-
require 'httparty'
|
2
|
-
|
3
|
-
module ActiveScraper
|
4
|
-
class Fetcher
|
5
|
-
|
6
|
-
def fetch(u, opts={})
|
7
|
-
url = convert_uri_object(u)
|
8
|
-
force_fresh = opts.delete :fresh
|
9
|
-
|
10
|
-
if force_fresh != true && (record = fetch_from_cache(url, opts))
|
11
|
-
resp_obj = record
|
12
|
-
else
|
13
|
-
resp_obj = fetch_fresh(url, opts)
|
14
|
-
end
|
15
|
-
|
16
|
-
build_response_object(resp_obj)
|
17
|
-
end
|
18
|
-
|
19
|
-
|
20
|
-
def fetch_fresh(url, opts={})
|
21
|
-
opts = opts.stringify_keys
|
22
|
-
|
23
|
-
url = url.to_s
|
24
|
-
verb = opts.fetch('verb'){ 'get' }
|
25
|
-
|
26
|
-
resp = HTTParty.send(verb, url)
|
27
|
-
end
|
28
|
-
|
29
|
-
|
30
|
-
# returns:
|
31
|
-
# single ScrapeCache if a valid ActiveScraper::Request exists
|
32
|
-
#
|
33
|
-
def fetch_from_cache(uri, opts={})
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
# true or false if ActiveScraper::Request with these parameters exist
|
38
|
-
def has_cache?(uri, opts={})
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
|
43
|
-
# u can either be a Request object, a String, or Addressable::URI
|
44
|
-
# returns an Addressable::URI
|
45
|
-
def convert_uri_object(u)
|
46
|
-
if u.is_a?(ActiveScraper::Request)
|
47
|
-
x = u.uri
|
48
|
-
else
|
49
|
-
x = Addressable::URI.parse(u)
|
50
|
-
end
|
51
|
-
|
52
|
-
return x
|
53
|
-
end
|
54
|
-
|
55
|
-
def build_response_object(obj)
|
56
|
-
self.class.build_response_object(obj)
|
57
|
-
end
|
58
|
-
|
59
|
-
# returns an OpenStruct that Response can use
|
60
|
-
def self.build_response_object(obj)
|
61
|
-
return AgnosticResponseObject.new(obj)
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
65
|
-
end
|