metainspector 4.6.1 → 4.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +9 -0
- data/lib/meta_inspector/document.rb +3 -1
- data/lib/meta_inspector/request.rb +7 -0
- data/lib/meta_inspector/url.rb +5 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -0
- data/spec/meta_inspector/meta_inspector_spec.rb +11 -0
- data/spec/url_spec.rb +16 -0
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 329fa61a1fb5c278adb8c07c58536cc84e774f31
|
4
|
+
data.tar.gz: 8c40e805a960cc5591bee0317307dad27fa46719
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0535544d927c37e963b567404764ff7af352c00afbfb896c44604e85d6e844155575d26b24a2234f478015f6f4aff9b75724d933c6cac9c3cceda6180545aa99
|
7
|
+
data.tar.gz: 9f7fa08ed92dcf775df62d97ab3a9d989c04ad4ff9ffe0cbaf6f83baf3c5bd052072d61e2db6b66ebef2643ca5de9de79e6aa78575a87bf432a21a3cb676a093
|
data/README.md
CHANGED
@@ -374,6 +374,15 @@ If you want to disable this, you can specify it like this:
|
|
374
374
|
page = MetaInspector.new('http://example.com', download_images: false)
|
375
375
|
```
|
376
376
|
|
377
|
+
### Caching responses
|
378
|
+
|
379
|
+
MetaInspector can be configured to use [Faraday::HttpCache](https://github.com/plataformatec/faraday-http-cache) to cache page responses. For that you should pass the `faraday_http_cache` option with at least the `:store` key, for example:
|
380
|
+
|
381
|
+
```ruby
|
382
|
+
cache = ActiveSupport::Cache.lookup_store(:file_store, '/tmp/cache')
|
383
|
+
page = MetaInspector.new('http://example.com', faraday_http_cache: { store: cache })
|
384
|
+
```
|
385
|
+
|
377
386
|
## Exception Handling
|
378
387
|
|
379
388
|
By default, MetaInspector will raise the exceptions found. We think that this is the safest default: in case the URL you're trying to scrape is unreachable, you should clearly be notified, and treat the exception as needed in your app.
|
@@ -33,6 +33,7 @@ module MetaInspector
|
|
33
33
|
@exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
|
34
34
|
@normalize_url = options[:normalize_url]
|
35
35
|
@faraday_options = options[:faraday_options]
|
36
|
+
@faraday_http_cache = options[:faraday_http_cache]
|
36
37
|
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log,
|
37
38
|
normalize: @normalize_url)
|
38
39
|
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
@@ -41,7 +42,8 @@ module MetaInspector
|
|
41
42
|
retries: @retries,
|
42
43
|
exception_log: @exception_log,
|
43
44
|
headers: @headers,
|
44
|
-
faraday_options: @faraday_options
|
45
|
+
faraday_options: @faraday_options,
|
46
|
+
faraday_http_cache: @faraday_http_cache) unless @document
|
45
47
|
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log,
|
46
48
|
download_images: @download_images)
|
47
49
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'faraday'
|
2
2
|
require 'faraday_middleware'
|
3
3
|
require 'faraday-cookie_jar'
|
4
|
+
require 'faraday-http-cache'
|
4
5
|
|
5
6
|
module MetaInspector
|
6
7
|
|
@@ -18,6 +19,7 @@ module MetaInspector
|
|
18
19
|
@exception_log = options[:exception_log]
|
19
20
|
@headers = options[:headers]
|
20
21
|
@faraday_options = options[:faraday_options] || {}
|
22
|
+
@faraday_http_cache = options[:faraday_http_cache]
|
21
23
|
|
22
24
|
response # request early so we can fail early
|
23
25
|
end
|
@@ -55,6 +57,11 @@ module MetaInspector
|
|
55
57
|
faraday.use :cookie_jar
|
56
58
|
end
|
57
59
|
|
60
|
+
if @faraday_http_cache.is_a?(Hash)
|
61
|
+
@faraday_http_cache[:serializer] ||= Marshal
|
62
|
+
faraday.use Faraday::HttpCache, @faraday_http_cache
|
63
|
+
end
|
64
|
+
|
58
65
|
faraday.headers.merge!(@headers || {})
|
59
66
|
faraday.adapter :net_http
|
60
67
|
end
|
data/lib/meta_inspector/url.rb
CHANGED
@@ -31,18 +31,21 @@ module MetaInspector
|
|
31
31
|
|
32
32
|
def tracked?
|
33
33
|
u = parsed(url)
|
34
|
+
return false unless u.query_values
|
34
35
|
found_tracking_params = WELL_KNOWN_TRACKING_PARAMS & u.query_values.keys
|
35
36
|
return found_tracking_params.any?
|
36
37
|
end
|
37
38
|
|
38
39
|
def untracked_url
|
39
40
|
u = parsed(url)
|
40
|
-
|
41
|
+
return url unless u.query_values
|
42
|
+
query_values = u.query_values.delete_if { |key, _| WELL_KNOWN_TRACKING_PARAMS.include? key }
|
43
|
+
u.query_values = query_values.length > 0 ? query_values : nil
|
41
44
|
u.to_s
|
42
45
|
end
|
43
46
|
|
44
47
|
def untrack!
|
45
|
-
self.url = untracked_url
|
48
|
+
self.url = untracked_url if tracked?
|
46
49
|
end
|
47
50
|
|
48
51
|
def url=(new_url)
|
data/meta_inspector.gemspec
CHANGED
@@ -18,6 +18,7 @@ Gem::Specification.new do |gem|
|
|
18
18
|
gem.add_dependency 'faraday', '~> 0.9.0'
|
19
19
|
gem.add_dependency 'faraday_middleware', '~> 0.10'
|
20
20
|
gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
|
21
|
+
gem.add_dependency 'faraday-http-cache', '~> 1.2.2'
|
21
22
|
gem.add_dependency 'addressable', '~> 2.3.5'
|
22
23
|
gem.add_dependency 'fastimage'
|
23
24
|
|
@@ -4,4 +4,15 @@ describe MetaInspector do
|
|
4
4
|
it "returns a Document" do
|
5
5
|
expect(MetaInspector.new('http://example.com').class).to eq(MetaInspector::Document)
|
6
6
|
end
|
7
|
+
|
8
|
+
it "cache request" do
|
9
|
+
# Creates a memory cache (a Hash that responds to #read, #write and #delete)
|
10
|
+
cache = Hash.new
|
11
|
+
def cache.read(k) self[k]; end
|
12
|
+
def cache.write(k, v) self[k] = v; end
|
13
|
+
|
14
|
+
expect(MetaInspector.new('http://example.com', warn_level: :store, faraday_http_cache: { store: cache })).to be_ok
|
15
|
+
|
16
|
+
expect(cache.keys).not_to be_empty
|
17
|
+
end
|
7
18
|
end
|
data/spec/url_spec.rb
CHANGED
@@ -43,6 +43,8 @@ describe MetaInspector::URL do
|
|
43
43
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_content=1234').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
|
44
44
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_campaign=1234').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
|
45
45
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_source=1234&utm_medium=5678&utm_term=4321&utm_content=9876&utm_campaign=5436').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
|
46
|
+
expect(MetaInspector::URL.new('http://example.com/foo?utm_source=1234&utm_medium=5678&utm_term=4321&utm_content=9876&utm_campaign=5436').untracked_url).to eq('http://example.com/foo')
|
47
|
+
expect(MetaInspector::URL.new('http://example.com/foo').untracked_url).to eq('http://example.com/foo')
|
46
48
|
end
|
47
49
|
|
48
50
|
it "should remove tracking parameters from url" do
|
@@ -62,6 +64,18 @@ describe MetaInspector::URL do
|
|
62
64
|
end
|
63
65
|
end
|
64
66
|
|
67
|
+
it "should remove all query values when untrack url" do
|
68
|
+
url = MetaInspector::URL.new('http://example.com/foo?utm_campaign=1234')
|
69
|
+
url.untrack!
|
70
|
+
expect(url.url).to eq('http://example.com/foo')
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should untrack untracked url" do
|
74
|
+
url = MetaInspector::URL.new('http://example.com/foo')
|
75
|
+
url.untrack!
|
76
|
+
expect(url.url).to eq('http://example.com/foo')
|
77
|
+
end
|
78
|
+
|
65
79
|
it "should say if the url is tracked" do
|
66
80
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_source=1234').tracked?).to be true
|
67
81
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_medium=1234').tracked?).to be true
|
@@ -75,6 +89,8 @@ describe MetaInspector::URL do
|
|
75
89
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar¬_utm_term=1234').tracked?).to be false
|
76
90
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar¬_utm_content=1234').tracked?).to be false
|
77
91
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar¬_utm_campaign=1234').tracked?).to be false
|
92
|
+
|
93
|
+
expect(MetaInspector::URL.new('http://example.com/foo').tracked?).to be false
|
78
94
|
end
|
79
95
|
|
80
96
|
describe "url=" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 0.0.6
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: faraday-http-cache
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.2.2
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.2.2
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: addressable
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -339,7 +353,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
339
353
|
version: '0'
|
340
354
|
requirements: []
|
341
355
|
rubyforge_project:
|
342
|
-
rubygems_version: 2.4.
|
356
|
+
rubygems_version: 2.4.8
|
343
357
|
signing_key:
|
344
358
|
specification_version: 4
|
345
359
|
summary: MetaInspector is a ruby gem for web scraping purposes, that returns metadata
|