metainspector 4.6.1 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -0
- data/lib/meta_inspector/document.rb +3 -1
- data/lib/meta_inspector/request.rb +7 -0
- data/lib/meta_inspector/url.rb +5 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -0
- data/spec/meta_inspector/meta_inspector_spec.rb +11 -0
- data/spec/url_spec.rb +16 -0
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 329fa61a1fb5c278adb8c07c58536cc84e774f31
|
4
|
+
data.tar.gz: 8c40e805a960cc5591bee0317307dad27fa46719
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0535544d927c37e963b567404764ff7af352c00afbfb896c44604e85d6e844155575d26b24a2234f478015f6f4aff9b75724d933c6cac9c3cceda6180545aa99
|
7
|
+
data.tar.gz: 9f7fa08ed92dcf775df62d97ab3a9d989c04ad4ff9ffe0cbaf6f83baf3c5bd052072d61e2db6b66ebef2643ca5de9de79e6aa78575a87bf432a21a3cb676a093
|
data/README.md
CHANGED
@@ -374,6 +374,15 @@ If you want to disable this, you can specify it like this:
|
|
374
374
|
page = MetaInspector.new('http://example.com', download_images: false)
|
375
375
|
```
|
376
376
|
|
377
|
+
### Caching responses
|
378
|
+
|
379
|
+
MetaInspector can be configured to use [Faraday::HttpCache](https://github.com/plataformatec/faraday-http-cache) to cache page responses. For that you should pass the `faraday_http_cache` option with at least the `:store` key, for example:
|
380
|
+
|
381
|
+
```ruby
|
382
|
+
cache = ActiveSupport::Cache.lookup_store(:file_store, '/tmp/cache')
|
383
|
+
page = MetaInspector.new('http://example.com', faraday_http_cache: { store: cache })
|
384
|
+
```
|
385
|
+
|
377
386
|
## Exception Handling
|
378
387
|
|
379
388
|
By default, MetaInspector will raise the exceptions found. We think that this is the safest default: in case the URL you're trying to scrape is unreachable, you should clearly be notified, and treat the exception as needed in your app.
|
@@ -33,6 +33,7 @@ module MetaInspector
|
|
33
33
|
@exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
|
34
34
|
@normalize_url = options[:normalize_url]
|
35
35
|
@faraday_options = options[:faraday_options]
|
36
|
+
@faraday_http_cache = options[:faraday_http_cache]
|
36
37
|
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log,
|
37
38
|
normalize: @normalize_url)
|
38
39
|
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
@@ -41,7 +42,8 @@ module MetaInspector
|
|
41
42
|
retries: @retries,
|
42
43
|
exception_log: @exception_log,
|
43
44
|
headers: @headers,
|
44
|
-
faraday_options: @faraday_options
|
45
|
+
faraday_options: @faraday_options,
|
46
|
+
faraday_http_cache: @faraday_http_cache) unless @document
|
45
47
|
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log,
|
46
48
|
download_images: @download_images)
|
47
49
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'faraday'
|
2
2
|
require 'faraday_middleware'
|
3
3
|
require 'faraday-cookie_jar'
|
4
|
+
require 'faraday-http-cache'
|
4
5
|
|
5
6
|
module MetaInspector
|
6
7
|
|
@@ -18,6 +19,7 @@ module MetaInspector
|
|
18
19
|
@exception_log = options[:exception_log]
|
19
20
|
@headers = options[:headers]
|
20
21
|
@faraday_options = options[:faraday_options] || {}
|
22
|
+
@faraday_http_cache = options[:faraday_http_cache]
|
21
23
|
|
22
24
|
response # request early so we can fail early
|
23
25
|
end
|
@@ -55,6 +57,11 @@ module MetaInspector
|
|
55
57
|
faraday.use :cookie_jar
|
56
58
|
end
|
57
59
|
|
60
|
+
if @faraday_http_cache.is_a?(Hash)
|
61
|
+
@faraday_http_cache[:serializer] ||= Marshal
|
62
|
+
faraday.use Faraday::HttpCache, @faraday_http_cache
|
63
|
+
end
|
64
|
+
|
58
65
|
faraday.headers.merge!(@headers || {})
|
59
66
|
faraday.adapter :net_http
|
60
67
|
end
|
data/lib/meta_inspector/url.rb
CHANGED
@@ -31,18 +31,21 @@ module MetaInspector
|
|
31
31
|
|
32
32
|
def tracked?
|
33
33
|
u = parsed(url)
|
34
|
+
return false unless u.query_values
|
34
35
|
found_tracking_params = WELL_KNOWN_TRACKING_PARAMS & u.query_values.keys
|
35
36
|
return found_tracking_params.any?
|
36
37
|
end
|
37
38
|
|
38
39
|
def untracked_url
|
39
40
|
u = parsed(url)
|
40
|
-
|
41
|
+
return url unless u.query_values
|
42
|
+
query_values = u.query_values.delete_if { |key, _| WELL_KNOWN_TRACKING_PARAMS.include? key }
|
43
|
+
u.query_values = query_values.length > 0 ? query_values : nil
|
41
44
|
u.to_s
|
42
45
|
end
|
43
46
|
|
44
47
|
def untrack!
|
45
|
-
self.url = untracked_url
|
48
|
+
self.url = untracked_url if tracked?
|
46
49
|
end
|
47
50
|
|
48
51
|
def url=(new_url)
|
data/meta_inspector.gemspec
CHANGED
@@ -18,6 +18,7 @@ Gem::Specification.new do |gem|
|
|
18
18
|
gem.add_dependency 'faraday', '~> 0.9.0'
|
19
19
|
gem.add_dependency 'faraday_middleware', '~> 0.10'
|
20
20
|
gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
|
21
|
+
gem.add_dependency 'faraday-http-cache', '~> 1.2.2'
|
21
22
|
gem.add_dependency 'addressable', '~> 2.3.5'
|
22
23
|
gem.add_dependency 'fastimage'
|
23
24
|
|
@@ -4,4 +4,15 @@ describe MetaInspector do
|
|
4
4
|
it "returns a Document" do
|
5
5
|
expect(MetaInspector.new('http://example.com').class).to eq(MetaInspector::Document)
|
6
6
|
end
|
7
|
+
|
8
|
+
it "cache request" do
|
9
|
+
# Creates a memory cache (a Hash that responds to #read, #write and #delete)
|
10
|
+
cache = Hash.new
|
11
|
+
def cache.read(k) self[k]; end
|
12
|
+
def cache.write(k, v) self[k] = v; end
|
13
|
+
|
14
|
+
expect(MetaInspector.new('http://example.com', warn_level: :store, faraday_http_cache: { store: cache })).to be_ok
|
15
|
+
|
16
|
+
expect(cache.keys).not_to be_empty
|
17
|
+
end
|
7
18
|
end
|
data/spec/url_spec.rb
CHANGED
@@ -43,6 +43,8 @@ describe MetaInspector::URL do
|
|
43
43
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_content=1234').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
|
44
44
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_campaign=1234').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
|
45
45
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_source=1234&utm_medium=5678&utm_term=4321&utm_content=9876&utm_campaign=5436').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
|
46
|
+
expect(MetaInspector::URL.new('http://example.com/foo?utm_source=1234&utm_medium=5678&utm_term=4321&utm_content=9876&utm_campaign=5436').untracked_url).to eq('http://example.com/foo')
|
47
|
+
expect(MetaInspector::URL.new('http://example.com/foo').untracked_url).to eq('http://example.com/foo')
|
46
48
|
end
|
47
49
|
|
48
50
|
it "should remove tracking parameters from url" do
|
@@ -62,6 +64,18 @@ describe MetaInspector::URL do
|
|
62
64
|
end
|
63
65
|
end
|
64
66
|
|
67
|
+
it "should remove all query values when untrack url" do
|
68
|
+
url = MetaInspector::URL.new('http://example.com/foo?utm_campaign=1234')
|
69
|
+
url.untrack!
|
70
|
+
expect(url.url).to eq('http://example.com/foo')
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should untrack untracked url" do
|
74
|
+
url = MetaInspector::URL.new('http://example.com/foo')
|
75
|
+
url.untrack!
|
76
|
+
expect(url.url).to eq('http://example.com/foo')
|
77
|
+
end
|
78
|
+
|
65
79
|
it "should say if the url is tracked" do
|
66
80
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_source=1234').tracked?).to be true
|
67
81
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_medium=1234').tracked?).to be true
|
@@ -75,6 +89,8 @@ describe MetaInspector::URL do
|
|
75
89
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar¬_utm_term=1234').tracked?).to be false
|
76
90
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar¬_utm_content=1234').tracked?).to be false
|
77
91
|
expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar¬_utm_campaign=1234').tracked?).to be false
|
92
|
+
|
93
|
+
expect(MetaInspector::URL.new('http://example.com/foo').tracked?).to be false
|
78
94
|
end
|
79
95
|
|
80
96
|
describe "url=" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 0.0.6
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: faraday-http-cache
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.2.2
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.2.2
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: addressable
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -339,7 +353,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
339
353
|
version: '0'
|
340
354
|
requirements: []
|
341
355
|
rubyforge_project:
|
342
|
-
rubygems_version: 2.4.
|
356
|
+
rubygems_version: 2.4.8
|
343
357
|
signing_key:
|
344
358
|
specification_version: 4
|
345
359
|
summary: MetaInspector is a ruby gem for web scraping purposes, that returns metadata
|