metainspector 4.6.1 → 4.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7cca73a64f172ba9906a1db7ba212780ffed0cdd
4
- data.tar.gz: 0b2feb3de6af1562ce3fed0bfa35d3002015ed40
3
+ metadata.gz: 329fa61a1fb5c278adb8c07c58536cc84e774f31
4
+ data.tar.gz: 8c40e805a960cc5591bee0317307dad27fa46719
5
5
  SHA512:
6
- metadata.gz: 5723c1dda401000aa4a54d517e4230a5f1559c9c099d0663c303896fa5ae00795eacd0dc686810dbd60ca4c64e278fbc8c0c5a1e1da6e51249ad450cd37aab46
7
- data.tar.gz: c3f48b0a18962b9777d4e0ec81b80cd63b9b5be619b91a0485ce8fcd6656e76f66867aa50e78027973c47239c02bf359923cf4ce5f4224034ad1eb185364ac0f
6
+ metadata.gz: 0535544d927c37e963b567404764ff7af352c00afbfb896c44604e85d6e844155575d26b24a2234f478015f6f4aff9b75724d933c6cac9c3cceda6180545aa99
7
+ data.tar.gz: 9f7fa08ed92dcf775df62d97ab3a9d989c04ad4ff9ffe0cbaf6f83baf3c5bd052072d61e2db6b66ebef2643ca5de9de79e6aa78575a87bf432a21a3cb676a093
data/README.md CHANGED
@@ -374,6 +374,15 @@ If you want to disable this, you can specify it like this:
374
374
  page = MetaInspector.new('http://example.com', download_images: false)
375
375
  ```
376
376
 
377
+ ### Caching responses
378
+
379
+ MetaInspector can be configured to use [Faraday::HttpCache](https://github.com/plataformatec/faraday-http-cache) to cache page responses. For that you should pass the `faraday_http_cache` option with at least the `:store` key, for example:
380
+
381
+ ```ruby
382
+ cache = ActiveSupport::Cache.lookup_store(:file_store, '/tmp/cache')
383
+ page = MetaInspector.new('http://example.com', faraday_http_cache: { store: cache })
384
+ ```
385
+
377
386
  ## Exception Handling
378
387
 
379
388
  By default, MetaInspector will raise the exceptions found. We think that this is the safest default: in case the URL you're trying to scrape is unreachable, you should clearly be notified, and treat the exception as needed in your app.
@@ -33,6 +33,7 @@ module MetaInspector
33
33
  @exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
34
34
  @normalize_url = options[:normalize_url]
35
35
  @faraday_options = options[:faraday_options]
36
+ @faraday_http_cache = options[:faraday_http_cache]
36
37
  @url = MetaInspector::URL.new(initial_url, exception_log: @exception_log,
37
38
  normalize: @normalize_url)
38
39
  @request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
@@ -41,7 +42,8 @@ module MetaInspector
41
42
  retries: @retries,
42
43
  exception_log: @exception_log,
43
44
  headers: @headers,
44
- faraday_options: @faraday_options) unless @document
45
+ faraday_options: @faraday_options,
46
+ faraday_http_cache: @faraday_http_cache) unless @document
45
47
  @parser = MetaInspector::Parser.new(self, exception_log: @exception_log,
46
48
  download_images: @download_images)
47
49
  end
@@ -1,6 +1,7 @@
1
1
  require 'faraday'
2
2
  require 'faraday_middleware'
3
3
  require 'faraday-cookie_jar'
4
+ require 'faraday-http-cache'
4
5
 
5
6
  module MetaInspector
6
7
 
@@ -18,6 +19,7 @@ module MetaInspector
18
19
  @exception_log = options[:exception_log]
19
20
  @headers = options[:headers]
20
21
  @faraday_options = options[:faraday_options] || {}
22
+ @faraday_http_cache = options[:faraday_http_cache]
21
23
 
22
24
  response # request early so we can fail early
23
25
  end
@@ -55,6 +57,11 @@ module MetaInspector
55
57
  faraday.use :cookie_jar
56
58
  end
57
59
 
60
+ if @faraday_http_cache.is_a?(Hash)
61
+ @faraday_http_cache[:serializer] ||= Marshal
62
+ faraday.use Faraday::HttpCache, @faraday_http_cache
63
+ end
64
+
58
65
  faraday.headers.merge!(@headers || {})
59
66
  faraday.adapter :net_http
60
67
  end
@@ -31,18 +31,21 @@ module MetaInspector
31
31
 
32
32
  def tracked?
33
33
  u = parsed(url)
34
+ return false unless u.query_values
34
35
  found_tracking_params = WELL_KNOWN_TRACKING_PARAMS & u.query_values.keys
35
36
  return found_tracking_params.any?
36
37
  end
37
38
 
38
39
  def untracked_url
39
40
  u = parsed(url)
40
- u.query_values = u.query_values.delete_if { |key, _| WELL_KNOWN_TRACKING_PARAMS.include? key }
41
+ return url unless u.query_values
42
+ query_values = u.query_values.delete_if { |key, _| WELL_KNOWN_TRACKING_PARAMS.include? key }
43
+ u.query_values = query_values.length > 0 ? query_values : nil
41
44
  u.to_s
42
45
  end
43
46
 
44
47
  def untrack!
45
- self.url = untracked_url
48
+ self.url = untracked_url if tracked?
46
49
  end
47
50
 
48
51
  def url=(new_url)
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = '4.6.1'
2
+ VERSION = '4.7.0'
3
3
  end
@@ -18,6 +18,7 @@ Gem::Specification.new do |gem|
18
18
  gem.add_dependency 'faraday', '~> 0.9.0'
19
19
  gem.add_dependency 'faraday_middleware', '~> 0.10'
20
20
  gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
21
+ gem.add_dependency 'faraday-http-cache', '~> 1.2.2'
21
22
  gem.add_dependency 'addressable', '~> 2.3.5'
22
23
  gem.add_dependency 'fastimage'
23
24
 
@@ -4,4 +4,15 @@ describe MetaInspector do
4
4
  it "returns a Document" do
5
5
  expect(MetaInspector.new('http://example.com').class).to eq(MetaInspector::Document)
6
6
  end
7
+
8
+ it "cache request" do
9
+ # Creates a memory cache (a Hash that responds to #read, #write and #delete)
10
+ cache = Hash.new
11
+ def cache.read(k) self[k]; end
12
+ def cache.write(k, v) self[k] = v; end
13
+
14
+ expect(MetaInspector.new('http://example.com', warn_level: :store, faraday_http_cache: { store: cache })).to be_ok
15
+
16
+ expect(cache.keys).not_to be_empty
17
+ end
7
18
  end
data/spec/url_spec.rb CHANGED
@@ -43,6 +43,8 @@ describe MetaInspector::URL do
43
43
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_content=1234').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
44
44
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_campaign=1234').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
45
45
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_source=1234&utm_medium=5678&utm_term=4321&utm_content=9876&utm_campaign=5436').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
46
+ expect(MetaInspector::URL.new('http://example.com/foo?utm_source=1234&utm_medium=5678&utm_term=4321&utm_content=9876&utm_campaign=5436').untracked_url).to eq('http://example.com/foo')
47
+ expect(MetaInspector::URL.new('http://example.com/foo').untracked_url).to eq('http://example.com/foo')
46
48
  end
47
49
 
48
50
  it "should remove tracking parameters from url" do
@@ -62,6 +64,18 @@ describe MetaInspector::URL do
62
64
  end
63
65
  end
64
66
 
67
+ it "should remove all query values when untrack url" do
68
+ url = MetaInspector::URL.new('http://example.com/foo?utm_campaign=1234')
69
+ url.untrack!
70
+ expect(url.url).to eq('http://example.com/foo')
71
+ end
72
+
73
+ it "should untrack untracked url" do
74
+ url = MetaInspector::URL.new('http://example.com/foo')
75
+ url.untrack!
76
+ expect(url.url).to eq('http://example.com/foo')
77
+ end
78
+
65
79
  it "should say if the url is tracked" do
66
80
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_source=1234').tracked?).to be true
67
81
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_medium=1234').tracked?).to be true
@@ -75,6 +89,8 @@ describe MetaInspector::URL do
75
89
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&not_utm_term=1234').tracked?).to be false
76
90
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&not_utm_content=1234').tracked?).to be false
77
91
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&not_utm_campaign=1234').tracked?).to be false
92
+
93
+ expect(MetaInspector::URL.new('http://example.com/foo').tracked?).to be false
78
94
  end
79
95
 
80
96
  describe "url=" do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.6.1
4
+ version: 4.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-31 00:00:00.000000000 Z
11
+ date: 2015-10-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 0.0.6
69
+ - !ruby/object:Gem::Dependency
70
+ name: faraday-http-cache
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 1.2.2
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 1.2.2
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: addressable
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -339,7 +353,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
339
353
  version: '0'
340
354
  requirements: []
341
355
  rubyforge_project:
342
- rubygems_version: 2.4.5
356
+ rubygems_version: 2.4.8
343
357
  signing_key:
344
358
  specification_version: 4
345
359
  summary: MetaInspector is a ruby gem for web scraping purposes, that returns metadata