metainspector 4.6.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7cca73a64f172ba9906a1db7ba212780ffed0cdd
4
- data.tar.gz: 0b2feb3de6af1562ce3fed0bfa35d3002015ed40
3
+ metadata.gz: 329fa61a1fb5c278adb8c07c58536cc84e774f31
4
+ data.tar.gz: 8c40e805a960cc5591bee0317307dad27fa46719
5
5
  SHA512:
6
- metadata.gz: 5723c1dda401000aa4a54d517e4230a5f1559c9c099d0663c303896fa5ae00795eacd0dc686810dbd60ca4c64e278fbc8c0c5a1e1da6e51249ad450cd37aab46
7
- data.tar.gz: c3f48b0a18962b9777d4e0ec81b80cd63b9b5be619b91a0485ce8fcd6656e76f66867aa50e78027973c47239c02bf359923cf4ce5f4224034ad1eb185364ac0f
6
+ metadata.gz: 0535544d927c37e963b567404764ff7af352c00afbfb896c44604e85d6e844155575d26b24a2234f478015f6f4aff9b75724d933c6cac9c3cceda6180545aa99
7
+ data.tar.gz: 9f7fa08ed92dcf775df62d97ab3a9d989c04ad4ff9ffe0cbaf6f83baf3c5bd052072d61e2db6b66ebef2643ca5de9de79e6aa78575a87bf432a21a3cb676a093
data/README.md CHANGED
@@ -374,6 +374,15 @@ If you want to disable this, you can specify it like this:
374
374
  page = MetaInspector.new('http://example.com', download_images: false)
375
375
  ```
376
376
 
377
+ ### Caching responses
378
+
379
+ MetaInspector can be configured to use [Faraday::HttpCache](https://github.com/plataformatec/faraday-http-cache) to cache page responses. For that you should pass the `faraday_http_cache` option with at least the `:store` key, for example:
380
+
381
+ ```ruby
382
+ cache = ActiveSupport::Cache.lookup_store(:file_store, '/tmp/cache')
383
+ page = MetaInspector.new('http://example.com', faraday_http_cache: { store: cache })
384
+ ```
385
+
377
386
  ## Exception Handling
378
387
 
379
388
  By default, MetaInspector will raise the exceptions found. We think that this is the safest default: in case the URL you're trying to scrape is unreachable, you should clearly be notified, and treat the exception as needed in your app.
@@ -33,6 +33,7 @@ module MetaInspector
33
33
  @exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
34
34
  @normalize_url = options[:normalize_url]
35
35
  @faraday_options = options[:faraday_options]
36
+ @faraday_http_cache = options[:faraday_http_cache]
36
37
  @url = MetaInspector::URL.new(initial_url, exception_log: @exception_log,
37
38
  normalize: @normalize_url)
38
39
  @request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
@@ -41,7 +42,8 @@ module MetaInspector
41
42
  retries: @retries,
42
43
  exception_log: @exception_log,
43
44
  headers: @headers,
44
- faraday_options: @faraday_options) unless @document
45
+ faraday_options: @faraday_options,
46
+ faraday_http_cache: @faraday_http_cache) unless @document
45
47
  @parser = MetaInspector::Parser.new(self, exception_log: @exception_log,
46
48
  download_images: @download_images)
47
49
  end
@@ -1,6 +1,7 @@
1
1
  require 'faraday'
2
2
  require 'faraday_middleware'
3
3
  require 'faraday-cookie_jar'
4
+ require 'faraday-http-cache'
4
5
 
5
6
  module MetaInspector
6
7
 
@@ -18,6 +19,7 @@ module MetaInspector
18
19
  @exception_log = options[:exception_log]
19
20
  @headers = options[:headers]
20
21
  @faraday_options = options[:faraday_options] || {}
22
+ @faraday_http_cache = options[:faraday_http_cache]
21
23
 
22
24
  response # request early so we can fail early
23
25
  end
@@ -55,6 +57,11 @@ module MetaInspector
55
57
  faraday.use :cookie_jar
56
58
  end
57
59
 
60
+ if @faraday_http_cache.is_a?(Hash)
61
+ @faraday_http_cache[:serializer] ||= Marshal
62
+ faraday.use Faraday::HttpCache, @faraday_http_cache
63
+ end
64
+
58
65
  faraday.headers.merge!(@headers || {})
59
66
  faraday.adapter :net_http
60
67
  end
@@ -31,18 +31,21 @@ module MetaInspector
31
31
 
32
32
  def tracked?
33
33
  u = parsed(url)
34
+ return false unless u.query_values
34
35
  found_tracking_params = WELL_KNOWN_TRACKING_PARAMS & u.query_values.keys
35
36
  return found_tracking_params.any?
36
37
  end
37
38
 
38
39
  def untracked_url
39
40
  u = parsed(url)
40
- u.query_values = u.query_values.delete_if { |key, _| WELL_KNOWN_TRACKING_PARAMS.include? key }
41
+ return url unless u.query_values
42
+ query_values = u.query_values.delete_if { |key, _| WELL_KNOWN_TRACKING_PARAMS.include? key }
43
+ u.query_values = query_values.length > 0 ? query_values : nil
41
44
  u.to_s
42
45
  end
43
46
 
44
47
  def untrack!
45
- self.url = untracked_url
48
+ self.url = untracked_url if tracked?
46
49
  end
47
50
 
48
51
  def url=(new_url)
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = '4.6.1'
2
+ VERSION = '4.7.0'
3
3
  end
@@ -18,6 +18,7 @@ Gem::Specification.new do |gem|
18
18
  gem.add_dependency 'faraday', '~> 0.9.0'
19
19
  gem.add_dependency 'faraday_middleware', '~> 0.10'
20
20
  gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
21
+ gem.add_dependency 'faraday-http-cache', '~> 1.2.2'
21
22
  gem.add_dependency 'addressable', '~> 2.3.5'
22
23
  gem.add_dependency 'fastimage'
23
24
 
@@ -4,4 +4,15 @@ describe MetaInspector do
4
4
  it "returns a Document" do
5
5
  expect(MetaInspector.new('http://example.com').class).to eq(MetaInspector::Document)
6
6
  end
7
+
8
+ it "cache request" do
9
+ # Creates a memory cache (a Hash that responds to #read, #write and #delete)
10
+ cache = Hash.new
11
+ def cache.read(k) self[k]; end
12
+ def cache.write(k, v) self[k] = v; end
13
+
14
+ expect(MetaInspector.new('http://example.com', warn_level: :store, faraday_http_cache: { store: cache })).to be_ok
15
+
16
+ expect(cache.keys).not_to be_empty
17
+ end
7
18
  end
data/spec/url_spec.rb CHANGED
@@ -43,6 +43,8 @@ describe MetaInspector::URL do
43
43
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_content=1234').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
44
44
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_campaign=1234').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
45
45
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_source=1234&utm_medium=5678&utm_term=4321&utm_content=9876&utm_campaign=5436').untracked_url).to eq('http://example.com/foo?not_utm_thing=bar')
46
+ expect(MetaInspector::URL.new('http://example.com/foo?utm_source=1234&utm_medium=5678&utm_term=4321&utm_content=9876&utm_campaign=5436').untracked_url).to eq('http://example.com/foo')
47
+ expect(MetaInspector::URL.new('http://example.com/foo').untracked_url).to eq('http://example.com/foo')
46
48
  end
47
49
 
48
50
  it "should remove tracking parameters from url" do
@@ -62,6 +64,18 @@ describe MetaInspector::URL do
62
64
  end
63
65
  end
64
66
 
67
+ it "should remove all query values when untrack url" do
68
+ url = MetaInspector::URL.new('http://example.com/foo?utm_campaign=1234')
69
+ url.untrack!
70
+ expect(url.url).to eq('http://example.com/foo')
71
+ end
72
+
73
+ it "should untrack untracked url" do
74
+ url = MetaInspector::URL.new('http://example.com/foo')
75
+ url.untrack!
76
+ expect(url.url).to eq('http://example.com/foo')
77
+ end
78
+
65
79
  it "should say if the url is tracked" do
66
80
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_source=1234').tracked?).to be true
67
81
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&utm_medium=1234').tracked?).to be true
@@ -75,6 +89,8 @@ describe MetaInspector::URL do
75
89
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&not_utm_term=1234').tracked?).to be false
76
90
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&not_utm_content=1234').tracked?).to be false
77
91
  expect(MetaInspector::URL.new('http://example.com/foo?not_utm_thing=bar&not_utm_campaign=1234').tracked?).to be false
92
+
93
+ expect(MetaInspector::URL.new('http://example.com/foo').tracked?).to be false
78
94
  end
79
95
 
80
96
  describe "url=" do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.6.1
4
+ version: 4.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-31 00:00:00.000000000 Z
11
+ date: 2015-10-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 0.0.6
69
+ - !ruby/object:Gem::Dependency
70
+ name: faraday-http-cache
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 1.2.2
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 1.2.2
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: addressable
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -339,7 +353,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
339
353
  version: '0'
340
354
  requirements: []
341
355
  rubyforge_project:
342
- rubygems_version: 2.4.5
356
+ rubygems_version: 2.4.8
343
357
  signing_key:
344
358
  specification_version: 4
345
359
  summary: MetaInspector is a ruby gem for web scraping purposes, that returns metadata