metainspector 5.14.0 → 5.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 71467ae16d368978ce8ee13d605c95a12be2fc207a7d7c28e5243658f70a0837
4
- data.tar.gz: f7774783e9c4e7282ce5c10683fdcd521ee483f0becb9c8bdce45d276920af30
3
+ metadata.gz: 8311486a8156f619d20a7cc93283e57ae055dd9fdcb222e14d3900adfae6d6a2
4
+ data.tar.gz: 9f1288adf02bc224d5d5e6915a0c24aaab342cabe3d6a85e07989f553966324a
5
5
  SHA512:
6
- metadata.gz: b623421b6f265833566eba954bb1d3bae0a00032720950a0a97316fd2e493ec98b5de46b4a48c9c0a7b28158101e8aecd4991e0043ca2601e75d387df97a769e
7
- data.tar.gz: 84cb6be32451c083adfa5ccb01e1746485a67e55fa9a1d56b1b2d496fec6154bfbf60f947c72a8c9ec8ea48e616641dcc30dc3b3579d1ef953475cc85354e8da
6
+ metadata.gz: 0ef843e07a8af813a4ed27f18b53eab03f5bfeaa60af5014fb3474346f3d02081891b94828ea034ffd7c5734f40c4fcb0e4028a2ccfe682152408d8f69820aec
7
+ data.tar.gz: c8da85989e7c11ad7bccff3d9f53c94daf75962492c4fa0d54550db2a02bee7928747b36354c89d47f0875c459310ea404bea9154f7e3cbd414b489fb45110e3
data/.circleci/config.yml CHANGED
@@ -2,36 +2,27 @@ version: 2.1
2
2
  orbs:
3
3
  ruby: circleci/ruby@1.0.4
4
4
  jobs:
5
- test_2_6:
6
- docker:
7
- - image: cimg/ruby:2.6.10
8
- steps:
9
- - checkout
10
- - ruby/install-deps
11
- - run:
12
- name: Run tests
13
- command: bundle exec rake
14
- test_2_7:
5
+ test_3_1:
15
6
  docker:
16
- - image: cimg/ruby:2.7.6
7
+ - image: cimg/ruby:3.1.7
17
8
  steps:
18
9
  - checkout
19
10
  - ruby/install-deps
20
11
  - run:
21
12
  name: Run tests
22
13
  command: bundle exec rake
23
- test_3_0:
14
+ test_3_2:
24
15
  docker:
25
- - image: cimg/ruby:3.0.4
16
+ - image: cimg/ruby:3.2.6
26
17
  steps:
27
18
  - checkout
28
19
  - ruby/install-deps
29
20
  - run:
30
21
  name: Run tests
31
22
  command: bundle exec rake
32
- test_3_1:
23
+ test_3_3:
33
24
  docker:
34
- - image: cimg/ruby:3.1.2
25
+ - image: cimg/ruby:3.3.8
35
26
  steps:
36
27
  - checkout
37
28
  - ruby/install-deps
@@ -42,7 +33,6 @@ workflows:
42
33
  version: 2
43
34
  deploy:
44
35
  jobs:
45
- - test_2_6
46
- - test_2_7
47
- - test_3_0
48
- - test_3_1
36
+ - test_3_1
37
+ - test_3_2
38
+ - test_3_3
data/CHANGELOG.md CHANGED
@@ -1,15 +1,28 @@
1
- # MetaInpector Changelog
1
+ # MetaInspector Changelog
2
2
 
3
- ## [Changes in 5.13.0](https://github.com/metainspector/metainspector/compare/v5.12.1...v5.13.0)
3
+ ## [Changes in 5.16.0](https://github.com/jaimeiniesta/metainspector/compare/v5.15.0...v5.16.0)
4
+
5
+ * Upgraded dependencies and supported Ruby versions.
6
+
7
+ ## [Changes in 5.15.0](https://github.com/jaimeiniesta/metainspector/compare/v5.14.0...v5.15.0)
8
+
9
+ * Added mechanism to use all available options in the `FollowRedirects` Faraday middleware,
10
+ https://github.com/jaimeiniesta/metainspector/pull/355 thanks to @bruno-b-martins and @miguelrod
11
+
12
+ ## [Changes in 5.14.0](https://github.com/jaimeiniesta/metainspector/compare/v5.13.0...v5.14.0)
13
+
14
+ * Several dependency updates, including Addressable 2.8.1 which fixes invalid_byte_sequence exception.
15
+
16
+ ## [Changes in 5.13.0](https://github.com/jaimeiniesta/metainspector/compare/v5.12.1...v5.13.0)
4
17
 
5
18
  * Remove support for #feed that was deprecated in 5.9
6
19
  * Add support for Ruby 3.1
7
20
 
8
- ## [Changes in 5.12.1](https://github.com/metainspector/metainspector/compare/v5.12.0...v5.12.1)
21
+ ## [Changes in 5.12.1](https://github.com/jaimeiniesta/metainspector/compare/v5.12.0...v5.12.1)
9
22
 
10
23
  * Update dependencies: rubocop, nokogiri
11
24
 
12
- ## [Changes in 5.12.0](https://github.com/metainspector/metainspector/compare/v5.11.2...v5.12.0)
25
+ ## [Changes in 5.12.0](https://github.com/jaimeiniesta/metainspector/compare/v5.11.2...v5.12.0)
13
26
 
14
27
  * Support Ruby 3.0
15
28
 
data/Gemfile.lock CHANGED
@@ -1,23 +1,23 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- metainspector (5.14.0)
5
- addressable (~> 2.8)
4
+ metainspector (5.16.0)
5
+ addressable (~> 2.8.4)
6
6
  faraday (~> 2.5)
7
7
  faraday-cookie_jar (~> 0.0)
8
8
  faraday-encoding (~> 0.0)
9
9
  faraday-follow_redirects (~> 0.3)
10
- faraday-gzip (>= 0.1, < 2.0)
11
- faraday-http-cache (~> 2.4)
10
+ faraday-gzip (>= 0.1, < 3.0)
11
+ faraday-http-cache (~> 2.5)
12
12
  faraday-retry (~> 2.0)
13
13
  fastimage (~> 2.2)
14
14
  nesty (~> 1.0)
15
- nokogiri (~> 1.13)
15
+ nokogiri (~> 1.18.8)
16
16
 
17
17
  GEM
18
18
  remote: http://rubygems.org/
19
19
  specs:
20
- addressable (2.8.1)
20
+ addressable (2.8.5)
21
21
  public_suffix (>= 2.0.2, < 6.0)
22
22
  ast (2.4.2)
23
23
  awesome_print (1.9.2)
@@ -25,49 +25,64 @@ GEM
25
25
  crack (0.4.5)
26
26
  rexml
27
27
  diff-lcs (1.5.0)
28
- domain_name (0.5.20190701)
29
- unf (>= 0.0.5, < 1.0.0)
30
- faraday (2.7.4)
31
- faraday-net_http (>= 2.0, < 3.1)
32
- ruby2_keywords (>= 0.0.4)
28
+ domain_name (0.6.20240107)
29
+ faraday (2.13.1)
30
+ faraday-net_http (>= 2.0, < 3.5)
31
+ json
32
+ logger
33
33
  faraday-cookie_jar (0.0.7)
34
34
  faraday (>= 0.8.0)
35
35
  http-cookie (~> 1.0.0)
36
- faraday-encoding (0.0.5)
36
+ faraday-encoding (0.0.6)
37
37
  faraday
38
38
  faraday-follow_redirects (0.3.0)
39
39
  faraday (>= 1, < 3)
40
- faraday-gzip (1.0.0)
40
+ faraday-gzip (2.0.1)
41
41
  faraday (>= 1.0)
42
- zlib (~> 2.1)
43
- faraday-http-cache (2.4.1)
42
+ zlib (~> 3.0)
43
+ faraday-http-cache (2.5.1)
44
44
  faraday (>= 0.8)
45
- faraday-net_http (3.0.2)
46
- faraday-retry (2.0.0)
45
+ faraday-net_http (3.4.0)
46
+ net-http (>= 0.5.0)
47
+ faraday-retry (2.3.1)
47
48
  faraday (~> 2.0)
48
- fastimage (2.2.6)
49
+ fastimage (2.4.0)
49
50
  hashdiff (1.0.1)
50
- http-cookie (1.0.5)
51
+ http-cookie (1.0.8)
51
52
  domain_name (~> 0.5)
52
- json (2.6.3)
53
+ json (2.7.1)
54
+ language_server-protocol (3.17.0.3)
55
+ logger (1.7.0)
53
56
  method_source (1.0.0)
54
- mini_portile2 (2.8.1)
57
+ mustermann (3.0.0)
58
+ ruby2_keywords (~> 0.0.1)
55
59
  nesty (1.0.2)
56
- nokogiri (1.14.2)
57
- mini_portile2 (~> 2.8.0)
60
+ net-http (0.6.0)
61
+ uri
62
+ nio4r (2.5.9)
63
+ nokogiri (1.18.8-arm64-darwin)
58
64
  racc (~> 1.4)
59
- parallel (1.22.1)
60
- parser (3.2.1.0)
65
+ nokogiri (1.18.8-x86_64-linux-gnu)
66
+ racc (~> 1.4)
67
+ parallel (1.24.0)
68
+ parser (3.3.0.5)
61
69
  ast (~> 2.4.1)
70
+ racc
62
71
  pry (0.14.2)
63
72
  coderay (~> 1.1)
64
73
  method_source (~> 1.0)
65
- public_suffix (5.0.1)
66
- racc (1.6.2)
74
+ public_suffix (5.0.3)
75
+ puma (6.4.0)
76
+ nio4r (~> 2.0)
77
+ racc (1.8.1)
78
+ rack (2.2.14)
79
+ rack-protection (3.0.6)
80
+ rack
67
81
  rainbow (3.1.1)
68
- rake (13.0.6)
69
- regexp_parser (2.7.0)
70
- rexml (3.2.5)
82
+ rake (13.1.0)
83
+ regexp_parser (2.9.0)
84
+ resolv (0.2.2)
85
+ rexml (3.2.6)
71
86
  rspec (3.12.0)
72
87
  rspec-core (~> 3.12.0)
73
88
  rspec-expectations (~> 3.12.0)
@@ -81,40 +96,50 @@ GEM
81
96
  diff-lcs (>= 1.2.0, < 2.0)
82
97
  rspec-support (~> 3.12.0)
83
98
  rspec-support (3.12.0)
84
- rubocop (1.46.0)
99
+ rubocop (1.62.0)
85
100
  json (~> 2.3)
101
+ language_server-protocol (>= 3.17.0)
86
102
  parallel (~> 1.10)
87
- parser (>= 3.2.0.0)
103
+ parser (>= 3.3.0.2)
88
104
  rainbow (>= 2.2.2, < 4.0)
89
105
  regexp_parser (>= 1.8, < 3.0)
90
106
  rexml (>= 3.2.5, < 4.0)
91
- rubocop-ast (>= 1.26.0, < 2.0)
107
+ rubocop-ast (>= 1.31.1, < 2.0)
92
108
  ruby-progressbar (~> 1.7)
93
109
  unicode-display_width (>= 2.4.0, < 3.0)
94
- rubocop-ast (1.26.0)
95
- parser (>= 3.2.1.0)
96
- ruby-progressbar (1.11.0)
110
+ rubocop-ast (1.31.1)
111
+ parser (>= 3.3.0.4)
112
+ ruby-progressbar (1.13.0)
97
113
  ruby2_keywords (0.0.5)
98
- unf (0.1.4)
99
- unf_ext
100
- unf_ext (0.0.8.2)
101
- unicode-display_width (2.4.2)
114
+ sinatra (3.0.6)
115
+ mustermann (~> 3.0)
116
+ rack (~> 2.2, >= 2.2.4)
117
+ rack-protection (= 3.0.6)
118
+ tilt (~> 2.0)
119
+ tilt (2.1.0)
120
+ unicode-display_width (2.5.0)
121
+ uri (1.0.3)
102
122
  webmock (3.18.1)
103
123
  addressable (>= 2.8.0)
104
124
  crack (>= 0.3.2)
105
125
  hashdiff (>= 0.4.0, < 2.0.0)
106
- zlib (2.1.1)
126
+ zlib (3.2.1)
107
127
 
108
128
  PLATFORMS
109
- ruby
129
+ arm64-darwin-22
130
+ arm64-darwin-24
131
+ x86_64-linux
110
132
 
111
133
  DEPENDENCIES
112
134
  awesome_print (~> 1.9)
113
135
  metainspector!
114
136
  pry (~> 0.14)
137
+ puma (~> 6.4.0)
115
138
  rake (~> 13.0)
139
+ resolv (~> 0.2.2)
116
140
  rspec (~> 3.11)
117
141
  rubocop (~> 1.34)
142
+ sinatra (~> 3.0.6)
118
143
  webmock (~> 3.17)
119
144
 
120
145
  BUNDLED WITH
data/README.md CHANGED
@@ -1,14 +1,10 @@
1
1
  # MetaInspector
2
- [![Gem Version](https://badge.fury.io/rb/metainspector.svg)](http://badge.fury.io/rb/metainspector) [![CircleCI](https://circleci.com/gh/metainspector/metainspector.svg?style=svg)](https://circleci.com/gh/metainspector/metainspector) [![Code Climate](https://codeclimate.com/github/jaimeiniesta/metainspector/badges/gpa.svg)](https://codeclimate.com/github/jaimeiniesta/metainspector) [![Mentioned in Awesome Ruby](https://awesome.re/mentioned-badge.svg)](https://github.com/markets/awesome-ruby)
2
+ [![Gem Version](https://badge.fury.io/rb/metainspector.svg)](http://badge.fury.io/rb/metainspector) [![CircleCI](https://circleci.com/gh/jaimeiniesta/metainspector.svg?style=svg)](https://circleci.com/gh/jaimeiniesta/metainspector) [![Code Climate](https://codeclimate.com/github/jaimeiniesta/metainspector/badges/gpa.svg)](https://codeclimate.com/github/jaimeiniesta/metainspector) [![Mentioned in Awesome Ruby](https://awesome.re/mentioned-badge.svg)](https://github.com/markets/awesome-ruby)
3
3
 
4
4
  MetaInspector is a gem for web scraping purposes.
5
5
 
6
6
  You give it an URL, and it lets you easily get its title, links, images, charset, description, keywords, meta tags...
7
7
 
8
- ## See it in action!
9
-
10
- You can try MetaInspector using this little demo: [https://github.com/metainspector/metainspectordemo](https://github.com/metainspector/metainspectordemo)
11
-
12
8
  ## Installation
13
9
 
14
10
  Install the gem from RubyGems:
@@ -23,7 +19,7 @@ If you're using it on a Rails application, just add it to your Gemfile and run `
23
19
  gem 'metainspector'
24
20
  ```
25
21
 
26
- Supported Ruby versions are defined in [`.travis.yml`](.travis.yml).
22
+ Supported Ruby versions are defined in [`.circleci/config.yml`](.circleci/config.yml).
27
23
 
28
24
  ## Usage
29
25
 
@@ -309,6 +305,27 @@ If you want to disallow redirects, you can do it like this:
309
305
  page = MetaInspector.new('facebook.com', :allow_redirections => false)
310
306
  ```
311
307
 
308
+ You can also customize how many redirects you wish to allow:
309
+
310
+ ```ruby
311
+ page = MetaInspector.new('facebook.com', :faraday_options => { redirect: { limit: 5 } })
312
+ ```
313
+
314
+ And even customize what to do in between each redirect:
315
+
316
+ ```ruby
317
+ callback = proc do |previous_response, next_request|
318
+   ip_address = Resolv.getaddress(next_request.url.host)
319
+ raise 'Invalid address' if IPAddr.new(ip_address).private?
320
+ end
321
+
322
+ page = MetaInspector.new(url, faraday_options: { redirect: { callback: callback } })
323
+ ```
324
+
325
+
326
+ The `faraday_options[:redirect]` hash is passed to the `FollowRedirects` middleware used by `Faraday`, so that we can use all available options.
327
+ Check them [here](https://github.com/lostisland/faraday_middleware/blob/main/lib/faraday_middleware/response/follow_redirects.rb#L44).
328
+
312
329
  ### Headers
313
330
 
314
331
  By default, the following headers are set:
@@ -0,0 +1,42 @@
1
+ # A MetaInspector example that runs a callback in between redirects.
2
+ # The callback raises an exception if the redirection points to a URL that resolves into a private IP address.
3
+ # This is one way of triggering a known security exploit called server-side request forgery (SSRF).
4
+ #
5
+ # To properly run this example you need a server which redirects to a service like nip.io.
6
+ # The easiest way to achieve that is running the examples/redirect_web_server.rb server in one terminal window,
7
+ # and calling its address with this example in another terminal window.
8
+ #
9
+ # Usage example:
10
+ # In terminal #1:
11
+ # ruby redirect_web_server.rb
12
+ #
13
+ # In terminal #2:
14
+ # ruby faraday_redirect_options.rb http://127.0.0.1:4567
15
+
16
+ require 'resolv'
17
+ require '../lib/metainspector'
18
+ puts "Using MetaInspector #{MetaInspector::VERSION}"
19
+
20
+ # Get the starting URL
21
+ url = ARGV[0] || (puts "Enter an url"; gets.strip)
22
+
23
+ # redirect options to be passed along to Faraday::FollowRedirects::Middleware
24
+ redirects_opts = {
25
+ limit: 5,
26
+ callback: proc do |_old_response, new_response|
27
+ ip_address = Resolv.getaddress(new_response.url.host)
28
+ raise 'Invalid address' if IPAddr.new(ip_address).private?
29
+ end
30
+ }
31
+
32
+ begin
33
+ page = MetaInspector.new(url, faraday_options: { redirect: redirects_opts })
34
+ rescue StandardError => e
35
+ puts e.message
36
+ else
37
+ puts "\nScraping #{page.url} returned these results:"
38
+ puts "\nTITLE: #{page.title}"
39
+
40
+ puts "\nto_hash..."
41
+ puts page.to_hash
42
+ end
@@ -0,0 +1,5 @@
1
+ require 'sinatra'
2
+
3
+ get '/' do
4
+ redirect 'http://10.0.0.0.nip.io/'
5
+ end
@@ -58,6 +58,7 @@ module MetaInspector
58
58
  def fetch
59
59
  Timeout::timeout(fatal_timeout) do
60
60
  @faraday_options.merge!(:url => url)
61
+ follow_redirects_options = @faraday_options.delete(:redirect) || {}
61
62
 
62
63
  session = Faraday.new(@faraday_options) do |faraday|
63
64
  faraday.request :retry, max: @retries
@@ -65,7 +66,8 @@ module MetaInspector
65
66
  faraday.request :gzip
66
67
 
67
68
  if @allow_redirections
68
- faraday.use Faraday::FollowRedirects::Middleware, limit: 10
69
+ follow_redirects_options[:limit] ||= 10
70
+ faraday.use Faraday::FollowRedirects::Middleware, **follow_redirects_options
69
71
  faraday.use :cookie_jar
70
72
  end
71
73
 
@@ -84,7 +86,9 @@ module MetaInspector
84
86
  req.options.open_timeout = @read_timeout
85
87
  end
86
88
 
87
- @url.url = response.env.url.to_s
89
+ if @allow_redirections
90
+ @url.url = response.env.url.to_s
91
+ end
88
92
 
89
93
  response
90
94
  end
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = '5.14.0'
2
+ VERSION = '5.16.0'
3
3
  end
@@ -5,7 +5,7 @@ Gem::Specification.new do |gem|
5
5
  gem.email = "jaimeiniesta@gmail.com"
6
6
  gem.description = %q{MetaInspector lets you scrape a web page and get its links, images, texts, meta tags...}
7
7
  gem.summary = %q{MetaInspector is a ruby gem for web scraping purposes, that returns metadata from a given URL}
8
- gem.homepage = "https://github.com/metainspector/metainspector"
8
+ gem.homepage = "https://github.com/jaimeiniesta/metainspector"
9
9
  gem.license = "MIT"
10
10
 
11
11
  gem.files = `git ls-files`.split("\n")
@@ -14,15 +14,15 @@ Gem::Specification.new do |gem|
14
14
  gem.require_paths = ["lib"]
15
15
  gem.version = MetaInspector::VERSION
16
16
 
17
- gem.add_dependency 'nokogiri', '~> 1.13'
17
+ gem.add_dependency 'nokogiri', '~> 1.18.8'
18
18
  gem.add_dependency 'faraday', '~> 2.5'
19
19
  gem.add_dependency 'faraday-cookie_jar', '~> 0.0'
20
20
  gem.add_dependency 'faraday-encoding', '~> 0.0'
21
21
  gem.add_dependency 'faraday-follow_redirects', '~> 0.3'
22
- gem.add_dependency 'faraday-gzip', '>= 0.1', '< 2.0'
23
- gem.add_dependency 'faraday-http-cache', '~> 2.4'
22
+ gem.add_dependency 'faraday-gzip', '>= 0.1', '< 3.0'
23
+ gem.add_dependency 'faraday-http-cache', '~> 2.5'
24
24
  gem.add_dependency 'faraday-retry', '~> 2.0'
25
- gem.add_dependency 'addressable', '~> 2.8'
25
+ gem.add_dependency 'addressable', '~> 2.8.4'
26
26
  gem.add_dependency 'fastimage', '~> 2.2'
27
27
  gem.add_dependency 'nesty', '~> 1.0'
28
28
 
@@ -31,5 +31,8 @@ Gem::Specification.new do |gem|
31
31
  gem.add_development_dependency 'awesome_print', '~> 1.9'
32
32
  gem.add_development_dependency 'rake', '~> 13.0'
33
33
  gem.add_development_dependency 'pry', '~> 0.14'
34
+ gem.add_development_dependency 'puma', '~> 6.4.0'
34
35
  gem.add_development_dependency 'rubocop', '~> 1.34'
36
+ gem.add_development_dependency 'resolv', '~> 0.2.2'
37
+ gem.add_development_dependency 'sinatra', '~> 3.0.6'
35
38
  end
@@ -130,11 +130,11 @@ describe MetaInspector::Document do
130
130
 
131
131
  describe 'url normalization' do
132
132
  it 'should normalize by default' do
133
- expect(MetaInspector.new('http://example.com/%EF%BD%9E').url).to eq('http://example.com/~')
133
+ expect(MetaInspector.new('http://example.com?name=joe martins', allow_redirections: false).url).to eq('http://example.com/?name=joe%20martins')
134
134
  end
135
135
 
136
136
  it 'should not normalize if the normalize_url option is false' do
137
- expect(MetaInspector.new('http://example.com/%EF%BD%9E', normalize_url: false).url).to eq('http://example.com/%EF%BD%9E')
137
+ expect(MetaInspector.new('http://example.com?name=joe martins', normalize_url: false, allow_redirections: false).url).to eq('http://example.com?name=joe martins')
138
138
  end
139
139
  end
140
140
 
@@ -52,29 +52,35 @@ describe MetaInspector do
52
52
  it "should get correct absolute links, encoding the URLs as needed" do
53
53
  m = MetaInspector.new('http://international.com')
54
54
 
55
- expect(m.links.internal).to eq([ "http://international.com/espa%C3%B1a.asp",
56
- "http://international.com/roman%C3%A9e",
57
- "http://international.com/faqs#cami%C3%B3n",
58
- "http://international.com/search?q=cami%C3%B3n",
59
- "http://international.com/search?q=espa%C3%B1a#top",
60
- "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"])
61
-
62
- expect(m.links.external).to eq([ "http://example.com/espa%C3%B1a.asp",
63
- "http://example.com/roman%C3%A9e",
64
- "http://example.com/faqs#cami%C3%B3n",
65
- "http://example.com/search?q=cami%C3%B3n",
66
- "http://example.com/search?q=espa%C3%B1a#top"])
55
+ expect(m.links.internal).to eq([
56
+ "http://international.com/espa%C3%83%C2%B1a.asp",
57
+ "http://international.com/roman%C3%83%C2%A9e",
58
+ "http://international.com/faqs#cami%C3%83%C2%B3n",
59
+ "http://international.com/search?q=cami%C3%83%C2%B3n",
60
+ "http://international.com/search?q=espa%C3%83%C2%B1a#top",
61
+ "http://international.com/index.php?q=espa%C3%83%C2%B1a&url=aHR0zZQ==&cntnt01pageid=21"
62
+ ])
63
+
64
+ expect(m.links.external).to eq([
65
+ "http://example.com/espa%C3%83%C2%B1a.asp",
66
+ "http://example.com/roman%C3%83%C2%A9e",
67
+ "http://example.com/faqs#cami%C3%83%C2%B3n",
68
+ "http://example.com/search?q=cami%C3%83%C2%B3n",
69
+ "http://example.com/search?q=espa%C3%83%C2%B1a#top"])
67
70
  end
68
71
 
69
72
  describe "internal links" do
70
73
  it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
71
74
  m = MetaInspector.new('http://international.com')
72
- expect(m.links.internal).to eq([ "http://international.com/espa%C3%B1a.asp",
73
- "http://international.com/roman%C3%A9e",
74
- "http://international.com/faqs#cami%C3%B3n",
75
- "http://international.com/search?q=cami%C3%B3n",
76
- "http://international.com/search?q=espa%C3%B1a#top",
77
- "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"])
75
+
76
+ expect(m.links.internal).to eq([
77
+ "http://international.com/espa%C3%83%C2%B1a.asp",
78
+ "http://international.com/roman%C3%83%C2%A9e",
79
+ "http://international.com/faqs#cami%C3%83%C2%B3n",
80
+ "http://international.com/search?q=cami%C3%83%C2%B3n",
81
+ "http://international.com/search?q=espa%C3%83%C2%B1a#top",
82
+ "http://international.com/index.php?q=espa%C3%83%C2%B1a&url=aHR0zZQ==&cntnt01pageid=21"
83
+ ])
78
84
  end
79
85
 
80
86
  it "should not crash when processing malformed hrefs" do
@@ -86,11 +92,14 @@ describe MetaInspector do
86
92
  describe "external links" do
87
93
  it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
88
94
  m = MetaInspector.new('http://international.com')
89
- expect(m.links.external).to eq([ "http://example.com/espa%C3%B1a.asp",
90
- "http://example.com/roman%C3%A9e",
91
- "http://example.com/faqs#cami%C3%B3n",
92
- "http://example.com/search?q=cami%C3%B3n",
93
- "http://example.com/search?q=espa%C3%B1a#top"])
95
+
96
+ expect(m.links.external).to eq([
97
+ "http://example.com/espa%C3%83%C2%B1a.asp",
98
+ "http://example.com/roman%C3%83%C2%A9e",
99
+ "http://example.com/faqs#cami%C3%83%C2%B3n",
100
+ "http://example.com/search?q=cami%C3%83%C2%B3n",
101
+ "http://example.com/search?q=espa%C3%83%C2%B1a#top"
102
+ ])
94
103
  end
95
104
 
96
105
  it "should not crash when processing malformed hrefs" do
@@ -108,7 +117,7 @@ describe MetaInspector do
108
117
 
109
118
  it "should handle links that have an invalid byte sequence" do
110
119
  m = MetaInspector.new('http://example.com/invalid_byte_seq')
111
- expect(m.links.all).to eq(["http://pagerankalert.posterous.com/", "http://element%B3wgarderoby.com/", "http://twitter.com/pagerankalert"])
120
+ expect(m.links.all).to eq(["http://pagerankalert.posterous.com/", "http://twitter.com/pagerankalert"])
112
121
  end
113
122
 
114
123
  end
@@ -1,4 +1,7 @@
1
1
  require 'spec_helper'
2
+ require 'resolv'
3
+
4
+ class PrivateIPAddressError < StandardError; end
2
5
 
3
6
  describe MetaInspector do
4
7
  describe "redirections" do
@@ -47,6 +50,25 @@ describe MetaInspector do
47
50
  expect(page.url).to eq("http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/?nclick_check=1")
48
51
  end
49
52
  end
53
+
54
+ context "when there is a callback to be ran between redirects that blocks redirections to private IP addresses" do
55
+ it "raises an exception" do
56
+ stub_request(:get, "https://www.facebook.com/")
57
+ .to_return(:status => 302,
58
+ :headers => { "Location" => "http://10.0.0.0/" })
59
+
60
+ redirect_options = {
61
+ callback: proc do |_previous_response, next_request|
62
+ ip_address = Resolv.getaddress(next_request.url.host)
63
+ raise PrivateIPAddressError if IPAddr.new(ip_address).private?
64
+ end
65
+ }
66
+
67
+ expect {
68
+ MetaInspector.new("https://www.facebook.com/", faraday_options: { redirect: redirect_options })
69
+ }.to raise_error PrivateIPAddressError
70
+ end
71
+ end
50
72
  end
51
73
 
52
74
  private
data/spec/spec_helper.rb CHANGED
@@ -50,7 +50,8 @@ RSpec.configure do |config|
50
50
  stub_request(:get, "http://example.com/author_in_body").to_return(fixture_file("author_in_body.response"))
51
51
  stub_request(:get, "http://example.com/author_in_link").to_return(fixture_file("author_in_link.response"))
52
52
  stub_request(:get, "http://example.com/author_in_twitter").to_return(fixture_file("author_in_twitter.response"))
53
- stub_request(:get, "http://example.com/~").to_return(fixture_file("example.response"))
53
+ stub_request(:get, "http://example.com/?name=joe martins").to_return(fixture_file("example.response"))
54
+ stub_request(:get, "http://example.com/?name=joe+martins").to_return(fixture_file("example.response"))
54
55
  stub_request(:get, "http://facebook.com/").to_return(fixture_file("facebook.com.response"))
55
56
  stub_request(:get, "http://international.com").to_return(fixture_file("international.response"))
56
57
  stub_request(:get, "http://pagerankalert-shortcut-and-icon.com").to_return(fixture_file("pagerankalert-shortcut-and-icon.com.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.14.0
4
+ version: 5.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-27 00:00:00.000000000 Z
11
+ date: 2025-05-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.13'
19
+ version: 1.18.8
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.13'
26
+ version: 1.18.8
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: faraday
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -89,7 +89,7 @@ dependencies:
89
89
  version: '0.1'
90
90
  - - "<"
91
91
  - !ruby/object:Gem::Version
92
- version: '2.0'
92
+ version: '3.0'
93
93
  type: :runtime
94
94
  prerelease: false
95
95
  version_requirements: !ruby/object:Gem::Requirement
@@ -99,21 +99,21 @@ dependencies:
99
99
  version: '0.1'
100
100
  - - "<"
101
101
  - !ruby/object:Gem::Version
102
- version: '2.0'
102
+ version: '3.0'
103
103
  - !ruby/object:Gem::Dependency
104
104
  name: faraday-http-cache
105
105
  requirement: !ruby/object:Gem::Requirement
106
106
  requirements:
107
107
  - - "~>"
108
108
  - !ruby/object:Gem::Version
109
- version: '2.4'
109
+ version: '2.5'
110
110
  type: :runtime
111
111
  prerelease: false
112
112
  version_requirements: !ruby/object:Gem::Requirement
113
113
  requirements:
114
114
  - - "~>"
115
115
  - !ruby/object:Gem::Version
116
- version: '2.4'
116
+ version: '2.5'
117
117
  - !ruby/object:Gem::Dependency
118
118
  name: faraday-retry
119
119
  requirement: !ruby/object:Gem::Requirement
@@ -134,14 +134,14 @@ dependencies:
134
134
  requirements:
135
135
  - - "~>"
136
136
  - !ruby/object:Gem::Version
137
- version: '2.8'
137
+ version: 2.8.4
138
138
  type: :runtime
139
139
  prerelease: false
140
140
  version_requirements: !ruby/object:Gem::Requirement
141
141
  requirements:
142
142
  - - "~>"
143
143
  - !ruby/object:Gem::Version
144
- version: '2.8'
144
+ version: 2.8.4
145
145
  - !ruby/object:Gem::Dependency
146
146
  name: fastimage
147
147
  requirement: !ruby/object:Gem::Requirement
@@ -240,6 +240,20 @@ dependencies:
240
240
  - - "~>"
241
241
  - !ruby/object:Gem::Version
242
242
  version: '0.14'
243
+ - !ruby/object:Gem::Dependency
244
+ name: puma
245
+ requirement: !ruby/object:Gem::Requirement
246
+ requirements:
247
+ - - "~>"
248
+ - !ruby/object:Gem::Version
249
+ version: 6.4.0
250
+ type: :development
251
+ prerelease: false
252
+ version_requirements: !ruby/object:Gem::Requirement
253
+ requirements:
254
+ - - "~>"
255
+ - !ruby/object:Gem::Version
256
+ version: 6.4.0
243
257
  - !ruby/object:Gem::Dependency
244
258
  name: rubocop
245
259
  requirement: !ruby/object:Gem::Requirement
@@ -254,6 +268,34 @@ dependencies:
254
268
  - - "~>"
255
269
  - !ruby/object:Gem::Version
256
270
  version: '1.34'
271
+ - !ruby/object:Gem::Dependency
272
+ name: resolv
273
+ requirement: !ruby/object:Gem::Requirement
274
+ requirements:
275
+ - - "~>"
276
+ - !ruby/object:Gem::Version
277
+ version: 0.2.2
278
+ type: :development
279
+ prerelease: false
280
+ version_requirements: !ruby/object:Gem::Requirement
281
+ requirements:
282
+ - - "~>"
283
+ - !ruby/object:Gem::Version
284
+ version: 0.2.2
285
+ - !ruby/object:Gem::Dependency
286
+ name: sinatra
287
+ requirement: !ruby/object:Gem::Requirement
288
+ requirements:
289
+ - - "~>"
290
+ - !ruby/object:Gem::Version
291
+ version: 3.0.6
292
+ type: :development
293
+ prerelease: false
294
+ version_requirements: !ruby/object:Gem::Requirement
295
+ requirements:
296
+ - - "~>"
297
+ - !ruby/object:Gem::Version
298
+ version: 3.0.6
257
299
  description: MetaInspector lets you scrape a web page and get its links, images, texts,
258
300
  meta tags...
259
301
  email: jaimeiniesta@gmail.com
@@ -275,7 +317,9 @@ files:
275
317
  - Rakefile
276
318
  - bin/console
277
319
  - examples/basic_scraping.rb
320
+ - examples/faraday_redirect_options.rb
278
321
  - examples/link_checker.rb
322
+ - examples/redirect_web_server.rb
279
323
  - examples/spider.rb
280
324
  - lib/meta_inspector.rb
281
325
  - lib/meta_inspector/document.rb
@@ -369,7 +413,7 @@ files:
369
413
  - spec/request_spec.rb
370
414
  - spec/spec_helper.rb
371
415
  - spec/url_spec.rb
372
- homepage: https://github.com/metainspector/metainspector
416
+ homepage: https://github.com/jaimeiniesta/metainspector
373
417
  licenses:
374
418
  - MIT
375
419
  metadata: {}
@@ -388,7 +432,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
388
432
  - !ruby/object:Gem::Version
389
433
  version: '0'
390
434
  requirements: []
391
- rubygems_version: 3.3.7
435
+ rubygems_version: 3.5.22
392
436
  signing_key:
393
437
  specification_version: 4
394
438
  summary: MetaInspector is a ruby gem for web scraping purposes, that returns metadata