metainspector 5.14.0 → 5.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +9 -19
- data/CHANGELOG.md +17 -4
- data/Gemfile.lock +67 -42
- data/README.md +23 -6
- data/examples/faraday_redirect_options.rb +42 -0
- data/examples/redirect_web_server.rb +5 -0
- data/lib/meta_inspector/request.rb +6 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +8 -5
- data/spec/document_spec.rb +2 -2
- data/spec/meta_inspector/links_spec.rb +33 -24
- data/spec/meta_inspector/redirections_spec.rb +22 -0
- data/spec/spec_helper.rb +2 -1
- metadata +56 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8311486a8156f619d20a7cc93283e57ae055dd9fdcb222e14d3900adfae6d6a2
|
4
|
+
data.tar.gz: 9f1288adf02bc224d5d5e6915a0c24aaab342cabe3d6a85e07989f553966324a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0ef843e07a8af813a4ed27f18b53eab03f5bfeaa60af5014fb3474346f3d02081891b94828ea034ffd7c5734f40c4fcb0e4028a2ccfe682152408d8f69820aec
|
7
|
+
data.tar.gz: c8da85989e7c11ad7bccff3d9f53c94daf75962492c4fa0d54550db2a02bee7928747b36354c89d47f0875c459310ea404bea9154f7e3cbd414b489fb45110e3
|
data/.circleci/config.yml
CHANGED
@@ -2,36 +2,27 @@ version: 2.1
|
|
2
2
|
orbs:
|
3
3
|
ruby: circleci/ruby@1.0.4
|
4
4
|
jobs:
|
5
|
-
|
6
|
-
docker:
|
7
|
-
- image: cimg/ruby:2.6.10
|
8
|
-
steps:
|
9
|
-
- checkout
|
10
|
-
- ruby/install-deps
|
11
|
-
- run:
|
12
|
-
name: Run tests
|
13
|
-
command: bundle exec rake
|
14
|
-
test_2_7:
|
5
|
+
test_3_1:
|
15
6
|
docker:
|
16
|
-
- image: cimg/ruby:
|
7
|
+
- image: cimg/ruby:3.1.7
|
17
8
|
steps:
|
18
9
|
- checkout
|
19
10
|
- ruby/install-deps
|
20
11
|
- run:
|
21
12
|
name: Run tests
|
22
13
|
command: bundle exec rake
|
23
|
-
|
14
|
+
test_3_2:
|
24
15
|
docker:
|
25
|
-
- image: cimg/ruby:3.
|
16
|
+
- image: cimg/ruby:3.2.6
|
26
17
|
steps:
|
27
18
|
- checkout
|
28
19
|
- ruby/install-deps
|
29
20
|
- run:
|
30
21
|
name: Run tests
|
31
22
|
command: bundle exec rake
|
32
|
-
|
23
|
+
test_3_3:
|
33
24
|
docker:
|
34
|
-
- image: cimg/ruby:3.
|
25
|
+
- image: cimg/ruby:3.3.8
|
35
26
|
steps:
|
36
27
|
- checkout
|
37
28
|
- ruby/install-deps
|
@@ -42,7 +33,6 @@ workflows:
|
|
42
33
|
version: 2
|
43
34
|
deploy:
|
44
35
|
jobs:
|
45
|
-
-
|
46
|
-
-
|
47
|
-
-
|
48
|
-
- test_3_1
|
36
|
+
- test_3_1
|
37
|
+
- test_3_2
|
38
|
+
- test_3_3
|
data/CHANGELOG.md
CHANGED
@@ -1,15 +1,28 @@
|
|
1
|
-
#
|
1
|
+
# MetaInspector Changelog
|
2
2
|
|
3
|
-
## [Changes in 5.
|
3
|
+
## [Changes in 5.16.0](https://github.com/jaimeiniesta/metainspector/compare/v5.15.0...v5.16.0)
|
4
|
+
|
5
|
+
* Upgraded dependencies and supported Ruby versions.
|
6
|
+
|
7
|
+
## [Changes in 5.15.0](https://github.com/jaimeiniesta/metainspector/compare/v5.14.0...v5.15.0)
|
8
|
+
|
9
|
+
* Added mechanism to use all available options in the `FollowRedirects` Faraday middleware,
|
10
|
+
https://github.com/jaimeiniesta/metainspector/pull/355 thanks to @bruno-b-martins and @miguelrod
|
11
|
+
|
12
|
+
## [Changes in 5.14.0](https://github.com/jaimeiniesta/metainspector/compare/v5.13.0...v5.14.0)
|
13
|
+
|
14
|
+
* Several dependency updates, including Addressable 2.8.1 which fixes invalid_byte_sequence exception.
|
15
|
+
|
16
|
+
## [Changes in 5.13.0](https://github.com/jaimeiniesta/metainspector/compare/v5.12.1...v5.13.0)
|
4
17
|
|
5
18
|
* Remove support for #feed that was deprecated in 5.9
|
6
19
|
* Add support for Ruby 3.1
|
7
20
|
|
8
|
-
## [Changes in 5.12.1](https://github.com/
|
21
|
+
## [Changes in 5.12.1](https://github.com/jaimeiniesta/metainspector/compare/v5.12.0...v5.12.1)
|
9
22
|
|
10
23
|
* Update dependencies: rubocop, nokogiri
|
11
24
|
|
12
|
-
## [Changes in 5.12.0](https://github.com/
|
25
|
+
## [Changes in 5.12.0](https://github.com/jaimeiniesta/metainspector/compare/v5.11.2...v5.12.0)
|
13
26
|
|
14
27
|
* Support Ruby 3.0
|
15
28
|
|
data/Gemfile.lock
CHANGED
@@ -1,23 +1,23 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
metainspector (5.
|
5
|
-
addressable (~> 2.8)
|
4
|
+
metainspector (5.16.0)
|
5
|
+
addressable (~> 2.8.4)
|
6
6
|
faraday (~> 2.5)
|
7
7
|
faraday-cookie_jar (~> 0.0)
|
8
8
|
faraday-encoding (~> 0.0)
|
9
9
|
faraday-follow_redirects (~> 0.3)
|
10
|
-
faraday-gzip (>= 0.1, <
|
11
|
-
faraday-http-cache (~> 2.
|
10
|
+
faraday-gzip (>= 0.1, < 3.0)
|
11
|
+
faraday-http-cache (~> 2.5)
|
12
12
|
faraday-retry (~> 2.0)
|
13
13
|
fastimage (~> 2.2)
|
14
14
|
nesty (~> 1.0)
|
15
|
-
nokogiri (~> 1.
|
15
|
+
nokogiri (~> 1.18.8)
|
16
16
|
|
17
17
|
GEM
|
18
18
|
remote: http://rubygems.org/
|
19
19
|
specs:
|
20
|
-
addressable (2.8.
|
20
|
+
addressable (2.8.5)
|
21
21
|
public_suffix (>= 2.0.2, < 6.0)
|
22
22
|
ast (2.4.2)
|
23
23
|
awesome_print (1.9.2)
|
@@ -25,49 +25,64 @@ GEM
|
|
25
25
|
crack (0.4.5)
|
26
26
|
rexml
|
27
27
|
diff-lcs (1.5.0)
|
28
|
-
domain_name (0.
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
domain_name (0.6.20240107)
|
29
|
+
faraday (2.13.1)
|
30
|
+
faraday-net_http (>= 2.0, < 3.5)
|
31
|
+
json
|
32
|
+
logger
|
33
33
|
faraday-cookie_jar (0.0.7)
|
34
34
|
faraday (>= 0.8.0)
|
35
35
|
http-cookie (~> 1.0.0)
|
36
|
-
faraday-encoding (0.0.
|
36
|
+
faraday-encoding (0.0.6)
|
37
37
|
faraday
|
38
38
|
faraday-follow_redirects (0.3.0)
|
39
39
|
faraday (>= 1, < 3)
|
40
|
-
faraday-gzip (
|
40
|
+
faraday-gzip (2.0.1)
|
41
41
|
faraday (>= 1.0)
|
42
|
-
zlib (~>
|
43
|
-
faraday-http-cache (2.
|
42
|
+
zlib (~> 3.0)
|
43
|
+
faraday-http-cache (2.5.1)
|
44
44
|
faraday (>= 0.8)
|
45
|
-
faraday-net_http (3.0
|
46
|
-
|
45
|
+
faraday-net_http (3.4.0)
|
46
|
+
net-http (>= 0.5.0)
|
47
|
+
faraday-retry (2.3.1)
|
47
48
|
faraday (~> 2.0)
|
48
|
-
fastimage (2.
|
49
|
+
fastimage (2.4.0)
|
49
50
|
hashdiff (1.0.1)
|
50
|
-
http-cookie (1.0.
|
51
|
+
http-cookie (1.0.8)
|
51
52
|
domain_name (~> 0.5)
|
52
|
-
json (2.
|
53
|
+
json (2.7.1)
|
54
|
+
language_server-protocol (3.17.0.3)
|
55
|
+
logger (1.7.0)
|
53
56
|
method_source (1.0.0)
|
54
|
-
|
57
|
+
mustermann (3.0.0)
|
58
|
+
ruby2_keywords (~> 0.0.1)
|
55
59
|
nesty (1.0.2)
|
56
|
-
|
57
|
-
|
60
|
+
net-http (0.6.0)
|
61
|
+
uri
|
62
|
+
nio4r (2.5.9)
|
63
|
+
nokogiri (1.18.8-arm64-darwin)
|
58
64
|
racc (~> 1.4)
|
59
|
-
|
60
|
-
|
65
|
+
nokogiri (1.18.8-x86_64-linux-gnu)
|
66
|
+
racc (~> 1.4)
|
67
|
+
parallel (1.24.0)
|
68
|
+
parser (3.3.0.5)
|
61
69
|
ast (~> 2.4.1)
|
70
|
+
racc
|
62
71
|
pry (0.14.2)
|
63
72
|
coderay (~> 1.1)
|
64
73
|
method_source (~> 1.0)
|
65
|
-
public_suffix (5.0.
|
66
|
-
|
74
|
+
public_suffix (5.0.3)
|
75
|
+
puma (6.4.0)
|
76
|
+
nio4r (~> 2.0)
|
77
|
+
racc (1.8.1)
|
78
|
+
rack (2.2.14)
|
79
|
+
rack-protection (3.0.6)
|
80
|
+
rack
|
67
81
|
rainbow (3.1.1)
|
68
|
-
rake (13.0
|
69
|
-
regexp_parser (2.
|
70
|
-
|
82
|
+
rake (13.1.0)
|
83
|
+
regexp_parser (2.9.0)
|
84
|
+
resolv (0.2.2)
|
85
|
+
rexml (3.2.6)
|
71
86
|
rspec (3.12.0)
|
72
87
|
rspec-core (~> 3.12.0)
|
73
88
|
rspec-expectations (~> 3.12.0)
|
@@ -81,40 +96,50 @@ GEM
|
|
81
96
|
diff-lcs (>= 1.2.0, < 2.0)
|
82
97
|
rspec-support (~> 3.12.0)
|
83
98
|
rspec-support (3.12.0)
|
84
|
-
rubocop (1.
|
99
|
+
rubocop (1.62.0)
|
85
100
|
json (~> 2.3)
|
101
|
+
language_server-protocol (>= 3.17.0)
|
86
102
|
parallel (~> 1.10)
|
87
|
-
parser (>= 3.
|
103
|
+
parser (>= 3.3.0.2)
|
88
104
|
rainbow (>= 2.2.2, < 4.0)
|
89
105
|
regexp_parser (>= 1.8, < 3.0)
|
90
106
|
rexml (>= 3.2.5, < 4.0)
|
91
|
-
rubocop-ast (>= 1.
|
107
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
92
108
|
ruby-progressbar (~> 1.7)
|
93
109
|
unicode-display_width (>= 2.4.0, < 3.0)
|
94
|
-
rubocop-ast (1.
|
95
|
-
parser (>= 3.
|
96
|
-
ruby-progressbar (1.
|
110
|
+
rubocop-ast (1.31.1)
|
111
|
+
parser (>= 3.3.0.4)
|
112
|
+
ruby-progressbar (1.13.0)
|
97
113
|
ruby2_keywords (0.0.5)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
114
|
+
sinatra (3.0.6)
|
115
|
+
mustermann (~> 3.0)
|
116
|
+
rack (~> 2.2, >= 2.2.4)
|
117
|
+
rack-protection (= 3.0.6)
|
118
|
+
tilt (~> 2.0)
|
119
|
+
tilt (2.1.0)
|
120
|
+
unicode-display_width (2.5.0)
|
121
|
+
uri (1.0.3)
|
102
122
|
webmock (3.18.1)
|
103
123
|
addressable (>= 2.8.0)
|
104
124
|
crack (>= 0.3.2)
|
105
125
|
hashdiff (>= 0.4.0, < 2.0.0)
|
106
|
-
zlib (2.1
|
126
|
+
zlib (3.2.1)
|
107
127
|
|
108
128
|
PLATFORMS
|
109
|
-
|
129
|
+
arm64-darwin-22
|
130
|
+
arm64-darwin-24
|
131
|
+
x86_64-linux
|
110
132
|
|
111
133
|
DEPENDENCIES
|
112
134
|
awesome_print (~> 1.9)
|
113
135
|
metainspector!
|
114
136
|
pry (~> 0.14)
|
137
|
+
puma (~> 6.4.0)
|
115
138
|
rake (~> 13.0)
|
139
|
+
resolv (~> 0.2.2)
|
116
140
|
rspec (~> 3.11)
|
117
141
|
rubocop (~> 1.34)
|
142
|
+
sinatra (~> 3.0.6)
|
118
143
|
webmock (~> 3.17)
|
119
144
|
|
120
145
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -1,14 +1,10 @@
|
|
1
1
|
# MetaInspector
|
2
|
-
[](http://badge.fury.io/rb/metainspector) [](http://badge.fury.io/rb/metainspector) [](https://circleci.com/gh/jaimeiniesta/metainspector) [](https://codeclimate.com/github/jaimeiniesta/metainspector) [](https://github.com/markets/awesome-ruby)
|
3
3
|
|
4
4
|
MetaInspector is a gem for web scraping purposes.
|
5
5
|
|
6
6
|
You give it an URL, and it lets you easily get its title, links, images, charset, description, keywords, meta tags...
|
7
7
|
|
8
|
-
## See it in action!
|
9
|
-
|
10
|
-
You can try MetaInspector using this little demo: [https://github.com/metainspector/metainspectordemo](https://github.com/metainspector/metainspectordemo)
|
11
|
-
|
12
8
|
## Installation
|
13
9
|
|
14
10
|
Install the gem from RubyGems:
|
@@ -23,7 +19,7 @@ If you're using it on a Rails application, just add it to your Gemfile and run `
|
|
23
19
|
gem 'metainspector'
|
24
20
|
```
|
25
21
|
|
26
|
-
Supported Ruby versions are defined in [`.
|
22
|
+
Supported Ruby versions are defined in [`.circleci/config.yml`](.circleci/config.yml).
|
27
23
|
|
28
24
|
## Usage
|
29
25
|
|
@@ -309,6 +305,27 @@ If you want to disallow redirects, you can do it like this:
|
|
309
305
|
page = MetaInspector.new('facebook.com', :allow_redirections => false)
|
310
306
|
```
|
311
307
|
|
308
|
+
You can also customize how many redirects you wish to allow:
|
309
|
+
|
310
|
+
```ruby
|
311
|
+
page = MetaInspector.new('facebook.com', :faraday_options => { redirect: { limit: 5 } })
|
312
|
+
```
|
313
|
+
|
314
|
+
And even customize what to do in between each redirect:
|
315
|
+
|
316
|
+
```ruby
|
317
|
+
callback = proc do |previous_response, next_request|
|
318
|
+
ip_address = Resolv.getaddress(next_request.url.host)
|
319
|
+
raise 'Invalid address' if IPAddr.new(ip_address).private?
|
320
|
+
end
|
321
|
+
|
322
|
+
page = MetaInspector.new(url, faraday_options: { redirect: { callback: callback } })
|
323
|
+
```
|
324
|
+
|
325
|
+
|
326
|
+
The `faraday_options[:redirect]` hash is passed to the `FollowRedirects` middleware used by `Faraday`, so that we can use all available options.
|
327
|
+
Check them [here](https://github.com/lostisland/faraday_middleware/blob/main/lib/faraday_middleware/response/follow_redirects.rb#L44).
|
328
|
+
|
312
329
|
### Headers
|
313
330
|
|
314
331
|
By default, the following headers are set:
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# A MetaInspector example that runs a callback in between redirects.
|
2
|
+
# The callback raises an exception if the redirection points to a URL that resolves into a private IP address.
|
3
|
+
# This is one way of triggering a known security exploit called server-side request forgery (SSRF).
|
4
|
+
#
|
5
|
+
# To properly run this example you need a server which redirects to a service like nip.io.
|
6
|
+
# The easiest way to achieve that is running the examples/redirect_web_server.rb server in one terminal window,
|
7
|
+
# and calling its address with this example in another terminal window.
|
8
|
+
#
|
9
|
+
# Usage example:
|
10
|
+
# In terminal #1:
|
11
|
+
# ruby redirect_web_server.rb
|
12
|
+
#
|
13
|
+
# In terminal #2:
|
14
|
+
# ruby faraday_redirect_options.rb http://127.0.0.1:4567
|
15
|
+
|
16
|
+
require 'resolv'
|
17
|
+
require '../lib/metainspector'
|
18
|
+
puts "Using MetaInspector #{MetaInspector::VERSION}"
|
19
|
+
|
20
|
+
# Get the starting URL
|
21
|
+
url = ARGV[0] || (puts "Enter an url"; gets.strip)
|
22
|
+
|
23
|
+
# redirect options to be passed along to Faraday::FollowRedirects::Middleware
|
24
|
+
redirects_opts = {
|
25
|
+
limit: 5,
|
26
|
+
callback: proc do |_old_response, new_response|
|
27
|
+
ip_address = Resolv.getaddress(new_response.url.host)
|
28
|
+
raise 'Invalid address' if IPAddr.new(ip_address).private?
|
29
|
+
end
|
30
|
+
}
|
31
|
+
|
32
|
+
begin
|
33
|
+
page = MetaInspector.new(url, faraday_options: { redirect: redirects_opts })
|
34
|
+
rescue StandardError => e
|
35
|
+
puts e.message
|
36
|
+
else
|
37
|
+
puts "\nScraping #{page.url} returned these results:"
|
38
|
+
puts "\nTITLE: #{page.title}"
|
39
|
+
|
40
|
+
puts "\nto_hash..."
|
41
|
+
puts page.to_hash
|
42
|
+
end
|
@@ -58,6 +58,7 @@ module MetaInspector
|
|
58
58
|
def fetch
|
59
59
|
Timeout::timeout(fatal_timeout) do
|
60
60
|
@faraday_options.merge!(:url => url)
|
61
|
+
follow_redirects_options = @faraday_options.delete(:redirect) || {}
|
61
62
|
|
62
63
|
session = Faraday.new(@faraday_options) do |faraday|
|
63
64
|
faraday.request :retry, max: @retries
|
@@ -65,7 +66,8 @@ module MetaInspector
|
|
65
66
|
faraday.request :gzip
|
66
67
|
|
67
68
|
if @allow_redirections
|
68
|
-
|
69
|
+
follow_redirects_options[:limit] ||= 10
|
70
|
+
faraday.use Faraday::FollowRedirects::Middleware, **follow_redirects_options
|
69
71
|
faraday.use :cookie_jar
|
70
72
|
end
|
71
73
|
|
@@ -84,7 +86,9 @@ module MetaInspector
|
|
84
86
|
req.options.open_timeout = @read_timeout
|
85
87
|
end
|
86
88
|
|
87
|
-
@
|
89
|
+
if @allow_redirections
|
90
|
+
@url.url = response.env.url.to_s
|
91
|
+
end
|
88
92
|
|
89
93
|
response
|
90
94
|
end
|
data/meta_inspector.gemspec
CHANGED
@@ -5,7 +5,7 @@ Gem::Specification.new do |gem|
|
|
5
5
|
gem.email = "jaimeiniesta@gmail.com"
|
6
6
|
gem.description = %q{MetaInspector lets you scrape a web page and get its links, images, texts, meta tags...}
|
7
7
|
gem.summary = %q{MetaInspector is a ruby gem for web scraping purposes, that returns metadata from a given URL}
|
8
|
-
gem.homepage = "https://github.com/
|
8
|
+
gem.homepage = "https://github.com/jaimeiniesta/metainspector"
|
9
9
|
gem.license = "MIT"
|
10
10
|
|
11
11
|
gem.files = `git ls-files`.split("\n")
|
@@ -14,15 +14,15 @@ Gem::Specification.new do |gem|
|
|
14
14
|
gem.require_paths = ["lib"]
|
15
15
|
gem.version = MetaInspector::VERSION
|
16
16
|
|
17
|
-
gem.add_dependency 'nokogiri', '~> 1.
|
17
|
+
gem.add_dependency 'nokogiri', '~> 1.18.8'
|
18
18
|
gem.add_dependency 'faraday', '~> 2.5'
|
19
19
|
gem.add_dependency 'faraday-cookie_jar', '~> 0.0'
|
20
20
|
gem.add_dependency 'faraday-encoding', '~> 0.0'
|
21
21
|
gem.add_dependency 'faraday-follow_redirects', '~> 0.3'
|
22
|
-
gem.add_dependency 'faraday-gzip', '>= 0.1', '<
|
23
|
-
gem.add_dependency 'faraday-http-cache', '~> 2.
|
22
|
+
gem.add_dependency 'faraday-gzip', '>= 0.1', '< 3.0'
|
23
|
+
gem.add_dependency 'faraday-http-cache', '~> 2.5'
|
24
24
|
gem.add_dependency 'faraday-retry', '~> 2.0'
|
25
|
-
gem.add_dependency 'addressable', '~> 2.8'
|
25
|
+
gem.add_dependency 'addressable', '~> 2.8.4'
|
26
26
|
gem.add_dependency 'fastimage', '~> 2.2'
|
27
27
|
gem.add_dependency 'nesty', '~> 1.0'
|
28
28
|
|
@@ -31,5 +31,8 @@ Gem::Specification.new do |gem|
|
|
31
31
|
gem.add_development_dependency 'awesome_print', '~> 1.9'
|
32
32
|
gem.add_development_dependency 'rake', '~> 13.0'
|
33
33
|
gem.add_development_dependency 'pry', '~> 0.14'
|
34
|
+
gem.add_development_dependency 'puma', '~> 6.4.0'
|
34
35
|
gem.add_development_dependency 'rubocop', '~> 1.34'
|
36
|
+
gem.add_development_dependency 'resolv', '~> 0.2.2'
|
37
|
+
gem.add_development_dependency 'sinatra', '~> 3.0.6'
|
35
38
|
end
|
data/spec/document_spec.rb
CHANGED
@@ -130,11 +130,11 @@ describe MetaInspector::Document do
|
|
130
130
|
|
131
131
|
describe 'url normalization' do
|
132
132
|
it 'should normalize by default' do
|
133
|
-
expect(MetaInspector.new('http://example.com
|
133
|
+
expect(MetaInspector.new('http://example.com?name=joe martins', allow_redirections: false).url).to eq('http://example.com/?name=joe%20martins')
|
134
134
|
end
|
135
135
|
|
136
136
|
it 'should not normalize if the normalize_url option is false' do
|
137
|
-
expect(MetaInspector.new('http://example.com
|
137
|
+
expect(MetaInspector.new('http://example.com?name=joe martins', normalize_url: false, allow_redirections: false).url).to eq('http://example.com?name=joe martins')
|
138
138
|
end
|
139
139
|
end
|
140
140
|
|
@@ -52,29 +52,35 @@ describe MetaInspector do
|
|
52
52
|
it "should get correct absolute links, encoding the URLs as needed" do
|
53
53
|
m = MetaInspector.new('http://international.com')
|
54
54
|
|
55
|
-
expect(m.links.internal).to eq([
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
55
|
+
expect(m.links.internal).to eq([
|
56
|
+
"http://international.com/espa%C3%83%C2%B1a.asp",
|
57
|
+
"http://international.com/roman%C3%83%C2%A9e",
|
58
|
+
"http://international.com/faqs#cami%C3%83%C2%B3n",
|
59
|
+
"http://international.com/search?q=cami%C3%83%C2%B3n",
|
60
|
+
"http://international.com/search?q=espa%C3%83%C2%B1a#top",
|
61
|
+
"http://international.com/index.php?q=espa%C3%83%C2%B1a&url=aHR0zZQ==&cntnt01pageid=21"
|
62
|
+
])
|
63
|
+
|
64
|
+
expect(m.links.external).to eq([
|
65
|
+
"http://example.com/espa%C3%83%C2%B1a.asp",
|
66
|
+
"http://example.com/roman%C3%83%C2%A9e",
|
67
|
+
"http://example.com/faqs#cami%C3%83%C2%B3n",
|
68
|
+
"http://example.com/search?q=cami%C3%83%C2%B3n",
|
69
|
+
"http://example.com/search?q=espa%C3%83%C2%B1a#top"])
|
67
70
|
end
|
68
71
|
|
69
72
|
describe "internal links" do
|
70
73
|
it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
|
71
74
|
m = MetaInspector.new('http://international.com')
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
|
76
|
+
expect(m.links.internal).to eq([
|
77
|
+
"http://international.com/espa%C3%83%C2%B1a.asp",
|
78
|
+
"http://international.com/roman%C3%83%C2%A9e",
|
79
|
+
"http://international.com/faqs#cami%C3%83%C2%B3n",
|
80
|
+
"http://international.com/search?q=cami%C3%83%C2%B3n",
|
81
|
+
"http://international.com/search?q=espa%C3%83%C2%B1a#top",
|
82
|
+
"http://international.com/index.php?q=espa%C3%83%C2%B1a&url=aHR0zZQ==&cntnt01pageid=21"
|
83
|
+
])
|
78
84
|
end
|
79
85
|
|
80
86
|
it "should not crash when processing malformed hrefs" do
|
@@ -86,11 +92,14 @@ describe MetaInspector do
|
|
86
92
|
describe "external links" do
|
87
93
|
it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
|
88
94
|
m = MetaInspector.new('http://international.com')
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
95
|
+
|
96
|
+
expect(m.links.external).to eq([
|
97
|
+
"http://example.com/espa%C3%83%C2%B1a.asp",
|
98
|
+
"http://example.com/roman%C3%83%C2%A9e",
|
99
|
+
"http://example.com/faqs#cami%C3%83%C2%B3n",
|
100
|
+
"http://example.com/search?q=cami%C3%83%C2%B3n",
|
101
|
+
"http://example.com/search?q=espa%C3%83%C2%B1a#top"
|
102
|
+
])
|
94
103
|
end
|
95
104
|
|
96
105
|
it "should not crash when processing malformed hrefs" do
|
@@ -108,7 +117,7 @@ describe MetaInspector do
|
|
108
117
|
|
109
118
|
it "should handle links that have an invalid byte sequence" do
|
110
119
|
m = MetaInspector.new('http://example.com/invalid_byte_seq')
|
111
|
-
expect(m.links.all).to eq(["http://pagerankalert.posterous.com/", "http://
|
120
|
+
expect(m.links.all).to eq(["http://pagerankalert.posterous.com/", "http://twitter.com/pagerankalert"])
|
112
121
|
end
|
113
122
|
|
114
123
|
end
|
@@ -1,4 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
+
require 'resolv'
|
3
|
+
|
4
|
+
class PrivateIPAddressError < StandardError; end
|
2
5
|
|
3
6
|
describe MetaInspector do
|
4
7
|
describe "redirections" do
|
@@ -47,6 +50,25 @@ describe MetaInspector do
|
|
47
50
|
expect(page.url).to eq("http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/?nclick_check=1")
|
48
51
|
end
|
49
52
|
end
|
53
|
+
|
54
|
+
context "when there is a callback to be ran between redirects that blocks redirections to private IP addresses" do
|
55
|
+
it "raises an exception" do
|
56
|
+
stub_request(:get, "https://www.facebook.com/")
|
57
|
+
.to_return(:status => 302,
|
58
|
+
:headers => { "Location" => "http://10.0.0.0/" })
|
59
|
+
|
60
|
+
redirect_options = {
|
61
|
+
callback: proc do |_previous_response, next_request|
|
62
|
+
ip_address = Resolv.getaddress(next_request.url.host)
|
63
|
+
raise PrivateIPAddressError if IPAddr.new(ip_address).private?
|
64
|
+
end
|
65
|
+
}
|
66
|
+
|
67
|
+
expect {
|
68
|
+
MetaInspector.new("https://www.facebook.com/", faraday_options: { redirect: redirect_options })
|
69
|
+
}.to raise_error PrivateIPAddressError
|
70
|
+
end
|
71
|
+
end
|
50
72
|
end
|
51
73
|
|
52
74
|
private
|
data/spec/spec_helper.rb
CHANGED
@@ -50,7 +50,8 @@ RSpec.configure do |config|
|
|
50
50
|
stub_request(:get, "http://example.com/author_in_body").to_return(fixture_file("author_in_body.response"))
|
51
51
|
stub_request(:get, "http://example.com/author_in_link").to_return(fixture_file("author_in_link.response"))
|
52
52
|
stub_request(:get, "http://example.com/author_in_twitter").to_return(fixture_file("author_in_twitter.response"))
|
53
|
-
stub_request(:get, "http://example.com
|
53
|
+
stub_request(:get, "http://example.com/?name=joe martins").to_return(fixture_file("example.response"))
|
54
|
+
stub_request(:get, "http://example.com/?name=joe+martins").to_return(fixture_file("example.response"))
|
54
55
|
stub_request(:get, "http://facebook.com/").to_return(fixture_file("facebook.com.response"))
|
55
56
|
stub_request(:get, "http://international.com").to_return(fixture_file("international.response"))
|
56
57
|
stub_request(:get, "http://pagerankalert-shortcut-and-icon.com").to_return(fixture_file("pagerankalert-shortcut-and-icon.com.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.
|
4
|
+
version: 5.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-05-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 1.18.8
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 1.18.8
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: faraday
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -89,7 +89,7 @@ dependencies:
|
|
89
89
|
version: '0.1'
|
90
90
|
- - "<"
|
91
91
|
- !ruby/object:Gem::Version
|
92
|
-
version: '
|
92
|
+
version: '3.0'
|
93
93
|
type: :runtime
|
94
94
|
prerelease: false
|
95
95
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -99,21 +99,21 @@ dependencies:
|
|
99
99
|
version: '0.1'
|
100
100
|
- - "<"
|
101
101
|
- !ruby/object:Gem::Version
|
102
|
-
version: '
|
102
|
+
version: '3.0'
|
103
103
|
- !ruby/object:Gem::Dependency
|
104
104
|
name: faraday-http-cache
|
105
105
|
requirement: !ruby/object:Gem::Requirement
|
106
106
|
requirements:
|
107
107
|
- - "~>"
|
108
108
|
- !ruby/object:Gem::Version
|
109
|
-
version: '2.
|
109
|
+
version: '2.5'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
112
|
version_requirements: !ruby/object:Gem::Requirement
|
113
113
|
requirements:
|
114
114
|
- - "~>"
|
115
115
|
- !ruby/object:Gem::Version
|
116
|
-
version: '2.
|
116
|
+
version: '2.5'
|
117
117
|
- !ruby/object:Gem::Dependency
|
118
118
|
name: faraday-retry
|
119
119
|
requirement: !ruby/object:Gem::Requirement
|
@@ -134,14 +134,14 @@ dependencies:
|
|
134
134
|
requirements:
|
135
135
|
- - "~>"
|
136
136
|
- !ruby/object:Gem::Version
|
137
|
-
version:
|
137
|
+
version: 2.8.4
|
138
138
|
type: :runtime
|
139
139
|
prerelease: false
|
140
140
|
version_requirements: !ruby/object:Gem::Requirement
|
141
141
|
requirements:
|
142
142
|
- - "~>"
|
143
143
|
- !ruby/object:Gem::Version
|
144
|
-
version:
|
144
|
+
version: 2.8.4
|
145
145
|
- !ruby/object:Gem::Dependency
|
146
146
|
name: fastimage
|
147
147
|
requirement: !ruby/object:Gem::Requirement
|
@@ -240,6 +240,20 @@ dependencies:
|
|
240
240
|
- - "~>"
|
241
241
|
- !ruby/object:Gem::Version
|
242
242
|
version: '0.14'
|
243
|
+
- !ruby/object:Gem::Dependency
|
244
|
+
name: puma
|
245
|
+
requirement: !ruby/object:Gem::Requirement
|
246
|
+
requirements:
|
247
|
+
- - "~>"
|
248
|
+
- !ruby/object:Gem::Version
|
249
|
+
version: 6.4.0
|
250
|
+
type: :development
|
251
|
+
prerelease: false
|
252
|
+
version_requirements: !ruby/object:Gem::Requirement
|
253
|
+
requirements:
|
254
|
+
- - "~>"
|
255
|
+
- !ruby/object:Gem::Version
|
256
|
+
version: 6.4.0
|
243
257
|
- !ruby/object:Gem::Dependency
|
244
258
|
name: rubocop
|
245
259
|
requirement: !ruby/object:Gem::Requirement
|
@@ -254,6 +268,34 @@ dependencies:
|
|
254
268
|
- - "~>"
|
255
269
|
- !ruby/object:Gem::Version
|
256
270
|
version: '1.34'
|
271
|
+
- !ruby/object:Gem::Dependency
|
272
|
+
name: resolv
|
273
|
+
requirement: !ruby/object:Gem::Requirement
|
274
|
+
requirements:
|
275
|
+
- - "~>"
|
276
|
+
- !ruby/object:Gem::Version
|
277
|
+
version: 0.2.2
|
278
|
+
type: :development
|
279
|
+
prerelease: false
|
280
|
+
version_requirements: !ruby/object:Gem::Requirement
|
281
|
+
requirements:
|
282
|
+
- - "~>"
|
283
|
+
- !ruby/object:Gem::Version
|
284
|
+
version: 0.2.2
|
285
|
+
- !ruby/object:Gem::Dependency
|
286
|
+
name: sinatra
|
287
|
+
requirement: !ruby/object:Gem::Requirement
|
288
|
+
requirements:
|
289
|
+
- - "~>"
|
290
|
+
- !ruby/object:Gem::Version
|
291
|
+
version: 3.0.6
|
292
|
+
type: :development
|
293
|
+
prerelease: false
|
294
|
+
version_requirements: !ruby/object:Gem::Requirement
|
295
|
+
requirements:
|
296
|
+
- - "~>"
|
297
|
+
- !ruby/object:Gem::Version
|
298
|
+
version: 3.0.6
|
257
299
|
description: MetaInspector lets you scrape a web page and get its links, images, texts,
|
258
300
|
meta tags...
|
259
301
|
email: jaimeiniesta@gmail.com
|
@@ -275,7 +317,9 @@ files:
|
|
275
317
|
- Rakefile
|
276
318
|
- bin/console
|
277
319
|
- examples/basic_scraping.rb
|
320
|
+
- examples/faraday_redirect_options.rb
|
278
321
|
- examples/link_checker.rb
|
322
|
+
- examples/redirect_web_server.rb
|
279
323
|
- examples/spider.rb
|
280
324
|
- lib/meta_inspector.rb
|
281
325
|
- lib/meta_inspector/document.rb
|
@@ -369,7 +413,7 @@ files:
|
|
369
413
|
- spec/request_spec.rb
|
370
414
|
- spec/spec_helper.rb
|
371
415
|
- spec/url_spec.rb
|
372
|
-
homepage: https://github.com/
|
416
|
+
homepage: https://github.com/jaimeiniesta/metainspector
|
373
417
|
licenses:
|
374
418
|
- MIT
|
375
419
|
metadata: {}
|
@@ -388,7 +432,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
388
432
|
- !ruby/object:Gem::Version
|
389
433
|
version: '0'
|
390
434
|
requirements: []
|
391
|
-
rubygems_version: 3.
|
435
|
+
rubygems_version: 3.5.22
|
392
436
|
signing_key:
|
393
437
|
specification_version: 4
|
394
438
|
summary: MetaInspector is a ruby gem for web scraping purposes, that returns metadata
|