metainspector 4.0.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +193 -133
- data/examples/basic_scraping.rb +16 -5
- data/examples/link_checker.rb +22 -21
- data/examples/spider.rb +1 -1
- data/lib/meta_inspector/document.rb +6 -2
- data/lib/meta_inspector/url.rb +9 -1
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +3 -3
- data/spec/document_spec.rb +10 -0
- data/spec/spec_helper.rb +5 -0
- metadata +10 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 694ffa2f1b0080c05335ccc14abe02a874a6562f
|
4
|
+
data.tar.gz: 8ef1ff54d9cf15ab21225bf64f2827033dc3b409
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 92ab014d16a8c6ad1332db4dac9e103c1ab114b3f04c823cc2bc140c43ddb038ae3c77d46bb273b6f6651b80bc3a8bde519ae1cc89ca436bcb1510707c03b888
|
7
|
+
data.tar.gz: d9fa07214d680a5af7b1a2d57f1e8481f85dfbe09f76c45978d5fa346c7c7d64614442c305ef4c4a7d3f1dc3613f467a873431abba9c1db60157265249f7dc07
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# MetaInspector [](http://travis-ci.org/jaimeiniesta/metainspector) [](https://gemnasium.com/jaimeiniesta/metainspector)
|
1
|
+
# MetaInspector [](http://travis-ci.org/jaimeiniesta/metainspector) [](https://gemnasium.com/jaimeiniesta/metainspector) [](https://codeclimate.com/github/jaimeiniesta/metainspector)
|
2
2
|
|
3
3
|
MetaInspector is a gem for web scraping purposes.
|
4
4
|
|
@@ -40,11 +40,15 @@ Also, we've introduced a new feature:
|
|
40
40
|
|
41
41
|
Install the gem from RubyGems:
|
42
42
|
|
43
|
-
|
43
|
+
```bash
|
44
|
+
gem install metainspector
|
45
|
+
```
|
44
46
|
|
45
47
|
If you're using it on a Rails application, just add it to your Gemfile and run `bundle install`
|
46
48
|
|
47
|
-
|
49
|
+
```ruby
|
50
|
+
gem 'metainspector'
|
51
|
+
```
|
48
52
|
|
49
53
|
This gem is tested on Ruby versions 2.0.0 and 2.1.3.
|
50
54
|
|
@@ -52,15 +56,21 @@ This gem is tested on Ruby versions 2.0.0 and 2.1.3.
|
|
52
56
|
|
53
57
|
Initialize a MetaInspector instance for an URL, like this:
|
54
58
|
|
55
|
-
|
59
|
+
```ruby
|
60
|
+
page = MetaInspector.new('http://sitevalidator.com')
|
61
|
+
```
|
56
62
|
|
57
63
|
If you don't include the scheme on the URL, http:// will be used by default:
|
58
64
|
|
59
|
-
|
65
|
+
```ruby
|
66
|
+
page = MetaInspector.new('sitevalidator.com')
|
67
|
+
```
|
60
68
|
|
61
69
|
You can also include the html which will be used as the document to scrape:
|
62
70
|
|
63
|
-
|
71
|
+
```ruby
|
72
|
+
page = MetaInspector.new("http://sitevalidator.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
73
|
+
```
|
64
74
|
|
65
75
|
## Accessing response status and headers
|
66
76
|
|
@@ -75,124 +85,138 @@ page.response.headers # { "server"=>"nginx", "content-type"=>"text/html; charset
|
|
75
85
|
|
76
86
|
You can see the scraped data like this:
|
77
87
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
88
|
+
```ruby
|
89
|
+
page.url # URL of the page
|
90
|
+
page.scheme # Scheme of the page (http, https)
|
91
|
+
page.host # Hostname of the page (like, sitevalidator.com, without the scheme)
|
92
|
+
page.root_url # Root url (scheme + host, like http://sitevalidator.com/)
|
93
|
+
page.title # title of the page, as string
|
94
|
+
page.links.raw # every link found, unprocessed
|
95
|
+
page.links.all # every link found on the page as an absolute URL
|
96
|
+
page.links.http # every HTTP link found
|
97
|
+
page.links.non_http # every non-HTTP link found
|
98
|
+
page.links.internal # every internal link found on the page as an absolute URL
|
99
|
+
page.links.external # every external link found on the page as an absolute URL
|
100
|
+
page.meta['keywords'] # meta keywords, as string
|
101
|
+
page.meta['description'] # meta description, as string
|
102
|
+
page.description # returns the meta description, or the first long paragraph if no meta description is found
|
103
|
+
page.images # enumerable collection, with every img found on the page as an absolute URL
|
104
|
+
page.images.best # Most relevant image, if defined with the og:image or twitter:image metatags. Fallback to the first page.images array element
|
105
|
+
page.images.favicon # absolute URL to the favicon
|
106
|
+
page.feed # Get rss or atom links in meta data fields as array
|
107
|
+
page.charset # UTF-8
|
108
|
+
page.content_type # content-type returned by the server when the url was requested
|
109
|
+
```
|
98
110
|
|
99
111
|
## Meta tags
|
100
112
|
|
101
113
|
When it comes to meta tags, you have several options:
|
102
114
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
115
|
+
```ruby
|
116
|
+
page.meta_tags # Gives you all the meta tags by type:
|
117
|
+
# (meta name, meta http-equiv, meta property and meta charset)
|
118
|
+
# As meta tags can be repeated (in the case of 'og:image', for example),
|
119
|
+
# the values returned will be arrays
|
120
|
+
#
|
121
|
+
# For example:
|
122
|
+
#
|
123
|
+
# {
|
124
|
+
'name' => {
|
125
|
+
'keywords' => ['one, two, three'],
|
126
|
+
'description' => ['the description'],
|
127
|
+
'author' => ['Joe Sample'],
|
128
|
+
'robots' => ['index,follow'],
|
129
|
+
'revisit' => ['15 days'],
|
130
|
+
'dc.date.issued' => ['2011-09-15']
|
131
|
+
},
|
132
|
+
|
133
|
+
'http-equiv' => {
|
134
|
+
'content-type' => ['text/html; charset=UTF-8'],
|
135
|
+
'content-style-type' => ['text/css']
|
136
|
+
},
|
137
|
+
|
138
|
+
'property' => {
|
139
|
+
'og:title' => ['An OG title'],
|
140
|
+
'og:type' => ['website'],
|
141
|
+
'og:url' => ['http://example.com/meta-tags'],
|
142
|
+
'og:image' => ['http://example.com/rock.jpg',
|
143
|
+
'http://example.com/rock2.jpg',
|
144
|
+
'http://example.com/rock3.jpg'],
|
145
|
+
'og:image:width' => ['300'],
|
146
|
+
'og:image:height' => ['300', '1000']
|
147
|
+
},
|
148
|
+
|
149
|
+
'charset' => ['UTF-8']
|
150
|
+
}
|
151
|
+
```
|
138
152
|
|
139
153
|
As this method returns a hash, you can also take only the key that you need, like in:
|
140
154
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
155
|
+
```ruby
|
156
|
+
page.meta_tags['property'] # Returns:
|
157
|
+
# {
|
158
|
+
# 'og:title' => ['An OG title'],
|
159
|
+
# 'og:type' => ['website'],
|
160
|
+
# 'og:url' => ['http://example.com/meta-tags'],
|
161
|
+
# 'og:image' => ['http://example.com/rock.jpg',
|
162
|
+
# 'http://example.com/rock2.jpg',
|
163
|
+
# 'http://example.com/rock3.jpg'],
|
164
|
+
# 'og:image:width' => ['300'],
|
165
|
+
# 'og:image:height' => ['300', '1000']
|
166
|
+
# }
|
167
|
+
```
|
152
168
|
|
153
169
|
In most cases you will only be interested in the first occurrence of a meta tag, so you can
|
154
170
|
use the singular form of that method:
|
155
171
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
172
|
+
```ruby
|
173
|
+
page.meta_tag['name'] # Returns:
|
174
|
+
# {
|
175
|
+
# 'keywords' => 'one, two, three',
|
176
|
+
# 'description' => 'the description',
|
177
|
+
# 'author' => 'Joe Sample',
|
178
|
+
# 'robots' => 'index,follow',
|
179
|
+
# 'revisit' => '15 days',
|
180
|
+
# 'dc.date.issued' => '2011-09-15'
|
181
|
+
# }
|
182
|
+
```
|
165
183
|
|
166
184
|
Or, as this is also a hash:
|
167
185
|
|
168
|
-
|
186
|
+
```ruby
|
187
|
+
page.meta_tag['name']['keywords'] # Returns 'one, two, three'
|
188
|
+
```
|
169
189
|
|
170
190
|
And finally, you can use the shorter `meta` method that will merge the different keys so you have
|
171
191
|
a simpler hash:
|
172
192
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
193
|
+
```ruby
|
194
|
+
page.meta # Returns:
|
195
|
+
#
|
196
|
+
# {
|
197
|
+
# 'keywords' => 'one, two, three',
|
198
|
+
# 'description' => 'the description',
|
199
|
+
# 'author' => 'Joe Sample',
|
200
|
+
# 'robots' => 'index,follow',
|
201
|
+
# 'revisit' => '15 days',
|
202
|
+
# 'dc.date.issued' => '2011-09-15',
|
203
|
+
# 'content-type' => 'text/html; charset=UTF-8',
|
204
|
+
# 'content-style-type' => 'text/css',
|
205
|
+
# 'og:title' => 'An OG title',
|
206
|
+
# 'og:type' => 'website',
|
207
|
+
# 'og:url' => 'http://example.com/meta-tags',
|
208
|
+
# 'og:image' => 'http://example.com/rock.jpg',
|
209
|
+
# 'og:image:width' => '300',
|
210
|
+
# 'og:image:height' => '300',
|
211
|
+
# 'charset' => 'UTF-8'
|
212
|
+
# }
|
213
|
+
```
|
192
214
|
|
193
215
|
This way, you can get most meta tags just like that:
|
194
216
|
|
195
|
-
|
217
|
+
```ruby
|
218
|
+
page.meta['author'] # Returns "Joe Sample"
|
219
|
+
```
|
196
220
|
|
197
221
|
Please be aware that all keys are converted to downcase, so it's `'dc.date.issued'` and not `'DC.date.issued'`.
|
198
222
|
|
@@ -200,16 +224,22 @@ Please be aware that all keys are converted to downcase, so it's `'dc.date.issue
|
|
200
224
|
|
201
225
|
You can also access most of the scraped data as a hash:
|
202
226
|
|
203
|
-
|
204
|
-
|
227
|
+
```ruby
|
228
|
+
page.to_hash # { "url" => "http://sitevalidator.com",
|
229
|
+
"title" => "MarkupValidator :: site-wide markup validation tool", ... }
|
230
|
+
```
|
205
231
|
|
206
232
|
The original document is accessible from:
|
207
233
|
|
208
|
-
|
234
|
+
```ruby
|
235
|
+
page.to_s # A String with the contents of the HTML document
|
236
|
+
```
|
209
237
|
|
210
238
|
And the full scraped document is accessible from:
|
211
239
|
|
212
|
-
|
240
|
+
```ruby
|
241
|
+
page.parsed # Nokogiri doc that you can use it to get any element from the page
|
242
|
+
```
|
213
243
|
|
214
244
|
## Options
|
215
245
|
|
@@ -252,36 +282,64 @@ By default, MetaInspector will follow redirects (up to a limit of 10).
|
|
252
282
|
|
253
283
|
If you want to disallow redirects, you can do it like this:
|
254
284
|
|
255
|
-
|
285
|
+
```ruby
|
286
|
+
page = MetaInspector.new('facebook.com', :allow_redirections => false)
|
287
|
+
```
|
256
288
|
|
257
289
|
### Headers
|
258
290
|
|
259
291
|
By default, the following headers are set:
|
260
292
|
|
261
|
-
|
293
|
+
```ruby
|
294
|
+
{'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
|
295
|
+
```
|
262
296
|
|
263
297
|
If you want to set custom headers then use the `headers` option:
|
264
298
|
|
265
|
-
|
266
|
-
|
299
|
+
```ruby
|
300
|
+
# Set the User-Agent header
|
301
|
+
page = MetaInspector.new('example.com', :headers => {'User-Agent' => 'My custom User-Agent'})
|
302
|
+
```
|
267
303
|
|
268
304
|
### HTML Content Only
|
269
305
|
|
270
306
|
MetaInspector will try to parse all URLs by default. If you want to raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html), you can state it like this:
|
271
307
|
|
272
|
-
|
308
|
+
```ruby
|
309
|
+
page = MetaInspector.new('sitevalidator.com', :html_content_only => true)
|
310
|
+
```
|
273
311
|
|
274
312
|
This is useful when using MetaInspector on web spidering. Although on the initial URL you'll probably have an HTML URL, following links you may find yourself trying to parse non-html URLs.
|
275
313
|
|
276
|
-
|
277
|
-
|
278
|
-
|
314
|
+
```ruby
|
315
|
+
page = MetaInspector.new('http://example.com/image.png')
|
316
|
+
page.content_type # "image/png"
|
317
|
+
page.description # will returned a garbled string
|
318
|
+
|
319
|
+
page = MetaInspector.new('http://example.com/image.png', :html_content_only => true)
|
320
|
+
page.content_type # "image/png"
|
321
|
+
page.description # raises an exception
|
322
|
+
```
|
323
|
+
|
324
|
+
### URL Normalization
|
325
|
+
|
326
|
+
By default, URLs are normalized using the Addressable gem. For example:
|
327
|
+
|
328
|
+
```ruby
|
329
|
+
# Normalization will add a default scheme and a trailing slash...
|
330
|
+
page = MetaInspector.new('sitevalidator.com')
|
331
|
+
page.url # http://sitevalidator.com/
|
332
|
+
|
333
|
+
# ...and it will also convert international characters
|
334
|
+
page = MetaInspector.new('http://www.詹姆斯.com')
|
335
|
+
page.url # http://www.xn--8ws00zhy3a.com/
|
336
|
+
```
|
279
337
|
|
280
|
-
|
281
|
-
page.content_type # "image/png"
|
282
|
-
page.description # raises an exception
|
338
|
+
While this is generally useful, it can be [tricky](https://github.com/sporkmonger/addressable/issues/182) [sometimes](https://github.com/sporkmonger/addressable/issues/160).
|
283
339
|
|
284
|
-
|
340
|
+
You can disable URL normalization by passing the `normalize_url: false` option.
|
341
|
+
|
342
|
+
## Exception Handling
|
285
343
|
|
286
344
|
By default, MetaInspector will raise the exceptions found. We think that this is the safest default: in case the URL you're trying to scrape is unreachable, you should clearly be notified, and treat the exception as needed in your app.
|
287
345
|
|
@@ -295,27 +353,29 @@ You should avoid using the `:store` option, or use it wisely, as silencing error
|
|
295
353
|
|
296
354
|
You can find some sample scripts on the `examples` folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
|
297
355
|
|
298
|
-
|
299
|
-
|
300
|
-
|
356
|
+
```ruby
|
357
|
+
$ irb
|
358
|
+
>> require 'metainspector'
|
359
|
+
=> true
|
301
360
|
|
302
|
-
|
303
|
-
|
361
|
+
>> page = MetaInspector.new('http://sitevalidator.com')
|
362
|
+
=> #<MetaInspector:0x11330c0 @url="http://sitevalidator.com">
|
304
363
|
|
305
|
-
|
306
|
-
|
364
|
+
>> page.title
|
365
|
+
=> "MarkupValidator :: site-wide markup validation tool"
|
307
366
|
|
308
|
-
|
309
|
-
|
367
|
+
>> page.meta['description']
|
368
|
+
=> "Site-wide markup validation tool. Validate the markup of your whole site with just one click."
|
310
369
|
|
311
|
-
|
312
|
-
|
370
|
+
>> page.meta['keywords']
|
371
|
+
=> "html, markup, validation, validator, tool, w3c, development, standards, free"
|
313
372
|
|
314
|
-
|
315
|
-
|
373
|
+
>> page.links.size
|
374
|
+
=> 15
|
316
375
|
|
317
|
-
|
318
|
-
|
376
|
+
>> page.links[4]
|
377
|
+
=> "/plans-and-pricing"
|
378
|
+
```
|
319
379
|
|
320
380
|
## ZOMG Fork! Thank you!
|
321
381
|
|
data/examples/basic_scraping.rb
CHANGED
@@ -11,14 +11,25 @@ url = ARGV[0] || (puts "Enter an url"; gets.strip)
|
|
11
11
|
|
12
12
|
page = MetaInspector.new(url)
|
13
13
|
|
14
|
-
puts "
|
15
|
-
puts "
|
14
|
+
puts "\nScraping #{page.url} returned these results:"
|
15
|
+
puts "\nTITLE: #{page.title}"
|
16
16
|
puts "META DESCRIPTION: #{page.meta['description']}"
|
17
17
|
puts "META KEYWORDS: #{page.meta['keywords']}"
|
18
|
-
|
19
|
-
page.links.
|
18
|
+
|
19
|
+
puts "\n#{page.links.internal.size} internal links found..."
|
20
|
+
page.links.internal.each do |link|
|
21
|
+
puts " ==> #{link}"
|
22
|
+
end
|
23
|
+
|
24
|
+
puts "\n#{page.links.external.size} external links found..."
|
25
|
+
page.links.external.each do |link|
|
26
|
+
puts " ==> #{link}"
|
27
|
+
end
|
28
|
+
|
29
|
+
puts "\n#{page.links.non_http.size} non-http links found..."
|
30
|
+
page.links.non_http.each do |link|
|
20
31
|
puts " ==> #{link}"
|
21
32
|
end
|
22
33
|
|
23
|
-
puts "
|
34
|
+
puts "\nto_hash..."
|
24
35
|
puts page.to_hash
|
data/examples/link_checker.rb
CHANGED
@@ -7,7 +7,6 @@
|
|
7
7
|
require 'metainspector'
|
8
8
|
|
9
9
|
class BrokenLinkChecker
|
10
|
-
attr_reader :broken
|
11
10
|
|
12
11
|
def initialize(url)
|
13
12
|
@url = url
|
@@ -33,32 +32,26 @@ class BrokenLinkChecker
|
|
33
32
|
private
|
34
33
|
|
35
34
|
def check
|
36
|
-
#
|
37
|
-
|
35
|
+
# Resolves redirections of initial URL before placing it on the queue
|
36
|
+
@queue.push(MetaInspector.new(@url).url)
|
38
37
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
while @queue.any?
|
43
|
-
url = @queue.pop
|
44
|
-
|
45
|
-
page = MetaInspector.new(url, :warn_level => :store)
|
38
|
+
process_next_on_queue while @queue.any?
|
39
|
+
end
|
46
40
|
|
47
|
-
|
48
|
-
|
49
|
-
page.links.select {|l| l =~ /^http(s)?:\/\//i}.each do |link|
|
50
|
-
check_status(link, page.url)
|
51
|
-
end
|
52
|
-
end
|
41
|
+
def process_next_on_queue
|
42
|
+
page = MetaInspector.new(@queue.pop, :warn_level => :store)
|
53
43
|
|
54
|
-
|
44
|
+
page.links.all.select {|l| l =~ /^http(s)?:\/\//i}.each do |link|
|
45
|
+
check_status(link, page.url)
|
46
|
+
end if page.ok?
|
55
47
|
|
56
|
-
|
57
|
-
@queue.push(link) unless @visited.include?(link) || @broken.include?(link) || @queue.include?(link)
|
58
|
-
end
|
48
|
+
@visited.push(page.url)
|
59
49
|
|
60
|
-
|
50
|
+
page.links.internal.each do |link|
|
51
|
+
@queue.push(link) if should_be_enqueued?(link)
|
61
52
|
end
|
53
|
+
|
54
|
+
show_stats
|
62
55
|
end
|
63
56
|
|
64
57
|
# Checks the response status of the linked_url and stores it on the ok or broken collections
|
@@ -78,6 +71,14 @@ class BrokenLinkChecker
|
|
78
71
|
end
|
79
72
|
end
|
80
73
|
|
74
|
+
def should_be_enqueued?(url)
|
75
|
+
!(@visited.include?(url) || @broken.include?(url) || @queue.include?(url))
|
76
|
+
end
|
77
|
+
|
78
|
+
def show_stats
|
79
|
+
puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links"
|
80
|
+
end
|
81
|
+
|
81
82
|
# A page is reachable if its response status is less than 400
|
82
83
|
# In the case of exceptions, like timeouts or server connection errors,
|
83
84
|
# we consider it unreachable
|
data/examples/spider.rb
CHANGED
@@ -17,6 +17,7 @@ module MetaInspector
|
|
17
17
|
# * warn_level: what to do when encountering exceptions.
|
18
18
|
# Can be :warn, :raise or nil
|
19
19
|
# * headers: object containing custom headers for the request
|
20
|
+
# * normalize_url: true by default
|
20
21
|
def initialize(initial_url, options = {})
|
21
22
|
options = defaults.merge(options)
|
22
23
|
@connection_timeout = options[:connection_timeout]
|
@@ -28,7 +29,9 @@ module MetaInspector
|
|
28
29
|
@headers = options[:headers]
|
29
30
|
@warn_level = options[:warn_level]
|
30
31
|
@exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
|
31
|
-
@
|
32
|
+
@normalize_url = options[:normalize_url]
|
33
|
+
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log,
|
34
|
+
normalize: @normalize_url)
|
32
35
|
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
33
36
|
connection_timeout: @connection_timeout,
|
34
37
|
read_timeout: @read_timeout,
|
@@ -77,7 +80,8 @@ module MetaInspector
|
|
77
80
|
:html_content_only => false,
|
78
81
|
:warn_level => :raise,
|
79
82
|
:headers => { 'User-Agent' => default_user_agent },
|
80
|
-
:allow_redirections => true
|
83
|
+
:allow_redirections => true,
|
84
|
+
:normalize_url => true }
|
81
85
|
end
|
82
86
|
|
83
87
|
def default_user_agent
|
data/lib/meta_inspector/url.rb
CHANGED
@@ -7,7 +7,10 @@ module MetaInspector
|
|
7
7
|
include MetaInspector::Exceptionable
|
8
8
|
|
9
9
|
def initialize(initial_url, options = {})
|
10
|
+
options = defaults.merge(options)
|
11
|
+
|
10
12
|
@exception_log = options[:exception_log]
|
13
|
+
@normalize = options[:normalize]
|
11
14
|
|
12
15
|
self.url = initial_url
|
13
16
|
end
|
@@ -25,7 +28,8 @@ module MetaInspector
|
|
25
28
|
end
|
26
29
|
|
27
30
|
def url=(new_url)
|
28
|
-
|
31
|
+
url = with_default_scheme(new_url)
|
32
|
+
@url = @normalize ? normalized(url) : url
|
29
33
|
end
|
30
34
|
|
31
35
|
# Converts a protocol-relative url to its full form,
|
@@ -50,6 +54,10 @@ module MetaInspector
|
|
50
54
|
|
51
55
|
private
|
52
56
|
|
57
|
+
def defaults
|
58
|
+
{ :normalize => true }
|
59
|
+
end
|
60
|
+
|
53
61
|
# Adds 'http' as default scheme, if there is none
|
54
62
|
def with_default_scheme(url)
|
55
63
|
parsed(url) && parsed(url).scheme.nil? ? 'http://' + url : url
|
data/meta_inspector.gemspec
CHANGED
@@ -3,8 +3,8 @@ require File.expand_path('../lib/meta_inspector/version', __FILE__)
|
|
3
3
|
Gem::Specification.new do |gem|
|
4
4
|
gem.authors = ["Jaime Iniesta"]
|
5
5
|
gem.email = ["jaimeiniesta@gmail.com"]
|
6
|
-
gem.description = %q{MetaInspector lets you scrape a web page and get its
|
7
|
-
gem.summary = %q{MetaInspector is a ruby gem for web scraping purposes, that returns
|
6
|
+
gem.description = %q{MetaInspector lets you scrape a web page and get its links, images, texts, meta tags...}
|
7
|
+
gem.summary = %q{MetaInspector is a ruby gem for web scraping purposes, that returns metadata from a given URL}
|
8
8
|
gem.homepage = "http://jaimeiniesta.github.io/metainspector/"
|
9
9
|
gem.license = "MIT"
|
10
10
|
|
@@ -23,7 +23,7 @@ Gem::Specification.new do |gem|
|
|
23
23
|
gem.add_development_dependency 'rspec', '2.14.1'
|
24
24
|
gem.add_development_dependency 'fakeweb', '1.3.0'
|
25
25
|
gem.add_development_dependency 'webmock'
|
26
|
-
gem.add_development_dependency 'awesome_print'
|
26
|
+
gem.add_development_dependency 'awesome_print'
|
27
27
|
gem.add_development_dependency 'rake', '~> 10.1.0'
|
28
28
|
gem.add_development_dependency 'pry'
|
29
29
|
gem.add_development_dependency 'guard'
|
data/spec/document_spec.rb
CHANGED
@@ -171,4 +171,14 @@ describe MetaInspector::Document do
|
|
171
171
|
MetaInspector::Document.new(url, headers: headers)
|
172
172
|
end
|
173
173
|
end
|
174
|
+
|
175
|
+
describe 'url normalization' do
|
176
|
+
it 'should normalize by default' do
|
177
|
+
MetaInspector.new('http://example.com/%EF%BD%9E').url.should == 'http://example.com/~'
|
178
|
+
end
|
179
|
+
|
180
|
+
it 'should not normalize if the normalize_url option is false' do
|
181
|
+
MetaInspector.new('http://example.com/%EF%BD%9E', normalize_url: false).url.should == 'http://example.com/%EF%BD%9E'
|
182
|
+
end
|
183
|
+
end
|
174
184
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -79,3 +79,8 @@ FakeWeb.register_uri(:get, "https://www.facebook.com/", :response => fixture
|
|
79
79
|
# https://unsafe-facebook.com => http://unsafe-facebook.com
|
80
80
|
FakeWeb.register_uri(:get, "https://unsafe-facebook.com/", :response => fixture_file("unsafe_https.facebook.com.response"))
|
81
81
|
FakeWeb.register_uri(:get, "http://unsafe-facebook.com/", :response => fixture_file("unsafe_facebook.com.response"))
|
82
|
+
|
83
|
+
# These examples are used to test normalize URLs
|
84
|
+
FakeWeb.register_uri(:get, "http://example.com/%EF%BD%9E", :response => fixture_file("example.response"))
|
85
|
+
FakeWeb.register_uri(:get, "http://example.com/~", :response => fixture_file("example.response"))
|
86
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-01-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -126,16 +126,16 @@ dependencies:
|
|
126
126
|
name: awesome_print
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
|
-
- - "
|
129
|
+
- - ">="
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version:
|
131
|
+
version: '0'
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
|
-
- - "
|
136
|
+
- - ">="
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version:
|
138
|
+
version: '0'
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
140
|
name: rake
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -206,8 +206,8 @@ dependencies:
|
|
206
206
|
- - ">="
|
207
207
|
- !ruby/object:Gem::Version
|
208
208
|
version: '0'
|
209
|
-
description: MetaInspector lets you scrape a web page and get its
|
210
|
-
|
209
|
+
description: MetaInspector lets you scrape a web page and get its links, images, texts,
|
210
|
+
meta tags...
|
211
211
|
email:
|
212
212
|
- jaimeiniesta@gmail.com
|
213
213
|
executables: []
|
@@ -309,6 +309,6 @@ rubyforge_project:
|
|
309
309
|
rubygems_version: 2.2.2
|
310
310
|
signing_key:
|
311
311
|
specification_version: 4
|
312
|
-
summary: MetaInspector is a ruby gem for web scraping purposes, that returns
|
313
|
-
|
312
|
+
summary: MetaInspector is a ruby gem for web scraping purposes, that returns metadata
|
313
|
+
from a given URL
|
314
314
|
test_files: []
|