metainspector 1.13.1 → 1.14.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +11 -3
- data/lib/meta_inspector/scraper.rb +4 -1
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -1
- data/spec/metainspector_spec.rb +15 -0
- metadata +9 -9
data/README.md
CHANGED
@@ -28,6 +28,10 @@ If you don't include the scheme on the URL, http:// will be used by default:
|
|
28
28
|
|
29
29
|
page = MetaInspector.new('markupvalidator.com')
|
30
30
|
|
31
|
+
You can also include the html which will be used as the document to scrape:
|
32
|
+
|
33
|
+
page = MetaInspector.new("http://markupvalidator.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
34
|
+
|
31
35
|
## Accessing scraped data
|
32
36
|
|
33
37
|
Then you can see the scraped data like this:
|
@@ -70,9 +74,13 @@ You can also access most of the scraped data as a hash:
|
|
70
74
|
page.to_hash # { "url" => "http://markupvalidator.com",
|
71
75
|
"title" => "MarkupValidator :: site-wide markup validation tool", ... }
|
72
76
|
|
73
|
-
The
|
77
|
+
The original document is accessible from:
|
78
|
+
|
79
|
+
page.document # A String with the contents of the HTML document
|
80
|
+
|
81
|
+
And the full scraped document is accessible from:
|
74
82
|
|
75
|
-
page.
|
83
|
+
page.parsed_document # Nokogiri doc that you can use it to get any element from the page
|
76
84
|
|
77
85
|
## Options
|
78
86
|
|
@@ -166,4 +174,4 @@ Thanks to all the contributors:
|
|
166
174
|
|
167
175
|
[https://github.com/jaimeiniesta/metainspector/graphs/contributors](https://github.com/jaimeiniesta/metainspector/graphs/contributors)
|
168
176
|
|
169
|
-
Copyright (c) 2009-2012 Jaime Iniesta, released under the MIT license
|
177
|
+
Copyright (c) 2009-2012 Jaime Iniesta, released under the MIT license
|
@@ -16,8 +16,10 @@ module MetaInspector
|
|
16
16
|
# Options:
|
17
17
|
# => timeout: defaults to 20 seconds
|
18
18
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
19
|
-
# => allow_safe_redirections:
|
19
|
+
# => allow_safe_redirections: if redirects from http to https sites on the same domain should be allowed or not
|
20
20
|
# => allow_unsafe_redirections: if redirects from https to http sites on the same domain should be allowed or not
|
21
|
+
# => document: the html of the url as a string
|
22
|
+
# => verbose: if the errors should be logged to the screen
|
21
23
|
def initialize(url, options = {})
|
22
24
|
options = defaults.merge(options)
|
23
25
|
|
@@ -32,6 +34,7 @@ module MetaInspector
|
|
32
34
|
@allow_safe_redirections = options[:allow_safe_redirections]
|
33
35
|
@allow_unsafe_redirections = options[:allow_unsafe_redirections]
|
34
36
|
@verbose = options[:verbose]
|
37
|
+
@document = options[:document]
|
35
38
|
end
|
36
39
|
|
37
40
|
# Returns the parsed document title, from the content of the <title> tag.
|
data/meta_inspector.gemspec
CHANGED
@@ -21,5 +21,5 @@ Gem::Specification.new do |gem|
|
|
21
21
|
gem.add_development_dependency 'rspec', '2.12.0'
|
22
22
|
gem.add_development_dependency 'fakeweb', '1.3.0'
|
23
23
|
gem.add_development_dependency 'awesome_print', '1.1.0'
|
24
|
-
gem.add_development_dependency 'rake', '10.0.
|
24
|
+
gem.add_development_dependency 'rake', '~> 10.0.3'
|
25
25
|
end
|
data/spec/metainspector_spec.rb
CHANGED
@@ -113,6 +113,21 @@ describe MetaInspector do
|
|
113
113
|
end
|
114
114
|
end
|
115
115
|
|
116
|
+
describe 'Doing a basic scrape from passed url html' do
|
117
|
+
|
118
|
+
before(:each) do
|
119
|
+
@m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
120
|
+
end
|
121
|
+
|
122
|
+
it "should get correct links when the url html is passed as an option" do
|
123
|
+
@m.links.should == ["http://cnn.com/hello"]
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should get the title" do
|
127
|
+
@m.title.should == "Hello From Passed Html"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
116
131
|
describe 'Page with missing meta description' do
|
117
132
|
it "should find secondary description" do
|
118
133
|
@m = MetaInspector.new('http://theonion-no-description.com')
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 47
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 1.
|
8
|
+
- 14
|
9
|
+
- 0
|
10
|
+
version: 1.14.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2013-01-14 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -118,14 +118,14 @@ dependencies:
|
|
118
118
|
requirement: &id007 !ruby/object:Gem::Requirement
|
119
119
|
none: false
|
120
120
|
requirements:
|
121
|
-
- -
|
121
|
+
- - ~>
|
122
122
|
- !ruby/object:Gem::Version
|
123
|
-
hash:
|
123
|
+
hash: 73
|
124
124
|
segments:
|
125
125
|
- 10
|
126
126
|
- 0
|
127
|
-
-
|
128
|
-
version: 10.0.
|
127
|
+
- 3
|
128
|
+
version: 10.0.3
|
129
129
|
type: :development
|
130
130
|
version_requirements: *id007
|
131
131
|
description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
|