metainspector 1.13.1 → 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +11 -3
- data/lib/meta_inspector/scraper.rb +4 -1
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -1
- data/spec/metainspector_spec.rb +15 -0
- metadata +9 -9
data/README.md
CHANGED
@@ -28,6 +28,10 @@ If you don't include the scheme on the URL, http:// will be used by default:
|
|
28
28
|
|
29
29
|
page = MetaInspector.new('markupvalidator.com')
|
30
30
|
|
31
|
+
You can also include the html which will be used as the document to scrape:
|
32
|
+
|
33
|
+
page = MetaInspector.new("http://markupvalidator.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
34
|
+
|
31
35
|
## Accessing scraped data
|
32
36
|
|
33
37
|
Then you can see the scraped data like this:
|
@@ -70,9 +74,13 @@ You can also access most of the scraped data as a hash:
|
|
70
74
|
page.to_hash # { "url" => "http://markupvalidator.com",
|
71
75
|
"title" => "MarkupValidator :: site-wide markup validation tool", ... }
|
72
76
|
|
73
|
-
The
|
77
|
+
The original document is accessible from:
|
78
|
+
|
79
|
+
page.document # A String with the contents of the HTML document
|
80
|
+
|
81
|
+
And the full scraped document is accessible from:
|
74
82
|
|
75
|
-
page.
|
83
|
+
page.parsed_document # Nokogiri doc that you can use it to get any element from the page
|
76
84
|
|
77
85
|
## Options
|
78
86
|
|
@@ -166,4 +174,4 @@ Thanks to all the contributors:
|
|
166
174
|
|
167
175
|
[https://github.com/jaimeiniesta/metainspector/graphs/contributors](https://github.com/jaimeiniesta/metainspector/graphs/contributors)
|
168
176
|
|
169
|
-
Copyright (c) 2009-2012 Jaime Iniesta, released under the MIT license
|
177
|
+
Copyright (c) 2009-2012 Jaime Iniesta, released under the MIT license
|
@@ -16,8 +16,10 @@ module MetaInspector
|
|
16
16
|
# Options:
|
17
17
|
# => timeout: defaults to 20 seconds
|
18
18
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
19
|
-
# => allow_safe_redirections:
|
19
|
+
# => allow_safe_redirections: if redirects from http to https sites on the same domain should be allowed or not
|
20
20
|
# => allow_unsafe_redirections: if redirects from https to http sites on the same domain should be allowed or not
|
21
|
+
# => document: the html of the url as a string
|
22
|
+
# => verbose: if the errors should be logged to the screen
|
21
23
|
def initialize(url, options = {})
|
22
24
|
options = defaults.merge(options)
|
23
25
|
|
@@ -32,6 +34,7 @@ module MetaInspector
|
|
32
34
|
@allow_safe_redirections = options[:allow_safe_redirections]
|
33
35
|
@allow_unsafe_redirections = options[:allow_unsafe_redirections]
|
34
36
|
@verbose = options[:verbose]
|
37
|
+
@document = options[:document]
|
35
38
|
end
|
36
39
|
|
37
40
|
# Returns the parsed document title, from the content of the <title> tag.
|
data/meta_inspector.gemspec
CHANGED
@@ -21,5 +21,5 @@ Gem::Specification.new do |gem|
|
|
21
21
|
gem.add_development_dependency 'rspec', '2.12.0'
|
22
22
|
gem.add_development_dependency 'fakeweb', '1.3.0'
|
23
23
|
gem.add_development_dependency 'awesome_print', '1.1.0'
|
24
|
-
gem.add_development_dependency 'rake', '10.0.
|
24
|
+
gem.add_development_dependency 'rake', '~> 10.0.3'
|
25
25
|
end
|
data/spec/metainspector_spec.rb
CHANGED
@@ -113,6 +113,21 @@ describe MetaInspector do
|
|
113
113
|
end
|
114
114
|
end
|
115
115
|
|
116
|
+
describe 'Doing a basic scrape from passed url html' do
|
117
|
+
|
118
|
+
before(:each) do
|
119
|
+
@m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
120
|
+
end
|
121
|
+
|
122
|
+
it "should get correct links when the url html is passed as an option" do
|
123
|
+
@m.links.should == ["http://cnn.com/hello"]
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should get the title" do
|
127
|
+
@m.title.should == "Hello From Passed Html"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
116
131
|
describe 'Page with missing meta description' do
|
117
132
|
it "should find secondary description" do
|
118
133
|
@m = MetaInspector.new('http://theonion-no-description.com')
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 47
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 1.
|
8
|
+
- 14
|
9
|
+
- 0
|
10
|
+
version: 1.14.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2013-01-14 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -118,14 +118,14 @@ dependencies:
|
|
118
118
|
requirement: &id007 !ruby/object:Gem::Requirement
|
119
119
|
none: false
|
120
120
|
requirements:
|
121
|
-
- -
|
121
|
+
- - ~>
|
122
122
|
- !ruby/object:Gem::Version
|
123
|
-
hash:
|
123
|
+
hash: 73
|
124
124
|
segments:
|
125
125
|
- 10
|
126
126
|
- 0
|
127
|
-
-
|
128
|
-
version: 10.0.
|
127
|
+
- 3
|
128
|
+
version: 10.0.3
|
129
129
|
type: :development
|
130
130
|
version_requirements: *id007
|
131
131
|
description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
|