metainspector 1.9.9 → 1.9.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +3 -0
- data/lib/meta_inspector/scraper.rb +11 -5
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +0 -1
- data/spec/fixtures/charset_000.response +22 -0
- data/spec/fixtures/charset_001.response +23 -0
- data/spec/fixtures/charset_002.response +23 -0
- data/spec/metainspector_spec.rb +14 -6
- metadata +16 -29
data/README.rdoc
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
= MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
|
2
|
+
|
1
3
|
MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, images, charset, description, keywords, meta tags...
|
2
4
|
|
3
5
|
= See it in action!
|
@@ -48,6 +50,7 @@ Then you can see the scraped data like this:
|
|
48
50
|
page.feed # Get rss or atom links in meta data fields as array
|
49
51
|
page.meta_og_title # opengraph title
|
50
52
|
page.meta_og_image # opengraph image
|
53
|
+
page.charset # UTF-8
|
51
54
|
|
52
55
|
MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
|
53
56
|
|
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
|
-
require 'charguess'
|
6
5
|
require 'hashie/rash'
|
7
6
|
require 'timeout'
|
8
7
|
|
@@ -71,11 +70,11 @@ module MetaInspector
|
|
71
70
|
meta_og_image
|
72
71
|
end
|
73
72
|
|
74
|
-
# Returns the charset
|
75
|
-
#
|
76
|
-
#
|
73
|
+
# Returns the charset from the meta tags, looking for it in the following order:
|
74
|
+
# <meta charset='utf-8' />
|
75
|
+
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
77
76
|
def charset
|
78
|
-
@data.charset ||=
|
77
|
+
@data.charset ||= (charset_from_meta_charset || charset_from_content_type)
|
79
78
|
end
|
80
79
|
|
81
80
|
# Returns all parsed data as a nested Hash
|
@@ -184,5 +183,12 @@ module MetaInspector
|
|
184
183
|
(p = parsed_document.search('//p').map(&:text).select{ |p| p.length > 120 }.first).nil? ? '' : p
|
185
184
|
end
|
186
185
|
|
186
|
+
def charset_from_meta_charset
|
187
|
+
parsed_document.css("meta[charset]")[0].attributes['charset'].value rescue nil
|
188
|
+
end
|
189
|
+
|
190
|
+
def charset_from_content_type
|
191
|
+
parsed_document.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
|
192
|
+
end
|
187
193
|
end
|
188
194
|
end
|
data/meta_inspector.gemspec
CHANGED
@@ -15,7 +15,6 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.version = MetaInspector::VERSION
|
16
16
|
|
17
17
|
gem.add_dependency 'nokogiri', '~> 1.5'
|
18
|
-
gem.add_dependency 'charguess', '1.3.20111021164500'
|
19
18
|
gem.add_dependency 'rash', '0.3.2'
|
20
19
|
|
21
20
|
gem.add_development_dependency 'rspec', '2.11.0'
|
@@ -0,0 +1,22 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Date: Mon, 30 May 2011 09:58:20 GMT
|
3
|
+
Server: Microsoft-IIS/6.0
|
4
|
+
X-Powered-By: PleskWin
|
5
|
+
X-Powered-By: ASP.NET
|
6
|
+
Cache-Control: private
|
7
|
+
Content-Length: 25902
|
8
|
+
Content-Type: text/html
|
9
|
+
Expires: Sun, 29 May 2011 09:58:18 GMT
|
10
|
+
Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
|
11
|
+
Cache-control: private
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
<html>
|
16
|
+
<head>
|
17
|
+
<title>A web</title>
|
18
|
+
</head>
|
19
|
+
<body>
|
20
|
+
<p>A sample web without a specified charset.</p>
|
21
|
+
</body>
|
22
|
+
</html>
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Date: Mon, 30 May 2011 09:58:20 GMT
|
3
|
+
Server: Microsoft-IIS/6.0
|
4
|
+
X-Powered-By: PleskWin
|
5
|
+
X-Powered-By: ASP.NET
|
6
|
+
Cache-Control: private
|
7
|
+
Content-Length: 25902
|
8
|
+
Content-Type: text/html
|
9
|
+
Expires: Sun, 29 May 2011 09:58:18 GMT
|
10
|
+
Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
|
11
|
+
Cache-control: private
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
<html>
|
16
|
+
<head>
|
17
|
+
<meta charset="utf-8" />
|
18
|
+
<title>A web</title>
|
19
|
+
</head>
|
20
|
+
<body>
|
21
|
+
<p>A sample web with a way of specifying the charset.</p>
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Date: Mon, 30 May 2011 09:58:20 GMT
|
3
|
+
Server: Microsoft-IIS/6.0
|
4
|
+
X-Powered-By: PleskWin
|
5
|
+
X-Powered-By: ASP.NET
|
6
|
+
Cache-Control: private
|
7
|
+
Content-Length: 25902
|
8
|
+
Content-Type: text/html
|
9
|
+
Expires: Sun, 29 May 2011 09:58:18 GMT
|
10
|
+
Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
|
11
|
+
Cache-control: private
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
<html>
|
16
|
+
<head>
|
17
|
+
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
|
18
|
+
<title>A web</title>
|
19
|
+
</head>
|
20
|
+
<body>
|
21
|
+
<p>A sample web with another way of specifying the charset.</p>
|
22
|
+
</body>
|
23
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -19,6 +19,9 @@ describe MetaInspector do
|
|
19
19
|
FakeWeb.register_uri(:get, "https://twitter.com/w3clove", :response => fixture_file("twitter_w3clove.response"))
|
20
20
|
FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
|
21
21
|
FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
|
22
|
+
FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
|
23
|
+
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
24
|
+
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
22
25
|
|
23
26
|
describe 'Initialization' do
|
24
27
|
it 'should accept an URL with a scheme' do
|
@@ -275,21 +278,26 @@ describe MetaInspector do
|
|
275
278
|
end
|
276
279
|
|
277
280
|
describe 'Charset detection' do
|
278
|
-
it "should
|
279
|
-
@m = MetaInspector.new('http://
|
281
|
+
it "should get the charset from <meta charset />" do
|
282
|
+
@m = MetaInspector.new('http://charset001.com')
|
283
|
+
@m.charset.should == "utf-8"
|
284
|
+
end
|
285
|
+
|
286
|
+
it "should get the charset from meta content type" do
|
287
|
+
@m = MetaInspector.new('http://charset002.com')
|
280
288
|
@m.charset.should == "windows-1252"
|
281
289
|
end
|
282
290
|
|
283
|
-
it "should
|
284
|
-
@m = MetaInspector.new('http://
|
285
|
-
@m.charset.should ==
|
291
|
+
it "should get nil if no declared charset is found" do
|
292
|
+
@m = MetaInspector.new('http://charset000.com')
|
293
|
+
@m.charset.should == nil
|
286
294
|
end
|
287
295
|
end
|
288
296
|
|
289
297
|
describe 'to_hash' do
|
290
298
|
it "should return a hash with all the values set" do
|
291
299
|
@m = MetaInspector.new('http://pagerankalert.com')
|
292
|
-
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"
|
300
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
293
301
|
end
|
294
302
|
end
|
295
303
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 39
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 9
|
9
|
-
-
|
10
|
-
version: 1.9.
|
9
|
+
- 10
|
10
|
+
version: 1.9.10
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
18
|
+
date: 2012-09-12 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
@@ -34,22 +34,6 @@ dependencies:
|
|
34
34
|
requirement: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
36
|
version_requirements: &id002 !ruby/object:Gem::Requirement
|
37
|
-
none: false
|
38
|
-
requirements:
|
39
|
-
- - "="
|
40
|
-
- !ruby/object:Gem::Version
|
41
|
-
hash: 40222042329011
|
42
|
-
segments:
|
43
|
-
- 1
|
44
|
-
- 3
|
45
|
-
- 20111021164500
|
46
|
-
version: 1.3.20111021164500
|
47
|
-
prerelease: false
|
48
|
-
type: :runtime
|
49
|
-
name: charguess
|
50
|
-
requirement: *id002
|
51
|
-
- !ruby/object:Gem::Dependency
|
52
|
-
version_requirements: &id003 !ruby/object:Gem::Requirement
|
53
37
|
none: false
|
54
38
|
requirements:
|
55
39
|
- - "="
|
@@ -63,9 +47,9 @@ dependencies:
|
|
63
47
|
prerelease: false
|
64
48
|
type: :runtime
|
65
49
|
name: rash
|
66
|
-
requirement: *
|
50
|
+
requirement: *id002
|
67
51
|
- !ruby/object:Gem::Dependency
|
68
|
-
version_requirements: &
|
52
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
69
53
|
none: false
|
70
54
|
requirements:
|
71
55
|
- - "="
|
@@ -79,9 +63,9 @@ dependencies:
|
|
79
63
|
prerelease: false
|
80
64
|
type: :development
|
81
65
|
name: rspec
|
82
|
-
requirement: *
|
66
|
+
requirement: *id003
|
83
67
|
- !ruby/object:Gem::Dependency
|
84
|
-
version_requirements: &
|
68
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
85
69
|
none: false
|
86
70
|
requirements:
|
87
71
|
- - "="
|
@@ -95,9 +79,9 @@ dependencies:
|
|
95
79
|
prerelease: false
|
96
80
|
type: :development
|
97
81
|
name: fakeweb
|
98
|
-
requirement: *
|
82
|
+
requirement: *id004
|
99
83
|
- !ruby/object:Gem::Dependency
|
100
|
-
version_requirements: &
|
84
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
101
85
|
none: false
|
102
86
|
requirements:
|
103
87
|
- - "="
|
@@ -111,9 +95,9 @@ dependencies:
|
|
111
95
|
prerelease: false
|
112
96
|
type: :development
|
113
97
|
name: awesome_print
|
114
|
-
requirement: *
|
98
|
+
requirement: *id005
|
115
99
|
- !ruby/object:Gem::Dependency
|
116
|
-
version_requirements: &
|
100
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
117
101
|
none: false
|
118
102
|
requirements:
|
119
103
|
- - "="
|
@@ -128,7 +112,7 @@ dependencies:
|
|
128
112
|
prerelease: false
|
129
113
|
type: :development
|
130
114
|
name: rake
|
131
|
-
requirement: *
|
115
|
+
requirement: *id006
|
132
116
|
description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
|
133
117
|
email:
|
134
118
|
- jaimeiniesta@gmail.com
|
@@ -155,6 +139,9 @@ files:
|
|
155
139
|
- samples/spider.rb
|
156
140
|
- spec/fixtures/alazan.com.response
|
157
141
|
- spec/fixtures/alazan_websolution.response
|
142
|
+
- spec/fixtures/charset_000.response
|
143
|
+
- spec/fixtures/charset_001.response
|
144
|
+
- spec/fixtures/charset_002.response
|
158
145
|
- spec/fixtures/empty_page.response
|
159
146
|
- spec/fixtures/guardian.co.uk.response
|
160
147
|
- spec/fixtures/international.response
|