metainspector 1.9.9 → 1.9.10
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +3 -0
- data/lib/meta_inspector/scraper.rb +11 -5
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +0 -1
- data/spec/fixtures/charset_000.response +22 -0
- data/spec/fixtures/charset_001.response +23 -0
- data/spec/fixtures/charset_002.response +23 -0
- data/spec/metainspector_spec.rb +14 -6
- metadata +16 -29
data/README.rdoc
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
= MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
|
2
|
+
|
1
3
|
MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, images, charset, description, keywords, meta tags...
|
2
4
|
|
3
5
|
= See it in action!
|
@@ -48,6 +50,7 @@ Then you can see the scraped data like this:
|
|
48
50
|
page.feed # Get rss or atom links in meta data fields as array
|
49
51
|
page.meta_og_title # opengraph title
|
50
52
|
page.meta_og_image # opengraph image
|
53
|
+
page.charset # UTF-8
|
51
54
|
|
52
55
|
MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
|
53
56
|
|
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
|
-
require 'charguess'
|
6
5
|
require 'hashie/rash'
|
7
6
|
require 'timeout'
|
8
7
|
|
@@ -71,11 +70,11 @@ module MetaInspector
|
|
71
70
|
meta_og_image
|
72
71
|
end
|
73
72
|
|
74
|
-
# Returns the charset
|
75
|
-
#
|
76
|
-
#
|
73
|
+
# Returns the charset from the meta tags, looking for it in the following order:
|
74
|
+
# <meta charset='utf-8' />
|
75
|
+
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
77
76
|
def charset
|
78
|
-
@data.charset ||=
|
77
|
+
@data.charset ||= (charset_from_meta_charset || charset_from_content_type)
|
79
78
|
end
|
80
79
|
|
81
80
|
# Returns all parsed data as a nested Hash
|
@@ -184,5 +183,12 @@ module MetaInspector
|
|
184
183
|
(p = parsed_document.search('//p').map(&:text).select{ |p| p.length > 120 }.first).nil? ? '' : p
|
185
184
|
end
|
186
185
|
|
186
|
+
def charset_from_meta_charset
|
187
|
+
parsed_document.css("meta[charset]")[0].attributes['charset'].value rescue nil
|
188
|
+
end
|
189
|
+
|
190
|
+
def charset_from_content_type
|
191
|
+
parsed_document.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
|
192
|
+
end
|
187
193
|
end
|
188
194
|
end
|
data/meta_inspector.gemspec
CHANGED
@@ -15,7 +15,6 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.version = MetaInspector::VERSION
|
16
16
|
|
17
17
|
gem.add_dependency 'nokogiri', '~> 1.5'
|
18
|
-
gem.add_dependency 'charguess', '1.3.20111021164500'
|
19
18
|
gem.add_dependency 'rash', '0.3.2'
|
20
19
|
|
21
20
|
gem.add_development_dependency 'rspec', '2.11.0'
|
@@ -0,0 +1,22 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Date: Mon, 30 May 2011 09:58:20 GMT
|
3
|
+
Server: Microsoft-IIS/6.0
|
4
|
+
X-Powered-By: PleskWin
|
5
|
+
X-Powered-By: ASP.NET
|
6
|
+
Cache-Control: private
|
7
|
+
Content-Length: 25902
|
8
|
+
Content-Type: text/html
|
9
|
+
Expires: Sun, 29 May 2011 09:58:18 GMT
|
10
|
+
Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
|
11
|
+
Cache-control: private
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
<html>
|
16
|
+
<head>
|
17
|
+
<title>A web</title>
|
18
|
+
</head>
|
19
|
+
<body>
|
20
|
+
<p>A sample web without a specified charset.</p>
|
21
|
+
</body>
|
22
|
+
</html>
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Date: Mon, 30 May 2011 09:58:20 GMT
|
3
|
+
Server: Microsoft-IIS/6.0
|
4
|
+
X-Powered-By: PleskWin
|
5
|
+
X-Powered-By: ASP.NET
|
6
|
+
Cache-Control: private
|
7
|
+
Content-Length: 25902
|
8
|
+
Content-Type: text/html
|
9
|
+
Expires: Sun, 29 May 2011 09:58:18 GMT
|
10
|
+
Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
|
11
|
+
Cache-control: private
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
<html>
|
16
|
+
<head>
|
17
|
+
<meta charset="utf-8" />
|
18
|
+
<title>A web</title>
|
19
|
+
</head>
|
20
|
+
<body>
|
21
|
+
<p>A sample web with a way of specifying the charset.</p>
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Date: Mon, 30 May 2011 09:58:20 GMT
|
3
|
+
Server: Microsoft-IIS/6.0
|
4
|
+
X-Powered-By: PleskWin
|
5
|
+
X-Powered-By: ASP.NET
|
6
|
+
Cache-Control: private
|
7
|
+
Content-Length: 25902
|
8
|
+
Content-Type: text/html
|
9
|
+
Expires: Sun, 29 May 2011 09:58:18 GMT
|
10
|
+
Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
|
11
|
+
Cache-control: private
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
<html>
|
16
|
+
<head>
|
17
|
+
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
|
18
|
+
<title>A web</title>
|
19
|
+
</head>
|
20
|
+
<body>
|
21
|
+
<p>A sample web with another way of specifying the charset.</p>
|
22
|
+
</body>
|
23
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -19,6 +19,9 @@ describe MetaInspector do
|
|
19
19
|
FakeWeb.register_uri(:get, "https://twitter.com/w3clove", :response => fixture_file("twitter_w3clove.response"))
|
20
20
|
FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
|
21
21
|
FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
|
22
|
+
FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
|
23
|
+
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
24
|
+
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
22
25
|
|
23
26
|
describe 'Initialization' do
|
24
27
|
it 'should accept an URL with a scheme' do
|
@@ -275,21 +278,26 @@ describe MetaInspector do
|
|
275
278
|
end
|
276
279
|
|
277
280
|
describe 'Charset detection' do
|
278
|
-
it "should
|
279
|
-
@m = MetaInspector.new('http://
|
281
|
+
it "should get the charset from <meta charset />" do
|
282
|
+
@m = MetaInspector.new('http://charset001.com')
|
283
|
+
@m.charset.should == "utf-8"
|
284
|
+
end
|
285
|
+
|
286
|
+
it "should get the charset from meta content type" do
|
287
|
+
@m = MetaInspector.new('http://charset002.com')
|
280
288
|
@m.charset.should == "windows-1252"
|
281
289
|
end
|
282
290
|
|
283
|
-
it "should
|
284
|
-
@m = MetaInspector.new('http://
|
285
|
-
@m.charset.should ==
|
291
|
+
it "should get nil if no declared charset is found" do
|
292
|
+
@m = MetaInspector.new('http://charset000.com')
|
293
|
+
@m.charset.should == nil
|
286
294
|
end
|
287
295
|
end
|
288
296
|
|
289
297
|
describe 'to_hash' do
|
290
298
|
it "should return a hash with all the values set" do
|
291
299
|
@m = MetaInspector.new('http://pagerankalert.com')
|
292
|
-
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"
|
300
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
293
301
|
end
|
294
302
|
end
|
295
303
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 39
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 9
|
9
|
-
-
|
10
|
-
version: 1.9.
|
9
|
+
- 10
|
10
|
+
version: 1.9.10
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
18
|
+
date: 2012-09-12 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
@@ -34,22 +34,6 @@ dependencies:
|
|
34
34
|
requirement: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
36
|
version_requirements: &id002 !ruby/object:Gem::Requirement
|
37
|
-
none: false
|
38
|
-
requirements:
|
39
|
-
- - "="
|
40
|
-
- !ruby/object:Gem::Version
|
41
|
-
hash: 40222042329011
|
42
|
-
segments:
|
43
|
-
- 1
|
44
|
-
- 3
|
45
|
-
- 20111021164500
|
46
|
-
version: 1.3.20111021164500
|
47
|
-
prerelease: false
|
48
|
-
type: :runtime
|
49
|
-
name: charguess
|
50
|
-
requirement: *id002
|
51
|
-
- !ruby/object:Gem::Dependency
|
52
|
-
version_requirements: &id003 !ruby/object:Gem::Requirement
|
53
37
|
none: false
|
54
38
|
requirements:
|
55
39
|
- - "="
|
@@ -63,9 +47,9 @@ dependencies:
|
|
63
47
|
prerelease: false
|
64
48
|
type: :runtime
|
65
49
|
name: rash
|
66
|
-
requirement: *
|
50
|
+
requirement: *id002
|
67
51
|
- !ruby/object:Gem::Dependency
|
68
|
-
version_requirements: &
|
52
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
69
53
|
none: false
|
70
54
|
requirements:
|
71
55
|
- - "="
|
@@ -79,9 +63,9 @@ dependencies:
|
|
79
63
|
prerelease: false
|
80
64
|
type: :development
|
81
65
|
name: rspec
|
82
|
-
requirement: *
|
66
|
+
requirement: *id003
|
83
67
|
- !ruby/object:Gem::Dependency
|
84
|
-
version_requirements: &
|
68
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
85
69
|
none: false
|
86
70
|
requirements:
|
87
71
|
- - "="
|
@@ -95,9 +79,9 @@ dependencies:
|
|
95
79
|
prerelease: false
|
96
80
|
type: :development
|
97
81
|
name: fakeweb
|
98
|
-
requirement: *
|
82
|
+
requirement: *id004
|
99
83
|
- !ruby/object:Gem::Dependency
|
100
|
-
version_requirements: &
|
84
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
101
85
|
none: false
|
102
86
|
requirements:
|
103
87
|
- - "="
|
@@ -111,9 +95,9 @@ dependencies:
|
|
111
95
|
prerelease: false
|
112
96
|
type: :development
|
113
97
|
name: awesome_print
|
114
|
-
requirement: *
|
98
|
+
requirement: *id005
|
115
99
|
- !ruby/object:Gem::Dependency
|
116
|
-
version_requirements: &
|
100
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
117
101
|
none: false
|
118
102
|
requirements:
|
119
103
|
- - "="
|
@@ -128,7 +112,7 @@ dependencies:
|
|
128
112
|
prerelease: false
|
129
113
|
type: :development
|
130
114
|
name: rake
|
131
|
-
requirement: *
|
115
|
+
requirement: *id006
|
132
116
|
description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
|
133
117
|
email:
|
134
118
|
- jaimeiniesta@gmail.com
|
@@ -155,6 +139,9 @@ files:
|
|
155
139
|
- samples/spider.rb
|
156
140
|
- spec/fixtures/alazan.com.response
|
157
141
|
- spec/fixtures/alazan_websolution.response
|
142
|
+
- spec/fixtures/charset_000.response
|
143
|
+
- spec/fixtures/charset_001.response
|
144
|
+
- spec/fixtures/charset_002.response
|
158
145
|
- spec/fixtures/empty_page.response
|
159
146
|
- spec/fixtures/guardian.co.uk.response
|
160
147
|
- spec/fixtures/international.response
|