metainspector 1.9.9 → 1.9.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,5 @@
1
+ = MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
2
+
1
3
  MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, images, charset, description, keywords, meta tags...
2
4
 
3
5
  = See it in action!
@@ -48,6 +50,7 @@ Then you can see the scraped data like this:
48
50
  page.feed # Get rss or atom links in meta data fields as array
49
51
  page.meta_og_title # opengraph title
50
52
  page.meta_og_image # opengraph image
53
+ page.charset # UTF-8
51
54
 
52
55
  MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
53
56
 
@@ -2,7 +2,6 @@
2
2
 
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
- require 'charguess'
6
5
  require 'hashie/rash'
7
6
  require 'timeout'
8
7
 
@@ -71,11 +70,11 @@ module MetaInspector
71
70
  meta_og_image
72
71
  end
73
72
 
74
- # Returns the charset
75
- # TODO: We should trust the charset expressed on the Content-Type meta tag
76
- # and only guess it if none given
73
+ # Returns the charset from the meta tags, looking for it in the following order:
74
+ # <meta charset='utf-8' />
75
+ # <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
77
76
  def charset
78
- @data.charset ||= CharGuess.guess(document)
77
+ @data.charset ||= (charset_from_meta_charset || charset_from_content_type)
79
78
  end
80
79
 
81
80
  # Returns all parsed data as a nested Hash
@@ -184,5 +183,12 @@ module MetaInspector
184
183
  (p = parsed_document.search('//p').map(&:text).select{ |p| p.length > 120 }.first).nil? ? '' : p
185
184
  end
186
185
 
186
+ def charset_from_meta_charset
187
+ parsed_document.css("meta[charset]")[0].attributes['charset'].value rescue nil
188
+ end
189
+
190
+ def charset_from_content_type
191
+ parsed_document.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
192
+ end
187
193
  end
188
194
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.9.9"
4
+ VERSION = "1.9.10"
5
5
  end
@@ -15,7 +15,6 @@ Gem::Specification.new do |gem|
15
15
  gem.version = MetaInspector::VERSION
16
16
 
17
17
  gem.add_dependency 'nokogiri', '~> 1.5'
18
- gem.add_dependency 'charguess', '1.3.20111021164500'
19
18
  gem.add_dependency 'rash', '0.3.2'
20
19
 
21
20
  gem.add_development_dependency 'rspec', '2.11.0'
@@ -0,0 +1,22 @@
1
+ HTTP/1.1 200 OK
2
+ Date: Mon, 30 May 2011 09:58:20 GMT
3
+ Server: Microsoft-IIS/6.0
4
+ X-Powered-By: PleskWin
5
+ X-Powered-By: ASP.NET
6
+ Cache-Control: private
7
+ Content-Length: 25902
8
+ Content-Type: text/html
9
+ Expires: Sun, 29 May 2011 09:58:18 GMT
10
+ Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
11
+ Cache-control: private
12
+
13
+
14
+
15
+ <html>
16
+ <head>
17
+ <title>A web</title>
18
+ </head>
19
+ <body>
20
+ <p>A sample web without a specified charset.</p>
21
+ </body>
22
+ </html>
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200 OK
2
+ Date: Mon, 30 May 2011 09:58:20 GMT
3
+ Server: Microsoft-IIS/6.0
4
+ X-Powered-By: PleskWin
5
+ X-Powered-By: ASP.NET
6
+ Cache-Control: private
7
+ Content-Length: 25902
8
+ Content-Type: text/html
9
+ Expires: Sun, 29 May 2011 09:58:18 GMT
10
+ Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
11
+ Cache-control: private
12
+
13
+
14
+
15
+ <html>
16
+ <head>
17
+ <meta charset="utf-8" />
18
+ <title>A web</title>
19
+ </head>
20
+ <body>
21
+ <p>A sample web with a way of specifying the charset.</p>
22
+ </body>
23
+ </html>
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200 OK
2
+ Date: Mon, 30 May 2011 09:58:20 GMT
3
+ Server: Microsoft-IIS/6.0
4
+ X-Powered-By: PleskWin
5
+ X-Powered-By: ASP.NET
6
+ Cache-Control: private
7
+ Content-Length: 25902
8
+ Content-Type: text/html
9
+ Expires: Sun, 29 May 2011 09:58:18 GMT
10
+ Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
11
+ Cache-control: private
12
+
13
+
14
+
15
+ <html>
16
+ <head>
17
+ <meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
18
+ <title>A web</title>
19
+ </head>
20
+ <body>
21
+ <p>A sample web with another way of specifying the charset.</p>
22
+ </body>
23
+ </html>
@@ -19,6 +19,9 @@ describe MetaInspector do
19
19
  FakeWeb.register_uri(:get, "https://twitter.com/w3clove", :response => fixture_file("twitter_w3clove.response"))
20
20
  FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
21
21
  FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
22
+ FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
23
+ FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
24
+ FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
22
25
 
23
26
  describe 'Initialization' do
24
27
  it 'should accept an URL with a scheme' do
@@ -275,21 +278,26 @@ describe MetaInspector do
275
278
  end
276
279
 
277
280
  describe 'Charset detection' do
278
- it "should detect windows-1252 charset" do
279
- @m = MetaInspector.new('http://www.alazan.com')
281
+ it "should get the charset from <meta charset />" do
282
+ @m = MetaInspector.new('http://charset001.com')
283
+ @m.charset.should == "utf-8"
284
+ end
285
+
286
+ it "should get the charset from meta content type" do
287
+ @m = MetaInspector.new('http://charset002.com')
280
288
  @m.charset.should == "windows-1252"
281
289
  end
282
290
 
283
- it "should detect utf-8 charset" do
284
- @m = MetaInspector.new('http://pagerankalert.com')
285
- @m.charset.should == "UTF-8"
291
+ it "should get nil if no declared charset is found" do
292
+ @m = MetaInspector.new('http://charset000.com')
293
+ @m.charset.should == nil
286
294
  end
287
295
  end
288
296
 
289
297
  describe 'to_hash' do
290
298
  it "should return a hash with all the values set" do
291
299
  @m = MetaInspector.new('http://pagerankalert.com')
292
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"UTF-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
300
+ @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
293
301
  end
294
302
  end
295
303
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 33
4
+ hash: 39
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 9
9
- - 9
10
- version: 1.9.9
9
+ - 10
10
+ version: 1.9.10
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-08-13 00:00:00 Z
18
+ date: 2012-09-12 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement
@@ -34,22 +34,6 @@ dependencies:
34
34
  requirement: *id001
35
35
  - !ruby/object:Gem::Dependency
36
36
  version_requirements: &id002 !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - "="
40
- - !ruby/object:Gem::Version
41
- hash: 40222042329011
42
- segments:
43
- - 1
44
- - 3
45
- - 20111021164500
46
- version: 1.3.20111021164500
47
- prerelease: false
48
- type: :runtime
49
- name: charguess
50
- requirement: *id002
51
- - !ruby/object:Gem::Dependency
52
- version_requirements: &id003 !ruby/object:Gem::Requirement
53
37
  none: false
54
38
  requirements:
55
39
  - - "="
@@ -63,9 +47,9 @@ dependencies:
63
47
  prerelease: false
64
48
  type: :runtime
65
49
  name: rash
66
- requirement: *id003
50
+ requirement: *id002
67
51
  - !ruby/object:Gem::Dependency
68
- version_requirements: &id004 !ruby/object:Gem::Requirement
52
+ version_requirements: &id003 !ruby/object:Gem::Requirement
69
53
  none: false
70
54
  requirements:
71
55
  - - "="
@@ -79,9 +63,9 @@ dependencies:
79
63
  prerelease: false
80
64
  type: :development
81
65
  name: rspec
82
- requirement: *id004
66
+ requirement: *id003
83
67
  - !ruby/object:Gem::Dependency
84
- version_requirements: &id005 !ruby/object:Gem::Requirement
68
+ version_requirements: &id004 !ruby/object:Gem::Requirement
85
69
  none: false
86
70
  requirements:
87
71
  - - "="
@@ -95,9 +79,9 @@ dependencies:
95
79
  prerelease: false
96
80
  type: :development
97
81
  name: fakeweb
98
- requirement: *id005
82
+ requirement: *id004
99
83
  - !ruby/object:Gem::Dependency
100
- version_requirements: &id006 !ruby/object:Gem::Requirement
84
+ version_requirements: &id005 !ruby/object:Gem::Requirement
101
85
  none: false
102
86
  requirements:
103
87
  - - "="
@@ -111,9 +95,9 @@ dependencies:
111
95
  prerelease: false
112
96
  type: :development
113
97
  name: awesome_print
114
- requirement: *id006
98
+ requirement: *id005
115
99
  - !ruby/object:Gem::Dependency
116
- version_requirements: &id007 !ruby/object:Gem::Requirement
100
+ version_requirements: &id006 !ruby/object:Gem::Requirement
117
101
  none: false
118
102
  requirements:
119
103
  - - "="
@@ -128,7 +112,7 @@ dependencies:
128
112
  prerelease: false
129
113
  type: :development
130
114
  name: rake
131
- requirement: *id007
115
+ requirement: *id006
132
116
  description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
133
117
  email:
134
118
  - jaimeiniesta@gmail.com
@@ -155,6 +139,9 @@ files:
155
139
  - samples/spider.rb
156
140
  - spec/fixtures/alazan.com.response
157
141
  - spec/fixtures/alazan_websolution.response
142
+ - spec/fixtures/charset_000.response
143
+ - spec/fixtures/charset_001.response
144
+ - spec/fixtures/charset_002.response
158
145
  - spec/fixtures/empty_page.response
159
146
  - spec/fixtures/guardian.co.uk.response
160
147
  - spec/fixtures/international.response