metainspector 1.9.9 → 1.9.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ = MetaInspector {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
2
+
1
3
  MetaInspector is a gem for web scraping purposes. You give it an URL, and it lets you easily get its title, links, images, charset, description, keywords, meta tags...
2
4
 
3
5
  = See it in action!
@@ -48,6 +50,7 @@ Then you can see the scraped data like this:
48
50
  page.feed # Get rss or atom links in meta data fields as array
49
51
  page.meta_og_title # opengraph title
50
52
  page.meta_og_image # opengraph image
53
+ page.charset # UTF-8
51
54
 
52
55
  MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
53
56
 
@@ -2,7 +2,6 @@
2
2
 
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
- require 'charguess'
6
5
  require 'hashie/rash'
7
6
  require 'timeout'
8
7
 
@@ -71,11 +70,11 @@ module MetaInspector
71
70
  meta_og_image
72
71
  end
73
72
 
74
- # Returns the charset
75
- # TODO: We should trust the charset expressed on the Content-Type meta tag
76
- # and only guess it if none given
73
+ # Returns the charset from the meta tags, looking for it in the following order:
74
+ # <meta charset='utf-8' />
75
+ # <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
77
76
  def charset
78
- @data.charset ||= CharGuess.guess(document)
77
+ @data.charset ||= (charset_from_meta_charset || charset_from_content_type)
79
78
  end
80
79
 
81
80
  # Returns all parsed data as a nested Hash
@@ -184,5 +183,12 @@ module MetaInspector
184
183
  (p = parsed_document.search('//p').map(&:text).select{ |p| p.length > 120 }.first).nil? ? '' : p
185
184
  end
186
185
 
186
+ def charset_from_meta_charset
187
+ parsed_document.css("meta[charset]")[0].attributes['charset'].value rescue nil
188
+ end
189
+
190
+ def charset_from_content_type
191
+ parsed_document.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
192
+ end
187
193
  end
188
194
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.9.9"
4
+ VERSION = "1.9.10"
5
5
  end
@@ -15,7 +15,6 @@ Gem::Specification.new do |gem|
15
15
  gem.version = MetaInspector::VERSION
16
16
 
17
17
  gem.add_dependency 'nokogiri', '~> 1.5'
18
- gem.add_dependency 'charguess', '1.3.20111021164500'
19
18
  gem.add_dependency 'rash', '0.3.2'
20
19
 
21
20
  gem.add_development_dependency 'rspec', '2.11.0'
@@ -0,0 +1,22 @@
1
+ HTTP/1.1 200 OK
2
+ Date: Mon, 30 May 2011 09:58:20 GMT
3
+ Server: Microsoft-IIS/6.0
4
+ X-Powered-By: PleskWin
5
+ X-Powered-By: ASP.NET
6
+ Cache-Control: private
7
+ Content-Length: 25902
8
+ Content-Type: text/html
9
+ Expires: Sun, 29 May 2011 09:58:18 GMT
10
+ Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
11
+ Cache-control: private
12
+
13
+
14
+
15
+ <html>
16
+ <head>
17
+ <title>A web</title>
18
+ </head>
19
+ <body>
20
+ <p>A sample web without a specified charset.</p>
21
+ </body>
22
+ </html>
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200 OK
2
+ Date: Mon, 30 May 2011 09:58:20 GMT
3
+ Server: Microsoft-IIS/6.0
4
+ X-Powered-By: PleskWin
5
+ X-Powered-By: ASP.NET
6
+ Cache-Control: private
7
+ Content-Length: 25902
8
+ Content-Type: text/html
9
+ Expires: Sun, 29 May 2011 09:58:18 GMT
10
+ Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
11
+ Cache-control: private
12
+
13
+
14
+
15
+ <html>
16
+ <head>
17
+ <meta charset="utf-8" />
18
+ <title>A web</title>
19
+ </head>
20
+ <body>
21
+ <p>A sample web with a way of specifying the charset.</p>
22
+ </body>
23
+ </html>
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200 OK
2
+ Date: Mon, 30 May 2011 09:58:20 GMT
3
+ Server: Microsoft-IIS/6.0
4
+ X-Powered-By: PleskWin
5
+ X-Powered-By: ASP.NET
6
+ Cache-Control: private
7
+ Content-Length: 25902
8
+ Content-Type: text/html
9
+ Expires: Sun, 29 May 2011 09:58:18 GMT
10
+ Set-Cookie: ASPSESSIONIDCSBSQADC=AHENHHKBGGDIFJLHHCCJBHMP; path=/
11
+ Cache-control: private
12
+
13
+
14
+
15
+ <html>
16
+ <head>
17
+ <meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
18
+ <title>A web</title>
19
+ </head>
20
+ <body>
21
+ <p>A sample web with another way of specifying the charset.</p>
22
+ </body>
23
+ </html>
@@ -19,6 +19,9 @@ describe MetaInspector do
19
19
  FakeWeb.register_uri(:get, "https://twitter.com/w3clove", :response => fixture_file("twitter_w3clove.response"))
20
20
  FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
21
21
  FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
22
+ FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
23
+ FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
24
+ FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
22
25
 
23
26
  describe 'Initialization' do
24
27
  it 'should accept an URL with a scheme' do
@@ -275,21 +278,26 @@ describe MetaInspector do
275
278
  end
276
279
 
277
280
  describe 'Charset detection' do
278
- it "should detect windows-1252 charset" do
279
- @m = MetaInspector.new('http://www.alazan.com')
281
+ it "should get the charset from <meta charset />" do
282
+ @m = MetaInspector.new('http://charset001.com')
283
+ @m.charset.should == "utf-8"
284
+ end
285
+
286
+ it "should get the charset from meta content type" do
287
+ @m = MetaInspector.new('http://charset002.com')
280
288
  @m.charset.should == "windows-1252"
281
289
  end
282
290
 
283
- it "should detect utf-8 charset" do
284
- @m = MetaInspector.new('http://pagerankalert.com')
285
- @m.charset.should == "UTF-8"
291
+ it "should get nil if no declared charset is found" do
292
+ @m = MetaInspector.new('http://charset000.com')
293
+ @m.charset.should == nil
286
294
  end
287
295
  end
288
296
 
289
297
  describe 'to_hash' do
290
298
  it "should return a hash with all the values set" do
291
299
  @m = MetaInspector.new('http://pagerankalert.com')
292
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"UTF-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
300
+ @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
293
301
  end
294
302
  end
295
303
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 33
4
+ hash: 39
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 9
9
- - 9
10
- version: 1.9.9
9
+ - 10
10
+ version: 1.9.10
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-08-13 00:00:00 Z
18
+ date: 2012-09-12 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement
@@ -34,22 +34,6 @@ dependencies:
34
34
  requirement: *id001
35
35
  - !ruby/object:Gem::Dependency
36
36
  version_requirements: &id002 !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - "="
40
- - !ruby/object:Gem::Version
41
- hash: 40222042329011
42
- segments:
43
- - 1
44
- - 3
45
- - 20111021164500
46
- version: 1.3.20111021164500
47
- prerelease: false
48
- type: :runtime
49
- name: charguess
50
- requirement: *id002
51
- - !ruby/object:Gem::Dependency
52
- version_requirements: &id003 !ruby/object:Gem::Requirement
53
37
  none: false
54
38
  requirements:
55
39
  - - "="
@@ -63,9 +47,9 @@ dependencies:
63
47
  prerelease: false
64
48
  type: :runtime
65
49
  name: rash
66
- requirement: *id003
50
+ requirement: *id002
67
51
  - !ruby/object:Gem::Dependency
68
- version_requirements: &id004 !ruby/object:Gem::Requirement
52
+ version_requirements: &id003 !ruby/object:Gem::Requirement
69
53
  none: false
70
54
  requirements:
71
55
  - - "="
@@ -79,9 +63,9 @@ dependencies:
79
63
  prerelease: false
80
64
  type: :development
81
65
  name: rspec
82
- requirement: *id004
66
+ requirement: *id003
83
67
  - !ruby/object:Gem::Dependency
84
- version_requirements: &id005 !ruby/object:Gem::Requirement
68
+ version_requirements: &id004 !ruby/object:Gem::Requirement
85
69
  none: false
86
70
  requirements:
87
71
  - - "="
@@ -95,9 +79,9 @@ dependencies:
95
79
  prerelease: false
96
80
  type: :development
97
81
  name: fakeweb
98
- requirement: *id005
82
+ requirement: *id004
99
83
  - !ruby/object:Gem::Dependency
100
- version_requirements: &id006 !ruby/object:Gem::Requirement
84
+ version_requirements: &id005 !ruby/object:Gem::Requirement
101
85
  none: false
102
86
  requirements:
103
87
  - - "="
@@ -111,9 +95,9 @@ dependencies:
111
95
  prerelease: false
112
96
  type: :development
113
97
  name: awesome_print
114
- requirement: *id006
98
+ requirement: *id005
115
99
  - !ruby/object:Gem::Dependency
116
- version_requirements: &id007 !ruby/object:Gem::Requirement
100
+ version_requirements: &id006 !ruby/object:Gem::Requirement
117
101
  none: false
118
102
  requirements:
119
103
  - - "="
@@ -128,7 +112,7 @@ dependencies:
128
112
  prerelease: false
129
113
  type: :development
130
114
  name: rake
131
- requirement: *id007
115
+ requirement: *id006
132
116
  description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
133
117
  email:
134
118
  - jaimeiniesta@gmail.com
@@ -155,6 +139,9 @@ files:
155
139
  - samples/spider.rb
156
140
  - spec/fixtures/alazan.com.response
157
141
  - spec/fixtures/alazan_websolution.response
142
+ - spec/fixtures/charset_000.response
143
+ - spec/fixtures/charset_001.response
144
+ - spec/fixtures/charset_002.response
158
145
  - spec/fixtures/empty_page.response
159
146
  - spec/fixtures/guardian.co.uk.response
160
147
  - spec/fixtures/international.response