metainspector 1.10.2 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -47,7 +47,7 @@ Then you can see the scraped data like this:
47
47
  page.title # title of the page, as string
48
48
  page.links # array of strings, with every link found on the page as an absolute URL
49
49
  page.internal_links # array of strings, with every internal link found on the page as an absolute URL
50
- page.extrenal_links # array of strings, with every external link found on the page as an absolute URL
50
+ page.external_links # array of strings, with every external link found on the page as an absolute URL
51
51
  page.meta_description # meta description, as string
52
52
  page.description # returns the meta description, or the first long paragraph if no meta description is found
53
53
  page.meta_keywords # meta keywords, as string
@@ -85,7 +85,7 @@ The full scraped document if accessible from:
85
85
 
86
86
  You can check if the page has been succesfully parsed with:
87
87
 
88
- page.parsed? # Will return true if everything looks OK
88
+ page.ok? # Will return true if everything looks OK
89
89
 
90
90
  In case there have been any errors, you can check them with:
91
91
 
@@ -8,90 +8,93 @@ require 'timeout'
8
8
  # MetaInspector provides an easy way to scrape web pages and get its elements
9
9
  module MetaInspector
10
10
  class Scraper
11
- attr_reader :url, :scheme, :host, :root_url, :errors, :content_type
11
+ attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
12
12
 
13
13
  # Initializes a new instance of MetaInspector, setting the URL to the one given
14
- # If no scheme given, set it to http:// by default
15
14
  # Options:
16
15
  # => timeout: defaults to 20 seconds
17
16
  # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
18
17
  def initialize(url, options = {})
19
- url = encode_url(url)
20
- @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
18
+ @url = with_default_scheme(encode_url(url))
21
19
  @scheme = URI.parse(@url).scheme
22
20
  @host = URI.parse(@url).host
23
21
  @root_url = "#{@scheme}://#{@host}/"
24
22
  @timeout = options[:timeout] || 20
25
- @data = Hashie::Rash.new('url' => @url)
23
+ @data = Hashie::Rash.new
26
24
  @errors = []
27
25
  @html_content_only = options[:html_content_only] || false
28
26
  end
29
27
 
30
28
  # Returns the parsed document title, from the content of the <title> tag.
31
- # This is not the same as the meta_tite tag
29
+ # This is not the same as the meta_title tag
32
30
  def title
33
- @data.title ||= parsed_document.css('title').inner_html.gsub(/\t|\n|\r/, '') rescue nil
31
+ @title ||= parsed_document.css('title').inner_html.gsub(/\t|\n|\r/, '') rescue nil
34
32
  end
35
33
 
36
34
  # A description getter that first checks for a meta description and if not present will
37
- # guess by looking grabbing the first paragraph > 120 characters
35
+ # guess by looking at the first paragraph with more than 120 characters
38
36
  def description
39
37
  meta_description.nil? ? secondary_description : meta_description
40
38
  end
41
39
 
42
40
  # Links found on the page, as absolute URLs
43
41
  def links
44
- @data.links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
42
+ @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
45
43
  end
46
44
 
47
45
  # Internal links found on the page, as absolute URLs
48
46
  def internal_links
49
- @data.internal_links ||= links.select {|link| URI.parse(link).host == @host }
47
+ @internal_links ||= links.select {|link| URI.parse(link).host == host }
50
48
  end
51
49
 
52
50
  # External links found on the page, as absolute URLs
53
51
  def external_links
54
- @data.external_links ||= links.select {|link| URI.parse(link).host != @host }
52
+ @external_links ||= links.select {|link| URI.parse(link).host != host }
55
53
  end
56
54
 
57
55
  # Images found on the page, as absolute URLs
58
56
  def images
59
- @data.images ||= parsed_images.map{ |i| absolutify_url(i) }
57
+ @images ||= parsed_images.map{ |i| absolutify_url(i) }
58
+ end
59
+
60
+ # Returns the parsed image from Facebook's open graph property tags
61
+ # Most all major websites now define this property and is usually very relevant
62
+ # See doc at http://developers.facebook.com/docs/opengraph/
63
+ def image
64
+ meta_og_image
60
65
  end
61
66
 
62
67
  # Returns the parsed document meta rss links
63
68
  def feed
64
- @data.feed ||= parsed_document.xpath("//link").select{ |link|
69
+ @feed ||= parsed_document.xpath("//link").select{ |link|
65
70
  link.attributes["type"] && link.attributes["type"].value =~ /(atom|rss)/
66
71
  }.map { |link|
67
72
  absolutify_url(link.attributes["href"].value)
68
73
  }.first rescue nil
69
74
  end
70
75
 
71
- # Returns the parsed image from Facebook's open graph property tags
72
- # Most all major websites now define this property and is usually very relevant
73
- # See doc at http://developers.facebook.com/docs/opengraph/
74
- def image
75
- meta_og_image
76
- end
77
-
78
76
  # Returns the charset from the meta tags, looking for it in the following order:
79
77
  # <meta charset='utf-8' />
80
78
  # <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
81
79
  def charset
82
- @data.charset ||= (charset_from_meta_charset || charset_from_content_type)
80
+ @charset ||= (charset_from_meta_charset || charset_from_content_type)
83
81
  end
84
82
 
85
83
  # Returns all parsed data as a nested Hash
86
84
  def to_hash
87
- # TODO: find a better option to populate the data to the Hash
88
- image;images;feed;links;charset;title;meta_keywords;internal_links;external_links
89
- @data.to_hash
90
- end
85
+ scrape_meta_data
91
86
 
92
- # Returns true if parsing has been successful
93
- def parsed?
94
- !@parsed_document.nil?
87
+ {
88
+ 'url' => url,
89
+ 'title' => title,
90
+ 'links' => links,
91
+ 'internal_links' => internal_links,
92
+ 'external_links' => external_links,
93
+ 'images' => images,
94
+ 'charset' => charset,
95
+ 'feed' => feed,
96
+ 'content_type' => content_type
97
+ }.merge @data.to_hash
95
98
  end
96
99
 
97
100
  # Returns the whole parsed document
@@ -103,24 +106,33 @@ module MetaInspector
103
106
 
104
107
  # Returns the original, unparsed document
105
108
  def document
106
- @document ||= Timeout::timeout(@timeout) {
107
- req = open(@url)
108
- @content_type = @data.content_type = req.content_type
109
+ @document ||= if html_content_only && content_type != "text/html"
110
+ raise "The url provided contains #{content_type} content instead of text/html content" and nil
111
+ else
112
+ request.read
113
+ end
114
+ rescue Exception => e
115
+ add_fatal_error "Scraping exception: #{e.message}"
116
+ end
109
117
 
110
- if @html_content_only && @content_type != "text/html"
111
- raise "The url provided contains #{@content_type} content instead of text/html content"
112
- end
118
+ # Returns the content_type of the fetched document
119
+ def content_type
120
+ @content_type ||= request.content_type
121
+ end
113
122
 
114
- req.read
115
- }
123
+ # Returns true if there are no errors
124
+ def ok?
125
+ errors.empty?
126
+ end
116
127
 
117
- rescue SocketError
118
- add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
119
- rescue TimeoutError
120
- add_fatal_error 'Timeout!!!'
121
- rescue Exception => e
122
- add_fatal_error "Scraping exception: #{e.message}"
128
+ ##### DEPRECATIONS ####
129
+ def parsed?
130
+ warn "the parsed? method has been deprecated, please use ok? instead"
131
+ !@parsed_document.nil?
123
132
  end
133
+ ##### DEPRECATIONS ####
134
+
135
+ private
124
136
 
125
137
  # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
126
138
  # meta name: keywords, description, robots, generator
@@ -132,43 +144,57 @@ module MetaInspector
132
144
  def method_missing(method_name)
133
145
  if method_name.to_s =~ /^meta_(.*)/
134
146
  key = $1
135
- #special treatment for og:
136
- if key =~ /^og_(.*)/
137
- key = "og:#{$1}"
138
- end
139
- unless @data.meta
140
- @data.meta!.name!
141
- @data.meta!.property!
142
- parsed_document.xpath("//meta").each do |element|
143
- if element.attributes["content"]
144
- if element.attributes["name"]
145
- @data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
146
- end
147
-
148
- if element.attributes["property"]
149
- @data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
150
- end
151
- end
152
- end
153
- end
147
+ key = "og:#{$1}" if key =~ /^og_(.*)/ # special treatment for og:
148
+
149
+ scrape_meta_data
150
+
154
151
  @data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
155
152
  else
156
153
  super
157
154
  end
158
155
  end
159
156
 
160
- private
157
+ # Makes the request to the server
158
+ def request
159
+ Timeout::timeout(timeout) { @request ||= open(url) }
160
+
161
+ rescue TimeoutError
162
+ add_fatal_error 'Timeout!!!'
163
+ rescue SocketError
164
+ add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
165
+ rescue Exception => e
166
+ add_fatal_error "Scraping exception: #{e.message}"
167
+ end
168
+
169
+ # Scrapes all meta tags found
170
+ def scrape_meta_data
171
+ unless @data.meta
172
+ @data.meta!.name!
173
+ @data.meta!.property!
174
+ parsed_document.xpath("//meta").each do |element|
175
+ if element.attributes["content"]
176
+ if element.attributes["name"]
177
+ @data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
178
+ end
179
+
180
+ if element.attributes["property"]
181
+ @data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end
161
187
 
162
188
  def parsed_links
163
189
  @parsed_links ||= parsed_document.search("//a") \
164
- .map {|link| link.attributes["href"] \
165
- .to_s.strip}.uniq rescue []
190
+ .map {|link| link.attributes["href"] \
191
+ .to_s.strip}.uniq rescue []
166
192
  end
167
193
 
168
194
  def parsed_images
169
195
  @parsed_images ||= parsed_document.search('//img') \
170
- .reject{|i| (i.attributes['src'].nil? || i.attributes['src'].value.empty?) } \
171
- .map{ |i| i.attributes['src'].value }.uniq
196
+ .reject{|i| (i.attributes['src'].nil? || i.attributes['src'].value.empty?) } \
197
+ .map{ |i| i.attributes['src'].value }.uniq
172
198
  end
173
199
 
174
200
  # Stores the error for later inspection
@@ -182,13 +208,18 @@ module MetaInspector
182
208
  URI.encode(url).to_s.gsub("%23", "#")
183
209
  end
184
210
 
211
+ # Adds 'http' as default scheme, if there if none
212
+ def with_default_scheme(url)
213
+ URI.parse(url).scheme.nil? ? 'http://' + url : url
214
+ end
215
+
185
216
  # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
186
217
  # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
187
218
  def absolutify_url(url)
188
219
  if url =~ /^\w*\:/i
189
220
  encode_url(url)
190
221
  else
191
- URI.parse(@root_url).merge(encode_url(url)).to_s
222
+ URI.parse(root_url).merge(encode_url(url)).to_s
192
223
  end
193
224
  rescue URI::InvalidURIError => e
194
225
  add_fatal_error "Link parsing exception: #{e.message}" and nil
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.10.2"
4
+ VERSION = "1.11.0"
5
5
  end
@@ -25,7 +25,7 @@ describe MetaInspector do
25
25
  FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
26
26
  FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
27
27
  FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
28
- FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/jpeg")
28
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
29
29
  FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
30
30
 
31
31
  describe 'Initialization' do
@@ -373,16 +373,14 @@ describe MetaInspector do
373
373
  image_url = MetaInspector.new('http://pagerankalert.com/image.png')
374
374
  desc = image_url.description
375
375
 
376
- image_url.errors == nil
377
- image_url.parsed? == true
376
+ image_url.should be_ok
378
377
  end
379
378
 
380
379
  it "should parse images when parse_html_content_type_only is false" do
381
380
  image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
382
381
  desc = image_url.description
383
382
 
384
- image_url.errors == nil
385
- image_url.parsed? == true
383
+ image_url.should be_ok
386
384
  end
387
385
 
388
386
  it "should handle errors when content is image/jpeg and html_content_type_only is true" do
@@ -392,7 +390,7 @@ describe MetaInspector do
392
390
  title = image_url.title
393
391
  }.to change { image_url.errors.size }
394
392
 
395
- image_url.errors.first.should == "Scraping exception: The url provided contains image/jpeg content instead of text/html content"
393
+ image_url.errors.first.should == "Scraping exception: The url provided contains image/png content instead of text/html content"
396
394
  end
397
395
 
398
396
  it "should handle errors when content is not text/html and html_content_type_only is true" do
@@ -405,55 +403,41 @@ describe MetaInspector do
405
403
  tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
406
404
  end
407
405
 
408
- describe "parsed?" do
409
- it "should return true if we have a parsed document" do
406
+ describe "ok?" do
407
+ it "should return true if we have no errors" do
410
408
  good = MetaInspector.new('http://pagerankalert.com')
411
- title = good.title
409
+ good.to_hash
412
410
 
413
- good.parsed?.should == true
411
+ good.should be_ok
414
412
  end
415
413
 
416
- it "should return false if we don't have a parsed document" do
417
- bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timout => 0.00000000000001)
418
- title = bad.title
414
+ it "should return false if there are errors" do
415
+ bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timeout => 0.00000000000001)
416
+ bad.title
419
417
 
420
- bad.parsed?.should == false
418
+ bad.should_not be_ok
421
419
  end
422
420
 
423
421
  it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
424
422
  tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
425
423
  title = tar.title
426
424
 
427
- tar.parsed?.should == false
425
+ tar.should_not be_ok
428
426
  end
429
427
  end
430
428
  end
431
429
 
432
430
  describe "content_type" do
433
- it "should return the correct content type of the url if it is parsed correctly even for non html pages" do
431
+ it "should return the correct content type of the url for non html pages" do
434
432
  good = MetaInspector.new('http://pagerankalert.com/image.png')
435
- title = good.title
436
433
 
437
- good.parsed?.should == true
438
- good.content_type == "image/jpeg"
434
+ good.content_type.should == "image/png"
439
435
  end
440
436
 
441
- it "should return the correct content type of the url if it is parsed correctly even for html pages" do
437
+ it "should return the correct content type of the url for html pages" do
442
438
  good = MetaInspector.new('http://pagerankalert.com')
443
- title = good.title
444
439
 
445
- good.parsed?.should == true
446
- good.content_type == "text/html"
440
+ good.content_type.should == "text/html"
447
441
  end
448
-
449
- it "should return the correct content type of the url if it is not parsed correctly" do
450
- bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
451
- title = bad.title
452
-
453
- bad.parsed?.should == false
454
- bad.content_type == "image/jpeg"
455
- end
456
-
457
442
  end
458
-
459
443
  end
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 10
9
- - 2
10
- version: 1.10.2
8
+ - 11
9
+ - 0
10
+ version: 1.11.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-11-17 00:00:00 Z
18
+ date: 2012-11-26 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement