metainspector 1.10.2 → 1.11.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -47,7 +47,7 @@ Then you can see the scraped data like this:
47
47
  page.title # title of the page, as string
48
48
  page.links # array of strings, with every link found on the page as an absolute URL
49
49
  page.internal_links # array of strings, with every internal link found on the page as an absolute URL
50
- page.extrenal_links # array of strings, with every external link found on the page as an absolute URL
50
+ page.external_links # array of strings, with every external link found on the page as an absolute URL
51
51
  page.meta_description # meta description, as string
52
52
  page.description # returns the meta description, or the first long paragraph if no meta description is found
53
53
  page.meta_keywords # meta keywords, as string
@@ -85,7 +85,7 @@ The full scraped document if accessible from:
85
85
 
86
86
  You can check if the page has been succesfully parsed with:
87
87
 
88
- page.parsed? # Will return true if everything looks OK
88
+ page.ok? # Will return true if everything looks OK
89
89
 
90
90
  In case there have been any errors, you can check them with:
91
91
 
@@ -8,90 +8,93 @@ require 'timeout'
8
8
  # MetaInspector provides an easy way to scrape web pages and get its elements
9
9
  module MetaInspector
10
10
  class Scraper
11
- attr_reader :url, :scheme, :host, :root_url, :errors, :content_type
11
+ attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
12
12
 
13
13
  # Initializes a new instance of MetaInspector, setting the URL to the one given
14
- # If no scheme given, set it to http:// by default
15
14
  # Options:
16
15
  # => timeout: defaults to 20 seconds
17
16
  # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
18
17
  def initialize(url, options = {})
19
- url = encode_url(url)
20
- @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
18
+ @url = with_default_scheme(encode_url(url))
21
19
  @scheme = URI.parse(@url).scheme
22
20
  @host = URI.parse(@url).host
23
21
  @root_url = "#{@scheme}://#{@host}/"
24
22
  @timeout = options[:timeout] || 20
25
- @data = Hashie::Rash.new('url' => @url)
23
+ @data = Hashie::Rash.new
26
24
  @errors = []
27
25
  @html_content_only = options[:html_content_only] || false
28
26
  end
29
27
 
30
28
  # Returns the parsed document title, from the content of the <title> tag.
31
- # This is not the same as the meta_tite tag
29
+ # This is not the same as the meta_title tag
32
30
  def title
33
- @data.title ||= parsed_document.css('title').inner_html.gsub(/\t|\n|\r/, '') rescue nil
31
+ @title ||= parsed_document.css('title').inner_html.gsub(/\t|\n|\r/, '') rescue nil
34
32
  end
35
33
 
36
34
  # A description getter that first checks for a meta description and if not present will
37
- # guess by looking grabbing the first paragraph > 120 characters
35
+ # guess by looking at the first paragraph with more than 120 characters
38
36
  def description
39
37
  meta_description.nil? ? secondary_description : meta_description
40
38
  end
41
39
 
42
40
  # Links found on the page, as absolute URLs
43
41
  def links
44
- @data.links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
42
+ @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
45
43
  end
46
44
 
47
45
  # Internal links found on the page, as absolute URLs
48
46
  def internal_links
49
- @data.internal_links ||= links.select {|link| URI.parse(link).host == @host }
47
+ @internal_links ||= links.select {|link| URI.parse(link).host == host }
50
48
  end
51
49
 
52
50
  # External links found on the page, as absolute URLs
53
51
  def external_links
54
- @data.external_links ||= links.select {|link| URI.parse(link).host != @host }
52
+ @external_links ||= links.select {|link| URI.parse(link).host != host }
55
53
  end
56
54
 
57
55
  # Images found on the page, as absolute URLs
58
56
  def images
59
- @data.images ||= parsed_images.map{ |i| absolutify_url(i) }
57
+ @images ||= parsed_images.map{ |i| absolutify_url(i) }
58
+ end
59
+
60
+ # Returns the parsed image from Facebook's open graph property tags
61
+ # Most all major websites now define this property and is usually very relevant
62
+ # See doc at http://developers.facebook.com/docs/opengraph/
63
+ def image
64
+ meta_og_image
60
65
  end
61
66
 
62
67
  # Returns the parsed document meta rss links
63
68
  def feed
64
- @data.feed ||= parsed_document.xpath("//link").select{ |link|
69
+ @feed ||= parsed_document.xpath("//link").select{ |link|
65
70
  link.attributes["type"] && link.attributes["type"].value =~ /(atom|rss)/
66
71
  }.map { |link|
67
72
  absolutify_url(link.attributes["href"].value)
68
73
  }.first rescue nil
69
74
  end
70
75
 
71
- # Returns the parsed image from Facebook's open graph property tags
72
- # Most all major websites now define this property and is usually very relevant
73
- # See doc at http://developers.facebook.com/docs/opengraph/
74
- def image
75
- meta_og_image
76
- end
77
-
78
76
  # Returns the charset from the meta tags, looking for it in the following order:
79
77
  # <meta charset='utf-8' />
80
78
  # <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
81
79
  def charset
82
- @data.charset ||= (charset_from_meta_charset || charset_from_content_type)
80
+ @charset ||= (charset_from_meta_charset || charset_from_content_type)
83
81
  end
84
82
 
85
83
  # Returns all parsed data as a nested Hash
86
84
  def to_hash
87
- # TODO: find a better option to populate the data to the Hash
88
- image;images;feed;links;charset;title;meta_keywords;internal_links;external_links
89
- @data.to_hash
90
- end
85
+ scrape_meta_data
91
86
 
92
- # Returns true if parsing has been successful
93
- def parsed?
94
- !@parsed_document.nil?
87
+ {
88
+ 'url' => url,
89
+ 'title' => title,
90
+ 'links' => links,
91
+ 'internal_links' => internal_links,
92
+ 'external_links' => external_links,
93
+ 'images' => images,
94
+ 'charset' => charset,
95
+ 'feed' => feed,
96
+ 'content_type' => content_type
97
+ }.merge @data.to_hash
95
98
  end
96
99
 
97
100
  # Returns the whole parsed document
@@ -103,24 +106,33 @@ module MetaInspector
103
106
 
104
107
  # Returns the original, unparsed document
105
108
  def document
106
- @document ||= Timeout::timeout(@timeout) {
107
- req = open(@url)
108
- @content_type = @data.content_type = req.content_type
109
+ @document ||= if html_content_only && content_type != "text/html"
110
+ raise "The url provided contains #{content_type} content instead of text/html content" and nil
111
+ else
112
+ request.read
113
+ end
114
+ rescue Exception => e
115
+ add_fatal_error "Scraping exception: #{e.message}"
116
+ end
109
117
 
110
- if @html_content_only && @content_type != "text/html"
111
- raise "The url provided contains #{@content_type} content instead of text/html content"
112
- end
118
+ # Returns the content_type of the fetched document
119
+ def content_type
120
+ @content_type ||= request.content_type
121
+ end
113
122
 
114
- req.read
115
- }
123
+ # Returns true if there are no errors
124
+ def ok?
125
+ errors.empty?
126
+ end
116
127
 
117
- rescue SocketError
118
- add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
119
- rescue TimeoutError
120
- add_fatal_error 'Timeout!!!'
121
- rescue Exception => e
122
- add_fatal_error "Scraping exception: #{e.message}"
128
+ ##### DEPRECATIONS ####
129
+ def parsed?
130
+ warn "the parsed? method has been deprecated, please use ok? instead"
131
+ !@parsed_document.nil?
123
132
  end
133
+ ##### DEPRECATIONS ####
134
+
135
+ private
124
136
 
125
137
  # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
126
138
  # meta name: keywords, description, robots, generator
@@ -132,43 +144,57 @@ module MetaInspector
132
144
  def method_missing(method_name)
133
145
  if method_name.to_s =~ /^meta_(.*)/
134
146
  key = $1
135
- #special treatment for og:
136
- if key =~ /^og_(.*)/
137
- key = "og:#{$1}"
138
- end
139
- unless @data.meta
140
- @data.meta!.name!
141
- @data.meta!.property!
142
- parsed_document.xpath("//meta").each do |element|
143
- if element.attributes["content"]
144
- if element.attributes["name"]
145
- @data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
146
- end
147
-
148
- if element.attributes["property"]
149
- @data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
150
- end
151
- end
152
- end
153
- end
147
+ key = "og:#{$1}" if key =~ /^og_(.*)/ # special treatment for og:
148
+
149
+ scrape_meta_data
150
+
154
151
  @data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
155
152
  else
156
153
  super
157
154
  end
158
155
  end
159
156
 
160
- private
157
+ # Makes the request to the server
158
+ def request
159
+ Timeout::timeout(timeout) { @request ||= open(url) }
160
+
161
+ rescue TimeoutError
162
+ add_fatal_error 'Timeout!!!'
163
+ rescue SocketError
164
+ add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
165
+ rescue Exception => e
166
+ add_fatal_error "Scraping exception: #{e.message}"
167
+ end
168
+
169
+ # Scrapes all meta tags found
170
+ def scrape_meta_data
171
+ unless @data.meta
172
+ @data.meta!.name!
173
+ @data.meta!.property!
174
+ parsed_document.xpath("//meta").each do |element|
175
+ if element.attributes["content"]
176
+ if element.attributes["name"]
177
+ @data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value
178
+ end
179
+
180
+ if element.attributes["property"]
181
+ @data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end
161
187
 
162
188
  def parsed_links
163
189
  @parsed_links ||= parsed_document.search("//a") \
164
- .map {|link| link.attributes["href"] \
165
- .to_s.strip}.uniq rescue []
190
+ .map {|link| link.attributes["href"] \
191
+ .to_s.strip}.uniq rescue []
166
192
  end
167
193
 
168
194
  def parsed_images
169
195
  @parsed_images ||= parsed_document.search('//img') \
170
- .reject{|i| (i.attributes['src'].nil? || i.attributes['src'].value.empty?) } \
171
- .map{ |i| i.attributes['src'].value }.uniq
196
+ .reject{|i| (i.attributes['src'].nil? || i.attributes['src'].value.empty?) } \
197
+ .map{ |i| i.attributes['src'].value }.uniq
172
198
  end
173
199
 
174
200
  # Stores the error for later inspection
@@ -182,13 +208,18 @@ module MetaInspector
182
208
  URI.encode(url).to_s.gsub("%23", "#")
183
209
  end
184
210
 
211
+ # Adds 'http' as default scheme, if there if none
212
+ def with_default_scheme(url)
213
+ URI.parse(url).scheme.nil? ? 'http://' + url : url
214
+ end
215
+
185
216
  # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
186
217
  # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
187
218
  def absolutify_url(url)
188
219
  if url =~ /^\w*\:/i
189
220
  encode_url(url)
190
221
  else
191
- URI.parse(@root_url).merge(encode_url(url)).to_s
222
+ URI.parse(root_url).merge(encode_url(url)).to_s
192
223
  end
193
224
  rescue URI::InvalidURIError => e
194
225
  add_fatal_error "Link parsing exception: #{e.message}" and nil
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.10.2"
4
+ VERSION = "1.11.0"
5
5
  end
@@ -25,7 +25,7 @@ describe MetaInspector do
25
25
  FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
26
26
  FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
27
27
  FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
28
- FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/jpeg")
28
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
29
29
  FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
30
30
 
31
31
  describe 'Initialization' do
@@ -373,16 +373,14 @@ describe MetaInspector do
373
373
  image_url = MetaInspector.new('http://pagerankalert.com/image.png')
374
374
  desc = image_url.description
375
375
 
376
- image_url.errors == nil
377
- image_url.parsed? == true
376
+ image_url.should be_ok
378
377
  end
379
378
 
380
379
  it "should parse images when parse_html_content_type_only is false" do
381
380
  image_url = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => false)
382
381
  desc = image_url.description
383
382
 
384
- image_url.errors == nil
385
- image_url.parsed? == true
383
+ image_url.should be_ok
386
384
  end
387
385
 
388
386
  it "should handle errors when content is image/jpeg and html_content_type_only is true" do
@@ -392,7 +390,7 @@ describe MetaInspector do
392
390
  title = image_url.title
393
391
  }.to change { image_url.errors.size }
394
392
 
395
- image_url.errors.first.should == "Scraping exception: The url provided contains image/jpeg content instead of text/html content"
393
+ image_url.errors.first.should == "Scraping exception: The url provided contains image/png content instead of text/html content"
396
394
  end
397
395
 
398
396
  it "should handle errors when content is not text/html and html_content_type_only is true" do
@@ -405,55 +403,41 @@ describe MetaInspector do
405
403
  tar_url.errors.first.should == "Scraping exception: The url provided contains application/x-gzip content instead of text/html content"
406
404
  end
407
405
 
408
- describe "parsed?" do
409
- it "should return true if we have a parsed document" do
406
+ describe "ok?" do
407
+ it "should return true if we have no errors" do
410
408
  good = MetaInspector.new('http://pagerankalert.com')
411
- title = good.title
409
+ good.to_hash
412
410
 
413
- good.parsed?.should == true
411
+ good.should be_ok
414
412
  end
415
413
 
416
- it "should return false if we don't have a parsed document" do
417
- bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timout => 0.00000000000001)
418
- title = bad.title
414
+ it "should return false if there are errors" do
415
+ bad = MetaInspector.new('http://fdsfdferewrewewewdesdf.com', :timeout => 0.00000000000001)
416
+ bad.title
419
417
 
420
- bad.parsed?.should == false
418
+ bad.should_not be_ok
421
419
  end
422
420
 
423
421
  it "should return false if we try to parse a page which content type is not html and html_content_type_only is set to true" do
424
422
  tar = MetaInspector.new('http://pagerankalert.com/file.tar.gz', :timeout => 20, :html_content_only => true)
425
423
  title = tar.title
426
424
 
427
- tar.parsed?.should == false
425
+ tar.should_not be_ok
428
426
  end
429
427
  end
430
428
  end
431
429
 
432
430
  describe "content_type" do
433
- it "should return the correct content type of the url if it is parsed correctly even for non html pages" do
431
+ it "should return the correct content type of the url for non html pages" do
434
432
  good = MetaInspector.new('http://pagerankalert.com/image.png')
435
- title = good.title
436
433
 
437
- good.parsed?.should == true
438
- good.content_type == "image/jpeg"
434
+ good.content_type.should == "image/png"
439
435
  end
440
436
 
441
- it "should return the correct content type of the url if it is parsed correctly even for html pages" do
437
+ it "should return the correct content type of the url for html pages" do
442
438
  good = MetaInspector.new('http://pagerankalert.com')
443
- title = good.title
444
439
 
445
- good.parsed?.should == true
446
- good.content_type == "text/html"
440
+ good.content_type.should == "text/html"
447
441
  end
448
-
449
- it "should return the correct content type of the url if it is not parsed correctly" do
450
- bad = MetaInspector.new('http://pagerankalert.com/image.png', :timeout => 20, :html_content_only => true)
451
- title = bad.title
452
-
453
- bad.parsed?.should == false
454
- bad.content_type == "image/jpeg"
455
- end
456
-
457
442
  end
458
-
459
443
  end
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 10
9
- - 2
10
- version: 1.10.2
8
+ - 11
9
+ - 0
10
+ version: 1.11.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-11-17 00:00:00 Z
18
+ date: 2012-11-26 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement