jaimeiniesta-metainspector 1.1.1 → 1.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +4 -0
- data/README.rdoc +28 -5
- data/lib/metainspector.rb +11 -9
- data/metainspector.gemspec +3 -3
- data/samples/spider.rb +6 -1
- data/test/test_metainspector.rb +1 -1
- metadata +4 -4
data/CHANGELOG.rdoc
CHANGED
data/README.rdoc
CHANGED
@@ -2,6 +2,22 @@
|
|
2
2
|
|
3
3
|
MetaInspector is a gem for web scraping purposes. You give it an URL, and it returns you metadata from it.
|
4
4
|
|
5
|
+
= Dependencies
|
6
|
+
|
7
|
+
MetaInspector uses the nokogiri gem to parse HTML. You can install it from github.
|
8
|
+
|
9
|
+
Run the following if you haven't already:
|
10
|
+
|
11
|
+
gem sources -a http://gems.github.com
|
12
|
+
|
13
|
+
Then install the gem:
|
14
|
+
|
15
|
+
sudo gem install tenderlove-nokogiri
|
16
|
+
|
17
|
+
If you're on Ubuntu, you might need to install these packages before installing nokogiri:
|
18
|
+
|
19
|
+
sudo aptitude install libxslt-dev libxml2 libxml2-dev
|
20
|
+
|
5
21
|
= Installation
|
6
22
|
|
7
23
|
Run the following if you haven't already:
|
@@ -76,7 +92,7 @@ You can find some sample scripts on the samples folder, including a basic scrapi
|
|
76
92
|
=> #<File:/var/folders/X8/X8TBsDiWGYuMKzrB3bhWTU+++TI/-Tmp-/open-uri.6656.0>
|
77
93
|
|
78
94
|
>> page.scraped_doc.class
|
79
|
-
=>
|
95
|
+
=> Nokogiri::HTML::Document
|
80
96
|
|
81
97
|
>> page.scraped?
|
82
98
|
=> true
|
@@ -98,10 +114,17 @@ You can find some sample scripts on the samples folder, including a basic scrapi
|
|
98
114
|
|
99
115
|
= To Do
|
100
116
|
|
101
|
-
*
|
102
|
-
*
|
103
|
-
* Return array of images in page
|
117
|
+
* Get page.base_dir from the address
|
118
|
+
* Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
|
119
|
+
* Return array of images in page as absolute URLs
|
104
120
|
* Return contents of meta robots tag
|
105
|
-
*
|
121
|
+
* Be able to set a timeout in seconds
|
122
|
+
* Recover from Timeout exception
|
123
|
+
* Recover from Errno::ECONNREFUSED
|
124
|
+
* If keywords seem to be separated by blank spaces, replace them with commas
|
125
|
+
* Mocks
|
126
|
+
* Check content type, process only HTML pages_
|
127
|
+
** Don't try to scrape http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2
|
128
|
+
** Don't try to scrape http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
|
106
129
|
|
107
130
|
Copyright (c) 2009 Jaime Iniesta, released under the MIT license
|
data/lib/metainspector.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
require 'open-uri'
|
2
2
|
require 'rubygems'
|
3
|
-
require '
|
3
|
+
require 'nokogiri'
|
4
4
|
|
5
5
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
6
6
|
class MetaInspector
|
7
|
-
VERSION = '1.1.
|
8
|
-
|
9
|
-
Hpricot.buffer_size = 300000
|
7
|
+
VERSION = '1.1.2'
|
10
8
|
|
11
9
|
attr_reader :address, :title, :description, :keywords, :links, :full_doc, :scraped_doc
|
12
10
|
|
@@ -28,21 +26,21 @@ class MetaInspector
|
|
28
26
|
# Visit web page, get its contents, and parse it
|
29
27
|
def scrape!
|
30
28
|
@full_doc = open(@address)
|
31
|
-
@scraped_doc =
|
29
|
+
@scraped_doc = Nokogiri::HTML(@full_doc)
|
32
30
|
|
33
31
|
# Searching title...
|
34
|
-
@title = @scraped_doc.
|
32
|
+
@title = @scraped_doc.css('title').inner_html rescue nil
|
35
33
|
|
36
34
|
# Searching meta description...
|
37
|
-
@description = @scraped_doc.
|
35
|
+
@description = @scraped_doc.css("meta[@name='description']").first['content'] rescue nil
|
38
36
|
|
39
37
|
# Searching meta keywords...
|
40
|
-
@keywords = @scraped_doc.
|
38
|
+
@keywords = @scraped_doc.css("meta[@name='keywords']").first['content'] rescue nil
|
41
39
|
|
42
40
|
# Searching links...
|
43
41
|
@links = []
|
44
42
|
@scraped_doc.search("//a").each do |link|
|
45
|
-
@links << link.attributes["href"].strip
|
43
|
+
@links << link.attributes["href"].to_s.strip
|
46
44
|
end
|
47
45
|
|
48
46
|
# Mark scraping as success
|
@@ -51,6 +49,10 @@ class MetaInspector
|
|
51
49
|
rescue SocketError
|
52
50
|
puts 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
|
53
51
|
@scraped = false
|
52
|
+
rescue TimeoutError
|
53
|
+
puts 'Timeout!!!'
|
54
|
+
rescue
|
55
|
+
puts 'An exception occurred while trying to scrape the page!'
|
54
56
|
end
|
55
57
|
|
56
58
|
# Syntactic sugar
|
data/metainspector.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "metainspector"
|
3
|
-
s.version = "1.1.
|
4
|
-
s.date = "2009-05-
|
3
|
+
s.version = "1.1.2"
|
4
|
+
s.date = "2009-05-19"
|
5
5
|
s.summary = "Ruby gem for web scraping"
|
6
6
|
s.email = "jaimeiniesta@gmail.com"
|
7
7
|
s.homepage = "http://github.com/jaimeiniesta/metainspector/tree/master"
|
@@ -12,5 +12,5 @@ Gem::Specification.new do |s|
|
|
12
12
|
s.test_files = []
|
13
13
|
s.rdoc_options = []
|
14
14
|
s.extra_rdoc_files = []
|
15
|
-
s.add_dependency("
|
15
|
+
s.add_dependency("nokogiri", ["> 1.2"])
|
16
16
|
end
|
data/samples/spider.rb
CHANGED
@@ -6,6 +6,7 @@ visited_links=[]
|
|
6
6
|
|
7
7
|
puts "Enter a valid http address to spider it following external links"
|
8
8
|
address = gets.strip
|
9
|
+
|
9
10
|
page = MetaInspector.new(address)
|
10
11
|
q.push(address)
|
11
12
|
|
@@ -14,9 +15,13 @@ while q.size > 0
|
|
14
15
|
page.address=address
|
15
16
|
puts "Spidering #{page.address}"
|
16
17
|
page.scrape!
|
18
|
+
|
17
19
|
puts "TITLE: #{page.title}"
|
20
|
+
puts "DESCRIPTION: #{page.description}"
|
21
|
+
puts "KEYWORDS: #{page.keywords}"
|
22
|
+
puts "LINKS: #{page.links.size}"
|
18
23
|
page.links.each do |link|
|
19
|
-
if link[0..6]
|
24
|
+
if link[0..6] == 'http://' && !visited_links.include?(link)
|
20
25
|
q.push(link)
|
21
26
|
end
|
22
27
|
end
|
data/test/test_metainspector.rb
CHANGED
@@ -31,7 +31,7 @@ class TestMetaInspector < Test::Unit::TestCase
|
|
31
31
|
assert_equal m.links.size, 31
|
32
32
|
assert_equal m.links[30], 'http://www.nuvio.cz/'
|
33
33
|
assert_equal m.full_doc.class, Tempfile
|
34
|
-
assert_equal m.scraped_doc.class,
|
34
|
+
assert_equal m.scraped_doc.class, Nokogiri::HTML::Document
|
35
35
|
end
|
36
36
|
|
37
37
|
# Test changing the address resets the state of the instance
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaimeiniesta-metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
@@ -9,18 +9,18 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-05-
|
12
|
+
date: 2009-05-19 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: nokogiri
|
17
17
|
type: :runtime
|
18
18
|
version_requirement:
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">"
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version: "
|
23
|
+
version: "1.2"
|
24
24
|
version:
|
25
25
|
description: MetaInspector is a ruby gem for web scraping purposes, that returns a hash with metadata from a given URL
|
26
26
|
email: jaimeiniesta@gmail.com
|