metainspector 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/README.txt +22 -8
- data/Rakefile +5 -6
- data/lib/metainspector.rb +10 -10
- metadata +7 -7
data/History.txt
CHANGED
data/README.txt
CHANGED
|
@@ -1,32 +1,46 @@
|
|
|
1
1
|
metainspector
|
|
2
|
-
by
|
|
3
|
-
|
|
2
|
+
by Jaime Iniesta
|
|
3
|
+
http://metainspector.rubyforge.org/
|
|
4
4
|
|
|
5
5
|
== DESCRIPTION:
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
Ruby gem for web scraping purposes. It scrapes a given URL, and returns you a hash with data from it like for example the title, meta description, meta keywords, an array with all the links, all the images in it, etc.
|
|
8
8
|
|
|
9
9
|
== FEATURES/PROBLEMS:
|
|
10
10
|
|
|
11
|
-
*
|
|
11
|
+
* Scrape a given URL and return data from its HTML
|
|
12
12
|
|
|
13
13
|
== SYNOPSIS:
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
# Require all gems and libs needed...
|
|
16
|
+
require 'rubygems'
|
|
17
|
+
require 'open-uri'
|
|
18
|
+
require 'hpricot'
|
|
19
|
+
require 'metainspector'
|
|
20
|
+
|
|
21
|
+
# Scrape an URL...
|
|
22
|
+
page_data = MetaInspector.scrape(url)
|
|
23
|
+
|
|
24
|
+
# See extracted data...
|
|
25
|
+
page_data['title']
|
|
26
|
+
page_data['description']
|
|
27
|
+
page_data['keywords']
|
|
28
|
+
page_data['links']
|
|
16
29
|
|
|
17
30
|
== REQUIREMENTS:
|
|
18
31
|
|
|
19
|
-
*
|
|
32
|
+
* open-uri
|
|
33
|
+
* hpricot
|
|
20
34
|
|
|
21
35
|
== INSTALL:
|
|
22
36
|
|
|
23
|
-
*
|
|
37
|
+
* sudo gem install metainspector
|
|
24
38
|
|
|
25
39
|
== LICENSE:
|
|
26
40
|
|
|
27
41
|
(The MIT License)
|
|
28
42
|
|
|
29
|
-
Copyright (c) 2007
|
|
43
|
+
Copyright (c) 2007 Jaime Iniesta
|
|
30
44
|
|
|
31
45
|
Permission is hereby granted, free of charge, to any person obtaining
|
|
32
46
|
a copy of this software and associated documentation files (the
|
data/Rakefile
CHANGED
|
@@ -8,12 +8,11 @@ require './lib/metainspector.rb'
|
|
|
8
8
|
|
|
9
9
|
Hoe.new('metainspector', MetaInspector::VERSION) do |p|
|
|
10
10
|
p.rubyforge_name = 'metainspector'
|
|
11
|
-
p.
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
# p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
|
11
|
+
p.author = 'Jaime Iniesta'
|
|
12
|
+
p.email = 'jaimeiniesta@gmail.com'
|
|
13
|
+
p.summary = 'Ruby gem for web scraping purposes. It scrapes a given URL, and returns you a hash with data from it like for example the title, meta description, meta keywords, an array with all the links, all the images in it, etc.'
|
|
14
|
+
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
|
15
|
+
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
|
17
16
|
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
|
18
17
|
end
|
|
19
18
|
|
data/lib/metainspector.rb
CHANGED
|
@@ -1,39 +1,39 @@
|
|
|
1
1
|
class MetaInspector
|
|
2
|
-
VERSION = '1.0.
|
|
2
|
+
VERSION = '1.0.1'
|
|
3
3
|
|
|
4
4
|
Hpricot.buffer_size = 300000
|
|
5
5
|
|
|
6
6
|
def self.scrape(url)
|
|
7
7
|
doc = Hpricot(open(url))
|
|
8
8
|
|
|
9
|
-
#
|
|
9
|
+
# Searching title...
|
|
10
10
|
if (!doc.at('title').nil?)
|
|
11
11
|
title = doc.at('title').inner_html
|
|
12
12
|
else
|
|
13
13
|
title = ""
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
-
#
|
|
16
|
+
# Searching meta description...
|
|
17
17
|
if (!doc.at("meta[@name='description']").nil?)
|
|
18
18
|
description = doc.at("meta[@name='description']")['content']
|
|
19
19
|
else
|
|
20
20
|
description = ""
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
-
#
|
|
23
|
+
# Searching meta keywords...
|
|
24
24
|
if (!doc.at("meta[@name='keywords']").nil?)
|
|
25
25
|
keywords = doc.at("meta[@name='keywords']")['content']
|
|
26
26
|
else
|
|
27
27
|
keywords = ""
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
#
|
|
31
|
-
|
|
32
|
-
doc.search("//a").each do |
|
|
33
|
-
|
|
30
|
+
# Searching links...
|
|
31
|
+
links = []
|
|
32
|
+
doc.search("//a").each do |link|
|
|
33
|
+
links << link.attributes["href"] if (!link.attributes["href"].nil?)
|
|
34
34
|
end
|
|
35
35
|
|
|
36
|
-
#
|
|
37
|
-
{'ok' => true, 'title' => title, 'description' => description, 'keywords' => keywords, '
|
|
36
|
+
# Returning all data...
|
|
37
|
+
{'ok' => true, 'title' => title, 'description' => description, 'keywords' => keywords, 'links' => links}
|
|
38
38
|
end
|
|
39
39
|
end
|
metadata
CHANGED
|
@@ -3,15 +3,15 @@ rubygems_version: 0.9.2
|
|
|
3
3
|
specification_version: 1
|
|
4
4
|
name: metainspector
|
|
5
5
|
version: !ruby/object:Gem::Version
|
|
6
|
-
version: 1.0.
|
|
7
|
-
date: 2007-12-
|
|
8
|
-
summary:
|
|
6
|
+
version: 1.0.1
|
|
7
|
+
date: 2007-12-07 00:00:00 +01:00
|
|
8
|
+
summary: Ruby gem for web scraping purposes. It scrapes a given URL, and returns you a hash with data from it like for example the title, meta description, meta keywords, an array with all the links, all the images in it, etc.
|
|
9
9
|
require_paths:
|
|
10
10
|
- lib
|
|
11
|
-
email:
|
|
12
|
-
homepage:
|
|
11
|
+
email: jaimeiniesta@gmail.com
|
|
12
|
+
homepage: " by Jaime Iniesta"
|
|
13
13
|
rubyforge_project: metainspector
|
|
14
|
-
description:
|
|
14
|
+
description: "== FEATURES/PROBLEMS: * Scrape a given URL and return data from its HTML == SYNOPSIS: # Require all gems and libs needed... require 'rubygems' require 'open-uri' require 'hpricot' require 'metainspector' # Scrape an URL... page_data = MetaInspector.scrape(url)"
|
|
15
15
|
autorequire:
|
|
16
16
|
default_executable:
|
|
17
17
|
bindir: bin
|
|
@@ -27,7 +27,7 @@ signing_key:
|
|
|
27
27
|
cert_chain:
|
|
28
28
|
post_install_message:
|
|
29
29
|
authors:
|
|
30
|
-
-
|
|
30
|
+
- Jaime Iniesta
|
|
31
31
|
files:
|
|
32
32
|
- History.txt
|
|
33
33
|
- Manifest.txt
|