jaimeiniesta-metainspector 1.1.3 → 1.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +4 -0
- data/README.rdoc +0 -12
- data/lib/metainspector.rb +1 -6
- data/metainspector.gemspec +2 -2
- data/samples/basic_scraping.rb +2 -3
- data/samples/spider.rb +1 -1
- data/test/test_metainspector.rb +0 -25
- metadata +2 -2
data/CHANGELOG.rdoc
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
= 1.1.4
|
2
|
+
=== 4th June, 2009
|
3
|
+
* Simplified code: removed address setter, just instantiate a new MetaInspector object if you need to scrape a different URL
|
4
|
+
|
1
5
|
= 1.1.3
|
2
6
|
=== 22nd May, 2009
|
3
7
|
* Simplified code: now there's no need to call page.scrape!, just initialize it and go directly to page.address, page.title, page.description, page.keywords or page.links, the page will be scraped on the fly
|
data/README.rdoc
CHANGED
@@ -42,12 +42,6 @@ Once scraped, you can see the scraped data like this:
|
|
42
42
|
page.keywords # meta keywords, as string
|
43
43
|
page.links # array of strings, with every link found on the page
|
44
44
|
|
45
|
-
You can also change the address of the page to be scraped using the address= setter, like this:
|
46
|
-
|
47
|
-
page.address="http://jaimeiniesta.com"
|
48
|
-
|
49
|
-
Doing so resets the state of the MetaInspector instance to the initial state (not scraped yet, cleared stored meta data). The page will be re-scraped when you consult any of its metadata again.
|
50
|
-
|
51
45
|
The full scraped document if accessible from:
|
52
46
|
|
53
47
|
page.document # Nokogiri doc that you can use it to get any element from the page
|
@@ -80,12 +74,6 @@ You can find some sample scripts on the samples folder, including a basic scrapi
|
|
80
74
|
|
81
75
|
>> page.document.class
|
82
76
|
=> Nokogiri::HTML::Document
|
83
|
-
|
84
|
-
>> page.address="http://jaimeiniesta.com"
|
85
|
-
=> "http://jaimeiniesta.com"
|
86
|
-
|
87
|
-
>> page.title
|
88
|
-
=> "ruby on rails freelance developer -- Jaime Iniesta"
|
89
77
|
|
90
78
|
= To Do
|
91
79
|
|
data/lib/metainspector.rb
CHANGED
@@ -4,7 +4,7 @@ require 'nokogiri'
|
|
4
4
|
|
5
5
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
6
6
|
class MetaInspector
|
7
|
-
VERSION = '1.1.
|
7
|
+
VERSION = '1.1.4'
|
8
8
|
|
9
9
|
attr_reader :address
|
10
10
|
|
@@ -16,11 +16,6 @@ class MetaInspector
|
|
16
16
|
@document = @title = @description = @keywords = @links = nil
|
17
17
|
end
|
18
18
|
|
19
|
-
# Setter for address. Initializes the whole state as the address is being changed.
|
20
|
-
def address=(address)
|
21
|
-
initialize(address)
|
22
|
-
end
|
23
|
-
|
24
19
|
# Returns the parsed document title
|
25
20
|
def title
|
26
21
|
@title ||= document.css('title').inner_html rescue nil
|
data/metainspector.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "metainspector"
|
3
|
-
s.version = "1.1.
|
4
|
-
s.date = "2009-
|
3
|
+
s.version = "1.1.4"
|
4
|
+
s.date = "2009-06-04"
|
5
5
|
s.summary = "Ruby gem for web scraping"
|
6
6
|
s.email = "jaimeiniesta@gmail.com"
|
7
7
|
s.homepage = "http://github.com/jaimeiniesta/metainspector/tree/master"
|
data/samples/basic_scraping.rb
CHANGED
@@ -3,10 +3,9 @@
|
|
3
3
|
require '../lib/metainspector.rb'
|
4
4
|
|
5
5
|
puts "Enter a valid http address to scrape it"
|
6
|
-
address = gets
|
6
|
+
address = gets.strip
|
7
7
|
page = MetaInspector.new(address)
|
8
|
-
puts "
|
9
|
-
puts "...please wait..."
|
8
|
+
puts "...please wait while scraping the page..."
|
10
9
|
|
11
10
|
puts "Scraping #{page.address} returned these results:"
|
12
11
|
puts "TITLE: #{page.title}"
|
data/samples/spider.rb
CHANGED
data/test/test_metainspector.rb
CHANGED
@@ -16,29 +16,4 @@ class TestMetaInspector < Test::Unit::TestCase
|
|
16
16
|
assert_equal m.links[30], 'http://www.nuvio.cz/'
|
17
17
|
assert_equal m.document.class, Nokogiri::HTML::Document
|
18
18
|
end
|
19
|
-
|
20
|
-
# Test changing the address resets the state of the instance so it causes a new scraping
|
21
|
-
def test_address_setter
|
22
|
-
m = MetaInspector.new('http://pagerankalert.com')
|
23
|
-
assert_equal m.address, 'http://pagerankalert.com'
|
24
|
-
title_1 = m.title
|
25
|
-
description_1 = m.description
|
26
|
-
keywords_1 = m.keywords
|
27
|
-
links_1 = m.links
|
28
|
-
document_1 = m.document
|
29
|
-
|
30
|
-
m.address = 'http://jaimeiniesta.com'
|
31
|
-
assert_equal m.address, 'http://jaimeiniesta.com'
|
32
|
-
title_2 = m.title
|
33
|
-
description_2 = m.description
|
34
|
-
keywords_2 = m.keywords
|
35
|
-
links_2 = m.links
|
36
|
-
document_2 = m.document
|
37
|
-
|
38
|
-
assert_not_equal title_1, title_2
|
39
|
-
assert_not_equal description_1, description_2
|
40
|
-
assert_not_equal keywords_1, keywords_2
|
41
|
-
assert_not_equal links_1, links_2
|
42
|
-
assert_not_equal document_1, document_2
|
43
|
-
end
|
44
19
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaimeiniesta-metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-06-04 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|