metrocot 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/Manifest.txt +7 -0
- data/README.txt +98 -0
- data/Rakefile +12 -0
- data/bin/metrocot +109 -0
- data/lib/metrocot.rb +1112 -0
- data/test/test_metrocot.rb +70 -0
- metadata +72 -0
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'metrocot'
|
4
|
+
require 'rake'
|
5
|
+
|
6
|
+
$rakefile = nil # shuts up a warning in rdoctask.rb
|
7
|
+
|
8
|
+
class TestMetrocot < Test::Unit::TestCase
|
9
|
+
|
10
|
+
def setup
|
11
|
+
Rake.application.clear
|
12
|
+
end
|
13
|
+
|
14
|
+
class Abc < Object
|
15
|
+
attr_accessor :a, :b, :c
|
16
|
+
def initialize( a, b, c )
|
17
|
+
@a = a
|
18
|
+
@b = b
|
19
|
+
@c = c
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_nothing
|
24
|
+
|
25
|
+
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
|
26
|
+
|
27
|
+
doc = Hpricot(html)
|
28
|
+
|
29
|
+
scraper = Metrocot.new(
|
30
|
+
:a => Metrocot::Scanners::TextScanner,
|
31
|
+
:b => Metrocot::Scanners::TextScanner,
|
32
|
+
:c => Metrocot::Scanners::TextScanner
|
33
|
+
)
|
34
|
+
|
35
|
+
# scraper.verbose = true
|
36
|
+
|
37
|
+
assert_equal( [[]], scraper.scrape(doc).descend("//html/body") { |td|
|
38
|
+
td.collect( "a=.//h3 b=.//p c=.//p" ) { |a, b, c|
|
39
|
+
Abc.new( a, b, c )
|
40
|
+
}
|
41
|
+
}.values )
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_abc
|
46
|
+
|
47
|
+
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
|
48
|
+
|
49
|
+
doc = Hpricot(html)
|
50
|
+
|
51
|
+
scraper = Metrocot.new(
|
52
|
+
:a => Metrocot::Scanners::TextScanner,
|
53
|
+
:b => Metrocot::Scanners::TextScanner,
|
54
|
+
:c => Metrocot::Scanners::TextScanner
|
55
|
+
)
|
56
|
+
|
57
|
+
# scraper.verbose = true
|
58
|
+
|
59
|
+
abcs = scraper.scrape(doc).descend("//html/body") { |td|
|
60
|
+
td.collect( "a=.//h2 b=.//p c=.//p" ) { |a, b, c|
|
61
|
+
Abc.new( a, b, c )
|
62
|
+
}
|
63
|
+
}.values.flatten
|
64
|
+
|
65
|
+
assert_equal(3, abcs.size)
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: metrocot
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Helmut Hissen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-01-05 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.8.2
|
24
|
+
version:
|
25
|
+
description: Metrocot builds on top of Hpricot to allow scraping of list data from HTML pages with a minimum of code and page specific information. The specification is done is a very compact readable format.
|
26
|
+
email:
|
27
|
+
- helmut@zeebar.com
|
28
|
+
executables:
|
29
|
+
- metrocot
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.txt
|
36
|
+
files:
|
37
|
+
- History.txt
|
38
|
+
- Manifest.txt
|
39
|
+
- README.txt
|
40
|
+
- Rakefile
|
41
|
+
- bin/metrocot
|
42
|
+
- lib/metrocot.rb
|
43
|
+
- test/test_metrocot.rb
|
44
|
+
has_rdoc: true
|
45
|
+
homepage: http://www.metrocascade.com/mdn/opensource/metrocot (url)
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options:
|
48
|
+
- --main
|
49
|
+
- README.txt
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: "0"
|
57
|
+
version:
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: "0"
|
63
|
+
version:
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project: metrocot
|
67
|
+
rubygems_version: 1.2.0
|
68
|
+
signing_key:
|
69
|
+
specification_version: 2
|
70
|
+
summary: Metrocot builds on top of Hpricot to allow scraping of list data from HTML pages with a minimum of code and page specific information
|
71
|
+
test_files:
|
72
|
+
- test/test_metrocot.rb
|