metrocot 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/Manifest.txt +7 -0
- data/README.txt +98 -0
- data/Rakefile +12 -0
- data/bin/metrocot +109 -0
- data/lib/metrocot.rb +1112 -0
- data/test/test_metrocot.rb +70 -0
- metadata +72 -0
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'metrocot'
|
4
|
+
require 'rake'
|
5
|
+
|
6
|
+
$rakefile = nil # shuts up a warning in rdoctask.rb
|
7
|
+
|
8
|
+
class TestMetrocot < Test::Unit::TestCase
|
9
|
+
|
10
|
+
def setup
|
11
|
+
Rake.application.clear
|
12
|
+
end
|
13
|
+
|
14
|
+
class Abc < Object
|
15
|
+
attr_accessor :a, :b, :c
|
16
|
+
def initialize( a, b, c )
|
17
|
+
@a = a
|
18
|
+
@b = b
|
19
|
+
@c = c
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_nothing
|
24
|
+
|
25
|
+
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
|
26
|
+
|
27
|
+
doc = Hpricot(html)
|
28
|
+
|
29
|
+
scraper = Metrocot.new(
|
30
|
+
:a => Metrocot::Scanners::TextScanner,
|
31
|
+
:b => Metrocot::Scanners::TextScanner,
|
32
|
+
:c => Metrocot::Scanners::TextScanner
|
33
|
+
)
|
34
|
+
|
35
|
+
# scraper.verbose = true
|
36
|
+
|
37
|
+
assert_equal( [[]], scraper.scrape(doc).descend("//html/body") { |td|
|
38
|
+
td.collect( "a=.//h3 b=.//p c=.//p" ) { |a, b, c|
|
39
|
+
Abc.new( a, b, c )
|
40
|
+
}
|
41
|
+
}.values )
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_abc
|
46
|
+
|
47
|
+
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
|
48
|
+
|
49
|
+
doc = Hpricot(html)
|
50
|
+
|
51
|
+
scraper = Metrocot.new(
|
52
|
+
:a => Metrocot::Scanners::TextScanner,
|
53
|
+
:b => Metrocot::Scanners::TextScanner,
|
54
|
+
:c => Metrocot::Scanners::TextScanner
|
55
|
+
)
|
56
|
+
|
57
|
+
# scraper.verbose = true
|
58
|
+
|
59
|
+
abcs = scraper.scrape(doc).descend("//html/body") { |td|
|
60
|
+
td.collect( "a=.//h2 b=.//p c=.//p" ) { |a, b, c|
|
61
|
+
Abc.new( a, b, c )
|
62
|
+
}
|
63
|
+
}.values.flatten
|
64
|
+
|
65
|
+
assert_equal(3, abcs.size)
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: metrocot
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Helmut Hissen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-01-05 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.8.2
|
24
|
+
version:
|
25
|
+
description: Metrocot builds on top of Hpricot to allow scraping of list data from HTML pages with a minimum of code and page specific information. The specification is done is a very compact readable format.
|
26
|
+
email:
|
27
|
+
- helmut@zeebar.com
|
28
|
+
executables:
|
29
|
+
- metrocot
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.txt
|
36
|
+
files:
|
37
|
+
- History.txt
|
38
|
+
- Manifest.txt
|
39
|
+
- README.txt
|
40
|
+
- Rakefile
|
41
|
+
- bin/metrocot
|
42
|
+
- lib/metrocot.rb
|
43
|
+
- test/test_metrocot.rb
|
44
|
+
has_rdoc: true
|
45
|
+
homepage: http://www.metrocascade.com/mdn/opensource/metrocot (url)
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options:
|
48
|
+
- --main
|
49
|
+
- README.txt
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: "0"
|
57
|
+
version:
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: "0"
|
63
|
+
version:
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project: metrocot
|
67
|
+
rubygems_version: 1.2.0
|
68
|
+
signing_key:
|
69
|
+
specification_version: 2
|
70
|
+
summary: Metrocot builds on top of Hpricot to allow scraping of list data from HTML pages with a minimum of code and page specific information
|
71
|
+
test_files:
|
72
|
+
- test/test_metrocot.rb
|