metrocot 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,70 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'metrocot'
4
+ require 'rake'
5
+
6
+ $rakefile = nil # shuts up a warning in rdoctask.rb
7
+
8
+ class TestMetrocot < Test::Unit::TestCase
9
+
10
+ def setup
11
+ Rake.application.clear
12
+ end
13
+
14
+ class Abc < Object
15
+ attr_accessor :a, :b, :c
16
+ def initialize( a, b, c )
17
+ @a = a
18
+ @b = b
19
+ @c = c
20
+ end
21
+ end
22
+
23
+ def test_nothing
24
+
25
+ html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
26
+
27
+ doc = Hpricot(html)
28
+
29
+ scraper = Metrocot.new(
30
+ :a => Metrocot::Scanners::TextScanner,
31
+ :b => Metrocot::Scanners::TextScanner,
32
+ :c => Metrocot::Scanners::TextScanner
33
+ )
34
+
35
+ # scraper.verbose = true
36
+
37
+ assert_equal( [[]], scraper.scrape(doc).descend("//html/body") { |td|
38
+ td.collect( "a=.//h3 b=.//p c=.//p" ) { |a, b, c|
39
+ Abc.new( a, b, c )
40
+ }
41
+ }.values )
42
+
43
+ end
44
+
45
+ def test_abc
46
+
47
+ html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
48
+
49
+ doc = Hpricot(html)
50
+
51
+ scraper = Metrocot.new(
52
+ :a => Metrocot::Scanners::TextScanner,
53
+ :b => Metrocot::Scanners::TextScanner,
54
+ :c => Metrocot::Scanners::TextScanner
55
+ )
56
+
57
+ # scraper.verbose = true
58
+
59
+ abcs = scraper.scrape(doc).descend("//html/body") { |td|
60
+ td.collect( "a=.//h2 b=.//p c=.//p" ) { |a, b, c|
61
+ Abc.new( a, b, c )
62
+ }
63
+ }.values.flatten
64
+
65
+ assert_equal(3, abcs.size)
66
+
67
+ end
68
+
69
+ end
70
+
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: metrocot
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Helmut Hissen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-01-05 00:00:00 -08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.8.2
24
+ version:
25
+ description: Metrocot builds on top of Hpricot to allow scraping of list data from HTML pages with a minimum of code and page specific information. The specification is done is a very compact readable format.
26
+ email:
27
+ - helmut@zeebar.com
28
+ executables:
29
+ - metrocot
30
+ extensions: []
31
+
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - Manifest.txt
35
+ - README.txt
36
+ files:
37
+ - History.txt
38
+ - Manifest.txt
39
+ - README.txt
40
+ - Rakefile
41
+ - bin/metrocot
42
+ - lib/metrocot.rb
43
+ - test/test_metrocot.rb
44
+ has_rdoc: true
45
+ homepage: http://www.metrocascade.com/mdn/opensource/metrocot (url)
46
+ post_install_message:
47
+ rdoc_options:
48
+ - --main
49
+ - README.txt
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ version:
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: "0"
63
+ version:
64
+ requirements: []
65
+
66
+ rubyforge_project: metrocot
67
+ rubygems_version: 1.2.0
68
+ signing_key:
69
+ specification_version: 2
70
+ summary: Metrocot builds on top of Hpricot to allow scraping of list data from HTML pages with a minimum of code and page specific information
71
+ test_files:
72
+ - test/test_metrocot.rb