table_parser 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +23 -0
- data/History.txt +6 -0
- data/Manifest.txt +11 -0
- data/README.txt +49 -0
- data/Rakefile +8 -0
- data/lib/table_parser/parser.rb +63 -0
- data/lib/table_parser/table.rb +23 -0
- data/lib/table_parser/table_header.rb +22 -0
- data/lib/table_parser/table_node.rb +16 -0
- data/lib/table_parser.rb +8 -0
- data/test/test_table_parser.rb +75 -0
- metadata +78 -0
data/.autotest
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'autotest/restart'
|
4
|
+
|
5
|
+
# Autotest.add_hook :initialize do |at|
|
6
|
+
# at.extra_files << "../some/external/dependency.rb"
|
7
|
+
#
|
8
|
+
# at.libs << ":../some/external"
|
9
|
+
#
|
10
|
+
# at.add_exception 'vendor'
|
11
|
+
#
|
12
|
+
# at.add_mapping(/dependency.rb/) do |f, _|
|
13
|
+
# at.files_matching(/test_.*rb$/)
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# %w(TestA TestB).each do |klass|
|
17
|
+
# at.extra_class_map[klass] = "test/test_misc.rb"
|
18
|
+
# end
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Autotest.add_hook :run_command do |at|
|
22
|
+
# system "rake build"
|
23
|
+
# end
|
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.txt
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
= TableParser
|
2
|
+
|
3
|
+
== DESCRIPTION:
|
4
|
+
|
5
|
+
Parsing table could be difficult when its structure contains colspan or rowspan. TableParser parser HTML tables, group them by columns, with colspan and rowspan respected.
|
6
|
+
|
7
|
+
== REQUIREMENTS:
|
8
|
+
|
9
|
+
* Nokogiri
|
10
|
+
|
11
|
+
== INSTALL:
|
12
|
+
|
13
|
+
* sudo gem install table_parser
|
14
|
+
|
15
|
+
== DEVELOPERS:
|
16
|
+
|
17
|
+
* Francis Chong francis at ignition dot hk
|
18
|
+
|
19
|
+
After checking out the source, run:
|
20
|
+
|
21
|
+
$ rake newb
|
22
|
+
|
23
|
+
This task will install any missing dependencies, run the tests/specs,
|
24
|
+
and generate the RDoc.
|
25
|
+
|
26
|
+
== LICENSE:
|
27
|
+
|
28
|
+
(The MIT License)
|
29
|
+
|
30
|
+
Copyright (c) 2010 Ignition Soft
|
31
|
+
|
32
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
33
|
+
a copy of this software and associated documentation files (the
|
34
|
+
'Software'), to deal in the Software without restriction, including
|
35
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
36
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
37
|
+
permit persons to whom the Software is furnished to do so, subject to
|
38
|
+
the following conditions:
|
39
|
+
|
40
|
+
The above copyright notice and this permission notice shall be
|
41
|
+
included in all copies or substantial portions of the Software.
|
42
|
+
|
43
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
44
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
45
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
46
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
47
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
48
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
49
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
module TableParser
|
6
|
+
class Parser
|
7
|
+
# extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
|
8
|
+
def self.extract_table(input, xpath)
|
9
|
+
doc = Nokogiri::HTML(input)
|
10
|
+
|
11
|
+
rows = []
|
12
|
+
table = doc.xpath(xpath)
|
13
|
+
rows = table.xpath("./tr").collect do |row|
|
14
|
+
row.xpath("./td").collect do |col|
|
15
|
+
col
|
16
|
+
end
|
17
|
+
end
|
18
|
+
rows
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.extract_headers(rows)
|
22
|
+
headers = []
|
23
|
+
rows.first.collect do |col|
|
24
|
+
header = TableHeader.new(col)
|
25
|
+
headers << header
|
26
|
+
|
27
|
+
(header.colspan-1).times do
|
28
|
+
headers << TableHeader.new(col)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
rows.delete_at(0)
|
32
|
+
headers
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.extract_nodes(rows, headers)
|
36
|
+
data = rows.collect do |row|
|
37
|
+
row.collect do |ele|
|
38
|
+
node = TableNode.new(ele)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# handle rowspan
|
43
|
+
data.each_index do |row_index|
|
44
|
+
row = data[row_index]
|
45
|
+
row.each_index do |col_index|
|
46
|
+
col = row[col_index]
|
47
|
+
headers[col_index].children << col
|
48
|
+
|
49
|
+
if col.colspan > 1
|
50
|
+
col.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
|
51
|
+
end
|
52
|
+
|
53
|
+
if col.rowspan > 1 && data[row_index+1]
|
54
|
+
data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
data
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module TableParser
|
2
|
+
class Table
|
3
|
+
attr_reader :nodes, :headers
|
4
|
+
def initialize(input, xpath_to_table="//table[0]")
|
5
|
+
table = Parser.extract_table(input, xpath_to_table)
|
6
|
+
@headers = Parser.extract_headers(table)
|
7
|
+
@nodes = Parser.extract_nodes(table, @headers)
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_s
|
11
|
+
"Table<#{@headers.collect{|h| h.to_s }.join("\n")}>"
|
12
|
+
end
|
13
|
+
|
14
|
+
def header_count
|
15
|
+
@headers.size
|
16
|
+
end
|
17
|
+
|
18
|
+
# get column by index
|
19
|
+
def [](index)
|
20
|
+
@headers[index]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module TableParser
|
2
|
+
class TableHeader < TableNode
|
3
|
+
attr_reader :children
|
4
|
+
def initialize(element, rowspan=nil, colspan=nil)
|
5
|
+
super(element, rowspan, colspan)
|
6
|
+
@children = []
|
7
|
+
end
|
8
|
+
|
9
|
+
def size
|
10
|
+
@children.size
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](index)
|
14
|
+
@children[index]
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"[name=#{text}, children=#{@children.collect{|c| c.to_s }.join(",")}]"
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module TableParser
|
2
|
+
class TableNode
|
3
|
+
attr_reader :element, :text, :rowspan, :colspan
|
4
|
+
def initialize(element, rowspan=nil, colspan=nil)
|
5
|
+
@element = element
|
6
|
+
@text = element.text
|
7
|
+
|
8
|
+
@colspan = colspan || element["colspan"].to_i rescue 1
|
9
|
+
@rowspan = rowspan || element["rowspan"].to_i rescue 1
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_s
|
13
|
+
"[#{@text}]"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/table_parser.rb
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "table_parser"
|
3
|
+
|
4
|
+
class TestTableParser < Test::Unit::TestCase
|
5
|
+
def test_parse_rowspan
|
6
|
+
table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
7
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
8
|
+
<tr><td>3</td></tr></table></body></html>",
|
9
|
+
"/html/body/table"
|
10
|
+
|
11
|
+
assert_equal(2, table.header_count, 'header_count should = 2 ')
|
12
|
+
assert_equal(2, table[0].size)
|
13
|
+
assert_equal(2, table[1].size)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_parse_colspan
|
17
|
+
table = TableParser::Table.new "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
|
18
|
+
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|
19
|
+
<tr><td>B2</td><td>C2</td></tr>\
|
20
|
+
<tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>",
|
21
|
+
"/html/body/table"
|
22
|
+
|
23
|
+
assert_equal(3, table.header_count, 'header_count should = 3 ')
|
24
|
+
assert_equal(4, table[0].size)
|
25
|
+
assert_equal(4, table[1].size)
|
26
|
+
assert_equal(4, table[2].size)
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_parse_complex
|
31
|
+
table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
|
32
|
+
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
|
33
|
+
<tr><td>B2</td><td>B4</td></tr>\
|
34
|
+
<tr><td>C2</td><td>C3</td><td>B4</td></tr>\
|
35
|
+
</table></body></html>",
|
36
|
+
"/html/body/table"
|
37
|
+
|
38
|
+
assert_equal 4, table.header_count
|
39
|
+
assert_equal 3, table[0].size
|
40
|
+
assert_equal 3, table[1].size
|
41
|
+
assert_equal 3, table[2].size
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_parse_complex2
|
45
|
+
table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
|
46
|
+
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
|
47
|
+
<tr><td>B2</td><td>B4</td></tr>\
|
48
|
+
<tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
|
49
|
+
<tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
|
50
|
+
<tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
|
51
|
+
<tr><td>F2</td><td>F4</td></tr>\
|
52
|
+
<tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
|
53
|
+
<tr><td>H2</td><td>H4</td></tr>\
|
54
|
+
<tr><td>I2</td><td>I3</td><td>I4</td></tr>\
|
55
|
+
</table></body></html>",
|
56
|
+
"/html/body/table"
|
57
|
+
|
58
|
+
assert_equal 4, table.header_count
|
59
|
+
assert_equal 9, table[0].size
|
60
|
+
assert_equal 9, table[1].size
|
61
|
+
assert_equal 9, table[2].size
|
62
|
+
assert_equal 9, table[3].size
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_parse_web
|
66
|
+
table = TableParser::Table.new open("test.html").read,
|
67
|
+
"/html/body/table"
|
68
|
+
|
69
|
+
assert_equal 11, table.header_count
|
70
|
+
assert_equal 9, table[0].size
|
71
|
+
assert_equal 9, table[1].size
|
72
|
+
assert_equal 9, table[2].size
|
73
|
+
assert_equal 9, table[3].size
|
74
|
+
end
|
75
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: table_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Francis Chong
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-05 00:00:00 +08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 2.4.0
|
24
|
+
version:
|
25
|
+
description: Parsing table could be difficult when its structure contains colspan or rowspan. TableParser parser HTML tables, group them by columns, with colspan and rowspan respected.
|
26
|
+
email:
|
27
|
+
- francis@ignition.hk
|
28
|
+
executables: []
|
29
|
+
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.txt
|
36
|
+
files:
|
37
|
+
- .autotest
|
38
|
+
- History.txt
|
39
|
+
- Manifest.txt
|
40
|
+
- README.txt
|
41
|
+
- Rakefile
|
42
|
+
- lib/table_parser.rb
|
43
|
+
- lib/table_parser/parser.rb
|
44
|
+
- lib/table_parser/table.rb
|
45
|
+
- lib/table_parser/table_header.rb
|
46
|
+
- lib/table_parser/table_node.rb
|
47
|
+
- test/test_table_parser.rb
|
48
|
+
has_rdoc: true
|
49
|
+
homepage:
|
50
|
+
licenses: []
|
51
|
+
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options:
|
54
|
+
- --main
|
55
|
+
- README.txt
|
56
|
+
require_paths:
|
57
|
+
- lib
|
58
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: "0"
|
63
|
+
version:
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: "0"
|
69
|
+
version:
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project: table_parser
|
73
|
+
rubygems_version: 1.3.5
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: Parsing table could be difficult when its structure contains colspan or rowspan
|
77
|
+
test_files:
|
78
|
+
- test/test_table_parser.rb
|