table_parser 0.5.7 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/table_parser/parser.rb +37 -27
- data/lib/table_parser/table_node.rb +15 -5
- data/lib/table_parser.rb +1 -1
- data/test/test_table_parser.rb +74 -30
- metadata +2 -2
data/lib/table_parser/parser.rb
CHANGED
@@ -42,44 +42,54 @@ module TableParser
|
|
42
42
|
headers
|
43
43
|
end
|
44
44
|
|
45
|
-
def self.extract_nodes(
|
46
|
-
data =
|
45
|
+
def self.extract_nodes(input_rows, headers, dup_rows, dup_cols)
|
46
|
+
data = input_rows.collect do |row|
|
47
47
|
row.collect do |ele|
|
48
48
|
node = TableNode.new(ele)
|
49
49
|
end
|
50
50
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
if headers[col_index]
|
58
|
-
headers[col_index].children << col if col.class != EmptyTableNode
|
51
|
+
|
52
|
+
columns = []
|
53
|
+
curr_x = 0
|
54
|
+
curr_y = 0
|
55
|
+
data.each do |row|
|
56
|
+
columns[curr_y] = [] unless columns[curr_y]
|
59
57
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
row.insert(col_index, EmptyTableNode.new(col.rowspan, col.colspan - 1))
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
if col.rowspan > 1
|
69
|
-
unless data[row_index+1]
|
70
|
-
data.insert(row_index, [])
|
71
|
-
end
|
58
|
+
curr_y = 0
|
59
|
+
row.each do |node|
|
60
|
+
rowspan = node.rowspan - 1
|
61
|
+
colspan = node.colspan - 1
|
72
62
|
|
73
|
-
|
74
|
-
|
63
|
+
(0..rowspan).each do |x|
|
64
|
+
(0..colspan).each do |y|
|
65
|
+
columns[curr_y+y] = [] unless columns[curr_y+y]
|
66
|
+
if (x == 0 || dup_rows) && (y == 0 || dup_cols)
|
67
|
+
while columns[curr_y+y][curr_x+x]
|
68
|
+
curr_y += 1
|
69
|
+
end
|
70
|
+
columns[curr_y+y][curr_x+x] = node
|
75
71
|
else
|
76
|
-
|
72
|
+
while columns[curr_y+y][curr_x+x]
|
73
|
+
curr_y += 1
|
74
|
+
end
|
75
|
+
columns[curr_y+y][curr_x+x] = EmptyTableNode.new(node.element)
|
77
76
|
end
|
78
77
|
end
|
79
78
|
end
|
79
|
+
curr_y += 1
|
80
80
|
end
|
81
|
-
|
82
|
-
|
81
|
+
curr_x += 1
|
82
|
+
end
|
83
|
+
|
84
|
+
columns.each_index do |col_index|
|
85
|
+
columns[col_index].each do |node|
|
86
|
+
if headers[col_index]
|
87
|
+
headers[col_index].children << node unless node.class == EmptyTableNode
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
columns
|
83
93
|
end
|
84
94
|
|
85
95
|
end
|
@@ -3,9 +3,19 @@ module TableParser
|
|
3
3
|
attr_reader :element, :text, :rowspan, :colspan
|
4
4
|
def initialize(element, rowspan=nil, colspan=nil)
|
5
5
|
@element = element
|
6
|
-
@text = element.text.strip rescue ""
|
7
|
-
|
8
|
-
|
6
|
+
@text = element.text.strip rescue ""
|
7
|
+
|
8
|
+
if element.nil?
|
9
|
+
@colspan = colspan || 1
|
10
|
+
@rowspan = rowspan || 1
|
11
|
+
else
|
12
|
+
@colspan = colspan || element["colspan"].nil? ? 1 : element["colspan"].to_i
|
13
|
+
@rowspan = rowspan || element["rowspan"].nil? ? 1 : element["rowspan"].to_i
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def span(row, col)
|
18
|
+
TableNode.new(element, rowspan-row, colspan-col)
|
9
19
|
end
|
10
20
|
|
11
21
|
def to_s
|
@@ -17,8 +27,8 @@ module TableParser
|
|
17
27
|
def initialize(rowspan=nil, colspan=nil)
|
18
28
|
@element = nil
|
19
29
|
@text = ""
|
20
|
-
@colspan = colspan ||
|
21
|
-
@rowspan = rowspan ||
|
30
|
+
@colspan = colspan || 1
|
31
|
+
@rowspan = rowspan || 1
|
22
32
|
end
|
23
33
|
end
|
24
34
|
end
|
data/lib/table_parser.rb
CHANGED
data/test/test_table_parser.rb
CHANGED
@@ -5,15 +5,28 @@ require 'open-uri'
|
|
5
5
|
|
6
6
|
class TestTableParser < Test::Unit::TestCase
|
7
7
|
def test_parse_rowspan
|
8
|
-
html = "
|
9
|
-
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
10
|
-
<tr><td>3</td></tr></table></body></html>"
|
8
|
+
html = open("rowspan.html").read
|
11
9
|
doc = Nokogiri::HTML(html)
|
12
|
-
table = TableParser::Table.new doc, "/html/body/table"
|
10
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => false}
|
11
|
+
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
12
|
+
assert_equal(2, table[0].size)
|
13
|
+
assert_equal(2, table[1].size)
|
13
14
|
|
15
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => true, :dup_cols => false}
|
16
|
+
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
17
|
+
assert_equal(3, table[0].size)
|
18
|
+
assert_equal(3, table[1].size)
|
19
|
+
|
20
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => true}
|
14
21
|
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
15
22
|
assert_equal(2, table[0].size)
|
16
23
|
assert_equal(2, table[1].size)
|
24
|
+
|
25
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => true, :dup_cols => true}
|
26
|
+
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
27
|
+
assert_equal(3, table[0].size)
|
28
|
+
assert_equal(3, table[1].size)
|
29
|
+
|
17
30
|
end
|
18
31
|
|
19
32
|
def test_parse_rowspan_disable_dup
|
@@ -29,27 +42,27 @@ class TestTableParser < Test::Unit::TestCase
|
|
29
42
|
end
|
30
43
|
|
31
44
|
def test_parse_colspan
|
32
|
-
html = "
|
33
|
-
<tr><td>A</td><td colspan=\"2\">B</td></tr>\
|
34
|
-
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|
35
|
-
<tr><td>B2</td><td>C2</td></tr>\
|
36
|
-
<tr><td>A3</td><td>B3</td><td>C3</td></tr>\
|
37
|
-
<tr><td>A4</td><td colspan=\"2\" rowspan=\"2\">B4</td></tr>\
|
38
|
-
<tr><td>A5</td></tr>\
|
39
|
-
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|
40
|
-
<tr><td>B2</td><td>C2</td></tr>\
|
41
|
-
<tr><td>A3</td><td>B3</td><td>C3</td></tr>\
|
42
|
-
<tr><td>A4</td><td colspan=\"2\" rowspan=\"2\">B4</td></tr>\
|
43
|
-
<tr><td>A5</td></tr>\
|
44
|
-
</table></body></html>"
|
45
|
+
html = open("colspan.html").read
|
45
46
|
doc = Nokogiri::HTML(html)
|
46
47
|
table = TableParser::Table.new doc, "/html/body/table"
|
47
|
-
puts table
|
48
|
-
|
49
48
|
assert_equal(3, table.columns.size, 'header_count should = 3 ')
|
50
49
|
assert_equal(10, table[0].size)
|
51
50
|
assert_equal(10, table[1].size)
|
52
51
|
assert_equal(10, table[2].size)
|
52
|
+
|
53
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
|
54
|
+
puts table.to_s
|
55
|
+
assert_equal(3, table.columns.size, 'header_count should = 3 ')
|
56
|
+
assert_equal(8, table[0].size)
|
57
|
+
assert_equal(8, table[1].size)
|
58
|
+
assert_equal(8, table[2].size)
|
59
|
+
|
60
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => false}
|
61
|
+
puts table.to_s
|
62
|
+
assert_equal(3, table.columns.size, 'header_count should = 3 ')
|
63
|
+
assert_equal(8, table[0].size)
|
64
|
+
assert_equal(8, table[1].size)
|
65
|
+
assert_equal(6, table[2].size)
|
53
66
|
end
|
54
67
|
|
55
68
|
def test_parse_complex
|
@@ -68,17 +81,7 @@ class TestTableParser < Test::Unit::TestCase
|
|
68
81
|
end
|
69
82
|
|
70
83
|
def test_parse_complex2
|
71
|
-
html = "
|
72
|
-
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
|
73
|
-
<tr><td>B2</td><td>B4</td></tr>\
|
74
|
-
<tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
|
75
|
-
<tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
|
76
|
-
<tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
|
77
|
-
<tr><td>F2</td><td>F4</td></tr>\
|
78
|
-
<tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
|
79
|
-
<tr><td>H2</td><td>H4</td></tr>\
|
80
|
-
<tr><td>I2</td><td>I3</td><td>I4</td></tr>\
|
81
|
-
</table></body></html>"
|
84
|
+
html = open("complex2.html").read
|
82
85
|
doc = Nokogiri::HTML(html)
|
83
86
|
table = TableParser::Table.new doc, "/html/body/table"
|
84
87
|
|
@@ -87,6 +90,15 @@ class TestTableParser < Test::Unit::TestCase
|
|
87
90
|
assert_equal 9, table[1].size
|
88
91
|
assert_equal 9, table[2].size
|
89
92
|
assert_equal 9, table[3].size
|
93
|
+
|
94
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
|
95
|
+
puts "table=" + table.to_s
|
96
|
+
assert_equal 4, table.columns.size
|
97
|
+
assert_equal 3, table[0].size
|
98
|
+
assert_equal 9, table[1].size
|
99
|
+
assert_equal 5, table[2].size
|
100
|
+
assert_equal 9, table[3].size
|
101
|
+
|
90
102
|
end
|
91
103
|
|
92
104
|
def test_parse_noheader
|
@@ -101,6 +113,37 @@ class TestTableParser < Test::Unit::TestCase
|
|
101
113
|
assert_equal(3, table[1].size)
|
102
114
|
end
|
103
115
|
|
116
|
+
def test_parse_complex_colrowspan
|
117
|
+
html = open("table_rowcol.html").read
|
118
|
+
|
119
|
+
doc = Nokogiri::HTML(html)
|
120
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => false}
|
121
|
+
puts table
|
122
|
+
assert_equal(5, table.columns.size, 'header_count should = 5 ')
|
123
|
+
assert_equal(1, table[0].size)
|
124
|
+
assert_equal(3, table[1].size)
|
125
|
+
assert_equal(3, table[2].size)
|
126
|
+
assert_equal(4, table[3].size)
|
127
|
+
assert_equal(5, table[4].size)
|
128
|
+
|
129
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => true}
|
130
|
+
puts table
|
131
|
+
assert_equal(5, table.columns.size, 'header_count should = 5 ')
|
132
|
+
assert_equal(5, table[0].size)
|
133
|
+
assert_equal(5, table[1].size)
|
134
|
+
assert_equal(3, table[2].size)
|
135
|
+
assert_equal(5, table[3].size)
|
136
|
+
assert_equal(5, table[4].size)
|
137
|
+
|
138
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => true, :dup_rows => true}
|
139
|
+
puts table
|
140
|
+
assert_equal(5, table.columns.size, 'header_count should = 5 ')
|
141
|
+
assert_equal(5, table[0].size)
|
142
|
+
assert_equal(5, table[1].size)
|
143
|
+
assert_equal(5, table[2].size)
|
144
|
+
assert_equal(5, table[3].size)
|
145
|
+
assert_equal(5, table[4].size)
|
146
|
+
end
|
104
147
|
|
105
148
|
def test_web
|
106
149
|
html = open("test4.html").read
|
@@ -108,4 +151,5 @@ class TestTableParser < Test::Unit::TestCase
|
|
108
151
|
table = TableParser::Table.new doc, "/html/body/div/div[3]/div/div[2]/table", {:header => false, :dup_rows => false}
|
109
152
|
puts table.columns[0].size
|
110
153
|
end
|
154
|
+
|
111
155
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: table_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Chong
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-01-
|
12
|
+
date: 2010-01-27 00:00:00 +08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|