table_parser 0.5.7 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/table_parser/parser.rb +37 -27
- data/lib/table_parser/table_node.rb +15 -5
- data/lib/table_parser.rb +1 -1
- data/test/test_table_parser.rb +74 -30
- metadata +2 -2
data/lib/table_parser/parser.rb
CHANGED
@@ -42,44 +42,54 @@ module TableParser
|
|
42
42
|
headers
|
43
43
|
end
|
44
44
|
|
45
|
-
def self.extract_nodes(
|
46
|
-
data =
|
45
|
+
def self.extract_nodes(input_rows, headers, dup_rows, dup_cols)
|
46
|
+
data = input_rows.collect do |row|
|
47
47
|
row.collect do |ele|
|
48
48
|
node = TableNode.new(ele)
|
49
49
|
end
|
50
50
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
if headers[col_index]
|
58
|
-
headers[col_index].children << col if col.class != EmptyTableNode
|
51
|
+
|
52
|
+
columns = []
|
53
|
+
curr_x = 0
|
54
|
+
curr_y = 0
|
55
|
+
data.each do |row|
|
56
|
+
columns[curr_y] = [] unless columns[curr_y]
|
59
57
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
row.insert(col_index, EmptyTableNode.new(col.rowspan, col.colspan - 1))
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
if col.rowspan > 1
|
69
|
-
unless data[row_index+1]
|
70
|
-
data.insert(row_index, [])
|
71
|
-
end
|
58
|
+
curr_y = 0
|
59
|
+
row.each do |node|
|
60
|
+
rowspan = node.rowspan - 1
|
61
|
+
colspan = node.colspan - 1
|
72
62
|
|
73
|
-
|
74
|
-
|
63
|
+
(0..rowspan).each do |x|
|
64
|
+
(0..colspan).each do |y|
|
65
|
+
columns[curr_y+y] = [] unless columns[curr_y+y]
|
66
|
+
if (x == 0 || dup_rows) && (y == 0 || dup_cols)
|
67
|
+
while columns[curr_y+y][curr_x+x]
|
68
|
+
curr_y += 1
|
69
|
+
end
|
70
|
+
columns[curr_y+y][curr_x+x] = node
|
75
71
|
else
|
76
|
-
|
72
|
+
while columns[curr_y+y][curr_x+x]
|
73
|
+
curr_y += 1
|
74
|
+
end
|
75
|
+
columns[curr_y+y][curr_x+x] = EmptyTableNode.new(node.element)
|
77
76
|
end
|
78
77
|
end
|
79
78
|
end
|
79
|
+
curr_y += 1
|
80
80
|
end
|
81
|
-
|
82
|
-
|
81
|
+
curr_x += 1
|
82
|
+
end
|
83
|
+
|
84
|
+
columns.each_index do |col_index|
|
85
|
+
columns[col_index].each do |node|
|
86
|
+
if headers[col_index]
|
87
|
+
headers[col_index].children << node unless node.class == EmptyTableNode
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
columns
|
83
93
|
end
|
84
94
|
|
85
95
|
end
|
@@ -3,9 +3,19 @@ module TableParser
|
|
3
3
|
attr_reader :element, :text, :rowspan, :colspan
|
4
4
|
def initialize(element, rowspan=nil, colspan=nil)
|
5
5
|
@element = element
|
6
|
-
@text = element.text.strip rescue ""
|
7
|
-
|
8
|
-
|
6
|
+
@text = element.text.strip rescue ""
|
7
|
+
|
8
|
+
if element.nil?
|
9
|
+
@colspan = colspan || 1
|
10
|
+
@rowspan = rowspan || 1
|
11
|
+
else
|
12
|
+
@colspan = colspan || element["colspan"].nil? ? 1 : element["colspan"].to_i
|
13
|
+
@rowspan = rowspan || element["rowspan"].nil? ? 1 : element["rowspan"].to_i
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def span(row, col)
|
18
|
+
TableNode.new(element, rowspan-row, colspan-col)
|
9
19
|
end
|
10
20
|
|
11
21
|
def to_s
|
@@ -17,8 +27,8 @@ module TableParser
|
|
17
27
|
def initialize(rowspan=nil, colspan=nil)
|
18
28
|
@element = nil
|
19
29
|
@text = ""
|
20
|
-
@colspan = colspan ||
|
21
|
-
@rowspan = rowspan ||
|
30
|
+
@colspan = colspan || 1
|
31
|
+
@rowspan = rowspan || 1
|
22
32
|
end
|
23
33
|
end
|
24
34
|
end
|
data/lib/table_parser.rb
CHANGED
data/test/test_table_parser.rb
CHANGED
@@ -5,15 +5,28 @@ require 'open-uri'
|
|
5
5
|
|
6
6
|
class TestTableParser < Test::Unit::TestCase
|
7
7
|
def test_parse_rowspan
|
8
|
-
html = "
|
9
|
-
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
10
|
-
<tr><td>3</td></tr></table></body></html>"
|
8
|
+
html = open("rowspan.html").read
|
11
9
|
doc = Nokogiri::HTML(html)
|
12
|
-
table = TableParser::Table.new doc, "/html/body/table"
|
10
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => false}
|
11
|
+
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
12
|
+
assert_equal(2, table[0].size)
|
13
|
+
assert_equal(2, table[1].size)
|
13
14
|
|
15
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => true, :dup_cols => false}
|
16
|
+
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
17
|
+
assert_equal(3, table[0].size)
|
18
|
+
assert_equal(3, table[1].size)
|
19
|
+
|
20
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => true}
|
14
21
|
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
15
22
|
assert_equal(2, table[0].size)
|
16
23
|
assert_equal(2, table[1].size)
|
24
|
+
|
25
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => true, :dup_cols => true}
|
26
|
+
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
27
|
+
assert_equal(3, table[0].size)
|
28
|
+
assert_equal(3, table[1].size)
|
29
|
+
|
17
30
|
end
|
18
31
|
|
19
32
|
def test_parse_rowspan_disable_dup
|
@@ -29,27 +42,27 @@ class TestTableParser < Test::Unit::TestCase
|
|
29
42
|
end
|
30
43
|
|
31
44
|
def test_parse_colspan
|
32
|
-
html = "
|
33
|
-
<tr><td>A</td><td colspan=\"2\">B</td></tr>\
|
34
|
-
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|
35
|
-
<tr><td>B2</td><td>C2</td></tr>\
|
36
|
-
<tr><td>A3</td><td>B3</td><td>C3</td></tr>\
|
37
|
-
<tr><td>A4</td><td colspan=\"2\" rowspan=\"2\">B4</td></tr>\
|
38
|
-
<tr><td>A5</td></tr>\
|
39
|
-
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|
40
|
-
<tr><td>B2</td><td>C2</td></tr>\
|
41
|
-
<tr><td>A3</td><td>B3</td><td>C3</td></tr>\
|
42
|
-
<tr><td>A4</td><td colspan=\"2\" rowspan=\"2\">B4</td></tr>\
|
43
|
-
<tr><td>A5</td></tr>\
|
44
|
-
</table></body></html>"
|
45
|
+
html = open("colspan.html").read
|
45
46
|
doc = Nokogiri::HTML(html)
|
46
47
|
table = TableParser::Table.new doc, "/html/body/table"
|
47
|
-
puts table
|
48
|
-
|
49
48
|
assert_equal(3, table.columns.size, 'header_count should = 3 ')
|
50
49
|
assert_equal(10, table[0].size)
|
51
50
|
assert_equal(10, table[1].size)
|
52
51
|
assert_equal(10, table[2].size)
|
52
|
+
|
53
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
|
54
|
+
puts table.to_s
|
55
|
+
assert_equal(3, table.columns.size, 'header_count should = 3 ')
|
56
|
+
assert_equal(8, table[0].size)
|
57
|
+
assert_equal(8, table[1].size)
|
58
|
+
assert_equal(8, table[2].size)
|
59
|
+
|
60
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => false}
|
61
|
+
puts table.to_s
|
62
|
+
assert_equal(3, table.columns.size, 'header_count should = 3 ')
|
63
|
+
assert_equal(8, table[0].size)
|
64
|
+
assert_equal(8, table[1].size)
|
65
|
+
assert_equal(6, table[2].size)
|
53
66
|
end
|
54
67
|
|
55
68
|
def test_parse_complex
|
@@ -68,17 +81,7 @@ class TestTableParser < Test::Unit::TestCase
|
|
68
81
|
end
|
69
82
|
|
70
83
|
def test_parse_complex2
|
71
|
-
html = "
|
72
|
-
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
|
73
|
-
<tr><td>B2</td><td>B4</td></tr>\
|
74
|
-
<tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
|
75
|
-
<tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
|
76
|
-
<tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
|
77
|
-
<tr><td>F2</td><td>F4</td></tr>\
|
78
|
-
<tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
|
79
|
-
<tr><td>H2</td><td>H4</td></tr>\
|
80
|
-
<tr><td>I2</td><td>I3</td><td>I4</td></tr>\
|
81
|
-
</table></body></html>"
|
84
|
+
html = open("complex2.html").read
|
82
85
|
doc = Nokogiri::HTML(html)
|
83
86
|
table = TableParser::Table.new doc, "/html/body/table"
|
84
87
|
|
@@ -87,6 +90,15 @@ class TestTableParser < Test::Unit::TestCase
|
|
87
90
|
assert_equal 9, table[1].size
|
88
91
|
assert_equal 9, table[2].size
|
89
92
|
assert_equal 9, table[3].size
|
93
|
+
|
94
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
|
95
|
+
puts "table=" + table.to_s
|
96
|
+
assert_equal 4, table.columns.size
|
97
|
+
assert_equal 3, table[0].size
|
98
|
+
assert_equal 9, table[1].size
|
99
|
+
assert_equal 5, table[2].size
|
100
|
+
assert_equal 9, table[3].size
|
101
|
+
|
90
102
|
end
|
91
103
|
|
92
104
|
def test_parse_noheader
|
@@ -101,6 +113,37 @@ class TestTableParser < Test::Unit::TestCase
|
|
101
113
|
assert_equal(3, table[1].size)
|
102
114
|
end
|
103
115
|
|
116
|
+
def test_parse_complex_colrowspan
|
117
|
+
html = open("table_rowcol.html").read
|
118
|
+
|
119
|
+
doc = Nokogiri::HTML(html)
|
120
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => false}
|
121
|
+
puts table
|
122
|
+
assert_equal(5, table.columns.size, 'header_count should = 5 ')
|
123
|
+
assert_equal(1, table[0].size)
|
124
|
+
assert_equal(3, table[1].size)
|
125
|
+
assert_equal(3, table[2].size)
|
126
|
+
assert_equal(4, table[3].size)
|
127
|
+
assert_equal(5, table[4].size)
|
128
|
+
|
129
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => true}
|
130
|
+
puts table
|
131
|
+
assert_equal(5, table.columns.size, 'header_count should = 5 ')
|
132
|
+
assert_equal(5, table[0].size)
|
133
|
+
assert_equal(5, table[1].size)
|
134
|
+
assert_equal(3, table[2].size)
|
135
|
+
assert_equal(5, table[3].size)
|
136
|
+
assert_equal(5, table[4].size)
|
137
|
+
|
138
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => true, :dup_rows => true}
|
139
|
+
puts table
|
140
|
+
assert_equal(5, table.columns.size, 'header_count should = 5 ')
|
141
|
+
assert_equal(5, table[0].size)
|
142
|
+
assert_equal(5, table[1].size)
|
143
|
+
assert_equal(5, table[2].size)
|
144
|
+
assert_equal(5, table[3].size)
|
145
|
+
assert_equal(5, table[4].size)
|
146
|
+
end
|
104
147
|
|
105
148
|
def test_web
|
106
149
|
html = open("test4.html").read
|
@@ -108,4 +151,5 @@ class TestTableParser < Test::Unit::TestCase
|
|
108
151
|
table = TableParser::Table.new doc, "/html/body/div/div[3]/div/div[2]/table", {:header => false, :dup_rows => false}
|
109
152
|
puts table.columns[0].size
|
110
153
|
end
|
154
|
+
|
111
155
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: table_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Chong
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-01-
|
12
|
+
date: 2010-01-27 00:00:00 +08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|