table_parser 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +33 -7
- data/lib/table_parser/parser.rb +8 -5
- data/lib/table_parser/table.rb +3 -3
- data/lib/table_parser/table_column.rb +1 -1
- data/lib/table_parser/table_node.rb +10 -2
- data/lib/table_parser.rb +1 -1
- data/test/test_table_parser.rb +13 -0
- metadata +1 -1
data/README.txt
CHANGED
@@ -12,20 +12,46 @@ Parsing table could be difficult when its structure contains colspan or rowspan.
|
|
12
12
|
|
13
13
|
* sudo gem install table_parser
|
14
14
|
|
15
|
-
== DEVELOPERS:
|
16
15
|
|
17
|
-
|
16
|
+
== USAGE:
|
17
|
+
|
18
|
+
Use TableParser::Table to create parsed HTML table.
|
19
|
+
|
20
|
+
For example, following code:
|
21
|
+
<pre>
|
22
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
23
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
24
|
+
<tr><td>3</td></tr></table></body></html>"
|
25
|
+
table = TableParser::Table.new(html, "/html/body/table")
|
26
|
+
</pre>
|
27
|
+
|
28
|
+
Result in following parsed table:
|
18
29
|
|
19
|
-
|
30
|
+
<pre>
|
31
|
+
Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
|
32
|
+
</pre>
|
33
|
+
Note the first column contains duplicated item, because the first row contains "rowspan" element. If this is not desired, use following syntax to skip duplication:
|
34
|
+
<pre>
|
35
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
36
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
37
|
+
<tr><td>3</td></tr></table></body></html>"
|
38
|
+
table = TableParser::Table.new(html, "/html/body/table", false)
|
39
|
+
</pre>
|
20
40
|
|
21
|
-
|
41
|
+
Which result in following parsed table:
|
22
42
|
|
23
|
-
|
24
|
-
|
43
|
+
<pre>
|
44
|
+
Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
|
45
|
+
</pre>
|
46
|
+
|
47
|
+
|
48
|
+
== DEVELOPERS:
|
49
|
+
|
50
|
+
* Francis Chong francis at ignition dot hk
|
25
51
|
|
26
52
|
== LICENSE:
|
27
53
|
|
28
|
-
|
54
|
+
The MIT License
|
29
55
|
|
30
56
|
Copyright (c) 2010 Ignition Soft
|
31
57
|
|
data/lib/table_parser/parser.rb
CHANGED
@@ -32,7 +32,7 @@ module TableParser
|
|
32
32
|
headers
|
33
33
|
end
|
34
34
|
|
35
|
-
def self.extract_nodes(rows, headers)
|
35
|
+
def self.extract_nodes(rows, headers, duplicate_colspan)
|
36
36
|
data = rows.collect do |row|
|
37
37
|
row.collect do |ele|
|
38
38
|
node = TableNode.new(ele)
|
@@ -44,18 +44,21 @@ module TableParser
|
|
44
44
|
row = data[row_index]
|
45
45
|
row.each_index do |col_index|
|
46
46
|
col = row[col_index]
|
47
|
-
headers[col_index].children << col
|
47
|
+
headers[col_index].children << col if col.class != EmptyTableNode
|
48
48
|
|
49
49
|
if col.colspan > 1
|
50
50
|
col.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
|
51
51
|
end
|
52
52
|
|
53
53
|
if col.rowspan > 1 && data[row_index+1]
|
54
|
-
|
54
|
+
if duplicate_colspan
|
55
|
+
data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
|
56
|
+
else
|
57
|
+
data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
|
58
|
+
end
|
55
59
|
end
|
56
60
|
end
|
57
|
-
end
|
58
|
-
|
61
|
+
end
|
59
62
|
data
|
60
63
|
end
|
61
64
|
|
data/lib/table_parser/table.rb
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
module TableParser
|
2
2
|
class Table
|
3
3
|
attr_reader :nodes, :columns
|
4
|
-
def initialize(input, xpath_to_table="//table[0]")
|
4
|
+
def initialize(input, xpath_to_table="//table[0]", duplicate_colspan=true)
|
5
5
|
table = Parser.extract_table(input, xpath_to_table)
|
6
6
|
@columns = Parser.extract_column_headers(table)
|
7
|
-
@nodes = Parser.extract_nodes(table, @columns)
|
7
|
+
@nodes = Parser.extract_nodes(table, @columns, duplicate_colspan)
|
8
8
|
end
|
9
9
|
|
10
10
|
def to_s
|
11
|
-
"Table<#{@
|
11
|
+
"Table<#{@columns.collect{|h| h.to_s }.join(",")}>"
|
12
12
|
end
|
13
13
|
|
14
14
|
# get column by index
|
@@ -3,8 +3,7 @@ module TableParser
|
|
3
3
|
attr_reader :element, :text, :rowspan, :colspan
|
4
4
|
def initialize(element, rowspan=nil, colspan=nil)
|
5
5
|
@element = element
|
6
|
-
@text = element.text
|
7
|
-
|
6
|
+
@text = element.text
|
8
7
|
@colspan = colspan || element["colspan"].to_i rescue 1
|
9
8
|
@rowspan = rowspan || element["rowspan"].to_i rescue 1
|
10
9
|
end
|
@@ -13,4 +12,13 @@ module TableParser
|
|
13
12
|
"[#{@text}]"
|
14
13
|
end
|
15
14
|
end
|
15
|
+
|
16
|
+
class EmptyTableNode < TableNode
|
17
|
+
def initialize(rowspan=nil, colspan=nil)
|
18
|
+
@element = nil
|
19
|
+
@text = ""
|
20
|
+
@colspan = colspan || element["colspan"].to_i rescue 1
|
21
|
+
@rowspan = rowspan || element["rowspan"].to_i rescue 1
|
22
|
+
end
|
23
|
+
end
|
16
24
|
end
|
data/lib/table_parser.rb
CHANGED
data/test/test_table_parser.rb
CHANGED
@@ -8,11 +8,24 @@ class TestTableParser < Test::Unit::TestCase
|
|
8
8
|
<tr><td>3</td></tr></table></body></html>",
|
9
9
|
"/html/body/table"
|
10
10
|
|
11
|
+
puts table
|
11
12
|
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
12
13
|
assert_equal(2, table[0].size)
|
13
14
|
assert_equal(2, table[1].size)
|
14
15
|
end
|
15
16
|
|
17
|
+
def test_parse_rowspan_disable_dup
|
18
|
+
table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
19
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
20
|
+
<tr><td>3</td></tr></table></body></html>",
|
21
|
+
"/html/body/table", false
|
22
|
+
|
23
|
+
puts table
|
24
|
+
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
25
|
+
assert_equal(1, table[0].size)
|
26
|
+
assert_equal(2, table[1].size)
|
27
|
+
end
|
28
|
+
|
16
29
|
def test_parse_colspan
|
17
30
|
table = TableParser::Table.new "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
|
18
31
|
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|