table_parser 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +33 -7
- data/lib/table_parser/parser.rb +8 -5
- data/lib/table_parser/table.rb +3 -3
- data/lib/table_parser/table_column.rb +1 -1
- data/lib/table_parser/table_node.rb +10 -2
- data/lib/table_parser.rb +1 -1
- data/test/test_table_parser.rb +13 -0
- metadata +1 -1
data/README.txt
CHANGED
@@ -12,20 +12,46 @@ Parsing table could be difficult when its structure contains colspan or rowspan.
|
|
12
12
|
|
13
13
|
* sudo gem install table_parser
|
14
14
|
|
15
|
-
== DEVELOPERS:
|
16
15
|
|
17
|
-
|
16
|
+
== USAGE:
|
17
|
+
|
18
|
+
Use TableParser::Table to create parsed HTML table.
|
19
|
+
|
20
|
+
For example, following code:
|
21
|
+
<pre>
|
22
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
23
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
24
|
+
<tr><td>3</td></tr></table></body></html>"
|
25
|
+
table = TableParser::Table.new(html, "/html/body/table")
|
26
|
+
</pre>
|
27
|
+
|
28
|
+
Result in following parsed table:
|
18
29
|
|
19
|
-
|
30
|
+
<pre>
|
31
|
+
Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
|
32
|
+
</pre>
|
33
|
+
Note the first column contains duplicated item, because the first row contains "rowspan" element. If this is not desired, use following syntax to skip duplication:
|
34
|
+
<pre>
|
35
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
36
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
37
|
+
<tr><td>3</td></tr></table></body></html>"
|
38
|
+
table = TableParser::Table.new(html, "/html/body/table", false)
|
39
|
+
</pre>
|
20
40
|
|
21
|
-
|
41
|
+
Which result in following parsed table:
|
22
42
|
|
23
|
-
|
24
|
-
|
43
|
+
<pre>
|
44
|
+
Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
|
45
|
+
</pre>
|
46
|
+
|
47
|
+
|
48
|
+
== DEVELOPERS:
|
49
|
+
|
50
|
+
* Francis Chong francis at ignition dot hk
|
25
51
|
|
26
52
|
== LICENSE:
|
27
53
|
|
28
|
-
|
54
|
+
The MIT License
|
29
55
|
|
30
56
|
Copyright (c) 2010 Ignition Soft
|
31
57
|
|
data/lib/table_parser/parser.rb
CHANGED
@@ -32,7 +32,7 @@ module TableParser
|
|
32
32
|
headers
|
33
33
|
end
|
34
34
|
|
35
|
-
def self.extract_nodes(rows, headers)
|
35
|
+
def self.extract_nodes(rows, headers, duplicate_colspan)
|
36
36
|
data = rows.collect do |row|
|
37
37
|
row.collect do |ele|
|
38
38
|
node = TableNode.new(ele)
|
@@ -44,18 +44,21 @@ module TableParser
|
|
44
44
|
row = data[row_index]
|
45
45
|
row.each_index do |col_index|
|
46
46
|
col = row[col_index]
|
47
|
-
headers[col_index].children << col
|
47
|
+
headers[col_index].children << col if col.class != EmptyTableNode
|
48
48
|
|
49
49
|
if col.colspan > 1
|
50
50
|
col.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
|
51
51
|
end
|
52
52
|
|
53
53
|
if col.rowspan > 1 && data[row_index+1]
|
54
|
-
|
54
|
+
if duplicate_colspan
|
55
|
+
data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
|
56
|
+
else
|
57
|
+
data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
|
58
|
+
end
|
55
59
|
end
|
56
60
|
end
|
57
|
-
end
|
58
|
-
|
61
|
+
end
|
59
62
|
data
|
60
63
|
end
|
61
64
|
|
data/lib/table_parser/table.rb
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
module TableParser
|
2
2
|
class Table
|
3
3
|
attr_reader :nodes, :columns
|
4
|
-
def initialize(input, xpath_to_table="//table[0]")
|
4
|
+
def initialize(input, xpath_to_table="//table[0]", duplicate_colspan=true)
|
5
5
|
table = Parser.extract_table(input, xpath_to_table)
|
6
6
|
@columns = Parser.extract_column_headers(table)
|
7
|
-
@nodes = Parser.extract_nodes(table, @columns)
|
7
|
+
@nodes = Parser.extract_nodes(table, @columns, duplicate_colspan)
|
8
8
|
end
|
9
9
|
|
10
10
|
def to_s
|
11
|
-
"Table<#{@
|
11
|
+
"Table<#{@columns.collect{|h| h.to_s }.join(",")}>"
|
12
12
|
end
|
13
13
|
|
14
14
|
# get column by index
|
@@ -3,8 +3,7 @@ module TableParser
|
|
3
3
|
attr_reader :element, :text, :rowspan, :colspan
|
4
4
|
def initialize(element, rowspan=nil, colspan=nil)
|
5
5
|
@element = element
|
6
|
-
@text = element.text
|
7
|
-
|
6
|
+
@text = element.text
|
8
7
|
@colspan = colspan || element["colspan"].to_i rescue 1
|
9
8
|
@rowspan = rowspan || element["rowspan"].to_i rescue 1
|
10
9
|
end
|
@@ -13,4 +12,13 @@ module TableParser
|
|
13
12
|
"[#{@text}]"
|
14
13
|
end
|
15
14
|
end
|
15
|
+
|
16
|
+
class EmptyTableNode < TableNode
|
17
|
+
def initialize(rowspan=nil, colspan=nil)
|
18
|
+
@element = nil
|
19
|
+
@text = ""
|
20
|
+
@colspan = colspan || element["colspan"].to_i rescue 1
|
21
|
+
@rowspan = rowspan || element["rowspan"].to_i rescue 1
|
22
|
+
end
|
23
|
+
end
|
16
24
|
end
|
data/lib/table_parser.rb
CHANGED
data/test/test_table_parser.rb
CHANGED
@@ -8,11 +8,24 @@ class TestTableParser < Test::Unit::TestCase
|
|
8
8
|
<tr><td>3</td></tr></table></body></html>",
|
9
9
|
"/html/body/table"
|
10
10
|
|
11
|
+
puts table
|
11
12
|
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
12
13
|
assert_equal(2, table[0].size)
|
13
14
|
assert_equal(2, table[1].size)
|
14
15
|
end
|
15
16
|
|
17
|
+
def test_parse_rowspan_disable_dup
|
18
|
+
table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
19
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
20
|
+
<tr><td>3</td></tr></table></body></html>",
|
21
|
+
"/html/body/table", false
|
22
|
+
|
23
|
+
puts table
|
24
|
+
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
25
|
+
assert_equal(1, table[0].size)
|
26
|
+
assert_equal(2, table[1].size)
|
27
|
+
end
|
28
|
+
|
16
29
|
def test_parse_colspan
|
17
30
|
table = TableParser::Table.new "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
|
18
31
|
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|