table_parser 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +1 -1
- data/{README.txt → README.rdoc} +15 -18
- data/lib/table_parser.rb +1 -1
- data/lib/table_parser/parser.rb +20 -15
- data/lib/table_parser/table.rb +11 -5
- data/lib/table_parser/table_node.rb +1 -1
- data/test/test_table_parser.rb +45 -32
- metadata +2 -3
data/Manifest.txt
CHANGED
data/{README.txt → README.rdoc}
RENAMED
@@ -18,32 +18,29 @@ Parsing table could be difficult when its structure contains colspan or rowspan.
|
|
18
18
|
Use TableParser::Table to create parsed HTML table.
|
19
19
|
|
20
20
|
For example, following code:
|
21
|
-
<
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
</pre>
|
21
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
22
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
23
|
+
<tr><td>3</td></tr></table></body></html>"
|
24
|
+
doc = Nokogiri::HTML(html)
|
25
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
27
26
|
|
28
27
|
Result in following parsed table:
|
29
28
|
|
30
|
-
<
|
31
|
-
|
32
|
-
</pre>
|
29
|
+
Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
|
30
|
+
|
33
31
|
Note the first column contains duplicated item, because the first row contains "rowspan" element. If this is not desired, use following syntax to skip duplication:
|
34
|
-
|
35
|
-
html = "
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
32
|
+
|
33
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
34
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
35
|
+
<tr><td>3</td></tr></table></body></html>"
|
36
|
+
doc = Nokogiri::HTML(html)
|
37
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false})
|
40
38
|
|
41
39
|
Which result in following parsed table:
|
42
40
|
|
43
|
-
<
|
44
|
-
Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
|
45
|
-
</pre>
|
41
|
+
Table<TableColumn<name=A, children=[1]>,TableColumn<name=B, children=[2],[3]>>
|
46
42
|
|
43
|
+
Read the spec (test/test_table_parser.rb) for more usage.
|
47
44
|
|
48
45
|
== DEVELOPERS:
|
49
46
|
|
data/lib/table_parser.rb
CHANGED
data/lib/table_parser/parser.rb
CHANGED
@@ -5,9 +5,7 @@ require 'open-uri'
|
|
5
5
|
module TableParser
|
6
6
|
class Parser
|
7
7
|
# extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
|
8
|
-
def self.extract_table(
|
9
|
-
doc = Nokogiri::HTML(input)
|
10
|
-
|
8
|
+
def self.extract_table(doc, xpath)
|
11
9
|
rows = []
|
12
10
|
table = doc.xpath(xpath)
|
13
11
|
rows = table.xpath("./tr").collect do |row|
|
@@ -18,7 +16,7 @@ module TableParser
|
|
18
16
|
rows
|
19
17
|
end
|
20
18
|
|
21
|
-
def self.extract_column_headers(rows)
|
19
|
+
def self.extract_column_headers(rows, dup_rows, dup_cols)
|
22
20
|
headers = []
|
23
21
|
rows.first.collect do |col|
|
24
22
|
header = TableColumn.new(col)
|
@@ -32,7 +30,7 @@ module TableParser
|
|
32
30
|
headers
|
33
31
|
end
|
34
32
|
|
35
|
-
def self.extract_nodes(rows, headers,
|
33
|
+
def self.extract_nodes(rows, headers, dup_rows, dup_cols)
|
36
34
|
data = rows.collect do |row|
|
37
35
|
row.collect do |ele|
|
38
36
|
node = TableNode.new(ele)
|
@@ -44,19 +42,26 @@ module TableParser
|
|
44
42
|
row = data[row_index]
|
45
43
|
row.each_index do |col_index|
|
46
44
|
col = row[col_index]
|
47
|
-
headers[col_index]
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
45
|
+
if headers[col_index]
|
46
|
+
headers[col_index].children << col if col.class != EmptyTableNode
|
47
|
+
if col.colspan > 1
|
48
|
+
if dup_cols
|
49
|
+
row.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
|
50
|
+
else
|
51
|
+
row.insert(col_index, EmptyTableNode.new(col.rowspan, col.colspan - 1))
|
52
|
+
end
|
53
|
+
end
|
52
54
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
55
|
+
if col.rowspan > 1 && data[row_index+1]
|
56
|
+
if dup_rows
|
57
|
+
data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
|
58
|
+
else
|
59
|
+
data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
|
60
|
+
end
|
58
61
|
end
|
59
62
|
end
|
63
|
+
|
64
|
+
|
60
65
|
end
|
61
66
|
end
|
62
67
|
data
|
data/lib/table_parser/table.rb
CHANGED
@@ -1,17 +1,23 @@
|
|
1
1
|
module TableParser
|
2
2
|
class Table
|
3
3
|
attr_reader :nodes, :columns
|
4
|
-
def initialize(
|
4
|
+
def initialize(doc, xpath_to_table="//table[0]", options={})
|
5
5
|
|
6
|
-
if options.has_key?
|
6
|
+
if options.has_key?(:dup_rows)
|
7
7
|
dup_rows = options[:dup_rows]
|
8
8
|
else
|
9
9
|
dup_rows = true
|
10
10
|
end
|
11
|
+
|
12
|
+
if options.has_key?(:dup_cols)
|
13
|
+
dup_cols = options[:dup_cols]
|
14
|
+
else
|
15
|
+
dup_cols = true
|
16
|
+
end
|
11
17
|
|
12
|
-
table = Parser.extract_table(
|
13
|
-
@columns = Parser.extract_column_headers(table)
|
14
|
-
@nodes = Parser.extract_nodes(table, @columns, dup_rows)
|
18
|
+
table = Parser.extract_table(doc, xpath_to_table)
|
19
|
+
@columns = Parser.extract_column_headers(table, dup_rows, dup_cols)
|
20
|
+
@nodes = Parser.extract_nodes(table, @columns, dup_rows, dup_cols)
|
15
21
|
end
|
16
22
|
|
17
23
|
def to_s
|
@@ -3,7 +3,7 @@ module TableParser
|
|
3
3
|
attr_reader :element, :text, :rowspan, :colspan
|
4
4
|
def initialize(element, rowspan=nil, colspan=nil)
|
5
5
|
@element = element
|
6
|
-
@text = element.text
|
6
|
+
@text = element.text.strip
|
7
7
|
@colspan = colspan || element["colspan"].to_i rescue 1
|
8
8
|
@rowspan = rowspan || element["rowspan"].to_i rescue 1
|
9
9
|
end
|
data/test/test_table_parser.rb
CHANGED
@@ -3,35 +3,36 @@ require "table_parser"
|
|
3
3
|
|
4
4
|
class TestTableParser < Test::Unit::TestCase
|
5
5
|
def test_parse_rowspan
|
6
|
-
|
6
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
7
7
|
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
8
|
-
<tr><td>3</td></tr></table></body></html>"
|
9
|
-
|
8
|
+
<tr><td>3</td></tr></table></body></html>"
|
9
|
+
doc = Nokogiri::HTML(html)
|
10
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
10
11
|
|
11
|
-
puts table
|
12
12
|
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
13
13
|
assert_equal(2, table[0].size)
|
14
14
|
assert_equal(2, table[1].size)
|
15
15
|
end
|
16
16
|
|
17
17
|
def test_parse_rowspan_disable_dup
|
18
|
-
|
18
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
19
19
|
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
20
|
-
<tr><td>3</td></tr></table></body></html>"
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
<tr><td>3</td></tr></table></body></html>"
|
21
|
+
doc = Nokogiri::HTML(html)
|
22
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
|
23
|
+
|
24
24
|
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
25
25
|
assert_equal(1, table[0].size)
|
26
26
|
assert_equal(2, table[1].size)
|
27
27
|
end
|
28
28
|
|
29
29
|
def test_parse_colspan
|
30
|
-
|
30
|
+
html = "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
|
31
31
|
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|
32
32
|
<tr><td>B2</td><td>C2</td></tr>\
|
33
|
-
<tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>"
|
34
|
-
|
33
|
+
<tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>"
|
34
|
+
doc = Nokogiri::HTML(html)
|
35
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
35
36
|
|
36
37
|
assert_equal(3, table.columns.size, 'header_count should = 3 ')
|
37
38
|
assert_equal(4, table[0].size)
|
@@ -41,12 +42,13 @@ class TestTableParser < Test::Unit::TestCase
|
|
41
42
|
end
|
42
43
|
|
43
44
|
def test_parse_complex
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
45
|
+
html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
|
46
|
+
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
|
47
|
+
<tr><td>B2</td><td>B4</td></tr>\
|
48
|
+
<tr><td>C2</td><td>C3</td><td>B4</td></tr>\
|
49
|
+
</table></body></html>"
|
50
|
+
doc = Nokogiri::HTML(html)
|
51
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
50
52
|
|
51
53
|
assert_equal 4, table.columns.size
|
52
54
|
assert_equal 3, table[0].size
|
@@ -55,18 +57,19 @@ class TestTableParser < Test::Unit::TestCase
|
|
55
57
|
end
|
56
58
|
|
57
59
|
def test_parse_complex2
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
60
|
+
html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
|
61
|
+
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
|
62
|
+
<tr><td>B2</td><td>B4</td></tr>\
|
63
|
+
<tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
|
64
|
+
<tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
|
65
|
+
<tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
|
66
|
+
<tr><td>F2</td><td>F4</td></tr>\
|
67
|
+
<tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
|
68
|
+
<tr><td>H2</td><td>H4</td></tr>\
|
69
|
+
<tr><td>I2</td><td>I3</td><td>I4</td></tr>\
|
70
|
+
</table></body></html>"
|
71
|
+
doc = Nokogiri::HTML(html)
|
72
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
70
73
|
|
71
74
|
assert_equal 4, table.columns.size
|
72
75
|
assert_equal 9, table[0].size
|
@@ -76,8 +79,8 @@ class TestTableParser < Test::Unit::TestCase
|
|
76
79
|
end
|
77
80
|
|
78
81
|
def test_parse_web
|
79
|
-
|
80
|
-
|
82
|
+
doc = Nokogiri::HTML(open("test.html").read)
|
83
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
81
84
|
|
82
85
|
assert_equal 11, table.columns.size
|
83
86
|
assert_equal 9, table[0].size
|
@@ -85,4 +88,14 @@ class TestTableParser < Test::Unit::TestCase
|
|
85
88
|
assert_equal 9, table[2].size
|
86
89
|
assert_equal 9, table[3].size
|
87
90
|
end
|
91
|
+
|
92
|
+
def test_parse_web2
|
93
|
+
doc = Nokogiri::HTML(open("test2.html").read)
|
94
|
+
|
95
|
+
table = doc.xpath("//div[@id='timetable_box-week']/table")
|
96
|
+
table.xpath("./tr[1]").remove
|
97
|
+
|
98
|
+
table = TableParser::Table.new doc, "//div[@id='timetable_box-week']/table", {:dup_cols => false, :dup_rows => false}
|
99
|
+
puts table.columns.select(){|c| c.text =~ /[0-9]+月[0-9]+日/ }
|
100
|
+
end
|
88
101
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: table_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Chong
|
@@ -32,12 +32,11 @@ extensions: []
|
|
32
32
|
extra_rdoc_files:
|
33
33
|
- History.txt
|
34
34
|
- Manifest.txt
|
35
|
-
- README.txt
|
36
35
|
files:
|
37
36
|
- .autotest
|
38
37
|
- History.txt
|
39
38
|
- Manifest.txt
|
40
|
-
- README.
|
39
|
+
- README.rdoc
|
41
40
|
- Rakefile
|
42
41
|
- lib/table_parser.rb
|
43
42
|
- lib/table_parser/parser.rb
|