table_parser 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +1 -1
- data/{README.txt → README.rdoc} +15 -18
- data/lib/table_parser.rb +1 -1
- data/lib/table_parser/parser.rb +20 -15
- data/lib/table_parser/table.rb +11 -5
- data/lib/table_parser/table_node.rb +1 -1
- data/test/test_table_parser.rb +45 -32
- metadata +2 -3
data/Manifest.txt
CHANGED
data/{README.txt → README.rdoc}
RENAMED
@@ -18,32 +18,29 @@ Parsing table could be difficult when its structure contains colspan or rowspan.
|
|
18
18
|
Use TableParser::Table to create parsed HTML table.
|
19
19
|
|
20
20
|
For example, following code:
|
21
|
-
<
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
</pre>
|
21
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
22
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
23
|
+
<tr><td>3</td></tr></table></body></html>"
|
24
|
+
doc = Nokogiri::HTML(html)
|
25
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
27
26
|
|
28
27
|
Result in following parsed table:
|
29
28
|
|
30
|
-
<
|
31
|
-
|
32
|
-
</pre>
|
29
|
+
Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
|
30
|
+
|
33
31
|
Note the first column contains duplicated item, because the first row contains "rowspan" element. If this is not desired, use following syntax to skip duplication:
|
34
|
-
|
35
|
-
html = "
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
32
|
+
|
33
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
34
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
35
|
+
<tr><td>3</td></tr></table></body></html>"
|
36
|
+
doc = Nokogiri::HTML(html)
|
37
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false})
|
40
38
|
|
41
39
|
Which result in following parsed table:
|
42
40
|
|
43
|
-
<
|
44
|
-
Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
|
45
|
-
</pre>
|
41
|
+
Table<TableColumn<name=A, children=[1]>,TableColumn<name=B, children=[2],[3]>>
|
46
42
|
|
43
|
+
Read the spec (test/test_table_parser.rb) for more usage.
|
47
44
|
|
48
45
|
== DEVELOPERS:
|
49
46
|
|
data/lib/table_parser.rb
CHANGED
data/lib/table_parser/parser.rb
CHANGED
@@ -5,9 +5,7 @@ require 'open-uri'
|
|
5
5
|
module TableParser
|
6
6
|
class Parser
|
7
7
|
# extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
|
8
|
-
def self.extract_table(
|
9
|
-
doc = Nokogiri::HTML(input)
|
10
|
-
|
8
|
+
def self.extract_table(doc, xpath)
|
11
9
|
rows = []
|
12
10
|
table = doc.xpath(xpath)
|
13
11
|
rows = table.xpath("./tr").collect do |row|
|
@@ -18,7 +16,7 @@ module TableParser
|
|
18
16
|
rows
|
19
17
|
end
|
20
18
|
|
21
|
-
def self.extract_column_headers(rows)
|
19
|
+
def self.extract_column_headers(rows, dup_rows, dup_cols)
|
22
20
|
headers = []
|
23
21
|
rows.first.collect do |col|
|
24
22
|
header = TableColumn.new(col)
|
@@ -32,7 +30,7 @@ module TableParser
|
|
32
30
|
headers
|
33
31
|
end
|
34
32
|
|
35
|
-
def self.extract_nodes(rows, headers,
|
33
|
+
def self.extract_nodes(rows, headers, dup_rows, dup_cols)
|
36
34
|
data = rows.collect do |row|
|
37
35
|
row.collect do |ele|
|
38
36
|
node = TableNode.new(ele)
|
@@ -44,19 +42,26 @@ module TableParser
|
|
44
42
|
row = data[row_index]
|
45
43
|
row.each_index do |col_index|
|
46
44
|
col = row[col_index]
|
47
|
-
headers[col_index]
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
45
|
+
if headers[col_index]
|
46
|
+
headers[col_index].children << col if col.class != EmptyTableNode
|
47
|
+
if col.colspan > 1
|
48
|
+
if dup_cols
|
49
|
+
row.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
|
50
|
+
else
|
51
|
+
row.insert(col_index, EmptyTableNode.new(col.rowspan, col.colspan - 1))
|
52
|
+
end
|
53
|
+
end
|
52
54
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
55
|
+
if col.rowspan > 1 && data[row_index+1]
|
56
|
+
if dup_rows
|
57
|
+
data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
|
58
|
+
else
|
59
|
+
data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
|
60
|
+
end
|
58
61
|
end
|
59
62
|
end
|
63
|
+
|
64
|
+
|
60
65
|
end
|
61
66
|
end
|
62
67
|
data
|
data/lib/table_parser/table.rb
CHANGED
@@ -1,17 +1,23 @@
|
|
1
1
|
module TableParser
|
2
2
|
class Table
|
3
3
|
attr_reader :nodes, :columns
|
4
|
-
def initialize(
|
4
|
+
def initialize(doc, xpath_to_table="//table[0]", options={})
|
5
5
|
|
6
|
-
if options.has_key?
|
6
|
+
if options.has_key?(:dup_rows)
|
7
7
|
dup_rows = options[:dup_rows]
|
8
8
|
else
|
9
9
|
dup_rows = true
|
10
10
|
end
|
11
|
+
|
12
|
+
if options.has_key?(:dup_cols)
|
13
|
+
dup_cols = options[:dup_cols]
|
14
|
+
else
|
15
|
+
dup_cols = true
|
16
|
+
end
|
11
17
|
|
12
|
-
table = Parser.extract_table(
|
13
|
-
@columns = Parser.extract_column_headers(table)
|
14
|
-
@nodes = Parser.extract_nodes(table, @columns, dup_rows)
|
18
|
+
table = Parser.extract_table(doc, xpath_to_table)
|
19
|
+
@columns = Parser.extract_column_headers(table, dup_rows, dup_cols)
|
20
|
+
@nodes = Parser.extract_nodes(table, @columns, dup_rows, dup_cols)
|
15
21
|
end
|
16
22
|
|
17
23
|
def to_s
|
@@ -3,7 +3,7 @@ module TableParser
|
|
3
3
|
attr_reader :element, :text, :rowspan, :colspan
|
4
4
|
def initialize(element, rowspan=nil, colspan=nil)
|
5
5
|
@element = element
|
6
|
-
@text = element.text
|
6
|
+
@text = element.text.strip
|
7
7
|
@colspan = colspan || element["colspan"].to_i rescue 1
|
8
8
|
@rowspan = rowspan || element["rowspan"].to_i rescue 1
|
9
9
|
end
|
data/test/test_table_parser.rb
CHANGED
@@ -3,35 +3,36 @@ require "table_parser"
|
|
3
3
|
|
4
4
|
class TestTableParser < Test::Unit::TestCase
|
5
5
|
def test_parse_rowspan
|
6
|
-
|
6
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
7
7
|
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
8
|
-
<tr><td>3</td></tr></table></body></html>"
|
9
|
-
|
8
|
+
<tr><td>3</td></tr></table></body></html>"
|
9
|
+
doc = Nokogiri::HTML(html)
|
10
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
10
11
|
|
11
|
-
puts table
|
12
12
|
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
13
13
|
assert_equal(2, table[0].size)
|
14
14
|
assert_equal(2, table[1].size)
|
15
15
|
end
|
16
16
|
|
17
17
|
def test_parse_rowspan_disable_dup
|
18
|
-
|
18
|
+
html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
19
19
|
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
20
|
-
<tr><td>3</td></tr></table></body></html>"
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
<tr><td>3</td></tr></table></body></html>"
|
21
|
+
doc = Nokogiri::HTML(html)
|
22
|
+
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
|
23
|
+
|
24
24
|
assert_equal(2, table.columns.size, 'header_count should = 2 ')
|
25
25
|
assert_equal(1, table[0].size)
|
26
26
|
assert_equal(2, table[1].size)
|
27
27
|
end
|
28
28
|
|
29
29
|
def test_parse_colspan
|
30
|
-
|
30
|
+
html = "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
|
31
31
|
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|
32
32
|
<tr><td>B2</td><td>C2</td></tr>\
|
33
|
-
<tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>"
|
34
|
-
|
33
|
+
<tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>"
|
34
|
+
doc = Nokogiri::HTML(html)
|
35
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
35
36
|
|
36
37
|
assert_equal(3, table.columns.size, 'header_count should = 3 ')
|
37
38
|
assert_equal(4, table[0].size)
|
@@ -41,12 +42,13 @@ class TestTableParser < Test::Unit::TestCase
|
|
41
42
|
end
|
42
43
|
|
43
44
|
def test_parse_complex
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
45
|
+
html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
|
46
|
+
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
|
47
|
+
<tr><td>B2</td><td>B4</td></tr>\
|
48
|
+
<tr><td>C2</td><td>C3</td><td>B4</td></tr>\
|
49
|
+
</table></body></html>"
|
50
|
+
doc = Nokogiri::HTML(html)
|
51
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
50
52
|
|
51
53
|
assert_equal 4, table.columns.size
|
52
54
|
assert_equal 3, table[0].size
|
@@ -55,18 +57,19 @@ class TestTableParser < Test::Unit::TestCase
|
|
55
57
|
end
|
56
58
|
|
57
59
|
def test_parse_complex2
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
60
|
+
html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
|
61
|
+
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
|
62
|
+
<tr><td>B2</td><td>B4</td></tr>\
|
63
|
+
<tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
|
64
|
+
<tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
|
65
|
+
<tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
|
66
|
+
<tr><td>F2</td><td>F4</td></tr>\
|
67
|
+
<tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
|
68
|
+
<tr><td>H2</td><td>H4</td></tr>\
|
69
|
+
<tr><td>I2</td><td>I3</td><td>I4</td></tr>\
|
70
|
+
</table></body></html>"
|
71
|
+
doc = Nokogiri::HTML(html)
|
72
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
70
73
|
|
71
74
|
assert_equal 4, table.columns.size
|
72
75
|
assert_equal 9, table[0].size
|
@@ -76,8 +79,8 @@ class TestTableParser < Test::Unit::TestCase
|
|
76
79
|
end
|
77
80
|
|
78
81
|
def test_parse_web
|
79
|
-
|
80
|
-
|
82
|
+
doc = Nokogiri::HTML(open("test.html").read)
|
83
|
+
table = TableParser::Table.new doc, "/html/body/table"
|
81
84
|
|
82
85
|
assert_equal 11, table.columns.size
|
83
86
|
assert_equal 9, table[0].size
|
@@ -85,4 +88,14 @@ class TestTableParser < Test::Unit::TestCase
|
|
85
88
|
assert_equal 9, table[2].size
|
86
89
|
assert_equal 9, table[3].size
|
87
90
|
end
|
91
|
+
|
92
|
+
def test_parse_web2
|
93
|
+
doc = Nokogiri::HTML(open("test2.html").read)
|
94
|
+
|
95
|
+
table = doc.xpath("//div[@id='timetable_box-week']/table")
|
96
|
+
table.xpath("./tr[1]").remove
|
97
|
+
|
98
|
+
table = TableParser::Table.new doc, "//div[@id='timetable_box-week']/table", {:dup_cols => false, :dup_rows => false}
|
99
|
+
puts table.columns.select(){|c| c.text =~ /[0-9]+月[0-9]+日/ }
|
100
|
+
end
|
88
101
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: table_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Chong
|
@@ -32,12 +32,11 @@ extensions: []
|
|
32
32
|
extra_rdoc_files:
|
33
33
|
- History.txt
|
34
34
|
- Manifest.txt
|
35
|
-
- README.txt
|
36
35
|
files:
|
37
36
|
- .autotest
|
38
37
|
- History.txt
|
39
38
|
- Manifest.txt
|
40
|
-
- README.
|
39
|
+
- README.rdoc
|
41
40
|
- Rakefile
|
42
41
|
- lib/table_parser.rb
|
43
42
|
- lib/table_parser/parser.rb
|