table_parser 0.5.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -42,44 +42,54 @@ module TableParser
42
42
  headers
43
43
  end
44
44
 
45
- def self.extract_nodes(rows, headers, dup_rows, dup_cols)
46
- data = rows.collect do |row|
45
+ def self.extract_nodes(input_rows, headers, dup_rows, dup_cols)
46
+ data = input_rows.collect do |row|
47
47
  row.collect do |ele|
48
48
  node = TableNode.new(ele)
49
49
  end
50
50
  end
51
-
52
- # handle rowspan
53
- data.each_index do |row_index|
54
- row = data[row_index]
55
- row.each_index do |col_index|
56
- col = row[col_index]
57
- if headers[col_index]
58
- headers[col_index].children << col if col.class != EmptyTableNode
51
+
52
+ columns = []
53
+ curr_x = 0
54
+ curr_y = 0
55
+ data.each do |row|
56
+ columns[curr_y] = [] unless columns[curr_y]
59
57
 
60
- if col.colspan > 1
61
- if dup_cols
62
- row.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
63
- else
64
- row.insert(col_index, EmptyTableNode.new(col.rowspan, col.colspan - 1))
65
- end
66
- end
67
-
68
- if col.rowspan > 1
69
- unless data[row_index+1]
70
- data.insert(row_index, [])
71
- end
58
+ curr_y = 0
59
+ row.each do |node|
60
+ rowspan = node.rowspan - 1
61
+ colspan = node.colspan - 1
72
62
 
73
- if dup_rows
74
- data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
63
+ (0..rowspan).each do |x|
64
+ (0..colspan).each do |y|
65
+ columns[curr_y+y] = [] unless columns[curr_y+y]
66
+ if (x == 0 || dup_rows) && (y == 0 || dup_cols)
67
+ while columns[curr_y+y][curr_x+x]
68
+ curr_y += 1
69
+ end
70
+ columns[curr_y+y][curr_x+x] = node
75
71
  else
76
- data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
72
+ while columns[curr_y+y][curr_x+x]
73
+ curr_y += 1
74
+ end
75
+ columns[curr_y+y][curr_x+x] = EmptyTableNode.new(node.element)
77
76
  end
78
77
  end
79
78
  end
79
+ curr_y += 1
80
80
  end
81
- end
82
- data
81
+ curr_x += 1
82
+ end
83
+
84
+ columns.each_index do |col_index|
85
+ columns[col_index].each do |node|
86
+ if headers[col_index]
87
+ headers[col_index].children << node unless node.class == EmptyTableNode
88
+ end
89
+ end
90
+ end
91
+
92
+ columns
83
93
  end
84
94
 
85
95
  end
@@ -3,9 +3,19 @@ module TableParser
3
3
  attr_reader :element, :text, :rowspan, :colspan
4
4
  def initialize(element, rowspan=nil, colspan=nil)
5
5
  @element = element
6
- @text = element.text.strip rescue ""
7
- @colspan = colspan || element["colspan"].to_i rescue 1
8
- @rowspan = rowspan || element["rowspan"].to_i rescue 1
6
+ @text = element.text.strip rescue ""
7
+
8
+ if element.nil?
9
+ @colspan = colspan || 1
10
+ @rowspan = rowspan || 1
11
+ else
12
+ @colspan = colspan || element["colspan"].nil? ? 1 : element["colspan"].to_i
13
+ @rowspan = rowspan || element["rowspan"].nil? ? 1 : element["rowspan"].to_i
14
+ end
15
+ end
16
+
17
+ def span(row, col)
18
+ TableNode.new(element, rowspan-row, colspan-col)
9
19
  end
10
20
 
11
21
  def to_s
@@ -17,8 +27,8 @@ module TableParser
17
27
  def initialize(rowspan=nil, colspan=nil)
18
28
  @element = nil
19
29
  @text = ""
20
- @colspan = colspan || element["colspan"].to_i rescue 1
21
- @rowspan = rowspan || element["rowspan"].to_i rescue 1
30
+ @colspan = colspan || 1
31
+ @rowspan = rowspan || 1
22
32
  end
23
33
  end
24
34
  end
data/lib/table_parser.rb CHANGED
@@ -4,5 +4,5 @@ require 'table_parser/table'
4
4
  require 'table_parser/parser'
5
5
 
6
6
  module TableParser
7
- VERSION = '0.5.7'
7
+ VERSION = '0.6.0'
8
8
  end
@@ -5,15 +5,28 @@ require 'open-uri'
5
5
 
6
6
  class TestTableParser < Test::Unit::TestCase
7
7
  def test_parse_rowspan
8
- html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
9
- <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
10
- <tr><td>3</td></tr></table></body></html>"
8
+ html = open("rowspan.html").read
11
9
  doc = Nokogiri::HTML(html)
12
- table = TableParser::Table.new doc, "/html/body/table"
10
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => false}
11
+ assert_equal(2, table.columns.size, 'header_count should = 2 ')
12
+ assert_equal(2, table[0].size)
13
+ assert_equal(2, table[1].size)
13
14
 
15
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => true, :dup_cols => false}
16
+ assert_equal(2, table.columns.size, 'header_count should = 2 ')
17
+ assert_equal(3, table[0].size)
18
+ assert_equal(3, table[1].size)
19
+
20
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => true}
14
21
  assert_equal(2, table.columns.size, 'header_count should = 2 ')
15
22
  assert_equal(2, table[0].size)
16
23
  assert_equal(2, table[1].size)
24
+
25
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => true, :dup_cols => true}
26
+ assert_equal(2, table.columns.size, 'header_count should = 2 ')
27
+ assert_equal(3, table[0].size)
28
+ assert_equal(3, table[1].size)
29
+
17
30
  end
18
31
 
19
32
  def test_parse_rowspan_disable_dup
@@ -29,27 +42,27 @@ class TestTableParser < Test::Unit::TestCase
29
42
  end
30
43
 
31
44
  def test_parse_colspan
32
- html = "<html><body><table>\
33
- <tr><td>A</td><td colspan=\"2\">B</td></tr>\
34
- <tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
35
- <tr><td>B2</td><td>C2</td></tr>\
36
- <tr><td>A3</td><td>B3</td><td>C3</td></tr>\
37
- <tr><td>A4</td><td colspan=\"2\" rowspan=\"2\">B4</td></tr>\
38
- <tr><td>A5</td></tr>\
39
- <tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
40
- <tr><td>B2</td><td>C2</td></tr>\
41
- <tr><td>A3</td><td>B3</td><td>C3</td></tr>\
42
- <tr><td>A4</td><td colspan=\"2\" rowspan=\"2\">B4</td></tr>\
43
- <tr><td>A5</td></tr>\
44
- </table></body></html>"
45
+ html = open("colspan.html").read
45
46
  doc = Nokogiri::HTML(html)
46
47
  table = TableParser::Table.new doc, "/html/body/table"
47
- puts table
48
-
49
48
  assert_equal(3, table.columns.size, 'header_count should = 3 ')
50
49
  assert_equal(10, table[0].size)
51
50
  assert_equal(10, table[1].size)
52
51
  assert_equal(10, table[2].size)
52
+
53
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
54
+ puts table.to_s
55
+ assert_equal(3, table.columns.size, 'header_count should = 3 ')
56
+ assert_equal(8, table[0].size)
57
+ assert_equal(8, table[1].size)
58
+ assert_equal(8, table[2].size)
59
+
60
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => false}
61
+ puts table.to_s
62
+ assert_equal(3, table.columns.size, 'header_count should = 3 ')
63
+ assert_equal(8, table[0].size)
64
+ assert_equal(8, table[1].size)
65
+ assert_equal(6, table[2].size)
53
66
  end
54
67
 
55
68
  def test_parse_complex
@@ -68,17 +81,7 @@ class TestTableParser < Test::Unit::TestCase
68
81
  end
69
82
 
70
83
  def test_parse_complex2
71
- html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
72
- <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
73
- <tr><td>B2</td><td>B4</td></tr>\
74
- <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
75
- <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
76
- <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
77
- <tr><td>F2</td><td>F4</td></tr>\
78
- <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
79
- <tr><td>H2</td><td>H4</td></tr>\
80
- <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
81
- </table></body></html>"
84
+ html = open("complex2.html").read
82
85
  doc = Nokogiri::HTML(html)
83
86
  table = TableParser::Table.new doc, "/html/body/table"
84
87
 
@@ -87,6 +90,15 @@ class TestTableParser < Test::Unit::TestCase
87
90
  assert_equal 9, table[1].size
88
91
  assert_equal 9, table[2].size
89
92
  assert_equal 9, table[3].size
93
+
94
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
95
+ puts "table=" + table.to_s
96
+ assert_equal 4, table.columns.size
97
+ assert_equal 3, table[0].size
98
+ assert_equal 9, table[1].size
99
+ assert_equal 5, table[2].size
100
+ assert_equal 9, table[3].size
101
+
90
102
  end
91
103
 
92
104
  def test_parse_noheader
@@ -101,6 +113,37 @@ class TestTableParser < Test::Unit::TestCase
101
113
  assert_equal(3, table[1].size)
102
114
  end
103
115
 
116
+ def test_parse_complex_colrowspan
117
+ html = open("table_rowcol.html").read
118
+
119
+ doc = Nokogiri::HTML(html)
120
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => false}
121
+ puts table
122
+ assert_equal(5, table.columns.size, 'header_count should = 5 ')
123
+ assert_equal(1, table[0].size)
124
+ assert_equal(3, table[1].size)
125
+ assert_equal(3, table[2].size)
126
+ assert_equal(4, table[3].size)
127
+ assert_equal(5, table[4].size)
128
+
129
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => true}
130
+ puts table
131
+ assert_equal(5, table.columns.size, 'header_count should = 5 ')
132
+ assert_equal(5, table[0].size)
133
+ assert_equal(5, table[1].size)
134
+ assert_equal(3, table[2].size)
135
+ assert_equal(5, table[3].size)
136
+ assert_equal(5, table[4].size)
137
+
138
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => true, :dup_rows => true}
139
+ puts table
140
+ assert_equal(5, table.columns.size, 'header_count should = 5 ')
141
+ assert_equal(5, table[0].size)
142
+ assert_equal(5, table[1].size)
143
+ assert_equal(5, table[2].size)
144
+ assert_equal(5, table[3].size)
145
+ assert_equal(5, table[4].size)
146
+ end
104
147
 
105
148
  def test_web
106
149
  html = open("test4.html").read
@@ -108,4 +151,5 @@ class TestTableParser < Test::Unit::TestCase
108
151
  table = TableParser::Table.new doc, "/html/body/div/div[3]/div/div[2]/table", {:header => false, :dup_rows => false}
109
152
  puts table.columns[0].size
110
153
  end
154
+
111
155
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: table_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.7
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Chong
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-26 00:00:00 +08:00
12
+ date: 2010-01-27 00:00:00 +08:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency