table_parser 0.5.7 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -42,44 +42,54 @@ module TableParser
42
42
  headers
43
43
  end
44
44
 
45
- def self.extract_nodes(rows, headers, dup_rows, dup_cols)
46
- data = rows.collect do |row|
45
+ def self.extract_nodes(input_rows, headers, dup_rows, dup_cols)
46
+ data = input_rows.collect do |row|
47
47
  row.collect do |ele|
48
48
  node = TableNode.new(ele)
49
49
  end
50
50
  end
51
-
52
- # handle rowspan
53
- data.each_index do |row_index|
54
- row = data[row_index]
55
- row.each_index do |col_index|
56
- col = row[col_index]
57
- if headers[col_index]
58
- headers[col_index].children << col if col.class != EmptyTableNode
51
+
52
+ columns = []
53
+ curr_x = 0
54
+ curr_y = 0
55
+ data.each do |row|
56
+ columns[curr_y] = [] unless columns[curr_y]
59
57
 
60
- if col.colspan > 1
61
- if dup_cols
62
- row.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
63
- else
64
- row.insert(col_index, EmptyTableNode.new(col.rowspan, col.colspan - 1))
65
- end
66
- end
67
-
68
- if col.rowspan > 1
69
- unless data[row_index+1]
70
- data.insert(row_index, [])
71
- end
58
+ curr_y = 0
59
+ row.each do |node|
60
+ rowspan = node.rowspan - 1
61
+ colspan = node.colspan - 1
72
62
 
73
- if dup_rows
74
- data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
63
+ (0..rowspan).each do |x|
64
+ (0..colspan).each do |y|
65
+ columns[curr_y+y] = [] unless columns[curr_y+y]
66
+ if (x == 0 || dup_rows) && (y == 0 || dup_cols)
67
+ while columns[curr_y+y][curr_x+x]
68
+ curr_y += 1
69
+ end
70
+ columns[curr_y+y][curr_x+x] = node
75
71
  else
76
- data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
72
+ while columns[curr_y+y][curr_x+x]
73
+ curr_y += 1
74
+ end
75
+ columns[curr_y+y][curr_x+x] = EmptyTableNode.new(node.element)
77
76
  end
78
77
  end
79
78
  end
79
+ curr_y += 1
80
80
  end
81
- end
82
- data
81
+ curr_x += 1
82
+ end
83
+
84
+ columns.each_index do |col_index|
85
+ columns[col_index].each do |node|
86
+ if headers[col_index]
87
+ headers[col_index].children << node unless node.class == EmptyTableNode
88
+ end
89
+ end
90
+ end
91
+
92
+ columns
83
93
  end
84
94
 
85
95
  end
@@ -3,9 +3,19 @@ module TableParser
3
3
  attr_reader :element, :text, :rowspan, :colspan
4
4
  def initialize(element, rowspan=nil, colspan=nil)
5
5
  @element = element
6
- @text = element.text.strip rescue ""
7
- @colspan = colspan || element["colspan"].to_i rescue 1
8
- @rowspan = rowspan || element["rowspan"].to_i rescue 1
6
+ @text = element.text.strip rescue ""
7
+
8
+ if element.nil?
9
+ @colspan = colspan || 1
10
+ @rowspan = rowspan || 1
11
+ else
12
+ @colspan = colspan || element["colspan"].nil? ? 1 : element["colspan"].to_i
13
+ @rowspan = rowspan || element["rowspan"].nil? ? 1 : element["rowspan"].to_i
14
+ end
15
+ end
16
+
17
+ def span(row, col)
18
+ TableNode.new(element, rowspan-row, colspan-col)
9
19
  end
10
20
 
11
21
  def to_s
@@ -17,8 +27,8 @@ module TableParser
17
27
  def initialize(rowspan=nil, colspan=nil)
18
28
  @element = nil
19
29
  @text = ""
20
- @colspan = colspan || element["colspan"].to_i rescue 1
21
- @rowspan = rowspan || element["rowspan"].to_i rescue 1
30
+ @colspan = colspan || 1
31
+ @rowspan = rowspan || 1
22
32
  end
23
33
  end
24
34
  end
data/lib/table_parser.rb CHANGED
@@ -4,5 +4,5 @@ require 'table_parser/table'
4
4
  require 'table_parser/parser'
5
5
 
6
6
  module TableParser
7
- VERSION = '0.5.7'
7
+ VERSION = '0.6.0'
8
8
  end
@@ -5,15 +5,28 @@ require 'open-uri'
5
5
 
6
6
  class TestTableParser < Test::Unit::TestCase
7
7
  def test_parse_rowspan
8
- html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
9
- <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
10
- <tr><td>3</td></tr></table></body></html>"
8
+ html = open("rowspan.html").read
11
9
  doc = Nokogiri::HTML(html)
12
- table = TableParser::Table.new doc, "/html/body/table"
10
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => false}
11
+ assert_equal(2, table.columns.size, 'header_count should = 2 ')
12
+ assert_equal(2, table[0].size)
13
+ assert_equal(2, table[1].size)
13
14
 
15
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => true, :dup_cols => false}
16
+ assert_equal(2, table.columns.size, 'header_count should = 2 ')
17
+ assert_equal(3, table[0].size)
18
+ assert_equal(3, table[1].size)
19
+
20
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => true}
14
21
  assert_equal(2, table.columns.size, 'header_count should = 2 ')
15
22
  assert_equal(2, table[0].size)
16
23
  assert_equal(2, table[1].size)
24
+
25
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => true, :dup_cols => true}
26
+ assert_equal(2, table.columns.size, 'header_count should = 2 ')
27
+ assert_equal(3, table[0].size)
28
+ assert_equal(3, table[1].size)
29
+
17
30
  end
18
31
 
19
32
  def test_parse_rowspan_disable_dup
@@ -29,27 +42,27 @@ class TestTableParser < Test::Unit::TestCase
29
42
  end
30
43
 
31
44
  def test_parse_colspan
32
- html = "<html><body><table>\
33
- <tr><td>A</td><td colspan=\"2\">B</td></tr>\
34
- <tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
35
- <tr><td>B2</td><td>C2</td></tr>\
36
- <tr><td>A3</td><td>B3</td><td>C3</td></tr>\
37
- <tr><td>A4</td><td colspan=\"2\" rowspan=\"2\">B4</td></tr>\
38
- <tr><td>A5</td></tr>\
39
- <tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
40
- <tr><td>B2</td><td>C2</td></tr>\
41
- <tr><td>A3</td><td>B3</td><td>C3</td></tr>\
42
- <tr><td>A4</td><td colspan=\"2\" rowspan=\"2\">B4</td></tr>\
43
- <tr><td>A5</td></tr>\
44
- </table></body></html>"
45
+ html = open("colspan.html").read
45
46
  doc = Nokogiri::HTML(html)
46
47
  table = TableParser::Table.new doc, "/html/body/table"
47
- puts table
48
-
49
48
  assert_equal(3, table.columns.size, 'header_count should = 3 ')
50
49
  assert_equal(10, table[0].size)
51
50
  assert_equal(10, table[1].size)
52
51
  assert_equal(10, table[2].size)
52
+
53
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
54
+ puts table.to_s
55
+ assert_equal(3, table.columns.size, 'header_count should = 3 ')
56
+ assert_equal(8, table[0].size)
57
+ assert_equal(8, table[1].size)
58
+ assert_equal(8, table[2].size)
59
+
60
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => false}
61
+ puts table.to_s
62
+ assert_equal(3, table.columns.size, 'header_count should = 3 ')
63
+ assert_equal(8, table[0].size)
64
+ assert_equal(8, table[1].size)
65
+ assert_equal(6, table[2].size)
53
66
  end
54
67
 
55
68
  def test_parse_complex
@@ -68,17 +81,7 @@ class TestTableParser < Test::Unit::TestCase
68
81
  end
69
82
 
70
83
  def test_parse_complex2
71
- html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
72
- <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
73
- <tr><td>B2</td><td>B4</td></tr>\
74
- <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
75
- <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
76
- <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
77
- <tr><td>F2</td><td>F4</td></tr>\
78
- <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
79
- <tr><td>H2</td><td>H4</td></tr>\
80
- <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
81
- </table></body></html>"
84
+ html = open("complex2.html").read
82
85
  doc = Nokogiri::HTML(html)
83
86
  table = TableParser::Table.new doc, "/html/body/table"
84
87
 
@@ -87,6 +90,15 @@ class TestTableParser < Test::Unit::TestCase
87
90
  assert_equal 9, table[1].size
88
91
  assert_equal 9, table[2].size
89
92
  assert_equal 9, table[3].size
93
+
94
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
95
+ puts "table=" + table.to_s
96
+ assert_equal 4, table.columns.size
97
+ assert_equal 3, table[0].size
98
+ assert_equal 9, table[1].size
99
+ assert_equal 5, table[2].size
100
+ assert_equal 9, table[3].size
101
+
90
102
  end
91
103
 
92
104
  def test_parse_noheader
@@ -101,6 +113,37 @@ class TestTableParser < Test::Unit::TestCase
101
113
  assert_equal(3, table[1].size)
102
114
  end
103
115
 
116
+ def test_parse_complex_colrowspan
117
+ html = open("table_rowcol.html").read
118
+
119
+ doc = Nokogiri::HTML(html)
120
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => false}
121
+ puts table
122
+ assert_equal(5, table.columns.size, 'header_count should = 5 ')
123
+ assert_equal(1, table[0].size)
124
+ assert_equal(3, table[1].size)
125
+ assert_equal(3, table[2].size)
126
+ assert_equal(4, table[3].size)
127
+ assert_equal(5, table[4].size)
128
+
129
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => true}
130
+ puts table
131
+ assert_equal(5, table.columns.size, 'header_count should = 5 ')
132
+ assert_equal(5, table[0].size)
133
+ assert_equal(5, table[1].size)
134
+ assert_equal(3, table[2].size)
135
+ assert_equal(5, table[3].size)
136
+ assert_equal(5, table[4].size)
137
+
138
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => true, :dup_rows => true}
139
+ puts table
140
+ assert_equal(5, table.columns.size, 'header_count should = 5 ')
141
+ assert_equal(5, table[0].size)
142
+ assert_equal(5, table[1].size)
143
+ assert_equal(5, table[2].size)
144
+ assert_equal(5, table[3].size)
145
+ assert_equal(5, table[4].size)
146
+ end
104
147
 
105
148
  def test_web
106
149
  html = open("test4.html").read
@@ -108,4 +151,5 @@ class TestTableParser < Test::Unit::TestCase
108
151
  table = TableParser::Table.new doc, "/html/body/div/div[3]/div/div[2]/table", {:header => false, :dup_rows => false}
109
152
  puts table.columns[0].size
110
153
  end
154
+
111
155
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: table_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.7
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Chong
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-26 00:00:00 +08:00
12
+ date: 2010-01-27 00:00:00 +08:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency