table_parser 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  .autotest
2
2
  History.txt
3
3
  Manifest.txt
4
- README.txt
4
+ README.rdoc
5
5
  Rakefile
6
6
  lib/table_parser.rb
7
7
  lib/table_parser/parser.rb
@@ -18,32 +18,29 @@ Parsing table could be difficult when its structure contains colspan or rowspan.
18
18
  Use TableParser::Table to create parsed HTML table.
19
19
 
20
20
  For example, following code:
21
- <pre>
22
- html = "&lt;html&gt;&lt;body&gt;&lt;table&gt;&lt;tr&gt;&lt;td&gt;A&lt;/td&gt;&lt;td&gt;B&lt;/td&gt;&lt;/tr&gt;\
23
- &lt;tr&gt;&lt;td rowspan=\"2\"&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt; \
24
- &lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;"
25
- table = TableParser::Table.new(html, "/html/body/table")
26
- </pre>
21
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
22
+ <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
23
+ <tr><td>3</td></tr></table></body></html>"
24
+ doc = Nokogiri::HTML(html)
25
+ table = TableParser::Table.new doc, "/html/body/table"
27
26
 
28
27
  Result in following parsed table:
29
28
 
30
- <pre>
31
- Table&lt;TableColumn&lt;name=A, children=[1],[1]&gt;,TableColumn&lt;name=B, children=[2],[3]&gt;&gt;
32
- </pre>
29
+ Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
30
+
33
31
  Note the first column contains duplicated item, because the first row contains "rowspan" element. If this is not desired, use following syntax to skip duplication:
34
- <pre>
35
- html = "&lt;html&gt;&lt;body&gt;&lt;table&gt;&lt;tr&gt;&lt;td&gt;A&lt;/td&gt;&lt;td&gt;B&lt;/td&gt;&lt;/tr&gt;\
36
- &lt;tr&gt;&lt;td rowspan=\"2\"&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt; \
37
- &lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;"
38
- table = TableParser::Table.new(html, "/html/body/table", {:dup_rows => false})
39
- </pre>
32
+
33
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
34
+ <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
35
+ <tr><td>3</td></tr></table></body></html>"
36
+ doc = Nokogiri::HTML(html)
37
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false})
40
38
 
41
39
  Which result in following parsed table:
42
40
 
43
- <pre>
44
- Table&lt;TableColumn&lt;name=A, children=[1],[1]&gt;,TableColumn&lt;name=B, children=[2],[3]&gt;&gt;
45
- </pre>
41
+ Table<TableColumn<name=A, children=[1]>,TableColumn<name=B, children=[2],[3]>>
46
42
 
43
+ Read the spec (test/test_table_parser.rb) for more usage.
47
44
 
48
45
  == DEVELOPERS:
49
46
 
data/lib/table_parser.rb CHANGED
@@ -4,5 +4,5 @@ require 'table_parser/table'
4
4
  require 'table_parser/parser'
5
5
 
6
6
  module TableParser
7
- VERSION = '0.5.1'
7
+ VERSION = '0.5.2'
8
8
  end
@@ -5,9 +5,7 @@ require 'open-uri'
5
5
  module TableParser
6
6
  class Parser
7
7
  # extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
8
- def self.extract_table(input, xpath)
9
- doc = Nokogiri::HTML(input)
10
-
8
+ def self.extract_table(doc, xpath)
11
9
  rows = []
12
10
  table = doc.xpath(xpath)
13
11
  rows = table.xpath("./tr").collect do |row|
@@ -18,7 +16,7 @@ module TableParser
18
16
  rows
19
17
  end
20
18
 
21
- def self.extract_column_headers(rows)
19
+ def self.extract_column_headers(rows, dup_rows, dup_cols)
22
20
  headers = []
23
21
  rows.first.collect do |col|
24
22
  header = TableColumn.new(col)
@@ -32,7 +30,7 @@ module TableParser
32
30
  headers
33
31
  end
34
32
 
35
- def self.extract_nodes(rows, headers, duplicate_colspan)
33
+ def self.extract_nodes(rows, headers, dup_rows, dup_cols)
36
34
  data = rows.collect do |row|
37
35
  row.collect do |ele|
38
36
  node = TableNode.new(ele)
@@ -44,19 +42,26 @@ module TableParser
44
42
  row = data[row_index]
45
43
  row.each_index do |col_index|
46
44
  col = row[col_index]
47
- headers[col_index].children << col if col.class != EmptyTableNode
48
-
49
- if col.colspan > 1
50
- col.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
51
- end
45
+ if headers[col_index]
46
+ headers[col_index].children << col if col.class != EmptyTableNode
47
+ if col.colspan > 1
48
+ if dup_cols
49
+ row.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
50
+ else
51
+ row.insert(col_index, EmptyTableNode.new(col.rowspan, col.colspan - 1))
52
+ end
53
+ end
52
54
 
53
- if col.rowspan > 1 && data[row_index+1]
54
- if duplicate_colspan
55
- data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
56
- else
57
- data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
55
+ if col.rowspan > 1 && data[row_index+1]
56
+ if dup_rows
57
+ data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
58
+ else
59
+ data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
60
+ end
58
61
  end
59
62
  end
63
+
64
+
60
65
  end
61
66
  end
62
67
  data
@@ -1,17 +1,23 @@
1
1
  module TableParser
2
2
  class Table
3
3
  attr_reader :nodes, :columns
4
- def initialize(input, xpath_to_table="//table[0]", options={})
4
+ def initialize(doc, xpath_to_table="//table[0]", options={})
5
5
 
6
- if options.has_key? :dup_rows
6
+ if options.has_key?(:dup_rows)
7
7
  dup_rows = options[:dup_rows]
8
8
  else
9
9
  dup_rows = true
10
10
  end
11
+
12
+ if options.has_key?(:dup_cols)
13
+ dup_cols = options[:dup_cols]
14
+ else
15
+ dup_cols = true
16
+ end
11
17
 
12
- table = Parser.extract_table(input, xpath_to_table)
13
- @columns = Parser.extract_column_headers(table)
14
- @nodes = Parser.extract_nodes(table, @columns, dup_rows)
18
+ table = Parser.extract_table(doc, xpath_to_table)
19
+ @columns = Parser.extract_column_headers(table, dup_rows, dup_cols)
20
+ @nodes = Parser.extract_nodes(table, @columns, dup_rows, dup_cols)
15
21
  end
16
22
 
17
23
  def to_s
@@ -3,7 +3,7 @@ module TableParser
3
3
  attr_reader :element, :text, :rowspan, :colspan
4
4
  def initialize(element, rowspan=nil, colspan=nil)
5
5
  @element = element
6
- @text = element.text
6
+ @text = element.text.strip
7
7
  @colspan = colspan || element["colspan"].to_i rescue 1
8
8
  @rowspan = rowspan || element["rowspan"].to_i rescue 1
9
9
  end
@@ -3,35 +3,36 @@ require "table_parser"
3
3
 
4
4
  class TestTableParser < Test::Unit::TestCase
5
5
  def test_parse_rowspan
6
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
6
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
7
7
  <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
8
- <tr><td>3</td></tr></table></body></html>",
9
- "/html/body/table"
8
+ <tr><td>3</td></tr></table></body></html>"
9
+ doc = Nokogiri::HTML(html)
10
+ table = TableParser::Table.new doc, "/html/body/table"
10
11
 
11
- puts table
12
12
  assert_equal(2, table.columns.size, 'header_count should = 2 ')
13
13
  assert_equal(2, table[0].size)
14
14
  assert_equal(2, table[1].size)
15
15
  end
16
16
 
17
17
  def test_parse_rowspan_disable_dup
18
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
18
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
19
19
  <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
20
- <tr><td>3</td></tr></table></body></html>",
21
- "/html/body/table", {:dup_rows => false}
22
-
23
- puts table
20
+ <tr><td>3</td></tr></table></body></html>"
21
+ doc = Nokogiri::HTML(html)
22
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
23
+
24
24
  assert_equal(2, table.columns.size, 'header_count should = 2 ')
25
25
  assert_equal(1, table[0].size)
26
26
  assert_equal(2, table[1].size)
27
27
  end
28
28
 
29
29
  def test_parse_colspan
30
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
30
+ html = "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
31
31
  <tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
32
32
  <tr><td>B2</td><td>C2</td></tr>\
33
- <tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>",
34
- "/html/body/table"
33
+ <tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>"
34
+ doc = Nokogiri::HTML(html)
35
+ table = TableParser::Table.new doc, "/html/body/table"
35
36
 
36
37
  assert_equal(3, table.columns.size, 'header_count should = 3 ')
37
38
  assert_equal(4, table[0].size)
@@ -41,12 +42,13 @@ class TestTableParser < Test::Unit::TestCase
41
42
  end
42
43
 
43
44
  def test_parse_complex
44
- table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
45
- <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
46
- <tr><td>B2</td><td>B4</td></tr>\
47
- <tr><td>C2</td><td>C3</td><td>B4</td></tr>\
48
- </table></body></html>",
49
- "/html/body/table"
45
+ html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
46
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
47
+ <tr><td>B2</td><td>B4</td></tr>\
48
+ <tr><td>C2</td><td>C3</td><td>B4</td></tr>\
49
+ </table></body></html>"
50
+ doc = Nokogiri::HTML(html)
51
+ table = TableParser::Table.new doc, "/html/body/table"
50
52
 
51
53
  assert_equal 4, table.columns.size
52
54
  assert_equal 3, table[0].size
@@ -55,18 +57,19 @@ class TestTableParser < Test::Unit::TestCase
55
57
  end
56
58
 
57
59
  def test_parse_complex2
58
- table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
59
- <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
60
- <tr><td>B2</td><td>B4</td></tr>\
61
- <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
62
- <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
63
- <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
64
- <tr><td>F2</td><td>F4</td></tr>\
65
- <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
66
- <tr><td>H2</td><td>H4</td></tr>\
67
- <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
68
- </table></body></html>",
69
- "/html/body/table"
60
+ html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
61
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
62
+ <tr><td>B2</td><td>B4</td></tr>\
63
+ <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
64
+ <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
65
+ <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
66
+ <tr><td>F2</td><td>F4</td></tr>\
67
+ <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
68
+ <tr><td>H2</td><td>H4</td></tr>\
69
+ <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
70
+ </table></body></html>"
71
+ doc = Nokogiri::HTML(html)
72
+ table = TableParser::Table.new doc, "/html/body/table"
70
73
 
71
74
  assert_equal 4, table.columns.size
72
75
  assert_equal 9, table[0].size
@@ -76,8 +79,8 @@ class TestTableParser < Test::Unit::TestCase
76
79
  end
77
80
 
78
81
  def test_parse_web
79
- table = TableParser::Table.new open("test.html").read,
80
- "/html/body/table"
82
+ doc = Nokogiri::HTML(open("test.html").read)
83
+ table = TableParser::Table.new doc, "/html/body/table"
81
84
 
82
85
  assert_equal 11, table.columns.size
83
86
  assert_equal 9, table[0].size
@@ -85,4 +88,14 @@ class TestTableParser < Test::Unit::TestCase
85
88
  assert_equal 9, table[2].size
86
89
  assert_equal 9, table[3].size
87
90
  end
91
+
92
+ def test_parse_web2
93
+ doc = Nokogiri::HTML(open("test2.html").read)
94
+
95
+ table = doc.xpath("//div[@id='timetable_box-week']/table")
96
+ table.xpath("./tr[1]").remove
97
+
98
+ table = TableParser::Table.new doc, "//div[@id='timetable_box-week']/table", {:dup_cols => false, :dup_rows => false}
99
+ puts table.columns.select(){|c| c.text =~ /[0-9]+月[0-9]+日/ }
100
+ end
88
101
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: table_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Chong
@@ -32,12 +32,11 @@ extensions: []
32
32
  extra_rdoc_files:
33
33
  - History.txt
34
34
  - Manifest.txt
35
- - README.txt
36
35
  files:
37
36
  - .autotest
38
37
  - History.txt
39
38
  - Manifest.txt
40
- - README.txt
39
+ - README.rdoc
41
40
  - Rakefile
42
41
  - lib/table_parser.rb
43
42
  - lib/table_parser/parser.rb