table_parser 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  .autotest
2
2
  History.txt
3
3
  Manifest.txt
4
- README.txt
4
+ README.rdoc
5
5
  Rakefile
6
6
  lib/table_parser.rb
7
7
  lib/table_parser/parser.rb
@@ -18,32 +18,29 @@ Parsing table could be difficult when its structure contains colspan or rowspan.
18
18
  Use TableParser::Table to create parsed HTML table.
19
19
 
20
20
  For example, following code:
21
- <pre>
22
- html = "&lt;html&gt;&lt;body&gt;&lt;table&gt;&lt;tr&gt;&lt;td&gt;A&lt;/td&gt;&lt;td&gt;B&lt;/td&gt;&lt;/tr&gt;\
23
- &lt;tr&gt;&lt;td rowspan=\"2\"&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt; \
24
- &lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;"
25
- table = TableParser::Table.new(html, "/html/body/table")
26
- </pre>
21
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
22
+ <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
23
+ <tr><td>3</td></tr></table></body></html>"
24
+ doc = Nokogiri::HTML(html)
25
+ table = TableParser::Table.new doc, "/html/body/table"
27
26
 
28
27
  Result in following parsed table:
29
28
 
30
- <pre>
31
- Table&lt;TableColumn&lt;name=A, children=[1],[1]&gt;,TableColumn&lt;name=B, children=[2],[3]&gt;&gt;
32
- </pre>
29
+ Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
30
+
33
31
  Note the first column contains duplicated item, because the first row contains "rowspan" element. If this is not desired, use following syntax to skip duplication:
34
- <pre>
35
- html = "&lt;html&gt;&lt;body&gt;&lt;table&gt;&lt;tr&gt;&lt;td&gt;A&lt;/td&gt;&lt;td&gt;B&lt;/td&gt;&lt;/tr&gt;\
36
- &lt;tr&gt;&lt;td rowspan=\"2\"&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt; \
37
- &lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;"
38
- table = TableParser::Table.new(html, "/html/body/table", {:dup_rows => false})
39
- </pre>
32
+
33
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
34
+ <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
35
+ <tr><td>3</td></tr></table></body></html>"
36
+ doc = Nokogiri::HTML(html)
37
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false})
40
38
 
41
39
  Which result in following parsed table:
42
40
 
43
- <pre>
44
- Table&lt;TableColumn&lt;name=A, children=[1],[1]&gt;,TableColumn&lt;name=B, children=[2],[3]&gt;&gt;
45
- </pre>
41
+ Table<TableColumn<name=A, children=[1]>,TableColumn<name=B, children=[2],[3]>>
46
42
 
43
+ Read the spec (test/test_table_parser.rb) for more usage.
47
44
 
48
45
  == DEVELOPERS:
49
46
 
data/lib/table_parser.rb CHANGED
@@ -4,5 +4,5 @@ require 'table_parser/table'
4
4
  require 'table_parser/parser'
5
5
 
6
6
  module TableParser
7
- VERSION = '0.5.1'
7
+ VERSION = '0.5.2'
8
8
  end
@@ -5,9 +5,7 @@ require 'open-uri'
5
5
  module TableParser
6
6
  class Parser
7
7
  # extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
8
- def self.extract_table(input, xpath)
9
- doc = Nokogiri::HTML(input)
10
-
8
+ def self.extract_table(doc, xpath)
11
9
  rows = []
12
10
  table = doc.xpath(xpath)
13
11
  rows = table.xpath("./tr").collect do |row|
@@ -18,7 +16,7 @@ module TableParser
18
16
  rows
19
17
  end
20
18
 
21
- def self.extract_column_headers(rows)
19
+ def self.extract_column_headers(rows, dup_rows, dup_cols)
22
20
  headers = []
23
21
  rows.first.collect do |col|
24
22
  header = TableColumn.new(col)
@@ -32,7 +30,7 @@ module TableParser
32
30
  headers
33
31
  end
34
32
 
35
- def self.extract_nodes(rows, headers, duplicate_colspan)
33
+ def self.extract_nodes(rows, headers, dup_rows, dup_cols)
36
34
  data = rows.collect do |row|
37
35
  row.collect do |ele|
38
36
  node = TableNode.new(ele)
@@ -44,19 +42,26 @@ module TableParser
44
42
  row = data[row_index]
45
43
  row.each_index do |col_index|
46
44
  col = row[col_index]
47
- headers[col_index].children << col if col.class != EmptyTableNode
48
-
49
- if col.colspan > 1
50
- col.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
51
- end
45
+ if headers[col_index]
46
+ headers[col_index].children << col if col.class != EmptyTableNode
47
+ if col.colspan > 1
48
+ if dup_cols
49
+ row.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
50
+ else
51
+ row.insert(col_index, EmptyTableNode.new(col.rowspan, col.colspan - 1))
52
+ end
53
+ end
52
54
 
53
- if col.rowspan > 1 && data[row_index+1]
54
- if duplicate_colspan
55
- data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
56
- else
57
- data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
55
+ if col.rowspan > 1 && data[row_index+1]
56
+ if dup_rows
57
+ data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
58
+ else
59
+ data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
60
+ end
58
61
  end
59
62
  end
63
+
64
+
60
65
  end
61
66
  end
62
67
  data
@@ -1,17 +1,23 @@
1
1
  module TableParser
2
2
  class Table
3
3
  attr_reader :nodes, :columns
4
- def initialize(input, xpath_to_table="//table[0]", options={})
4
+ def initialize(doc, xpath_to_table="//table[0]", options={})
5
5
 
6
- if options.has_key? :dup_rows
6
+ if options.has_key?(:dup_rows)
7
7
  dup_rows = options[:dup_rows]
8
8
  else
9
9
  dup_rows = true
10
10
  end
11
+
12
+ if options.has_key?(:dup_cols)
13
+ dup_cols = options[:dup_cols]
14
+ else
15
+ dup_cols = true
16
+ end
11
17
 
12
- table = Parser.extract_table(input, xpath_to_table)
13
- @columns = Parser.extract_column_headers(table)
14
- @nodes = Parser.extract_nodes(table, @columns, dup_rows)
18
+ table = Parser.extract_table(doc, xpath_to_table)
19
+ @columns = Parser.extract_column_headers(table, dup_rows, dup_cols)
20
+ @nodes = Parser.extract_nodes(table, @columns, dup_rows, dup_cols)
15
21
  end
16
22
 
17
23
  def to_s
@@ -3,7 +3,7 @@ module TableParser
3
3
  attr_reader :element, :text, :rowspan, :colspan
4
4
  def initialize(element, rowspan=nil, colspan=nil)
5
5
  @element = element
6
- @text = element.text
6
+ @text = element.text.strip
7
7
  @colspan = colspan || element["colspan"].to_i rescue 1
8
8
  @rowspan = rowspan || element["rowspan"].to_i rescue 1
9
9
  end
@@ -3,35 +3,36 @@ require "table_parser"
3
3
 
4
4
  class TestTableParser < Test::Unit::TestCase
5
5
  def test_parse_rowspan
6
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
6
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
7
7
  <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
8
- <tr><td>3</td></tr></table></body></html>",
9
- "/html/body/table"
8
+ <tr><td>3</td></tr></table></body></html>"
9
+ doc = Nokogiri::HTML(html)
10
+ table = TableParser::Table.new doc, "/html/body/table"
10
11
 
11
- puts table
12
12
  assert_equal(2, table.columns.size, 'header_count should = 2 ')
13
13
  assert_equal(2, table[0].size)
14
14
  assert_equal(2, table[1].size)
15
15
  end
16
16
 
17
17
  def test_parse_rowspan_disable_dup
18
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
18
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
19
19
  <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
20
- <tr><td>3</td></tr></table></body></html>",
21
- "/html/body/table", {:dup_rows => false}
22
-
23
- puts table
20
+ <tr><td>3</td></tr></table></body></html>"
21
+ doc = Nokogiri::HTML(html)
22
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false}
23
+
24
24
  assert_equal(2, table.columns.size, 'header_count should = 2 ')
25
25
  assert_equal(1, table[0].size)
26
26
  assert_equal(2, table[1].size)
27
27
  end
28
28
 
29
29
  def test_parse_colspan
30
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
30
+ html = "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
31
31
  <tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
32
32
  <tr><td>B2</td><td>C2</td></tr>\
33
- <tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>",
34
- "/html/body/table"
33
+ <tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>"
34
+ doc = Nokogiri::HTML(html)
35
+ table = TableParser::Table.new doc, "/html/body/table"
35
36
 
36
37
  assert_equal(3, table.columns.size, 'header_count should = 3 ')
37
38
  assert_equal(4, table[0].size)
@@ -41,12 +42,13 @@ class TestTableParser < Test::Unit::TestCase
41
42
  end
42
43
 
43
44
  def test_parse_complex
44
- table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
45
- <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
46
- <tr><td>B2</td><td>B4</td></tr>\
47
- <tr><td>C2</td><td>C3</td><td>B4</td></tr>\
48
- </table></body></html>",
49
- "/html/body/table"
45
+ html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
46
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
47
+ <tr><td>B2</td><td>B4</td></tr>\
48
+ <tr><td>C2</td><td>C3</td><td>B4</td></tr>\
49
+ </table></body></html>"
50
+ doc = Nokogiri::HTML(html)
51
+ table = TableParser::Table.new doc, "/html/body/table"
50
52
 
51
53
  assert_equal 4, table.columns.size
52
54
  assert_equal 3, table[0].size
@@ -55,18 +57,19 @@ class TestTableParser < Test::Unit::TestCase
55
57
  end
56
58
 
57
59
  def test_parse_complex2
58
- table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
59
- <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
60
- <tr><td>B2</td><td>B4</td></tr>\
61
- <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
62
- <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
63
- <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
64
- <tr><td>F2</td><td>F4</td></tr>\
65
- <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
66
- <tr><td>H2</td><td>H4</td></tr>\
67
- <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
68
- </table></body></html>",
69
- "/html/body/table"
60
+ html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
61
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
62
+ <tr><td>B2</td><td>B4</td></tr>\
63
+ <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
64
+ <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
65
+ <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
66
+ <tr><td>F2</td><td>F4</td></tr>\
67
+ <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
68
+ <tr><td>H2</td><td>H4</td></tr>\
69
+ <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
70
+ </table></body></html>"
71
+ doc = Nokogiri::HTML(html)
72
+ table = TableParser::Table.new doc, "/html/body/table"
70
73
 
71
74
  assert_equal 4, table.columns.size
72
75
  assert_equal 9, table[0].size
@@ -76,8 +79,8 @@ class TestTableParser < Test::Unit::TestCase
76
79
  end
77
80
 
78
81
  def test_parse_web
79
- table = TableParser::Table.new open("test.html").read,
80
- "/html/body/table"
82
+ doc = Nokogiri::HTML(open("test.html").read)
83
+ table = TableParser::Table.new doc, "/html/body/table"
81
84
 
82
85
  assert_equal 11, table.columns.size
83
86
  assert_equal 9, table[0].size
@@ -85,4 +88,14 @@ class TestTableParser < Test::Unit::TestCase
85
88
  assert_equal 9, table[2].size
86
89
  assert_equal 9, table[3].size
87
90
  end
91
+
92
+ def test_parse_web2
93
+ doc = Nokogiri::HTML(open("test2.html").read)
94
+
95
+ table = doc.xpath("//div[@id='timetable_box-week']/table")
96
+ table.xpath("./tr[1]").remove
97
+
98
+ table = TableParser::Table.new doc, "//div[@id='timetable_box-week']/table", {:dup_cols => false, :dup_rows => false}
99
+ puts table.columns.select(){|c| c.text =~ /[0-9]+月[0-9]+日/ }
100
+ end
88
101
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: table_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Chong
@@ -32,12 +32,11 @@ extensions: []
32
32
  extra_rdoc_files:
33
33
  - History.txt
34
34
  - Manifest.txt
35
- - README.txt
36
35
  files:
37
36
  - .autotest
38
37
  - History.txt
39
38
  - Manifest.txt
40
- - README.txt
39
+ - README.rdoc
41
40
  - Rakefile
42
41
  - lib/table_parser.rb
43
42
  - lib/table_parser/parser.rb