table_parser 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.autotest ADDED
@@ -0,0 +1,23 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'autotest/restart'
4
+
5
+ # Autotest.add_hook :initialize do |at|
6
+ # at.extra_files << "../some/external/dependency.rb"
7
+ #
8
+ # at.libs << ":../some/external"
9
+ #
10
+ # at.add_exception 'vendor'
11
+ #
12
+ # at.add_mapping(/dependency.rb/) do |f, _|
13
+ # at.files_matching(/test_.*rb$/)
14
+ # end
15
+ #
16
+ # %w(TestA TestB).each do |klass|
17
+ # at.extra_class_map[klass] = "test/test_misc.rb"
18
+ # end
19
+ # end
20
+
21
+ # Autotest.add_hook :run_command do |at|
22
+ # system "rake build"
23
+ # end
data/History.txt ADDED
@@ -0,0 +1,6 @@
1
+ === 1.0.0 / 2010-01-04
2
+
3
+ * 1 major enhancement
4
+
5
+ * Birthday!
6
+
data/Manifest.txt ADDED
@@ -0,0 +1,11 @@
1
+ .autotest
2
+ History.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ lib/table_parser.rb
7
+ lib/table_parser/parser.rb
8
+ lib/table_parser/table.rb
9
+ lib/table_parser/table_header.rb
10
+ lib/table_parser/table_node.rb
11
+ test/test_table_parser.rb
data/README.txt ADDED
@@ -0,0 +1,49 @@
1
+ = TableParser
2
+
3
+ == DESCRIPTION:
4
+
5
+ Parsing table could be difficult when its structure contains colspan or rowspan. TableParser parser HTML tables, group them by columns, with colspan and rowspan respected.
6
+
7
+ == REQUIREMENTS:
8
+
9
+ * Nokogiri
10
+
11
+ == INSTALL:
12
+
13
+ * sudo gem install table_parser
14
+
15
+ == DEVELOPERS:
16
+
17
+ * Francis Chong francis at ignition dot hk
18
+
19
+ After checking out the source, run:
20
+
21
+ $ rake newb
22
+
23
+ This task will install any missing dependencies, run the tests/specs,
24
+ and generate the RDoc.
25
+
26
+ == LICENSE:
27
+
28
+ (The MIT License)
29
+
30
+ Copyright (c) 2010 Ignition Soft
31
+
32
+ Permission is hereby granted, free of charge, to any person obtaining
33
+ a copy of this software and associated documentation files (the
34
+ 'Software'), to deal in the Software without restriction, including
35
+ without limitation the rights to use, copy, modify, merge, publish,
36
+ distribute, sublicense, and/or sell copies of the Software, and to
37
+ permit persons to whom the Software is furnished to do so, subject to
38
+ the following conditions:
39
+
40
+ The above copyright notice and this permission notice shall be
41
+ included in all copies or substantial portions of the Software.
42
+
43
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
44
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
45
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
46
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
47
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
48
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
49
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'hoe'
3
+
4
+ Hoe.plugin :gemcutter
5
+ Hoe.spec 'table_parser' do
6
+ developer('Francis Chong', 'francis@ignition.hk')
7
+ end
8
+
@@ -0,0 +1,63 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ module TableParser
6
+ class Parser
7
+ # extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
8
+ def self.extract_table(input, xpath)
9
+ doc = Nokogiri::HTML(input)
10
+
11
+ rows = []
12
+ table = doc.xpath(xpath)
13
+ rows = table.xpath("./tr").collect do |row|
14
+ row.xpath("./td").collect do |col|
15
+ col
16
+ end
17
+ end
18
+ rows
19
+ end
20
+
21
+ def self.extract_headers(rows)
22
+ headers = []
23
+ rows.first.collect do |col|
24
+ header = TableHeader.new(col)
25
+ headers << header
26
+
27
+ (header.colspan-1).times do
28
+ headers << TableHeader.new(col)
29
+ end
30
+ end
31
+ rows.delete_at(0)
32
+ headers
33
+ end
34
+
35
+ def self.extract_nodes(rows, headers)
36
+ data = rows.collect do |row|
37
+ row.collect do |ele|
38
+ node = TableNode.new(ele)
39
+ end
40
+ end
41
+
42
+ # handle rowspan
43
+ data.each_index do |row_index|
44
+ row = data[row_index]
45
+ row.each_index do |col_index|
46
+ col = row[col_index]
47
+ headers[col_index].children << col
48
+
49
+ if col.colspan > 1
50
+ col.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
51
+ end
52
+
53
+ if col.rowspan > 1 && data[row_index+1]
54
+ data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
55
+ end
56
+ end
57
+ end
58
+
59
+ data
60
+ end
61
+
62
+ end
63
+ end
@@ -0,0 +1,23 @@
1
+ module TableParser
2
+ class Table
3
+ attr_reader :nodes, :headers
4
+ def initialize(input, xpath_to_table="//table[0]")
5
+ table = Parser.extract_table(input, xpath_to_table)
6
+ @headers = Parser.extract_headers(table)
7
+ @nodes = Parser.extract_nodes(table, @headers)
8
+ end
9
+
10
+ def to_s
11
+ "Table<#{@headers.collect{|h| h.to_s }.join("\n")}>"
12
+ end
13
+
14
+ def header_count
15
+ @headers.size
16
+ end
17
+
18
+ # get column by index
19
+ def [](index)
20
+ @headers[index]
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,22 @@
1
+ module TableParser
2
+ class TableHeader < TableNode
3
+ attr_reader :children
4
+ def initialize(element, rowspan=nil, colspan=nil)
5
+ super(element, rowspan, colspan)
6
+ @children = []
7
+ end
8
+
9
+ def size
10
+ @children.size
11
+ end
12
+
13
+ def [](index)
14
+ @children[index]
15
+ end
16
+
17
+ def to_s
18
+ "[name=#{text}, children=#{@children.collect{|c| c.to_s }.join(",")}]"
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,16 @@
1
+ module TableParser
2
+ class TableNode
3
+ attr_reader :element, :text, :rowspan, :colspan
4
+ def initialize(element, rowspan=nil, colspan=nil)
5
+ @element = element
6
+ @text = element.text
7
+
8
+ @colspan = colspan || element["colspan"].to_i rescue 1
9
+ @rowspan = rowspan || element["rowspan"].to_i rescue 1
10
+ end
11
+
12
+ def to_s
13
+ "[#{@text}]"
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,8 @@
1
+ require 'table_parser/table_node'
2
+ require 'table_parser/table_header'
3
+ require 'table_parser/table'
4
+ require 'table_parser/parser'
5
+
6
+ module TableParser
7
+ VERSION = '0.2.0'
8
+ end
@@ -0,0 +1,75 @@
1
+ require "test/unit"
2
+ require "table_parser"
3
+
4
+ class TestTableParser < Test::Unit::TestCase
5
+ def test_parse_rowspan
6
+ table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
7
+ <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
8
+ <tr><td>3</td></tr></table></body></html>",
9
+ "/html/body/table"
10
+
11
+ assert_equal(2, table.header_count, 'header_count should = 2 ')
12
+ assert_equal(2, table[0].size)
13
+ assert_equal(2, table[1].size)
14
+ end
15
+
16
+ def test_parse_colspan
17
+ table = TableParser::Table.new "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
18
+ <tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
19
+ <tr><td>B2</td><td>C2</td></tr>\
20
+ <tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>",
21
+ "/html/body/table"
22
+
23
+ assert_equal(3, table.header_count, 'header_count should = 3 ')
24
+ assert_equal(4, table[0].size)
25
+ assert_equal(4, table[1].size)
26
+ assert_equal(4, table[2].size)
27
+
28
+ end
29
+
30
+ def test_parse_complex
31
+ table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
32
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
33
+ <tr><td>B2</td><td>B4</td></tr>\
34
+ <tr><td>C2</td><td>C3</td><td>B4</td></tr>\
35
+ </table></body></html>",
36
+ "/html/body/table"
37
+
38
+ assert_equal 4, table.header_count
39
+ assert_equal 3, table[0].size
40
+ assert_equal 3, table[1].size
41
+ assert_equal 3, table[2].size
42
+ end
43
+
44
+ def test_parse_complex2
45
+ table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
46
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
47
+ <tr><td>B2</td><td>B4</td></tr>\
48
+ <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
49
+ <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
50
+ <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
51
+ <tr><td>F2</td><td>F4</td></tr>\
52
+ <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
53
+ <tr><td>H2</td><td>H4</td></tr>\
54
+ <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
55
+ </table></body></html>",
56
+ "/html/body/table"
57
+
58
+ assert_equal 4, table.header_count
59
+ assert_equal 9, table[0].size
60
+ assert_equal 9, table[1].size
61
+ assert_equal 9, table[2].size
62
+ assert_equal 9, table[3].size
63
+ end
64
+
65
+ def test_parse_web
66
+ table = TableParser::Table.new open("test.html").read,
67
+ "/html/body/table"
68
+
69
+ assert_equal 11, table.header_count
70
+ assert_equal 9, table[0].size
71
+ assert_equal 9, table[1].size
72
+ assert_equal 9, table[2].size
73
+ assert_equal 9, table[3].size
74
+ end
75
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: table_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Francis Chong
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-05 00:00:00 +08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 2.4.0
24
+ version:
25
+ description: Parsing table could be difficult when its structure contains colspan or rowspan. TableParser parser HTML tables, group them by columns, with colspan and rowspan respected.
26
+ email:
27
+ - francis@ignition.hk
28
+ executables: []
29
+
30
+ extensions: []
31
+
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - Manifest.txt
35
+ - README.txt
36
+ files:
37
+ - .autotest
38
+ - History.txt
39
+ - Manifest.txt
40
+ - README.txt
41
+ - Rakefile
42
+ - lib/table_parser.rb
43
+ - lib/table_parser/parser.rb
44
+ - lib/table_parser/table.rb
45
+ - lib/table_parser/table_header.rb
46
+ - lib/table_parser/table_node.rb
47
+ - test/test_table_parser.rb
48
+ has_rdoc: true
49
+ homepage:
50
+ licenses: []
51
+
52
+ post_install_message:
53
+ rdoc_options:
54
+ - --main
55
+ - README.txt
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: "0"
63
+ version:
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: "0"
69
+ version:
70
+ requirements: []
71
+
72
+ rubyforge_project: table_parser
73
+ rubygems_version: 1.3.5
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: Parsing table could be difficult when its structure contains colspan or rowspan
77
+ test_files:
78
+ - test/test_table_parser.rb