table_parser 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.autotest ADDED
@@ -0,0 +1,23 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'autotest/restart'
4
+
5
+ # Autotest.add_hook :initialize do |at|
6
+ # at.extra_files << "../some/external/dependency.rb"
7
+ #
8
+ # at.libs << ":../some/external"
9
+ #
10
+ # at.add_exception 'vendor'
11
+ #
12
+ # at.add_mapping(/dependency.rb/) do |f, _|
13
+ # at.files_matching(/test_.*rb$/)
14
+ # end
15
+ #
16
+ # %w(TestA TestB).each do |klass|
17
+ # at.extra_class_map[klass] = "test/test_misc.rb"
18
+ # end
19
+ # end
20
+
21
+ # Autotest.add_hook :run_command do |at|
22
+ # system "rake build"
23
+ # end
data/History.txt ADDED
@@ -0,0 +1,6 @@
1
+ === 1.0.0 / 2010-01-04
2
+
3
+ * 1 major enhancement
4
+
5
+ * Birthday!
6
+
data/Manifest.txt ADDED
@@ -0,0 +1,11 @@
1
+ .autotest
2
+ History.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ lib/table_parser.rb
7
+ lib/table_parser/parser.rb
8
+ lib/table_parser/table.rb
9
+ lib/table_parser/table_header.rb
10
+ lib/table_parser/table_node.rb
11
+ test/test_table_parser.rb
data/README.txt ADDED
@@ -0,0 +1,49 @@
1
+ = TableParser
2
+
3
+ == DESCRIPTION:
4
+
5
+ Parsing table could be difficult when its structure contains colspan or rowspan. TableParser parser HTML tables, group them by columns, with colspan and rowspan respected.
6
+
7
+ == REQUIREMENTS:
8
+
9
+ * Nokogiri
10
+
11
+ == INSTALL:
12
+
13
+ * sudo gem install table_parser
14
+
15
+ == DEVELOPERS:
16
+
17
+ * Francis Chong francis at ignition dot hk
18
+
19
+ After checking out the source, run:
20
+
21
+ $ rake newb
22
+
23
+ This task will install any missing dependencies, run the tests/specs,
24
+ and generate the RDoc.
25
+
26
+ == LICENSE:
27
+
28
+ (The MIT License)
29
+
30
+ Copyright (c) 2010 Ignition Soft
31
+
32
+ Permission is hereby granted, free of charge, to any person obtaining
33
+ a copy of this software and associated documentation files (the
34
+ 'Software'), to deal in the Software without restriction, including
35
+ without limitation the rights to use, copy, modify, merge, publish,
36
+ distribute, sublicense, and/or sell copies of the Software, and to
37
+ permit persons to whom the Software is furnished to do so, subject to
38
+ the following conditions:
39
+
40
+ The above copyright notice and this permission notice shall be
41
+ included in all copies or substantial portions of the Software.
42
+
43
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
44
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
45
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
46
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
47
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
48
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
49
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'hoe'
3
+
4
+ Hoe.plugin :gemcutter
5
+ Hoe.spec 'table_parser' do
6
+ developer('Francis Chong', 'francis@ignition.hk')
7
+ end
8
+
@@ -0,0 +1,63 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ module TableParser
6
+ class Parser
7
+ # extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
8
+ def self.extract_table(input, xpath)
9
+ doc = Nokogiri::HTML(input)
10
+
11
+ rows = []
12
+ table = doc.xpath(xpath)
13
+ rows = table.xpath("./tr").collect do |row|
14
+ row.xpath("./td").collect do |col|
15
+ col
16
+ end
17
+ end
18
+ rows
19
+ end
20
+
21
+ def self.extract_headers(rows)
22
+ headers = []
23
+ rows.first.collect do |col|
24
+ header = TableHeader.new(col)
25
+ headers << header
26
+
27
+ (header.colspan-1).times do
28
+ headers << TableHeader.new(col)
29
+ end
30
+ end
31
+ rows.delete_at(0)
32
+ headers
33
+ end
34
+
35
+ def self.extract_nodes(rows, headers)
36
+ data = rows.collect do |row|
37
+ row.collect do |ele|
38
+ node = TableNode.new(ele)
39
+ end
40
+ end
41
+
42
+ # handle rowspan
43
+ data.each_index do |row_index|
44
+ row = data[row_index]
45
+ row.each_index do |col_index|
46
+ col = row[col_index]
47
+ headers[col_index].children << col
48
+
49
+ if col.colspan > 1
50
+ col.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
51
+ end
52
+
53
+ if col.rowspan > 1 && data[row_index+1]
54
+ data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
55
+ end
56
+ end
57
+ end
58
+
59
+ data
60
+ end
61
+
62
+ end
63
+ end
@@ -0,0 +1,23 @@
1
+ module TableParser
2
+ class Table
3
+ attr_reader :nodes, :headers
4
+ def initialize(input, xpath_to_table="//table[0]")
5
+ table = Parser.extract_table(input, xpath_to_table)
6
+ @headers = Parser.extract_headers(table)
7
+ @nodes = Parser.extract_nodes(table, @headers)
8
+ end
9
+
10
+ def to_s
11
+ "Table<#{@headers.collect{|h| h.to_s }.join("\n")}>"
12
+ end
13
+
14
+ def header_count
15
+ @headers.size
16
+ end
17
+
18
+ # get column by index
19
+ def [](index)
20
+ @headers[index]
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,22 @@
1
+ module TableParser
2
+ class TableHeader < TableNode
3
+ attr_reader :children
4
+ def initialize(element, rowspan=nil, colspan=nil)
5
+ super(element, rowspan, colspan)
6
+ @children = []
7
+ end
8
+
9
+ def size
10
+ @children.size
11
+ end
12
+
13
+ def [](index)
14
+ @children[index]
15
+ end
16
+
17
+ def to_s
18
+ "[name=#{text}, children=#{@children.collect{|c| c.to_s }.join(",")}]"
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,16 @@
1
+ module TableParser
2
+ class TableNode
3
+ attr_reader :element, :text, :rowspan, :colspan
4
+ def initialize(element, rowspan=nil, colspan=nil)
5
+ @element = element
6
+ @text = element.text
7
+
8
+ @colspan = colspan || element["colspan"].to_i rescue 1
9
+ @rowspan = rowspan || element["rowspan"].to_i rescue 1
10
+ end
11
+
12
+ def to_s
13
+ "[#{@text}]"
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,8 @@
1
+ require 'table_parser/table_node'
2
+ require 'table_parser/table_header'
3
+ require 'table_parser/table'
4
+ require 'table_parser/parser'
5
+
6
+ module TableParser
7
+ VERSION = '0.2.0'
8
+ end
@@ -0,0 +1,75 @@
1
+ require "test/unit"
2
+ require "table_parser"
3
+
4
+ class TestTableParser < Test::Unit::TestCase
5
+ def test_parse_rowspan
6
+ table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
7
+ <tr><td rowspan=\"2\">1</td><td>2</td></tr> \
8
+ <tr><td>3</td></tr></table></body></html>",
9
+ "/html/body/table"
10
+
11
+ assert_equal(2, table.header_count, 'header_count should = 2 ')
12
+ assert_equal(2, table[0].size)
13
+ assert_equal(2, table[1].size)
14
+ end
15
+
16
+ def test_parse_colspan
17
+ table = TableParser::Table.new "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
18
+ <tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
19
+ <tr><td>B2</td><td>C2</td></tr>\
20
+ <tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>",
21
+ "/html/body/table"
22
+
23
+ assert_equal(3, table.header_count, 'header_count should = 3 ')
24
+ assert_equal(4, table[0].size)
25
+ assert_equal(4, table[1].size)
26
+ assert_equal(4, table[2].size)
27
+
28
+ end
29
+
30
+ def test_parse_complex
31
+ table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
32
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
33
+ <tr><td>B2</td><td>B4</td></tr>\
34
+ <tr><td>C2</td><td>C3</td><td>B4</td></tr>\
35
+ </table></body></html>",
36
+ "/html/body/table"
37
+
38
+ assert_equal 4, table.header_count
39
+ assert_equal 3, table[0].size
40
+ assert_equal 3, table[1].size
41
+ assert_equal 3, table[2].size
42
+ end
43
+
44
+ def test_parse_complex2
45
+ table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
46
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
47
+ <tr><td>B2</td><td>B4</td></tr>\
48
+ <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
49
+ <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
50
+ <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
51
+ <tr><td>F2</td><td>F4</td></tr>\
52
+ <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
53
+ <tr><td>H2</td><td>H4</td></tr>\
54
+ <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
55
+ </table></body></html>",
56
+ "/html/body/table"
57
+
58
+ assert_equal 4, table.header_count
59
+ assert_equal 9, table[0].size
60
+ assert_equal 9, table[1].size
61
+ assert_equal 9, table[2].size
62
+ assert_equal 9, table[3].size
63
+ end
64
+
65
+ def test_parse_web
66
+ table = TableParser::Table.new open("test.html").read,
67
+ "/html/body/table"
68
+
69
+ assert_equal 11, table.header_count
70
+ assert_equal 9, table[0].size
71
+ assert_equal 9, table[1].size
72
+ assert_equal 9, table[2].size
73
+ assert_equal 9, table[3].size
74
+ end
75
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: table_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Francis Chong
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-05 00:00:00 +08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 2.4.0
24
+ version:
25
+ description: Parsing table could be difficult when its structure contains colspan or rowspan. TableParser parser HTML tables, group them by columns, with colspan and rowspan respected.
26
+ email:
27
+ - francis@ignition.hk
28
+ executables: []
29
+
30
+ extensions: []
31
+
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - Manifest.txt
35
+ - README.txt
36
+ files:
37
+ - .autotest
38
+ - History.txt
39
+ - Manifest.txt
40
+ - README.txt
41
+ - Rakefile
42
+ - lib/table_parser.rb
43
+ - lib/table_parser/parser.rb
44
+ - lib/table_parser/table.rb
45
+ - lib/table_parser/table_header.rb
46
+ - lib/table_parser/table_node.rb
47
+ - test/test_table_parser.rb
48
+ has_rdoc: true
49
+ homepage:
50
+ licenses: []
51
+
52
+ post_install_message:
53
+ rdoc_options:
54
+ - --main
55
+ - README.txt
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: "0"
63
+ version:
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: "0"
69
+ version:
70
+ requirements: []
71
+
72
+ rubyforge_project: table_parser
73
+ rubygems_version: 1.3.5
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: Parsing table could be difficult when its structure contains colspan or rowspan
77
+ test_files:
78
+ - test/test_table_parser.rb