table_parser 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +23 -0
- data/History.txt +6 -0
- data/Manifest.txt +11 -0
- data/README.txt +49 -0
- data/Rakefile +8 -0
- data/lib/table_parser/parser.rb +63 -0
- data/lib/table_parser/table.rb +23 -0
- data/lib/table_parser/table_header.rb +22 -0
- data/lib/table_parser/table_node.rb +16 -0
- data/lib/table_parser.rb +8 -0
- data/test/test_table_parser.rb +75 -0
- metadata +78 -0
data/.autotest
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'autotest/restart'
|
4
|
+
|
5
|
+
# Autotest.add_hook :initialize do |at|
|
6
|
+
# at.extra_files << "../some/external/dependency.rb"
|
7
|
+
#
|
8
|
+
# at.libs << ":../some/external"
|
9
|
+
#
|
10
|
+
# at.add_exception 'vendor'
|
11
|
+
#
|
12
|
+
# at.add_mapping(/dependency.rb/) do |f, _|
|
13
|
+
# at.files_matching(/test_.*rb$/)
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# %w(TestA TestB).each do |klass|
|
17
|
+
# at.extra_class_map[klass] = "test/test_misc.rb"
|
18
|
+
# end
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Autotest.add_hook :run_command do |at|
|
22
|
+
# system "rake build"
|
23
|
+
# end
|
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.txt
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
= TableParser
|
2
|
+
|
3
|
+
== DESCRIPTION:
|
4
|
+
|
5
|
+
Parsing table could be difficult when its structure contains colspan or rowspan. TableParser parser HTML tables, group them by columns, with colspan and rowspan respected.
|
6
|
+
|
7
|
+
== REQUIREMENTS:
|
8
|
+
|
9
|
+
* Nokogiri
|
10
|
+
|
11
|
+
== INSTALL:
|
12
|
+
|
13
|
+
* sudo gem install table_parser
|
14
|
+
|
15
|
+
== DEVELOPERS:
|
16
|
+
|
17
|
+
* Francis Chong francis at ignition dot hk
|
18
|
+
|
19
|
+
After checking out the source, run:
|
20
|
+
|
21
|
+
$ rake newb
|
22
|
+
|
23
|
+
This task will install any missing dependencies, run the tests/specs,
|
24
|
+
and generate the RDoc.
|
25
|
+
|
26
|
+
== LICENSE:
|
27
|
+
|
28
|
+
(The MIT License)
|
29
|
+
|
30
|
+
Copyright (c) 2010 Ignition Soft
|
31
|
+
|
32
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
33
|
+
a copy of this software and associated documentation files (the
|
34
|
+
'Software'), to deal in the Software without restriction, including
|
35
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
36
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
37
|
+
permit persons to whom the Software is furnished to do so, subject to
|
38
|
+
the following conditions:
|
39
|
+
|
40
|
+
The above copyright notice and this permission notice shall be
|
41
|
+
included in all copies or substantial portions of the Software.
|
42
|
+
|
43
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
44
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
45
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
46
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
47
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
48
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
49
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
module TableParser
|
6
|
+
class Parser
|
7
|
+
# extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
|
8
|
+
def self.extract_table(input, xpath)
|
9
|
+
doc = Nokogiri::HTML(input)
|
10
|
+
|
11
|
+
rows = []
|
12
|
+
table = doc.xpath(xpath)
|
13
|
+
rows = table.xpath("./tr").collect do |row|
|
14
|
+
row.xpath("./td").collect do |col|
|
15
|
+
col
|
16
|
+
end
|
17
|
+
end
|
18
|
+
rows
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.extract_headers(rows)
|
22
|
+
headers = []
|
23
|
+
rows.first.collect do |col|
|
24
|
+
header = TableHeader.new(col)
|
25
|
+
headers << header
|
26
|
+
|
27
|
+
(header.colspan-1).times do
|
28
|
+
headers << TableHeader.new(col)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
rows.delete_at(0)
|
32
|
+
headers
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.extract_nodes(rows, headers)
|
36
|
+
data = rows.collect do |row|
|
37
|
+
row.collect do |ele|
|
38
|
+
node = TableNode.new(ele)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# handle rowspan
|
43
|
+
data.each_index do |row_index|
|
44
|
+
row = data[row_index]
|
45
|
+
row.each_index do |col_index|
|
46
|
+
col = row[col_index]
|
47
|
+
headers[col_index].children << col
|
48
|
+
|
49
|
+
if col.colspan > 1
|
50
|
+
col.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
|
51
|
+
end
|
52
|
+
|
53
|
+
if col.rowspan > 1 && data[row_index+1]
|
54
|
+
data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
data
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module TableParser
|
2
|
+
class Table
|
3
|
+
attr_reader :nodes, :headers
|
4
|
+
def initialize(input, xpath_to_table="//table[0]")
|
5
|
+
table = Parser.extract_table(input, xpath_to_table)
|
6
|
+
@headers = Parser.extract_headers(table)
|
7
|
+
@nodes = Parser.extract_nodes(table, @headers)
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_s
|
11
|
+
"Table<#{@headers.collect{|h| h.to_s }.join("\n")}>"
|
12
|
+
end
|
13
|
+
|
14
|
+
def header_count
|
15
|
+
@headers.size
|
16
|
+
end
|
17
|
+
|
18
|
+
# get column by index
|
19
|
+
def [](index)
|
20
|
+
@headers[index]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module TableParser
|
2
|
+
class TableHeader < TableNode
|
3
|
+
attr_reader :children
|
4
|
+
def initialize(element, rowspan=nil, colspan=nil)
|
5
|
+
super(element, rowspan, colspan)
|
6
|
+
@children = []
|
7
|
+
end
|
8
|
+
|
9
|
+
def size
|
10
|
+
@children.size
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](index)
|
14
|
+
@children[index]
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"[name=#{text}, children=#{@children.collect{|c| c.to_s }.join(",")}]"
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module TableParser
|
2
|
+
class TableNode
|
3
|
+
attr_reader :element, :text, :rowspan, :colspan
|
4
|
+
def initialize(element, rowspan=nil, colspan=nil)
|
5
|
+
@element = element
|
6
|
+
@text = element.text
|
7
|
+
|
8
|
+
@colspan = colspan || element["colspan"].to_i rescue 1
|
9
|
+
@rowspan = rowspan || element["rowspan"].to_i rescue 1
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_s
|
13
|
+
"[#{@text}]"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/table_parser.rb
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "table_parser"
|
3
|
+
|
4
|
+
class TestTableParser < Test::Unit::TestCase
|
5
|
+
def test_parse_rowspan
|
6
|
+
table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
|
7
|
+
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
|
8
|
+
<tr><td>3</td></tr></table></body></html>",
|
9
|
+
"/html/body/table"
|
10
|
+
|
11
|
+
assert_equal(2, table.header_count, 'header_count should = 2 ')
|
12
|
+
assert_equal(2, table[0].size)
|
13
|
+
assert_equal(2, table[1].size)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_parse_colspan
|
17
|
+
table = TableParser::Table.new "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
|
18
|
+
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
|
19
|
+
<tr><td>B2</td><td>C2</td></tr>\
|
20
|
+
<tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>",
|
21
|
+
"/html/body/table"
|
22
|
+
|
23
|
+
assert_equal(3, table.header_count, 'header_count should = 3 ')
|
24
|
+
assert_equal(4, table[0].size)
|
25
|
+
assert_equal(4, table[1].size)
|
26
|
+
assert_equal(4, table[2].size)
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_parse_complex
|
31
|
+
table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
|
32
|
+
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
|
33
|
+
<tr><td>B2</td><td>B4</td></tr>\
|
34
|
+
<tr><td>C2</td><td>C3</td><td>B4</td></tr>\
|
35
|
+
</table></body></html>",
|
36
|
+
"/html/body/table"
|
37
|
+
|
38
|
+
assert_equal 4, table.header_count
|
39
|
+
assert_equal 3, table[0].size
|
40
|
+
assert_equal 3, table[1].size
|
41
|
+
assert_equal 3, table[2].size
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_parse_complex2
|
45
|
+
table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
|
46
|
+
<tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
|
47
|
+
<tr><td>B2</td><td>B4</td></tr>\
|
48
|
+
<tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
|
49
|
+
<tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
|
50
|
+
<tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
|
51
|
+
<tr><td>F2</td><td>F4</td></tr>\
|
52
|
+
<tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
|
53
|
+
<tr><td>H2</td><td>H4</td></tr>\
|
54
|
+
<tr><td>I2</td><td>I3</td><td>I4</td></tr>\
|
55
|
+
</table></body></html>",
|
56
|
+
"/html/body/table"
|
57
|
+
|
58
|
+
assert_equal 4, table.header_count
|
59
|
+
assert_equal 9, table[0].size
|
60
|
+
assert_equal 9, table[1].size
|
61
|
+
assert_equal 9, table[2].size
|
62
|
+
assert_equal 9, table[3].size
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_parse_web
|
66
|
+
table = TableParser::Table.new open("test.html").read,
|
67
|
+
"/html/body/table"
|
68
|
+
|
69
|
+
assert_equal 11, table.header_count
|
70
|
+
assert_equal 9, table[0].size
|
71
|
+
assert_equal 9, table[1].size
|
72
|
+
assert_equal 9, table[2].size
|
73
|
+
assert_equal 9, table[3].size
|
74
|
+
end
|
75
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: table_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Francis Chong
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-05 00:00:00 +08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 2.4.0
|
24
|
+
version:
|
25
|
+
description: Parsing table could be difficult when its structure contains colspan or rowspan. TableParser parser HTML tables, group them by columns, with colspan and rowspan respected.
|
26
|
+
email:
|
27
|
+
- francis@ignition.hk
|
28
|
+
executables: []
|
29
|
+
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.txt
|
36
|
+
files:
|
37
|
+
- .autotest
|
38
|
+
- History.txt
|
39
|
+
- Manifest.txt
|
40
|
+
- README.txt
|
41
|
+
- Rakefile
|
42
|
+
- lib/table_parser.rb
|
43
|
+
- lib/table_parser/parser.rb
|
44
|
+
- lib/table_parser/table.rb
|
45
|
+
- lib/table_parser/table_header.rb
|
46
|
+
- lib/table_parser/table_node.rb
|
47
|
+
- test/test_table_parser.rb
|
48
|
+
has_rdoc: true
|
49
|
+
homepage:
|
50
|
+
licenses: []
|
51
|
+
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options:
|
54
|
+
- --main
|
55
|
+
- README.txt
|
56
|
+
require_paths:
|
57
|
+
- lib
|
58
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: "0"
|
63
|
+
version:
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: "0"
|
69
|
+
version:
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project: table_parser
|
73
|
+
rubygems_version: 1.3.5
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: Parsing table could be difficult when its structure contains colspan or rowspan
|
77
|
+
test_files:
|
78
|
+
- test/test_table_parser.rb
|