simple-spreadsheet-extractor 0.4.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +18 -6
- data/VERSION +1 -1
- data/jars/lib/{poi-3.6.jar → poi-3.7.jar} +0 -0
- data/jars/lib/poi-ooxml-3.7.jar +0 -0
- data/jars/lib/{poi-ooxml-schemas-3.6.jar → poi-ooxml-schemas-3.7.jar} +0 -0
- data/jars/simple-spreadsheet-extractor-0.5.0.jar +0 -0
- data/lib/simple-spreadsheet-extractor.rb +33 -18
- data/simple-spreadsheet-extractor.gemspec +8 -9
- data/test/test_extraction.rb +25 -0
- metadata +20 -10
- data/jars/lib/poi-ooxml-3.6.jar +0 -0
- data/jars/simple-spreadsheet-extractor-0.4.1.jar +0 -0
data/README.rdoc
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
= Simple Spreadsheet Extractor
|
2
2
|
|
3
|
-
Authors::
|
4
|
-
Version:: 0.
|
3
|
+
Authors:: Stuart Owen, Finn Bacall
|
4
|
+
Version:: 0.5.0
|
5
5
|
Contact:: mailto:stuart.owen@manchester.ac.uk
|
6
6
|
Licence:: BSD (See LICENCE or http://www.opensource.org/licenses/bsd-license.php)
|
7
7
|
Copyright:: (c) 2010 The University of Manchester, UK
|
@@ -11,6 +11,8 @@ Copyright:: (c) 2010 The University of Manchester, UK
|
|
11
11
|
|
12
12
|
This is a simple gem that provides a facility to read an XLS or XLSX Excel spreadsheet document and produce an XML representation of its content.
|
13
13
|
|
14
|
+
CSV output can also be generated for a single sheet.
|
15
|
+
|
14
16
|
Internally it uses Apache POI, using the sister http://github.com/myGrid/simple-spreadsheet-extractor tool.
|
15
17
|
|
16
18
|
This is a simple tool developed for use within SysMO-DB[http://www.sysmo-db.org].
|
@@ -29,7 +31,7 @@ Note that on Windows you must also do:
|
|
29
31
|
|
30
32
|
* require 'simple-spreadsheet-extractor'
|
31
33
|
* include the module SysMODB::SpreadsheetExtractor
|
32
|
-
* pass an IO object to the method spreedsheet_to_xml which responds with the XML for the contents of the
|
34
|
+
* pass an IO object to the method spreedsheet_to_xml which responds with the XML for the contents of the spreadsheet. Alternatively use spreadsheet_to_csv for CSV.
|
33
35
|
* if something goes wrong with the extraction then a SysMODB::SpreadsheetExtractionException will be thrown
|
34
36
|
|
35
37
|
e.g.
|
@@ -42,7 +44,7 @@ e.g.
|
|
42
44
|
|
43
45
|
path=ARGV.first
|
44
46
|
|
45
|
-
f=open
|
47
|
+
f=open path
|
46
48
|
begin
|
47
49
|
puts spreadsheet_to_xml(f)
|
48
50
|
rescue SysMODB::SpreadsheetExtractionException=>e
|
@@ -53,10 +55,20 @@ Formulas are evaluated placing the result in the XML produced for that cell, how
|
|
53
55
|
|
54
56
|
Row and column indexes start at 1, rather than 0, to keep consistent with namings of the cells in Excel.
|
55
57
|
|
56
|
-
An XSD schema for the XML is available in doc/schema-v1.xsd["
|
58
|
+
An XSD schema for the XML is available in doc/schema-v1.xsd["tree/master/doc/schema-v1.xsd"]
|
57
59
|
|
58
60
|
The desired spreadsheet extractor jar can be specified by defining SPREADSHEET_EXTRACTOR_JAR_PATH in a config file (e.g. environment.rb)
|
59
61
|
|
62
|
+
CSV can be generated in a similar way, and also takes an optional sheet number. If the sheet number is missing then the first sheet is used.
|
63
|
+
|
64
|
+
Note that the sheet number for the first sheet is 1, and can either be a string or integer.
|
65
|
+
|
66
|
+
e.g.
|
67
|
+
|
68
|
+
puts spreadsheet_to_csv(f,"1")
|
69
|
+
|
70
|
+
|
71
|
+
|
60
72
|
== Example XML
|
61
73
|
|
62
74
|
<?xml version="1.0" encoding="UTF-8"?>
|
@@ -105,4 +117,4 @@ The desired spreadsheet extractor jar can be specified by defining SPREADSHEET_E
|
|
105
117
|
</sheet>
|
106
118
|
<sheet name="Sheet2" index="2" hidden="false" very_hidden="false" first_row="1" last_row="1"/>
|
107
119
|
<sheet name="Sheet3" index="3" hidden="false" very_hidden="false" first_row="1" last_row="1"/>
|
108
|
-
</workbook>
|
120
|
+
</workbook>
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.5.0
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -10,32 +10,46 @@ module SysMODB
|
|
10
10
|
|
11
11
|
module SpreadsheetExtractor
|
12
12
|
|
13
|
-
DEFAULT_PATH = File.dirname(__FILE__) + "/../jars/simple-spreadsheet-extractor-0.
|
13
|
+
DEFAULT_PATH = File.dirname(__FILE__) + "/../jars/simple-spreadsheet-extractor-0.5.0.jar"
|
14
14
|
|
15
|
-
def spreadsheet_to_xml spreadsheet_data
|
16
|
-
|
17
|
-
|
18
|
-
output = read_with_popen4 spreadsheet_data
|
15
|
+
def spreadsheet_to_xml spreadsheet_data
|
16
|
+
if is_windows?
|
17
|
+
read_with_popen4 spreadsheet_data,"xml"
|
19
18
|
else
|
20
|
-
|
19
|
+
read_with_open4 spreadsheet_data,"xml"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def spreadsheet_to_csv spreadsheet_data,sheet=1,trim=false
|
24
|
+
if is_windows?
|
25
|
+
read_with_popen4 spreadsheet_data,"csv",sheet,trim
|
26
|
+
else
|
27
|
+
read_with_open4 spreadsheet_data,"csv",sheet,trim
|
21
28
|
end
|
22
|
-
|
23
|
-
return output
|
24
29
|
end
|
25
30
|
|
26
31
|
|
27
32
|
|
28
|
-
def spreadsheet_extractor_command
|
29
|
-
"java -jar #{(defined? SPREADSHEET_EXTRACTOR_JAR_PATH) ? SPREADSHEET_EXTRACTOR_JAR_PATH : DEFAULT_PATH}"
|
33
|
+
def spreadsheet_extractor_command format="xml",sheet=nil,trim=false
|
34
|
+
command = "java -jar #{(defined? SPREADSHEET_EXTRACTOR_JAR_PATH) ? SPREADSHEET_EXTRACTOR_JAR_PATH : DEFAULT_PATH}"
|
35
|
+
command += " -o #{format}"
|
36
|
+
command += " -s #{sheet}" if sheet
|
37
|
+
command += " -t" if trim
|
38
|
+
command
|
30
39
|
end
|
31
40
|
|
32
41
|
private
|
33
|
-
|
34
|
-
|
35
|
-
|
42
|
+
|
43
|
+
def is_windows?
|
44
|
+
!(RUBY_PLATFORM =~ /mswin32/ || RUBY_PLATFORM =~ /mingw32/).nil?
|
45
|
+
end
|
46
|
+
|
47
|
+
#opens using POpen4 - this is for the benefit of Windows. It has been found to be unstable in Linux and give occasional segmentation faults
|
48
|
+
def read_with_popen4 spreadsheet_data,format="xml",sheet=nil,trim=false
|
36
49
|
output=""
|
37
50
|
err_message = ""
|
38
|
-
|
51
|
+
command = spreadsheet_extractor_command format,sheet,trim
|
52
|
+
status = POpen4::popen4(command) do |stdout, stderr, stdin, pid|
|
39
53
|
stdin=stdin.binmode
|
40
54
|
spreadsheet_data.each_byte{|b| stdin.putc(b)}
|
41
55
|
stdin.close
|
@@ -49,13 +63,14 @@ module SysMODB
|
|
49
63
|
raise SpreadsheetExtractionException.new(err_message)
|
50
64
|
end
|
51
65
|
|
52
|
-
|
66
|
+
output.strip
|
53
67
|
end
|
54
68
|
|
55
|
-
def read_with_open4 spreadsheet_data
|
69
|
+
def read_with_open4 spreadsheet_data,format="xml",sheet=nil,trim=false
|
56
70
|
output = ""
|
57
71
|
err_message = ""
|
58
|
-
|
72
|
+
command = spreadsheet_extractor_command format,sheet,trim
|
73
|
+
status = Open4::popen4(command) do |pid, stdin, stdout, stderr|
|
59
74
|
while ((line = spreadsheet_data.gets) != nil) do
|
60
75
|
stdin << line
|
61
76
|
end
|
@@ -76,7 +91,7 @@ module SysMODB
|
|
76
91
|
raise SpreadsheetExtractionException.new(err_message)
|
77
92
|
end
|
78
93
|
|
79
|
-
|
94
|
+
output.strip
|
80
95
|
end
|
81
96
|
|
82
97
|
end
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{simple-spreadsheet-extractor}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.5.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Stuart Owen", "Finn Bacall"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-03-03}
|
13
13
|
s.description = %q{Takes a stream to a spreadsheet file and produces and XML representation of its contents}
|
14
14
|
s.email = %q{stuart.owen@manchester.ac.uk}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,27 +24,26 @@ Gem::Specification.new do |s|
|
|
24
24
|
"doc/schema-v1.xsd",
|
25
25
|
"example.rb",
|
26
26
|
"jars/lib/dom4j-1.6.1.jar",
|
27
|
-
"jars/lib/poi-3.
|
28
|
-
"jars/lib/poi-ooxml-3.
|
29
|
-
"jars/lib/poi-ooxml-schemas-3.
|
27
|
+
"jars/lib/poi-3.7.jar",
|
28
|
+
"jars/lib/poi-ooxml-3.7.jar",
|
29
|
+
"jars/lib/poi-ooxml-schemas-3.7.jar",
|
30
30
|
"jars/lib/xmlbeans-2.3.0.jar",
|
31
|
-
"jars/simple-spreadsheet-extractor-0.
|
31
|
+
"jars/simple-spreadsheet-extractor-0.5.0.jar",
|
32
32
|
"lib/simple-spreadsheet-extractor.rb",
|
33
33
|
"simple-spreadsheet-extractor.gemspec"
|
34
34
|
]
|
35
35
|
s.homepage = %q{http://github.com/myGrid/simple-spreadsheet-extractor-gem}
|
36
36
|
s.require_paths = ["lib"]
|
37
|
-
s.rubygems_version = %q{1.
|
37
|
+
s.rubygems_version = %q{1.4.2}
|
38
38
|
s.summary = %q{Basic spreadsheet content extraction using Apache POI}
|
39
39
|
s.test_files = [
|
40
40
|
"test/test_extraction.rb"
|
41
41
|
]
|
42
42
|
|
43
43
|
if s.respond_to? :specification_version then
|
44
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
45
44
|
s.specification_version = 3
|
46
45
|
|
47
|
-
if Gem::Version.new(Gem::
|
46
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
48
47
|
s.add_runtime_dependency(%q<POpen4>, ["= 0.1.4"])
|
49
48
|
s.add_runtime_dependency(%q<open4>, ["= 1.0.1"])
|
50
49
|
else
|
data/test/test_extraction.rb
CHANGED
@@ -40,6 +40,31 @@ class TestExtraction < Test::Unit::TestCase
|
|
40
40
|
assert false,"Error validating against schema: #{e.message}"
|
41
41
|
end
|
42
42
|
end
|
43
|
+
|
44
|
+
def test_csv_output
|
45
|
+
test_sheet = File.dirname(__FILE__) + "/files/test-spreadsheet.xls"
|
46
|
+
expected_file = File.dirname(__FILE__) + "/files/test-csv-output1.csv"
|
47
|
+
expected = open(expected_file,"rb").read
|
48
|
+
|
49
|
+
f=open(test_sheet,"rb")
|
50
|
+
csv = spreadsheet_to_csv(f,2)
|
51
|
+
assert_equal expected,csv
|
52
|
+
|
53
|
+
#try sheet as a string
|
54
|
+
f=open(test_sheet,"rb")
|
55
|
+
csv = spreadsheet_to_csv(f,"2")
|
56
|
+
assert_equal expected,csv
|
57
|
+
end
|
58
|
+
|
59
|
+
# def test_csv_output_trimmed
|
60
|
+
# test_sheet = File.dirname(__FILE__) + "/files/test-spreadsheet.xls"
|
61
|
+
# expected_file = File.dirname(__FILE__) + "/files/test-csv-output1-trimmed.csv"
|
62
|
+
# expected = open(expected_file,"rb").read
|
63
|
+
#
|
64
|
+
# f=open(test_sheet,"rb")
|
65
|
+
# csv = spreadsheet_to_csv(f,2,true)
|
66
|
+
# assert_equal expected,csv
|
67
|
+
# end
|
43
68
|
|
44
69
|
def test_for_segfault
|
45
70
|
test_sheet = File.dirname(__FILE__) + "/files/test-spreadsheet.xls"
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple-spreadsheet-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 11
|
5
|
+
prerelease:
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
8
|
+
- 5
|
9
|
+
- 0
|
10
|
+
version: 0.5.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Stuart Owen
|
@@ -15,16 +16,18 @@ autorequire:
|
|
15
16
|
bindir: bin
|
16
17
|
cert_chain: []
|
17
18
|
|
18
|
-
date: 2011-
|
19
|
+
date: 2011-03-03 00:00:00 +00:00
|
19
20
|
default_executable:
|
20
21
|
dependencies:
|
21
22
|
- !ruby/object:Gem::Dependency
|
22
23
|
name: POpen4
|
23
24
|
prerelease: false
|
24
25
|
requirement: &id001 !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
25
27
|
requirements:
|
26
28
|
- - "="
|
27
29
|
- !ruby/object:Gem::Version
|
30
|
+
hash: 19
|
28
31
|
segments:
|
29
32
|
- 0
|
30
33
|
- 1
|
@@ -36,9 +39,11 @@ dependencies:
|
|
36
39
|
name: open4
|
37
40
|
prerelease: false
|
38
41
|
requirement: &id002 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
39
43
|
requirements:
|
40
44
|
- - "="
|
41
45
|
- !ruby/object:Gem::Version
|
46
|
+
hash: 21
|
42
47
|
segments:
|
43
48
|
- 1
|
44
49
|
- 0
|
@@ -63,13 +68,14 @@ files:
|
|
63
68
|
- doc/schema-v1.xsd
|
64
69
|
- example.rb
|
65
70
|
- jars/lib/dom4j-1.6.1.jar
|
66
|
-
- jars/lib/poi-3.
|
67
|
-
- jars/lib/poi-ooxml-3.
|
68
|
-
- jars/lib/poi-ooxml-schemas-3.
|
71
|
+
- jars/lib/poi-3.7.jar
|
72
|
+
- jars/lib/poi-ooxml-3.7.jar
|
73
|
+
- jars/lib/poi-ooxml-schemas-3.7.jar
|
69
74
|
- jars/lib/xmlbeans-2.3.0.jar
|
70
|
-
- jars/simple-spreadsheet-extractor-0.
|
75
|
+
- jars/simple-spreadsheet-extractor-0.5.0.jar
|
71
76
|
- lib/simple-spreadsheet-extractor.rb
|
72
77
|
- simple-spreadsheet-extractor.gemspec
|
78
|
+
- test/test_extraction.rb
|
73
79
|
has_rdoc: true
|
74
80
|
homepage: http://github.com/myGrid/simple-spreadsheet-extractor-gem
|
75
81
|
licenses: []
|
@@ -80,23 +86,27 @@ rdoc_options: []
|
|
80
86
|
require_paths:
|
81
87
|
- lib
|
82
88
|
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
83
90
|
requirements:
|
84
91
|
- - ">="
|
85
92
|
- !ruby/object:Gem::Version
|
93
|
+
hash: 3
|
86
94
|
segments:
|
87
95
|
- 0
|
88
96
|
version: "0"
|
89
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
90
99
|
requirements:
|
91
100
|
- - ">="
|
92
101
|
- !ruby/object:Gem::Version
|
102
|
+
hash: 3
|
93
103
|
segments:
|
94
104
|
- 0
|
95
105
|
version: "0"
|
96
106
|
requirements: []
|
97
107
|
|
98
108
|
rubyforge_project:
|
99
|
-
rubygems_version: 1.
|
109
|
+
rubygems_version: 1.4.2
|
100
110
|
signing_key:
|
101
111
|
specification_version: 3
|
102
112
|
summary: Basic spreadsheet content extraction using Apache POI
|
data/jars/lib/poi-ooxml-3.6.jar
DELETED
Binary file
|
Binary file
|