simple-spreadsheet-extractor 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +18 -6
- data/VERSION +1 -1
- data/jars/lib/{poi-3.6.jar → poi-3.7.jar} +0 -0
- data/jars/lib/poi-ooxml-3.7.jar +0 -0
- data/jars/lib/{poi-ooxml-schemas-3.6.jar → poi-ooxml-schemas-3.7.jar} +0 -0
- data/jars/simple-spreadsheet-extractor-0.5.0.jar +0 -0
- data/lib/simple-spreadsheet-extractor.rb +33 -18
- data/simple-spreadsheet-extractor.gemspec +8 -9
- data/test/test_extraction.rb +25 -0
- metadata +20 -10
- data/jars/lib/poi-ooxml-3.6.jar +0 -0
- data/jars/simple-spreadsheet-extractor-0.4.1.jar +0 -0
data/README.rdoc
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
= Simple Spreadsheet Extractor
|
2
2
|
|
3
|
-
Authors::
|
4
|
-
Version:: 0.
|
3
|
+
Authors:: Stuart Owen, Finn Bacall
|
4
|
+
Version:: 0.5.0
|
5
5
|
Contact:: mailto:stuart.owen@manchester.ac.uk
|
6
6
|
Licence:: BSD (See LICENCE or http://www.opensource.org/licenses/bsd-license.php)
|
7
7
|
Copyright:: (c) 2010 The University of Manchester, UK
|
@@ -11,6 +11,8 @@ Copyright:: (c) 2010 The University of Manchester, UK
|
|
11
11
|
|
12
12
|
This is a simple gem that provides a facility to read an XLS or XLSX Excel spreadsheet document and produce an XML representation of its content.
|
13
13
|
|
14
|
+
CSV output can also be generated for a single sheet.
|
15
|
+
|
14
16
|
Internally it uses Apache POI, using the sister http://github.com/myGrid/simple-spreadsheet-extractor tool.
|
15
17
|
|
16
18
|
This is a simple tool developed for use within SysMO-DB[http://www.sysmo-db.org].
|
@@ -29,7 +31,7 @@ Note that on Windows you must also do:
|
|
29
31
|
|
30
32
|
* require 'simple-spreadsheet-extractor'
|
31
33
|
* include the module SysMODB::SpreadsheetExtractor
|
32
|
-
* pass an IO object to the method spreedsheet_to_xml which responds with the XML for the contents of the
|
34
|
+
* pass an IO object to the method spreedsheet_to_xml which responds with the XML for the contents of the spreadsheet. Alternatively use spreadsheet_to_csv for CSV.
|
33
35
|
* if something goes wrong with the extraction then a SysMODB::SpreadsheetExtractionException will be thrown
|
34
36
|
|
35
37
|
e.g.
|
@@ -42,7 +44,7 @@ e.g.
|
|
42
44
|
|
43
45
|
path=ARGV.first
|
44
46
|
|
45
|
-
f=open
|
47
|
+
f=open path
|
46
48
|
begin
|
47
49
|
puts spreadsheet_to_xml(f)
|
48
50
|
rescue SysMODB::SpreadsheetExtractionException=>e
|
@@ -53,10 +55,20 @@ Formulas are evaluated placing the result in the XML produced for that cell, how
|
|
53
55
|
|
54
56
|
Row and column indexes start at 1, rather than 0, to keep consistent with namings of the cells in Excel.
|
55
57
|
|
56
|
-
An XSD schema for the XML is available in doc/schema-v1.xsd["
|
58
|
+
An XSD schema for the XML is available in doc/schema-v1.xsd["tree/master/doc/schema-v1.xsd"]
|
57
59
|
|
58
60
|
The desired spreadsheet extractor jar can be specified by defining SPREADSHEET_EXTRACTOR_JAR_PATH in a config file (e.g. environment.rb)
|
59
61
|
|
62
|
+
CSV can be generated in a similar way, and also takes an optional sheet number. If the sheet number is missing then the first sheet is used.
|
63
|
+
|
64
|
+
Note that the sheet number for the first sheet is 1, and can either be a string or integer.
|
65
|
+
|
66
|
+
e.g.
|
67
|
+
|
68
|
+
puts spreadsheet_to_csv(f,"1")
|
69
|
+
|
70
|
+
|
71
|
+
|
60
72
|
== Example XML
|
61
73
|
|
62
74
|
<?xml version="1.0" encoding="UTF-8"?>
|
@@ -105,4 +117,4 @@ The desired spreadsheet extractor jar can be specified by defining SPREADSHEET_E
|
|
105
117
|
</sheet>
|
106
118
|
<sheet name="Sheet2" index="2" hidden="false" very_hidden="false" first_row="1" last_row="1"/>
|
107
119
|
<sheet name="Sheet3" index="3" hidden="false" very_hidden="false" first_row="1" last_row="1"/>
|
108
|
-
</workbook>
|
120
|
+
</workbook>
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.5.0
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -10,32 +10,46 @@ module SysMODB
|
|
10
10
|
|
11
11
|
module SpreadsheetExtractor
|
12
12
|
|
13
|
-
DEFAULT_PATH = File.dirname(__FILE__) + "/../jars/simple-spreadsheet-extractor-0.
|
13
|
+
DEFAULT_PATH = File.dirname(__FILE__) + "/../jars/simple-spreadsheet-extractor-0.5.0.jar"
|
14
14
|
|
15
|
-
def spreadsheet_to_xml spreadsheet_data
|
16
|
-
|
17
|
-
|
18
|
-
output = read_with_popen4 spreadsheet_data
|
15
|
+
def spreadsheet_to_xml spreadsheet_data
|
16
|
+
if is_windows?
|
17
|
+
read_with_popen4 spreadsheet_data,"xml"
|
19
18
|
else
|
20
|
-
|
19
|
+
read_with_open4 spreadsheet_data,"xml"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def spreadsheet_to_csv spreadsheet_data,sheet=1,trim=false
|
24
|
+
if is_windows?
|
25
|
+
read_with_popen4 spreadsheet_data,"csv",sheet,trim
|
26
|
+
else
|
27
|
+
read_with_open4 spreadsheet_data,"csv",sheet,trim
|
21
28
|
end
|
22
|
-
|
23
|
-
return output
|
24
29
|
end
|
25
30
|
|
26
31
|
|
27
32
|
|
28
|
-
def spreadsheet_extractor_command
|
29
|
-
"java -jar #{(defined? SPREADSHEET_EXTRACTOR_JAR_PATH) ? SPREADSHEET_EXTRACTOR_JAR_PATH : DEFAULT_PATH}"
|
33
|
+
def spreadsheet_extractor_command format="xml",sheet=nil,trim=false
|
34
|
+
command = "java -jar #{(defined? SPREADSHEET_EXTRACTOR_JAR_PATH) ? SPREADSHEET_EXTRACTOR_JAR_PATH : DEFAULT_PATH}"
|
35
|
+
command += " -o #{format}"
|
36
|
+
command += " -s #{sheet}" if sheet
|
37
|
+
command += " -t" if trim
|
38
|
+
command
|
30
39
|
end
|
31
40
|
|
32
41
|
private
|
33
|
-
|
34
|
-
|
35
|
-
|
42
|
+
|
43
|
+
def is_windows?
|
44
|
+
!(RUBY_PLATFORM =~ /mswin32/ || RUBY_PLATFORM =~ /mingw32/).nil?
|
45
|
+
end
|
46
|
+
|
47
|
+
#opens using POpen4 - this is for the benefit of Windows. It has been found to be unstable in Linux and give occasional segmentation faults
|
48
|
+
def read_with_popen4 spreadsheet_data,format="xml",sheet=nil,trim=false
|
36
49
|
output=""
|
37
50
|
err_message = ""
|
38
|
-
|
51
|
+
command = spreadsheet_extractor_command format,sheet,trim
|
52
|
+
status = POpen4::popen4(command) do |stdout, stderr, stdin, pid|
|
39
53
|
stdin=stdin.binmode
|
40
54
|
spreadsheet_data.each_byte{|b| stdin.putc(b)}
|
41
55
|
stdin.close
|
@@ -49,13 +63,14 @@ module SysMODB
|
|
49
63
|
raise SpreadsheetExtractionException.new(err_message)
|
50
64
|
end
|
51
65
|
|
52
|
-
|
66
|
+
output.strip
|
53
67
|
end
|
54
68
|
|
55
|
-
def read_with_open4 spreadsheet_data
|
69
|
+
def read_with_open4 spreadsheet_data,format="xml",sheet=nil,trim=false
|
56
70
|
output = ""
|
57
71
|
err_message = ""
|
58
|
-
|
72
|
+
command = spreadsheet_extractor_command format,sheet,trim
|
73
|
+
status = Open4::popen4(command) do |pid, stdin, stdout, stderr|
|
59
74
|
while ((line = spreadsheet_data.gets) != nil) do
|
60
75
|
stdin << line
|
61
76
|
end
|
@@ -76,7 +91,7 @@ module SysMODB
|
|
76
91
|
raise SpreadsheetExtractionException.new(err_message)
|
77
92
|
end
|
78
93
|
|
79
|
-
|
94
|
+
output.strip
|
80
95
|
end
|
81
96
|
|
82
97
|
end
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{simple-spreadsheet-extractor}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.5.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Stuart Owen", "Finn Bacall"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-03-03}
|
13
13
|
s.description = %q{Takes a stream to a spreadsheet file and produces and XML representation of its contents}
|
14
14
|
s.email = %q{stuart.owen@manchester.ac.uk}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,27 +24,26 @@ Gem::Specification.new do |s|
|
|
24
24
|
"doc/schema-v1.xsd",
|
25
25
|
"example.rb",
|
26
26
|
"jars/lib/dom4j-1.6.1.jar",
|
27
|
-
"jars/lib/poi-3.
|
28
|
-
"jars/lib/poi-ooxml-3.
|
29
|
-
"jars/lib/poi-ooxml-schemas-3.
|
27
|
+
"jars/lib/poi-3.7.jar",
|
28
|
+
"jars/lib/poi-ooxml-3.7.jar",
|
29
|
+
"jars/lib/poi-ooxml-schemas-3.7.jar",
|
30
30
|
"jars/lib/xmlbeans-2.3.0.jar",
|
31
|
-
"jars/simple-spreadsheet-extractor-0.
|
31
|
+
"jars/simple-spreadsheet-extractor-0.5.0.jar",
|
32
32
|
"lib/simple-spreadsheet-extractor.rb",
|
33
33
|
"simple-spreadsheet-extractor.gemspec"
|
34
34
|
]
|
35
35
|
s.homepage = %q{http://github.com/myGrid/simple-spreadsheet-extractor-gem}
|
36
36
|
s.require_paths = ["lib"]
|
37
|
-
s.rubygems_version = %q{1.
|
37
|
+
s.rubygems_version = %q{1.4.2}
|
38
38
|
s.summary = %q{Basic spreadsheet content extraction using Apache POI}
|
39
39
|
s.test_files = [
|
40
40
|
"test/test_extraction.rb"
|
41
41
|
]
|
42
42
|
|
43
43
|
if s.respond_to? :specification_version then
|
44
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
45
44
|
s.specification_version = 3
|
46
45
|
|
47
|
-
if Gem::Version.new(Gem::
|
46
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
48
47
|
s.add_runtime_dependency(%q<POpen4>, ["= 0.1.4"])
|
49
48
|
s.add_runtime_dependency(%q<open4>, ["= 1.0.1"])
|
50
49
|
else
|
data/test/test_extraction.rb
CHANGED
@@ -40,6 +40,31 @@ class TestExtraction < Test::Unit::TestCase
|
|
40
40
|
assert false,"Error validating against schema: #{e.message}"
|
41
41
|
end
|
42
42
|
end
|
43
|
+
|
44
|
+
def test_csv_output
|
45
|
+
test_sheet = File.dirname(__FILE__) + "/files/test-spreadsheet.xls"
|
46
|
+
expected_file = File.dirname(__FILE__) + "/files/test-csv-output1.csv"
|
47
|
+
expected = open(expected_file,"rb").read
|
48
|
+
|
49
|
+
f=open(test_sheet,"rb")
|
50
|
+
csv = spreadsheet_to_csv(f,2)
|
51
|
+
assert_equal expected,csv
|
52
|
+
|
53
|
+
#try sheet as a string
|
54
|
+
f=open(test_sheet,"rb")
|
55
|
+
csv = spreadsheet_to_csv(f,"2")
|
56
|
+
assert_equal expected,csv
|
57
|
+
end
|
58
|
+
|
59
|
+
# def test_csv_output_trimmed
|
60
|
+
# test_sheet = File.dirname(__FILE__) + "/files/test-spreadsheet.xls"
|
61
|
+
# expected_file = File.dirname(__FILE__) + "/files/test-csv-output1-trimmed.csv"
|
62
|
+
# expected = open(expected_file,"rb").read
|
63
|
+
#
|
64
|
+
# f=open(test_sheet,"rb")
|
65
|
+
# csv = spreadsheet_to_csv(f,2,true)
|
66
|
+
# assert_equal expected,csv
|
67
|
+
# end
|
43
68
|
|
44
69
|
def test_for_segfault
|
45
70
|
test_sheet = File.dirname(__FILE__) + "/files/test-spreadsheet.xls"
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple-spreadsheet-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 11
|
5
|
+
prerelease:
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
8
|
+
- 5
|
9
|
+
- 0
|
10
|
+
version: 0.5.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Stuart Owen
|
@@ -15,16 +16,18 @@ autorequire:
|
|
15
16
|
bindir: bin
|
16
17
|
cert_chain: []
|
17
18
|
|
18
|
-
date: 2011-
|
19
|
+
date: 2011-03-03 00:00:00 +00:00
|
19
20
|
default_executable:
|
20
21
|
dependencies:
|
21
22
|
- !ruby/object:Gem::Dependency
|
22
23
|
name: POpen4
|
23
24
|
prerelease: false
|
24
25
|
requirement: &id001 !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
25
27
|
requirements:
|
26
28
|
- - "="
|
27
29
|
- !ruby/object:Gem::Version
|
30
|
+
hash: 19
|
28
31
|
segments:
|
29
32
|
- 0
|
30
33
|
- 1
|
@@ -36,9 +39,11 @@ dependencies:
|
|
36
39
|
name: open4
|
37
40
|
prerelease: false
|
38
41
|
requirement: &id002 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
39
43
|
requirements:
|
40
44
|
- - "="
|
41
45
|
- !ruby/object:Gem::Version
|
46
|
+
hash: 21
|
42
47
|
segments:
|
43
48
|
- 1
|
44
49
|
- 0
|
@@ -63,13 +68,14 @@ files:
|
|
63
68
|
- doc/schema-v1.xsd
|
64
69
|
- example.rb
|
65
70
|
- jars/lib/dom4j-1.6.1.jar
|
66
|
-
- jars/lib/poi-3.
|
67
|
-
- jars/lib/poi-ooxml-3.
|
68
|
-
- jars/lib/poi-ooxml-schemas-3.
|
71
|
+
- jars/lib/poi-3.7.jar
|
72
|
+
- jars/lib/poi-ooxml-3.7.jar
|
73
|
+
- jars/lib/poi-ooxml-schemas-3.7.jar
|
69
74
|
- jars/lib/xmlbeans-2.3.0.jar
|
70
|
-
- jars/simple-spreadsheet-extractor-0.
|
75
|
+
- jars/simple-spreadsheet-extractor-0.5.0.jar
|
71
76
|
- lib/simple-spreadsheet-extractor.rb
|
72
77
|
- simple-spreadsheet-extractor.gemspec
|
78
|
+
- test/test_extraction.rb
|
73
79
|
has_rdoc: true
|
74
80
|
homepage: http://github.com/myGrid/simple-spreadsheet-extractor-gem
|
75
81
|
licenses: []
|
@@ -80,23 +86,27 @@ rdoc_options: []
|
|
80
86
|
require_paths:
|
81
87
|
- lib
|
82
88
|
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
83
90
|
requirements:
|
84
91
|
- - ">="
|
85
92
|
- !ruby/object:Gem::Version
|
93
|
+
hash: 3
|
86
94
|
segments:
|
87
95
|
- 0
|
88
96
|
version: "0"
|
89
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
90
99
|
requirements:
|
91
100
|
- - ">="
|
92
101
|
- !ruby/object:Gem::Version
|
102
|
+
hash: 3
|
93
103
|
segments:
|
94
104
|
- 0
|
95
105
|
version: "0"
|
96
106
|
requirements: []
|
97
107
|
|
98
108
|
rubyforge_project:
|
99
|
-
rubygems_version: 1.
|
109
|
+
rubygems_version: 1.4.2
|
100
110
|
signing_key:
|
101
111
|
specification_version: 3
|
102
112
|
summary: Basic spreadsheet content extraction using Apache POI
|
data/jars/lib/poi-ooxml-3.6.jar
DELETED
Binary file
|
Binary file
|