simple-spreadsheet-extractor 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/README.rdoc +10 -3
- data/Rakefile +37 -0
- data/VERSION +1 -0
- data/doc/schema-v1.xsd +56 -0
- data/lib/simple-spreadsheet-extractor.rb +9 -16
- data/simple-spreadsheet-extractor.gemspec +57 -0
- data/test/test_extraction.rb +45 -0
- metadata +25 -19
data/.gitignore
ADDED
data/README.rdoc
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
= Simple Spreadsheet Extractor
|
2
2
|
|
3
3
|
Authors:: Stuart Owen
|
4
|
-
Version:: 0.3.
|
4
|
+
Version:: 0.3.3
|
5
5
|
Contact:: mailto:stuart.owen@manchester.ac.uk
|
6
6
|
Licence:: BSD (See LICENCE or http://www.opensource.org/licenses/bsd-license.php)
|
7
7
|
Copyright:: (c) 2010 The University of Manchester, UK
|
@@ -21,6 +21,10 @@ Java 1.6 (JRE) is required.
|
|
21
21
|
|
22
22
|
gem install simple-spreadsheet-extractor
|
23
23
|
|
24
|
+
Note that on Windows you must also do:
|
25
|
+
|
26
|
+
gem install win32-open3
|
27
|
+
|
24
28
|
== Usage
|
25
29
|
|
26
30
|
* require 'simple-spreadsheet-extractor'
|
@@ -30,12 +34,15 @@ Java 1.6 (JRE) is required.
|
|
30
34
|
|
31
35
|
e.g.
|
32
36
|
|
37
|
+
#example.rb - takes path, i.e. ruby example.rb /tmp/spreadsheet.xls
|
33
38
|
require 'rubygems'
|
34
39
|
require 'simple-spreadsheet-extractor'
|
35
40
|
|
36
41
|
include SysMODB::SpreadsheetExtractor
|
37
42
|
|
38
|
-
|
43
|
+
path=ARGV.first
|
44
|
+
|
45
|
+
f=open(path)
|
39
46
|
begin
|
40
47
|
puts spreadsheet_to_xml(f)
|
41
48
|
rescue SysMODB::SpreadsheetExtractionException=>e
|
@@ -46,7 +53,7 @@ Formulas are evaluated placing the result in the XML produced for that cell, how
|
|
46
53
|
|
47
54
|
Row and column indexes start at 1, rather than 0, to keep consistent with namings of the cells in Excel.
|
48
55
|
|
49
|
-
An XSD schema for the XML is available in doc/schema-v1.xsd
|
56
|
+
An XSD schema for the XML is available in doc/schema-v1.xsd["http://github.com/stuzart/simple-spreadsheet-extractor-gem/blob/master/doc/schema-v1.xsd]
|
50
57
|
|
51
58
|
== Example XML
|
52
59
|
|
data/Rakefile
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rubygems'
|
5
|
+
|
6
|
+
require 'rake/gempackagetask'
|
7
|
+
|
8
|
+
task :default => [:test]
|
9
|
+
|
10
|
+
begin
|
11
|
+
require 'jeweler'
|
12
|
+
Jeweler::Tasks.new do |gemspec|
|
13
|
+
gemspec.name = "simple-spreadsheet-extractor"
|
14
|
+
gemspec.summary = "Basic spreadsheet content extraction using Apache POI"
|
15
|
+
gemspec.description = "Takes a stream to a spreadsheet file and produces and XML representation of its contents"
|
16
|
+
gemspec.email = "stuart.owen@manchester.ac.uk"
|
17
|
+
gemspec.homepage = "http://github.com/myGrid/simple-spreadsheet-extractor-gem"
|
18
|
+
gemspec.authors = ["Stuart Owen"]
|
19
|
+
|
20
|
+
gemspec.files.include %w(jars)
|
21
|
+
gemspec.files.exclude "test/*"
|
22
|
+
gemspec.extra_rdoc_files = ["README.rdoc", "LICENCE"]
|
23
|
+
gemspec.add_dependency("POpen4","0.1.4")
|
24
|
+
end
|
25
|
+
rescue LoadError
|
26
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
27
|
+
end
|
28
|
+
|
29
|
+
task:test do
|
30
|
+
Rake::TestTask.new do |t|
|
31
|
+
t.libs << "test"
|
32
|
+
t.test_files = FileList['test/test*.rb']
|
33
|
+
t.verbose = true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
#end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.3.3
|
data/doc/schema-v1.xsd
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<xsd:schema targetNamespace="http://www.sysmo-db.org/2010/xml/spreadsheet"
|
3
|
+
xml:lang="en" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns="http://www.sysmo-db.org/2010/xml/spreadsheet"
|
4
|
+
elementFormDefault="qualified">
|
5
|
+
|
6
|
+
<xsd:element name="workbook" type="Workbook"></xsd:element>
|
7
|
+
|
8
|
+
<xsd:complexType name="Workbook">
|
9
|
+
<xsd:sequence>
|
10
|
+
<xsd:element name="sheet" type="Sheet" minOccurs="0"
|
11
|
+
maxOccurs="unbounded" />
|
12
|
+
</xsd:sequence>
|
13
|
+
</xsd:complexType>
|
14
|
+
|
15
|
+
<xsd:complexType name="Sheet">
|
16
|
+
<xsd:sequence>
|
17
|
+
<xsd:element name="row" type="Row" minOccurs="0"
|
18
|
+
maxOccurs="unbounded" />
|
19
|
+
</xsd:sequence>
|
20
|
+
<xsd:attribute name="name" type="xsd:string" use="required"></xsd:attribute>
|
21
|
+
<xsd:attribute name="index" type="xsd:positiveInteger"
|
22
|
+
use="required"></xsd:attribute>
|
23
|
+
<xsd:attribute name="hidden" type="xsd:boolean" use="required"></xsd:attribute>
|
24
|
+
<xsd:attribute name="very_hidden" type="xsd:boolean"
|
25
|
+
use="required"></xsd:attribute>
|
26
|
+
<xsd:attribute name="first_row" type="xsd:positiveInteger"
|
27
|
+
use="required"></xsd:attribute>
|
28
|
+
<xsd:attribute name="last_row" type="xsd:positiveInteger"
|
29
|
+
use="required"></xsd:attribute>
|
30
|
+
</xsd:complexType>
|
31
|
+
|
32
|
+
<xsd:complexType name="Row">
|
33
|
+
<xsd:sequence>
|
34
|
+
<xsd:element name="cell" type="Cell" minOccurs="0"
|
35
|
+
maxOccurs="unbounded" />
|
36
|
+
</xsd:sequence>
|
37
|
+
<xsd:attribute name="index" type="xsd:positiveInteger"
|
38
|
+
use="required"></xsd:attribute>
|
39
|
+
</xsd:complexType>
|
40
|
+
|
41
|
+
<xsd:complexType name="Cell">
|
42
|
+
<xsd:simpleContent>
|
43
|
+
<xsd:extension base="xsd:string">
|
44
|
+
<xsd:attribute name="column" type="xsd:positiveInteger"
|
45
|
+
use="required"></xsd:attribute>
|
46
|
+
<xsd:attribute name="column_alpha" type="xsd:string"
|
47
|
+
use="required"></xsd:attribute>
|
48
|
+
<xsd:attribute name="row" type="xsd:positiveInteger"
|
49
|
+
use="required"></xsd:attribute>
|
50
|
+
<xsd:attribute name="type" type="xsd:string" use="required"></xsd:attribute>
|
51
|
+
<xsd:attribute name="formula" type="xsd:string" use="optional"></xsd:attribute>
|
52
|
+
</xsd:extension>
|
53
|
+
</xsd:simpleContent>
|
54
|
+
</xsd:complexType>
|
55
|
+
|
56
|
+
</xsd:schema>
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require '
|
2
|
+
require 'popen4'
|
3
3
|
|
4
4
|
module SysMODB
|
5
5
|
|
@@ -14,24 +14,17 @@ module SysMODB
|
|
14
14
|
command = "java -jar #{JAR_PATH}/simple-spreadsheet-extractor-0.3.2.jar"
|
15
15
|
output = ""
|
16
16
|
err_message = ""
|
17
|
-
status =
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
stdin.close
|
17
|
+
status = POpen4::popen4(command) do |stdout, stderr, stdin, pid|
|
18
|
+
stdin=stdin.binmode
|
19
|
+
spreadsheet_data.each_byte{|b| stdin.putc(b)}
|
20
|
+
stdin.close
|
22
21
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
stdout.close
|
27
|
-
|
28
|
-
while ((line=stderr.gets)!= nil) do
|
29
|
-
err_message << line
|
30
|
-
end
|
31
|
-
stderr.close
|
22
|
+
output=stdout.read.strip
|
23
|
+
err_message=stderr.read.strip
|
24
|
+
|
32
25
|
end
|
33
26
|
|
34
|
-
if status.to_i != 0
|
27
|
+
if status.to_i != 0
|
35
28
|
raise SpreadsheetExtractionException.new(err_message)
|
36
29
|
end
|
37
30
|
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{simple-spreadsheet-extractor}
|
8
|
+
s.version = "0.3.3"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Stuart Owen"]
|
12
|
+
s.date = %q{2010-07-28}
|
13
|
+
s.description = %q{Takes a stream to a spreadsheet file and produces and XML representation of its contents}
|
14
|
+
s.email = %q{stuart.owen@manchester.ac.uk}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENCE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".gitignore",
|
21
|
+
"LICENCE",
|
22
|
+
"README.rdoc",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"doc/schema-v1.xsd",
|
26
|
+
"jars/lib/dom4j-1.6.1.jar",
|
27
|
+
"jars/lib/poi-3.6.jar",
|
28
|
+
"jars/lib/poi-ooxml-3.6.jar",
|
29
|
+
"jars/lib/poi-ooxml-schemas-3.6.jar",
|
30
|
+
"jars/lib/xmlbeans-2.3.0.jar",
|
31
|
+
"jars/simple-spreadsheet-extractor-0.3.2.jar",
|
32
|
+
"lib/simple-spreadsheet-extractor.rb",
|
33
|
+
"simple-spreadsheet-extractor.gemspec"
|
34
|
+
]
|
35
|
+
s.homepage = %q{http://github.com/myGrid/simple-spreadsheet-extractor-gem}
|
36
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
s.rubygems_version = %q{1.3.7}
|
39
|
+
s.summary = %q{Basic spreadsheet content extraction using Apache POI}
|
40
|
+
s.test_files = [
|
41
|
+
"test/test_extraction.rb"
|
42
|
+
]
|
43
|
+
|
44
|
+
if s.respond_to? :specification_version then
|
45
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
46
|
+
s.specification_version = 3
|
47
|
+
|
48
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
49
|
+
s.add_runtime_dependency(%q<POpen4>, ["= 0.1.4"])
|
50
|
+
else
|
51
|
+
s.add_dependency(%q<POpen4>, ["= 0.1.4"])
|
52
|
+
end
|
53
|
+
else
|
54
|
+
s.add_dependency(%q<POpen4>, ["= 0.1.4"])
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'simple-spreadsheet-extractor'
|
3
|
+
require 'libxml'
|
4
|
+
|
5
|
+
class TestExtraction < Test::Unit::TestCase
|
6
|
+
|
7
|
+
SCHEMA_FILE_PATH = File.dirname(__FILE__) + "/../doc/schema-v1.xsd"
|
8
|
+
|
9
|
+
include SysMODB::SpreadsheetExtractor
|
10
|
+
|
11
|
+
def test_from_file_object
|
12
|
+
test_sheet = File.dirname(__FILE__) + "/files/test-spreadsheet.xls"
|
13
|
+
f=open(test_sheet,"rb")
|
14
|
+
xml = spreadsheet_to_xml(f)
|
15
|
+
assert_not_nil xml
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_validate_xml
|
19
|
+
test_sheet = File.dirname(__FILE__) + "/files/test-spreadsheet.xls"
|
20
|
+
f=open(test_sheet,"rb")
|
21
|
+
xml = spreadsheet_to_xml(f)
|
22
|
+
validate_against_schema(xml)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_failure
|
26
|
+
test_sheet = File.dirname(__FILE__) + "/files/not-a-spreadsheet.xls"
|
27
|
+
f=open(test_sheet,"rb")
|
28
|
+
assert_raise SysMODB::SpreadsheetExtractionException do
|
29
|
+
spreadsheet_to_xml(f)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def validate_against_schema xml
|
34
|
+
document = LibXML::XML::Document.string(xml)
|
35
|
+
schema = LibXML::XML::Schema.new(SCHEMA_FILE_PATH)
|
36
|
+
begin
|
37
|
+
document.validate_schema(schema)
|
38
|
+
rescue LibXML::XML::Error => e
|
39
|
+
puts xml
|
40
|
+
assert false,"Error validating against schema: #{e.message}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple-spreadsheet-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
9
|
+
- 3
|
10
|
+
version: 0.3.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Stuart Owen
|
@@ -15,23 +15,23 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-07-
|
18
|
+
date: 2010-07-28 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
-
name:
|
22
|
+
name: POpen4
|
23
23
|
prerelease: false
|
24
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
27
|
- - "="
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
hash:
|
29
|
+
hash: 19
|
30
30
|
segments:
|
31
|
-
- 1
|
32
31
|
- 0
|
33
32
|
- 1
|
34
|
-
|
33
|
+
- 4
|
34
|
+
version: 0.1.4
|
35
35
|
type: :runtime
|
36
36
|
version_requirements: *id001
|
37
37
|
description: Takes a stream to a spreadsheet file and produces and XML representation of its contents
|
@@ -41,25 +41,31 @@ executables: []
|
|
41
41
|
extensions: []
|
42
42
|
|
43
43
|
extra_rdoc_files:
|
44
|
-
- README.rdoc
|
45
44
|
- LICENCE
|
45
|
+
- README.rdoc
|
46
46
|
files:
|
47
|
-
-
|
48
|
-
-
|
49
|
-
-
|
50
|
-
-
|
47
|
+
- .gitignore
|
48
|
+
- LICENCE
|
49
|
+
- README.rdoc
|
50
|
+
- Rakefile
|
51
|
+
- VERSION
|
52
|
+
- doc/schema-v1.xsd
|
51
53
|
- jars/lib/dom4j-1.6.1.jar
|
54
|
+
- jars/lib/poi-3.6.jar
|
52
55
|
- jars/lib/poi-ooxml-3.6.jar
|
56
|
+
- jars/lib/poi-ooxml-schemas-3.6.jar
|
57
|
+
- jars/lib/xmlbeans-2.3.0.jar
|
53
58
|
- jars/simple-spreadsheet-extractor-0.3.2.jar
|
54
|
-
-
|
55
|
-
-
|
59
|
+
- lib/simple-spreadsheet-extractor.rb
|
60
|
+
- simple-spreadsheet-extractor.gemspec
|
61
|
+
- test/test_extraction.rb
|
56
62
|
has_rdoc: true
|
57
63
|
homepage: http://github.com/myGrid/simple-spreadsheet-extractor-gem
|
58
64
|
licenses: []
|
59
65
|
|
60
66
|
post_install_message:
|
61
|
-
rdoc_options:
|
62
|
-
|
67
|
+
rdoc_options:
|
68
|
+
- --charset=UTF-8
|
63
69
|
require_paths:
|
64
70
|
- lib
|
65
71
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -87,5 +93,5 @@ rubygems_version: 1.3.7
|
|
87
93
|
signing_key:
|
88
94
|
specification_version: 3
|
89
95
|
summary: Basic spreadsheet content extraction using Apache POI
|
90
|
-
test_files:
|
91
|
-
|
96
|
+
test_files:
|
97
|
+
- test/test_extraction.rb
|