xls-split 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.md +13 -0
- data/README.md +77 -0
- data/bin/xls-split +111 -0
- data/lib/xls-split/version.rb +3 -0
- metadata +62 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9762885e78b2e3f27485a4493df8031f6b998ec6
|
4
|
+
data.tar.gz: bdc067b5a95852f50df5612d5638cbd8988a0c50
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0feb5bcabc0fb7dd40ebd27f66a07b4bed76c5957edbf0b8429f1870e6ce752ed3e6064cf1f577f6a9245972efb64f8e7fc68178af9e622820ef68e65b361109
|
7
|
+
data.tar.gz: c8752b0484cea5595671836bfda8bf2319aae9ac379861339353e8771e584093d527ebb928f56d0e4da78f017f4432cc4f643ed64d4fd3668b7312a39e974180
|
data/LICENSE.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
2
|
+
you may not use this file except in compliance with the License.
|
3
|
+
|
4
|
+
You may obtain a copy of the License at
|
5
|
+
|
6
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
|
8
|
+
Unless required by applicable law or agreed to in writing,
|
9
|
+
software distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
|
12
|
+
See the License for the specific language governing permissions and limitations
|
13
|
+
under the License.
|
data/README.md
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
A simple ruby command-line tool that allows a spreadsheet to be split into a number of
|
2
|
+
separate CSV files, one file per worksheet.
|
3
|
+
|
4
|
+
Lots of (UK) Government data is published in this way, often with hidden worksheets. This
|
5
|
+
tool makes it simple to split all of the worksheets out into separate files for easier
|
6
|
+
processing, e.g. using tools like Google Refine.
|
7
|
+
|
8
|
+
USAGE
|
9
|
+
-----
|
10
|
+
|
11
|
+
xls-split [opts] [file]
|
12
|
+
|
13
|
+
For example, if you have a spreadsheet containing two worksheets, Table1 and Table2 then the following
|
14
|
+
command will split that into two CSV files: /tmp/my-data-Table1.csv and /tmp/my-data-Table2.csv
|
15
|
+
|
16
|
+
xls-split -d /tmp -b my-data spreadsheet.xls
|
17
|
+
|
18
|
+
There are command-line options available to control location and naming of generated files,
|
19
|
+
as well as the ability to only extract specific worksheets, based on a regex match.
|
20
|
+
|
21
|
+
INSTALLATION
|
22
|
+
------------
|
23
|
+
|
24
|
+
Install as a gem:
|
25
|
+
|
26
|
+
gem install xls-split
|
27
|
+
|
28
|
+
Or, grab the source from github and run:
|
29
|
+
|
30
|
+
rake install
|
31
|
+
|
32
|
+
OPTIONS
|
33
|
+
-------
|
34
|
+
|
35
|
+
--help , -h
|
36
|
+
show this message
|
37
|
+
|
38
|
+
--verbose , -v
|
39
|
+
verbose progress reporting
|
40
|
+
|
41
|
+
--encoding , -e
|
42
|
+
set the encoding of the spreadsheet. Default is UTF-8
|
43
|
+
|
44
|
+
--skip , -s
|
45
|
+
number of rows in each worksheet to skip before writing data
|
46
|
+
|
47
|
+
--skipfooter , -f
|
48
|
+
number of rows to skip at the end of each worksheet
|
49
|
+
|
50
|
+
--dir , -d
|
51
|
+
output directory into which CSV files will be written
|
52
|
+
|
53
|
+
--base , -b
|
54
|
+
set a base file name for generated CSV files. Worksheet name will be appended
|
55
|
+
|
56
|
+
--match , -m
|
57
|
+
regular expression used to match worksheet names. Only matches sheets will be split out
|
58
|
+
|
59
|
+
--tidy-names , -t
|
60
|
+
tidy up worksheet names. Lower case and strips spaces
|
61
|
+
|
62
|
+
LICENSE
|
63
|
+
--------
|
64
|
+
|
65
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
66
|
+
you may not use this file except in compliance with the License.
|
67
|
+
|
68
|
+
You may obtain a copy of the License at
|
69
|
+
|
70
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
71
|
+
|
72
|
+
Unless required by applicable law or agreed to in writing,
|
73
|
+
software distributed under the License is distributed on an "AS IS" BASIS,
|
74
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
75
|
+
|
76
|
+
See the License for the specific language governing permissions and limitations
|
77
|
+
under the License.
|
data/bin/xls-split
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "rubygems"
|
4
|
+
require 'getoptlong'
|
5
|
+
require "spreadsheet"
|
6
|
+
require "csv"
|
7
|
+
|
8
|
+
USAGE = <<-EOL
|
9
|
+
SUMMARY
|
10
|
+
|
11
|
+
xls-split [opts] file.xls
|
12
|
+
|
13
|
+
DESCRIPTION
|
14
|
+
|
15
|
+
Split a spreadsheet into separate CSV files, one file per worksheet.
|
16
|
+
|
17
|
+
Makes it easier to process complex, multi-sheet spreadsheets in tools like Google Refine
|
18
|
+
|
19
|
+
OPTIONS
|
20
|
+
|
21
|
+
--help , -h
|
22
|
+
show this message
|
23
|
+
--verbose , -v
|
24
|
+
verbose progress reporting
|
25
|
+
--encoding , -e
|
26
|
+
set the encoding of the spreadsheet. Default is UTF-8
|
27
|
+
--skip , -s
|
28
|
+
number of rows in each worksheet to skip before writing data
|
29
|
+
--dir , -d
|
30
|
+
output directory into which CSV files will be written
|
31
|
+
--base , -b
|
32
|
+
set a base file name for generated CSV files. Worksheet name will be appended
|
33
|
+
--match , -m
|
34
|
+
regular expression used to match worksheet names. Only matches sheets will be split out
|
35
|
+
--tidy-names , -t
|
36
|
+
tidy up worksheet names. Lower case and strips spaces
|
37
|
+
EOL
|
38
|
+
|
39
|
+
opts = GetoptLong::new(
|
40
|
+
[ "--help" , "-h" , GetoptLong::NO_ARGUMENT ],
|
41
|
+
[ "--encoding" , "-e" , GetoptLong::REQUIRED_ARGUMENT ],
|
42
|
+
[ "--skip" , "-s" , GetoptLong::REQUIRED_ARGUMENT ],
|
43
|
+
[ "--dir" , "-d" , GetoptLong::REQUIRED_ARGUMENT ],
|
44
|
+
[ "--base" , "-b" , GetoptLong::REQUIRED_ARGUMENT ],
|
45
|
+
[ "--match" , "-m" , GetoptLong::REQUIRED_ARGUMENT ],
|
46
|
+
[ "--tidy" , "-t" , GetoptLong::NO_ARGUMENT ],
|
47
|
+
[ "--verbose" , "-v" , GetoptLong::NO_ARGUMENT ],
|
48
|
+
[ "--skipfooter" , "-f" , GetoptLong::REQUIRED_ARGUMENT ]
|
49
|
+
).enum_for.inject({}) { |h, (k, v)| h.update k.delete('-') => v }
|
50
|
+
|
51
|
+
if opts["help"]
|
52
|
+
USAGE.display
|
53
|
+
exit(0)
|
54
|
+
end
|
55
|
+
|
56
|
+
def log(opts, msg)
|
57
|
+
$stderr.puts msg if opts["verbose"]
|
58
|
+
end
|
59
|
+
|
60
|
+
if opts["encoding"]
|
61
|
+
Spreadsheet.client_encoding = opts["encoding"]
|
62
|
+
else
|
63
|
+
Spreadsheet.client_encoding = 'UTF-8'
|
64
|
+
end
|
65
|
+
|
66
|
+
log( opts, "Worksheet encoding set to #{Spreadsheet.client_encoding}" )
|
67
|
+
|
68
|
+
skip = opts["skip"].to_i || 0
|
69
|
+
skipfooter = opts["skipfooter"].to_i || 0
|
70
|
+
|
71
|
+
log( opts, "Skipping #{skip} rows in each worksheet" )
|
72
|
+
log( opts, "Skipping #{skipfooter} rows at end of each worksheet" )
|
73
|
+
|
74
|
+
base = opts["base"] || ""
|
75
|
+
dir = opts["dir"] || "."
|
76
|
+
|
77
|
+
book = Spreadsheet.open ARGV[0]
|
78
|
+
|
79
|
+
book.worksheets.each do |worksheet|
|
80
|
+
|
81
|
+
if opts["match"] == nil || worksheet.name.match( opts["match"] )
|
82
|
+
|
83
|
+
name = worksheet.name
|
84
|
+
if opts["tidy"]
|
85
|
+
name = name.downcase.gsub(" ", "-")
|
86
|
+
end
|
87
|
+
filename = File.join( dir, "#{base}-#{name}.csv")
|
88
|
+
|
89
|
+
log( opts, "Writing worksheet #{worksheet.name} to #{filename}" )
|
90
|
+
|
91
|
+
maxrow = worksheet.dimensions[1]
|
92
|
+
if skipfooter
|
93
|
+
maxrow = maxrow - skipfooter
|
94
|
+
end
|
95
|
+
|
96
|
+
CSV.open(filename, "w") do |writer|
|
97
|
+
|
98
|
+
worksheet.each skip do |row|
|
99
|
+
if row.idx <= maxrow
|
100
|
+
writer << row
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
else
|
107
|
+
log( opts, "Ignoring unmatched worksheet: #{worksheet.name}" )
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: xls-split
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.2'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Leigh Dodds
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-01-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: spreadsheet
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.6.4.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.6.4.1
|
27
|
+
description: Tools for splitting Excel files into CSV files
|
28
|
+
email:
|
29
|
+
- leigh@ldodds.com
|
30
|
+
executables:
|
31
|
+
- xls-split
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- LICENSE.md
|
36
|
+
- README.md
|
37
|
+
- bin/xls-split
|
38
|
+
- lib/xls-split/version.rb
|
39
|
+
homepage: http://github.com/ldodds/xls-split
|
40
|
+
licenses: []
|
41
|
+
metadata: {}
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
require_paths:
|
45
|
+
- lib
|
46
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 2.2.1
|
59
|
+
signing_key:
|
60
|
+
specification_version: 4
|
61
|
+
summary: Extract worksheets from XLS files into CSV files
|
62
|
+
test_files: []
|