xls-split 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9762885e78b2e3f27485a4493df8031f6b998ec6
4
+ data.tar.gz: bdc067b5a95852f50df5612d5638cbd8988a0c50
5
+ SHA512:
6
+ metadata.gz: 0feb5bcabc0fb7dd40ebd27f66a07b4bed76c5957edbf0b8429f1870e6ce752ed3e6064cf1f577f6a9245972efb64f8e7fc68178af9e622820ef68e65b361109
7
+ data.tar.gz: c8752b0484cea5595671836bfda8bf2319aae9ac379861339353e8771e584093d527ebb928f56d0e4da78f017f4432cc4f643ed64d4fd3668b7312a39e974180
@@ -0,0 +1,13 @@
1
+ Licensed under the Apache License, Version 2.0 (the "License");
2
+ you may not use this file except in compliance with the License.
3
+
4
+ You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing,
9
+ software distributed under the License is distributed on an "AS IS" BASIS,
10
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+
12
+ See the License for the specific language governing permissions and limitations
13
+ under the License.
@@ -0,0 +1,77 @@
1
+ A simple ruby command-line tool that allows a spreadsheet to be split into a number of
2
+ separate CSV files, one file per worksheet.
3
+
4
+ Lots of (UK) Government data is published in this way, often with hidden worksheets. This
5
+ tool makes it simple to split all of the worksheets out into separate files for easier
6
+ processing, e.g. using tools like Google Refine.
7
+
8
+ USAGE
9
+ -----
10
+
11
+ xls-split [opts] [file]
12
+
13
+ For example, if you have a spreadsheet containing two worksheets, Table1 and Table2 then the following
14
+ command will split that into two CSV files: /tmp/my-data-Table1.csv and /tmp/my-data-Table2.csv
15
+
16
+ xls-split -d /tmp -b my-data spreadsheet.xls
17
+
18
+ There are command-line options available to control location and naming of generated files,
19
+ as well as the ability to only extract specific worksheets, based on a regex match.
20
+
21
+ INSTALLATION
22
+ ------------
23
+
24
+ Install as a gem:
25
+
26
+ gem install xls-split
27
+
28
+ Or, grab the source from github and run:
29
+
30
+ rake install
31
+
32
+ OPTIONS
33
+ -------
34
+
35
+ --help , -h
36
+ show this message
37
+
38
+ --verbose , -v
39
+ verbose progress reporting
40
+
41
+ --encoding , -e
42
+ set the encoding of the spreadsheet. Default is UTF-8
43
+
44
+ --skip , -s
45
+ number of rows in each worksheet to skip before writing data
46
+
47
+ --skipfooter , -f
48
+ number of rows to skip at the end of each worksheet
49
+
50
+ --dir , -d
51
+ output directory into which CSV files will be written
52
+
53
+ --base , -b
54
+ set a base file name for generated CSV files. Worksheet name will be appended
55
+
56
+ --match , -m
57
+ regular expression used to match worksheet names. Only matches sheets will be split out
58
+
59
+ --tidy-names , -t
60
+ tidy up worksheet names. Lower case and strips spaces
61
+
62
+ LICENSE
63
+ --------
64
+
65
+ Licensed under the Apache License, Version 2.0 (the "License");
66
+ you may not use this file except in compliance with the License.
67
+
68
+ You may obtain a copy of the License at
69
+
70
+ http://www.apache.org/licenses/LICENSE-2.0
71
+
72
+ Unless required by applicable law or agreed to in writing,
73
+ software distributed under the License is distributed on an "AS IS" BASIS,
74
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
75
+
76
+ See the License for the specific language governing permissions and limitations
77
+ under the License.
@@ -0,0 +1,111 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "rubygems"
4
+ require 'getoptlong'
5
+ require "spreadsheet"
6
+ require "csv"
7
+
8
+ USAGE = <<-EOL
9
+ SUMMARY
10
+
11
+ xls-split [opts] file.xls
12
+
13
+ DESCRIPTION
14
+
15
+ Split a spreadsheet into separate CSV files, one file per worksheet.
16
+
17
+ Makes it easier to process complex, multi-sheet spreadsheets in tools like Google Refine
18
+
19
+ OPTIONS
20
+
21
+ --help , -h
22
+ show this message
23
+ --verbose , -v
24
+ verbose progress reporting
25
+ --encoding , -e
26
+ set the encoding of the spreadsheet. Default is UTF-8
27
+ --skip , -s
28
+ number of rows in each worksheet to skip before writing data
29
+ --dir , -d
30
+ output directory into which CSV files will be written
31
+ --base , -b
32
+ set a base file name for generated CSV files. Worksheet name will be appended
33
+ --match , -m
34
+ regular expression used to match worksheet names. Only matches sheets will be split out
35
+ --tidy-names , -t
36
+ tidy up worksheet names. Lower case and strips spaces
37
+ EOL
38
+
39
+ opts = GetoptLong::new(
40
+ [ "--help" , "-h" , GetoptLong::NO_ARGUMENT ],
41
+ [ "--encoding" , "-e" , GetoptLong::REQUIRED_ARGUMENT ],
42
+ [ "--skip" , "-s" , GetoptLong::REQUIRED_ARGUMENT ],
43
+ [ "--dir" , "-d" , GetoptLong::REQUIRED_ARGUMENT ],
44
+ [ "--base" , "-b" , GetoptLong::REQUIRED_ARGUMENT ],
45
+ [ "--match" , "-m" , GetoptLong::REQUIRED_ARGUMENT ],
46
+ [ "--tidy" , "-t" , GetoptLong::NO_ARGUMENT ],
47
+ [ "--verbose" , "-v" , GetoptLong::NO_ARGUMENT ],
48
+ [ "--skipfooter" , "-f" , GetoptLong::REQUIRED_ARGUMENT ]
49
+ ).enum_for.inject({}) { |h, (k, v)| h.update k.delete('-') => v }
50
+
51
+ if opts["help"]
52
+ USAGE.display
53
+ exit(0)
54
+ end
55
+
56
+ def log(opts, msg)
57
+ $stderr.puts msg if opts["verbose"]
58
+ end
59
+
60
+ if opts["encoding"]
61
+ Spreadsheet.client_encoding = opts["encoding"]
62
+ else
63
+ Spreadsheet.client_encoding = 'UTF-8'
64
+ end
65
+
66
+ log( opts, "Worksheet encoding set to #{Spreadsheet.client_encoding}" )
67
+
68
+ skip = opts["skip"].to_i || 0
69
+ skipfooter = opts["skipfooter"].to_i || 0
70
+
71
+ log( opts, "Skipping #{skip} rows in each worksheet" )
72
+ log( opts, "Skipping #{skipfooter} rows at end of each worksheet" )
73
+
74
+ base = opts["base"] || ""
75
+ dir = opts["dir"] || "."
76
+
77
+ book = Spreadsheet.open ARGV[0]
78
+
79
+ book.worksheets.each do |worksheet|
80
+
81
+ if opts["match"] == nil || worksheet.name.match( opts["match"] )
82
+
83
+ name = worksheet.name
84
+ if opts["tidy"]
85
+ name = name.downcase.gsub(" ", "-")
86
+ end
87
+ filename = File.join( dir, "#{base}-#{name}.csv")
88
+
89
+ log( opts, "Writing worksheet #{worksheet.name} to #{filename}" )
90
+
91
+ maxrow = worksheet.dimensions[1]
92
+ if skipfooter
93
+ maxrow = maxrow - skipfooter
94
+ end
95
+
96
+ CSV.open(filename, "w") do |writer|
97
+
98
+ worksheet.each skip do |row|
99
+ if row.idx <= maxrow
100
+ writer << row
101
+ end
102
+ end
103
+
104
+ end
105
+
106
+ else
107
+ log( opts, "Ignoring unmatched worksheet: #{worksheet.name}" )
108
+ end
109
+
110
+ end
111
+
@@ -0,0 +1,3 @@
1
+ module XLSSplit
2
+ VERSION = "0.2"
3
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xls-split
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.2'
5
+ platform: ruby
6
+ authors:
7
+ - Leigh Dodds
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-01-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: spreadsheet
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.6.4.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.6.4.1
27
+ description: Tools for splitting Excel files into CSV files
28
+ email:
29
+ - leigh@ldodds.com
30
+ executables:
31
+ - xls-split
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - LICENSE.md
36
+ - README.md
37
+ - bin/xls-split
38
+ - lib/xls-split/version.rb
39
+ homepage: http://github.com/ldodds/xls-split
40
+ licenses: []
41
+ metadata: {}
42
+ post_install_message:
43
+ rdoc_options: []
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ requirements: []
57
+ rubyforge_project:
58
+ rubygems_version: 2.2.1
59
+ signing_key:
60
+ specification_version: 4
61
+ summary: Extract worksheets from XLS files into CSV files
62
+ test_files: []