poi2csv 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3f419a351e1981e983019c5d6c94be41e8aa77d7
4
+ data.tar.gz: d469b9326fae229ffd4d460f5b9eff199e2455d7
5
+ SHA512:
6
+ metadata.gz: c8ee1be9dce6a77176aab26eae45577a93378fe6cb80b00ecb1fec2de915969dba906c0c267a8628350d52f59141b405d11babcb482a775c7c9a02feac86971b
7
+ data.tar.gz: adcd33315bd73bca8543afcb523c3547b206a914412847e18912e4ee1581a5cb98c094092924a7cbb6b1f6efb360b602fd4dab09b86397aa931998185bcfaf2c
data/.DS_Store ADDED
Binary file
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in poi2csv.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Douglas English
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Poi2csv
2
+
3
+ This GEM provides a wrapper to the http://poi.apache.org/ library for converting Excel (.xls and .xlsx) files to CSV.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'poi2csv'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install poi2csv
18
+
19
+ ## Usage
20
+
21
+ Poi2csv::to_csv(input_file_path, output_folder_path, separator=nil, formating_convention=nil)
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/build.xml ADDED
@@ -0,0 +1,70 @@
1
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2
+ <project basedir="." default="build" name="poi2csv">
3
+ <property environment="env"/>
4
+ <property name="ECLIPSE_HOME" value="../../../../Applications/eclipse"/>
5
+ <property name="debuglevel" value="source,lines,vars"/>
6
+ <property name="target" value="1.6"/>
7
+ <property name="source" value="1.6"/>
8
+ <path id="poi2csv.classpath">
9
+ <pathelement location="classes"/>
10
+ <pathelement location="lib/commons-codec-1.5.jar"/>
11
+ <pathelement location="lib/commons-logging-1.1.jar"/>
12
+ <pathelement location="lib/dom4j-1.6.1.jar"/>
13
+ <pathelement location="lib/junit-3.8.1.jar"/>
14
+ <pathelement location="lib/log4j-1.2.13.jar"/>
15
+ <pathelement location="lib/poi-3.9-20121203.jar"/>
16
+ <pathelement location="lib/poi-examples-3.9-20121203.jar"/>
17
+ <pathelement location="lib/poi-excelant-3.9-20121203.jar"/>
18
+ <pathelement location="lib/poi-ooxml-3.9-20121203.jar"/>
19
+ <pathelement location="lib/poi-ooxml-schemas-3.9-20121203.jar"/>
20
+ <pathelement location="lib/poi-scratchpad-3.9-20121203.jar"/>
21
+ <pathelement location="lib/stax-api-1.0.1.jar"/>
22
+ <pathelement location="lib/xmlbeans-2.3.0.jar"/>
23
+ </path>
24
+ <target name="init">
25
+ <mkdir dir="classes"/>
26
+ <copy includeemptydirs="false" todir="classes">
27
+ <fileset dir="src">
28
+ <exclude name="**/*.launch"/>
29
+ <exclude name="**/*.java"/>
30
+ </fileset>
31
+ </copy>
32
+ </target>
33
+ <target name="clean">
34
+ <delete dir="classes"/>
35
+ </target>
36
+ <target depends="clean" name="cleanall"/>
37
+ <target depends="build-subprojects,build-project" name="build"/>
38
+ <target name="build-subprojects"/>
39
+ <target depends="init" name="build-project">
40
+ <echo message="${ant.project.name}: ${ant.file}"/>
41
+ <javac debug="true" debuglevel="${debuglevel}" destdir="classes" includeantruntime="false" source="${source}" target="${target}">
42
+ <src path="src"/>
43
+ <classpath refid="poi2csv.classpath"/>
44
+ </javac>
45
+ </target>
46
+ <target description="Build all projects which reference this project. Useful to propagate changes." name="build-refprojects"/>
47
+ <target description="copy Eclipse compiler jars to ant lib directory" name="init-eclipse-compiler">
48
+ <copy todir="${ant.library.dir}">
49
+ <fileset dir="${ECLIPSE_HOME}/plugins" includes="org.eclipse.jdt.core_*.jar"/>
50
+ </copy>
51
+ <unzip dest="${ant.library.dir}">
52
+ <patternset includes="jdtCompilerAdapter.jar"/>
53
+ <fileset dir="${ECLIPSE_HOME}/plugins" includes="org.eclipse.jdt.core_*.jar"/>
54
+ </unzip>
55
+ </target>
56
+ <target description="compile project with Eclipse compiler" name="build-eclipse-compiler">
57
+ <property name="build.compiler" value="org.eclipse.jdt.core.JDTCompilerAdapter"/>
58
+ <antcall target="build"/>
59
+ </target>
60
+ <target name="ExtractExcelToCSV">
61
+ <java classname="ExtractExcelToCSV" failonerror="true" fork="yes">
62
+ <classpath refid="poi2csv.classpath"/>
63
+ </java>
64
+ </target>
65
+ <target name="ToCSV">
66
+ <java classname="ToCSV" failonerror="true" fork="yes">
67
+ <classpath refid="poi2csv.classpath"/>
68
+ </java>
69
+ </target>
70
+ </project>
Binary file
data/lib/.DS_Store ADDED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,3 @@
1
+ module Poi2csv
2
+ VERSION = "0.0.1"
3
+ end
data/lib/poi2csv.rb ADDED
@@ -0,0 +1,12 @@
1
+ require "poi2csv/version"
2
+
3
+ module Poi2csv
4
+
5
+ def self.to_csv(input_file_path, output_folder_path, separator=nil, formating_convention=nil)
6
+ `java -cp #{classpath} ToCSV #{input_file_path} #{output_folder_path} #{separator} #{formating_convention}`
7
+ end
8
+
9
+ def self.classpath
10
+ @_classpath ||= File.expand_path(File.join(File.dirname(__FILE__),'*')) + File::PATH_SEPARATOR + File.expand_path(File.join(File.dirname(__FILE__),'..', 'classes'))
11
+ end
12
+ end
Binary file
Binary file
data/poi2csv.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'poi2csv/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "poi2csv"
8
+ spec.version = Poi2csv::VERSION
9
+ spec.authors = ["Douglas English"]
10
+ spec.email = ["douglas.english@gmail.com"]
11
+ spec.description = %q{Converts Excel .xls and .xlsx files to CSV.}
12
+ spec.summary = %q{This GEM provides a wrapper to the http://poi.apache.org/ library for converting Excel (.xls and .xlsx) files to CSV.}
13
+ spec.homepage = "https://github.com/denglish/poi2csv"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ end
data/src/ToCSV.java ADDED
@@ -0,0 +1,758 @@
1
+ /* ====================================================================
2
+ Licensed to the Apache Software Foundation (ASF) under one or more
3
+ contributor license agreements. See the NOTICE file distributed with
4
+ this work for additional information regarding copyright ownership.
5
+ The ASF licenses this file to You under the Apache License, Version 2.0
6
+ (the "License"); you may not use this file except in compliance with
7
+ the License. You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+ ==================================================================== */
17
+
18
+ import org.apache.poi.ss.usermodel.WorkbookFactory;
19
+ import org.apache.poi.ss.usermodel.Workbook;
20
+ import org.apache.poi.ss.usermodel.Sheet;
21
+ import org.apache.poi.ss.usermodel.Row;
22
+ import org.apache.poi.ss.usermodel.Cell;
23
+ import org.apache.poi.ss.usermodel.DataFormatter;
24
+ import org.apache.poi.ss.usermodel.FormulaEvaluator;
25
+ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
26
+
27
+ import java.io.File;
28
+ import java.io.FileInputStream;
29
+ import java.io.FileWriter;
30
+ import java.io.BufferedWriter;
31
+ import java.io.FilenameFilter;
32
+ import java.io.IOException;
33
+ import java.io.FileNotFoundException;
34
+ import java.util.ArrayList;
35
+
36
+ /**
37
+ * Demonstrates <em>one</em> way to convert an Excel spreadsheet into a CSV
38
+ * file. This class makes the following assumptions;
39
+ * <list>
40
+ * <li>1. Where the Excel workbook contains more that one worksheet, then a single
41
+ * CSV file will contain the data from all of the worksheets.</li>
42
+ * <li>2. The data matrix contained in the CSV file will be square. This means that
43
+ * the number of fields in each record of the CSV file will match the number
44
+ * of cells in the longest row found in the Excel workbook. Any short records
45
+ * will be 'padded' with empty fields - an empty field is represented in the
46
+ * the CSV file in this way - ,,.</li>
47
+ * <li>3. Empty fields will represent missing cells.</li>
48
+ * <li>4. A record consisting of empty fields will be used to represent an empty row
49
+ * in the Excel workbook.</li>
50
+ * </list>
51
+ * Therefore, if the worksheet looked like this;
52
+ *
53
+ * <pre>
54
+ * ___________________________________________
55
+ * | | | | | |
56
+ * | A | B | C | D | E |
57
+ * ___|_______|_______|_______|_______|_______|
58
+ * | | | | | |
59
+ * 1 | 1 | 2 | 3 | 4 | 5 |
60
+ * ___|_______|_______|_______|_______|_______|
61
+ * | | | | | |
62
+ * 2 | | | | | |
63
+ * ___|_______|_______|_______|_______|_______|
64
+ * | | | | | |
65
+ * 3 | | A | | B | |
66
+ * ___|_______|_______|_______|_______|_______|
67
+ * | | | | | |
68
+ * 4 | | | | | Z |
69
+ * ___|_______|_______|_______|_______|_______|
70
+ * | | | | | |
71
+ * 5 | 1,400 | | 250 | | |
72
+ * ___|_______|_______|_______|_______|_______|
73
+ *
74
+ * </pre>
75
+ *
76
+ * Then, the resulting CSV file will contain the following lines (records);
77
+ * <pre>
78
+ * 1,2,3,4,5
79
+ * ,,,,
80
+ * ,A,,B,
81
+ * ,,,,Z
82
+ * "1,400",,250,,
83
+ * </pre><p>
84
+ * Typically, the comma is used to separate each of the fields that, together,
85
+ * constitute a single record or line within the CSV file. This is not however
86
+ * a hard and fast rule and so this class allows the user to determine which
87
+ * character is used as the field separator and assumes the comma if none other
88
+ * is specified.
89
+ * </p><p>
90
+ * If a field contains the separator then it will be escaped. If the file should
91
+ * obey Excel's CSV formatting rules, then the field will be surrounded with
92
+ * speech marks whilst if it should obey UNIX conventions, each occurrence of
93
+ * the separator will be preceded by the backslash character.
94
+ * </p><p>
95
+ * If a field contains an end of line (EOL) character then it too will be
96
+ * escaped. If the file should obey Excel's CSV formatting rules then the field
97
+ * will again be surrounded by speech marks. On the other hand, if the file
98
+ * should follow UNIX conventions then a single backslash will precede the
99
+ * EOL character. There is no single applicable standard for UNIX and some
100
+ * applications replace the CR with \r and the LF with \n but this class will
101
+ * not do so.
102
+ * </p><p>
103
+ * If the field contains double quotes then that character will be escaped. It
104
+ * seems as though UNIX does not define a standard for this whilst Excel does.
105
+ * Should the CSV file have to obey Excel's formating rules then the speech
106
+ * mark character will be escaped with a second set of speech marks. Finally, an
107
+ * enclosing set of speech marks will also surround the entire field. Thus, if
108
+ * the following line of text appeared in a cell - "Hello" he said - it would
109
+ * look like this when converted into a field within a CSV file - """Hello"" he
110
+ * said".
111
+ * </p><p>
112
+ * Finally, it is worth noting that talk of CSV 'standards' is really slightly
113
+ * miss-leading as there is no such thing. It may well be that the code in this
114
+ * class has to be modified to produce files to suit a specific application
115
+ * or requirement.
116
+ * </p>
117
+ * @author Mark B
118
+ * @version 1.00 9th April 2010
119
+ * 1.10 13th April 2010 - Added support for processing all Excel
120
+ * workbooks in a folder along with the ability
121
+ * to specify a field separator character.
122
+ * 2.00 14th April 2010 - Added support for embedded characters; the
123
+ * field separator, EOL and double quotes or
124
+ * speech marks. In addition, gave the client
125
+ * the ability to select how these are handled,
126
+ * either obeying Excel's or UNIX formatting
127
+ * conventions.
128
+ */
129
+ public class ToCSV {
130
+
131
+ private Workbook workbook = null;
132
+ private ArrayList<ArrayList<String>> csvData = null;
133
+ private int maxRowWidth = 0;
134
+ private int formattingConvention = 0;
135
+ private DataFormatter formatter = null;
136
+ private FormulaEvaluator evaluator = null;
137
+ private String separator = null;
138
+
139
+ private static final String CSV_FILE_EXTENSION = ".csv";
140
+ private static final String DEFAULT_SEPARATOR = ",";
141
+
142
+ /**
143
+ * Identifies that the CSV file should obey Excel's formatting conventions
144
+ * with regard to escaping certain embedded characters - the field separator,
145
+ * speech mark and end of line (EOL) character
146
+ */
147
+ public static final int EXCEL_STYLE_ESCAPING = 0;
148
+
149
+ /**
150
+ * Identifies that the CSV file should obey UNIX formatting conventions
151
+ * with regard to escaping certain embedded characters - the field separator
152
+ * and end of line (EOL) character
153
+ */
154
+ public static final int UNIX_STYLE_ESCAPING = 1;
155
+
156
+ /**
157
+ * Process the contents of a folder, convert the contents of each Excel
158
+ * workbook into CSV format and save the resulting file to the specified
159
+ * folder using the same name as the original workbook with the .xls or
160
+ * .xlsx extension replaced by .csv. This method will ensure that the
161
+ * CSV file created contains the comma field separator and that embedded
162
+ * characters such as the field separator, the EOL and double quotes are
163
+ * escaped in accordance with Excel's convention.
164
+ *
165
+ * @param strSource An instance of the String class that encapsulates the
166
+ * name of and path to either a folder containing those Excel
167
+ * workbook(s) or the name of and path to an individual Excel workbook
168
+ * that is/are to be converted.
169
+ * @param strDestination An instance of the String class encapsulating the
170
+ * name of and path to a folder that will contain the resulting CSV
171
+ * files.
172
+ * @throws java.io.FileNotFoundException Thrown if any file cannot be located
173
+ * on the file system during processing.
174
+ * @throws java.io.IOException Thrown if the file system encounters any
175
+ * problems during processing.
176
+ * @throws java.lang.IllegalArgumentException Thrown if the values passed
177
+ * to the strSource parameter refers to a file or folder that does not
178
+ * exist or if the value passed to the strDestination parameter refers
179
+ * to a folder that does not exist or simply does not refer to a
180
+ * folder.
181
+ * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown
182
+ * if the xml markup encountered whilst parsing a SpreadsheetML
183
+ * file (.xlsx) is invalid.
184
+ */
185
+ public void convertExcelToCSV(String strSource, String strDestination)
186
+ throws FileNotFoundException, IOException,
187
+ IllegalArgumentException, InvalidFormatException {
188
+
189
+ // Simply chain the call to the overloaded convertExcelToCSV(String,
190
+ // String, String, int) method, pass the default separator and ensure
191
+ // that certain embedded characters are escaped in accordance with
192
+ // Excel's formatting conventions
193
+ this.convertExcelToCSV(strSource, strDestination,
194
+ ToCSV.DEFAULT_SEPARATOR, ToCSV.EXCEL_STYLE_ESCAPING);
195
+ }
196
+
197
+ /**
198
+ * Process the contents of a folder, convert the contents of each Excel
199
+ * workbook into CSV format and save the resulting file to the specified
200
+ * folder using the same name as the original workbook with the .xls or
201
+ * .xlsx extension replaced by .csv. This method allows the client to
202
+ * define the field separator but will ensure that embedded characters such
203
+ * as the field separator, the EOL and double quotes are escaped in
204
+ * accordance with Excel's convention.
205
+ *
206
+ * @param strSource An instance of the String class that encapsulates the
207
+ * name of and path to either a folder containing those Excel
208
+ * workbook(s) or the name of and path to an individual Excel workbook
209
+ * that is/are to be converted.
210
+ * @param strDestination An instance of the String class encapsulating the
211
+ * name of and path to a folder that will contain the resulting CSV
212
+ * files.
213
+ * @param separator An instance of the String class that encapsulates the
214
+ * character or characters the client wishes to use as the field
215
+ * separator.
216
+ * @throws java.io.FileNotFoundException Thrown if any file cannot be located
217
+ * on the file system during processing.
218
+ * @throws java.io.IOException Thrown if the file system encounters any
219
+ * problems during processing.
220
+ * @throws java.lang.IllegalArgumentException Thrown if the values passed
221
+ * to the strSource parameter refers to a file or folder that does not
222
+ * exist or if the value passed to the strDestination parameter refers
223
+ * to a folder that does not exist or simply does not refer to a
224
+ * folder.
225
+ * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown
226
+ * if the xml markup encountered whilst parsing a SpreadsheetML
227
+ * file (.xlsx) is invalid.
228
+ */
229
+ public void convertExcelToCSV(String strSource, String strDestination,
230
+ String separator)
231
+ throws FileNotFoundException, IOException,
232
+ IllegalArgumentException, InvalidFormatException {
233
+
234
+ // Simply chain the call to the overloaded convertExcelToCSV(String,
235
+ // String, String, int) method and ensure that certain embedded
236
+ // characters are escaped in accordance with Excel's formatting
237
+ // conventions
238
+ this.convertExcelToCSV(strSource, strDestination,
239
+ separator, ToCSV.EXCEL_STYLE_ESCAPING);
240
+ }
241
+
242
+ /**
243
+ * Process the contents of a folder, convert the contents of each Excel
244
+ * workbook into CSV format and save the resulting file to the specified
245
+ * folder using the same name as the original workbook with the .xls or
246
+ * .xlsx extension replaced by .csv
247
+ *
248
+ * @param strSource An instance of the String class that encapsulates the
249
+ * name of and path to either a folder containing those Excel
250
+ * workbook(s) or the name of and path to an individual Excel workbook
251
+ * that is/are to be converted.
252
+ * @param strDestination An instance of the String class encapsulating the name
253
+ * of and path to a folder that will contain the resulting CSV files.
254
+ * @param formattingConvention A primitive int whose value will determine
255
+ * whether certain embedded characters should be escaped in accordance
256
+ * with Excel's or UNIX formatting conventions. Two constants are
257
+ * defined to support this option; ToCSV.EXCEL_STYLE_ESCAPING and
258
+ * ToCSV.UNIX_STYLE_ESCAPING
259
+ * @param separator An instance of the String class encapsulating the
260
+ * characters or characters that should be used to separate items
261
+ * on a line within the CSV file.
262
+ * @throws java.io.FileNotFoundException Thrown if any file cannot be located
263
+ * on the file system during processing.
264
+ * @throws java.io.IOException Thrown if the file system encounters any
265
+ * problems during processing.
266
+ * @throws java.lang.IllegalArgumentException Thrown if the values passed
267
+ * to the strSource parameter refers to a file or folder that does not
268
+ * exist, if the value passed to the strDestination parameter refers
269
+ * to a folder that does not exist, if the value passed to the
270
+ * strDestination parameter does not refer to a folder or if the
271
+ * value passed to the formattingConvention parameter is other than
272
+ * one of the values defined by the constants ToCSV.EXCEL_STYLE_ESCAPING
273
+ * and ToCSV.UNIX_STYLE_ESCAPING.
274
+ * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown
275
+ * if the xml markup encountered whilst parsing a SpreadsheetML
276
+ * file (.xlsx) is invalid.
277
+ */
278
+ public void convertExcelToCSV(String strSource, String strDestination,
279
+ String separator, int formattingConvention)
280
+ throws FileNotFoundException, IOException,
281
+ IllegalArgumentException, InvalidFormatException {
282
+ File source = new File(strSource);
283
+ File destination = new File(strDestination);
284
+ File[] filesList = null;
285
+ String destinationFilename = null;
286
+
287
+ // Check that the source file/folder exists.
288
+ if(!source.exists()) {
289
+ throw new IllegalArgumentException("The source for the Excel " +
290
+ "file(s) cannot be found.");
291
+ }
292
+
293
+ // Ensure that the folder the user has chosen to save the CSV files
294
+ // away into firstly exists and secondly is a folder rather than, for
295
+ // instance, a data file.
296
+ if(!destination.exists()) {
297
+ throw new IllegalArgumentException("The folder/directory for the " +
298
+ "converted CSV file(s) does not exist.");
299
+ }
300
+ if(!destination.isDirectory()) {
301
+ throw new IllegalArgumentException("The destination for the CSV " +
302
+ "file(s) is not a directory/folder.");
303
+ }
304
+
305
+ // Ensure the value passed to the formattingConvention parameter is
306
+ // within range.
307
+ if(formattingConvention != ToCSV.EXCEL_STYLE_ESCAPING &&
308
+ formattingConvention != ToCSV.UNIX_STYLE_ESCAPING) {
309
+ throw new IllegalArgumentException("The value passed to the " +
310
+ "formattingConvention parameter is out of range.");
311
+ }
312
+
313
+ // Copy the separator character and formatting convention into local
314
+ // variables for use in other methods.
315
+ this.separator = separator;
316
+ this.formattingConvention = formattingConvention;
317
+
318
+ // Check to see if the sourceFolder variable holds a reference to
319
+ // a file or a folder full of files.
320
+ if(source.isDirectory()) {
321
+ // Get a list of all of the Excel spreadsheet files (workbooks) in
322
+ // the source folder/directory
323
+ filesList = source.listFiles(new ExcelFilenameFilter());
324
+ }
325
+ else {
326
+ // Assume that it must be a file handle - although there are other
327
+ // options the code should perhaps check - and store the reference
328
+ // into the filesList variable.
329
+ filesList = new File[]{source};
330
+ }
331
+
332
+ // Step through each of the files in the source folder and for each
333
+ // open the workbook, convert it's contents to CSV format and then
334
+ // save the resulting file away into the folder specified by the
335
+ // contents of the destination variable. Note that the name of the
336
+ // csv file will be created by taking the name of the Excel file,
337
+ // removing the extension and replacing it with .csv. Note that there
338
+ // is one drawback with this approach; if the folder holding the files
339
+ // contains two workbooks whose names match but one is a binary file
340
+ // (.xls) and the other a SpreadsheetML file (.xlsx), then the names
341
+ // for both CSV files will be identical and one CSV file will,
342
+ // therefore, over-write the other.
343
+ for(File excelFile : filesList) {
344
+ // Open the workbook
345
+ this.openWorkbook(excelFile);
346
+
347
+ // Convert it's contents into a CSV file
348
+ this.convertToCSV();
349
+
350
+ // Build the name of the csv folder from that of the Excel workbook.
351
+ // Simply replace the .xls or .xlsx file extension with .csv
352
+ destinationFilename = excelFile.getName();
353
+ destinationFilename = destinationFilename.substring(
354
+ 0, destinationFilename.lastIndexOf(".")) +
355
+ ToCSV.CSV_FILE_EXTENSION;
356
+
357
+ // Save the CSV file away using the newly constructed file name
358
+ // and to the specified directory.
359
+ this.saveCSVFile(new File(destination, destinationFilename));
360
+ }
361
+ }
362
+
363
+ /**
364
+ * Open an Excel workbook ready for conversion.
365
+ *
366
+ * @param file An instance of the File class that encapsulates a handle
367
+ * to a valid Excel workbook. Note that the workbook can be in
368
+ * either binary (.xls) or SpreadsheetML (.xlsx) format.
369
+ * @throws java.io.FileNotFoundException Thrown if the file cannot be located.
370
+ * @throws java.io.IOException Thrown if a problem occurs in the file system.
371
+ * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown
372
+ * if invalid xml is found whilst parsing an input SpreadsheetML
373
+ * file.
374
+ */
375
+ private void openWorkbook(File file) throws FileNotFoundException,
376
+ IOException, InvalidFormatException {
377
+ FileInputStream fis = null;
378
+ try {
379
+ System.out.println("Opening workbook [" + file.getName() + "]");
380
+
381
+ fis = new FileInputStream(file);
382
+
383
+ // Open the workbook and then create the FormulaEvaluator and
384
+ // DataFormatter instances that will be needed to, respectively,
385
+ // force evaluation of formulae found in cells and create a
386
+ // formatted String encapsulating the cells contents.
387
+ this.workbook = WorkbookFactory.create(fis);
388
+ this.evaluator = this.workbook.getCreationHelper().createFormulaEvaluator();
389
+ this.formatter = new DataFormatter(true);
390
+ }
391
+ finally {
392
+ if(fis != null) {
393
+ fis.close();
394
+ }
395
+ }
396
+ }
397
+
398
+ /**
399
+ * Called to convert the contents of the currently opened workbook into
400
+ * a CSV file.
401
+ */
402
+ private void convertToCSV() {
403
+ Sheet sheet = null;
404
+ Row row = null;
405
+ int lastRowNum = 0;
406
+ this.csvData = new ArrayList<ArrayList<String>>();
407
+
408
+ System.out.println("Converting files contents to CSV format.");
409
+
410
+ // Discover how many sheets there are in the workbook....
411
+ int numSheets = this.workbook.getNumberOfSheets();
412
+
413
+ // and then iterate through them.
414
+ for(int i = 0; i < numSheets; i++) {
415
+
416
+ // Get a reference to a sheet and check to see if it contains
417
+ // any rows.
418
+ sheet = this.workbook.getSheetAt(i);
419
+ if(sheet.getPhysicalNumberOfRows() > 0) {
420
+
421
+ // Note down the index number of the bottom-most row and
422
+ // then iterate through all of the rows on the sheet starting
423
+ // from the very first row - number 1 - even if it is missing.
424
+ // Recover a reference to the row and then call another method
425
+ // which will strip the data from the cells and build lines
426
+ // for inclusion in the resulting CSV file.
427
+ lastRowNum = sheet.getLastRowNum();
428
+ for(int j = 0; j <= lastRowNum; j++) {
429
+ row = sheet.getRow(j);
430
+ this.rowToCSV(row);
431
+ }
432
+ }
433
+ }
434
+ }
435
+
436
+ /**
437
+ * Called to actually save the data recovered from the Excel workbook
438
+ * as a CSV file.
439
+ *
440
+ * @param file An instance of the File class that encapsulates a handle
441
+ * referring to the CSV file.
442
+ * @throws java.io.FileNotFoundException Thrown if the file cannot be found.
443
+ * @throws java.io.IOException Thrown to indicate and error occurred in the
444
+ * underlying file system.
445
+ */
446
+ private void saveCSVFile(File file)
447
+ throws FileNotFoundException, IOException {
448
+ FileWriter fw = null;
449
+ BufferedWriter bw = null;
450
+ ArrayList<String> line = null;
451
+ StringBuffer buffer = null;
452
+ String csvLineElement = null;
453
+ try {
454
+
455
+ System.out.println("Saving the CSV file [" + file.getName() + "]");
456
+
457
+ // Open a writer onto the CSV file.
458
+ fw = new FileWriter(file);
459
+ bw = new BufferedWriter(fw);
460
+
461
+ // Step through the elements of the ArrayList that was used to hold
462
+ // all of the data recovered from the Excel workbooks' sheets, rows
463
+ // and cells.
464
+ for(int i = 0; i < this.csvData.size(); i++) {
465
+ buffer = new StringBuffer();
466
+
467
+ // Get an element from the ArrayList that contains the data for
468
+ // the workbook. This element will itself be an ArrayList
469
+ // containing Strings and each String will hold the data recovered
470
+ // from a single cell. The for() loop is used to recover elements
471
+ // from this 'row' ArrayList one at a time and to write the Strings
472
+ // away to a StringBuffer thus assembling a single line for inclusion
473
+ // in the CSV file. If a row was empty or if it was short, then
474
+ // the ArrayList that contains it's data will also be shorter than
475
+ // some of the others. Therefore, it is necessary to check within
476
+ // the for loop to ensure that the ArrayList contains data to be
477
+ // processed. If it does, then an element will be recovered and
478
+ // appended to the StringBuffer.
479
+ line = this.csvData.get(i);
480
+ for(int j = 0; j < this.maxRowWidth; j++) {
481
+ if(line.size() > j) {
482
+ csvLineElement = line.get(j);
483
+ if(csvLineElement != null) {
484
+ buffer.append(this.escapeEmbeddedCharacters(
485
+ csvLineElement));
486
+ }
487
+ }
488
+ if(j < (this.maxRowWidth - 1)) {
489
+ buffer.append(this.separator);
490
+ }
491
+ }
492
+
493
+ // Once the line is built, write it away to the CSV file.
494
+ bw.write(buffer.toString().trim());
495
+
496
+ // Condition the inclusion of new line characters so as to
497
+ // avoid an additional, superfluous, new line at the end of
498
+ // the file.
499
+ if(i < (this.csvData.size() - 1)) {
500
+ bw.newLine();
501
+ }
502
+ }
503
+ }
504
+ finally {
505
+ if(bw != null) {
506
+ bw.flush();
507
+ bw.close();
508
+ }
509
+ }
510
+ }
511
+
512
+ /**
513
+ * Called to convert a row of cells into a line of data that can later be
514
+ * output to the CSV file.
515
+ *
516
+ * @param row An instance of either the HSSFRow or XSSFRow classes that
517
+ * encapsulates information about a row of cells recovered from
518
+ * an Excel workbook.
519
+ */
520
+ private void rowToCSV(Row row) {
521
+ Cell cell = null;
522
+ int lastCellNum = 0;
523
+ ArrayList<String> csvLine = new ArrayList<String>();
524
+
525
+ // Check to ensure that a row was recovered from the sheet as it is
526
+ // possible that one or more rows between other populated rows could be
527
+ // missing - blank. If the row does contain cells then...
528
+ if(row != null) {
529
+
530
+ // Get the index for the right most cell on the row and then
531
+ // step along the row from left to right recovering the contents
532
+ // of each cell, converting that into a formatted String and
533
+ // then storing the String into the csvLine ArrayList.
534
+ lastCellNum = row.getLastCellNum();
535
+ for(int i = 0; i <= lastCellNum; i++) {
536
+ cell = row.getCell(i);
537
+ if(cell == null) {
538
+ csvLine.add("");
539
+ }
540
+ else {
541
+ if(cell.getCellType() != Cell.CELL_TYPE_FORMULA) {
542
+ csvLine.add(this.formatter.formatCellValue(cell));
543
+ }
544
+ else {
545
+ csvLine.add(this.formatter.formatCellValue(cell, this.evaluator));
546
+ }
547
+ }
548
+ }
549
+ // Make a note of the index number of the right most cell. This value
550
+ // will later be used to ensure that the matrix of data in the CSV file
551
+ // is square.
552
+ if(lastCellNum > this.maxRowWidth) {
553
+ this.maxRowWidth = lastCellNum;
554
+ }
555
+ }
556
+ this.csvData.add(csvLine);
557
+ }
558
+
559
+ /**
560
+ * Checks to see whether the field - which consists of the formatted
561
+ * contents of an Excel worksheet cell encapsulated within a String - contains
562
+ * any embedded characters that must be escaped. The method is able to
563
+ * comply with either Excel's or UNIX formatting conventions in the
564
+ * following manner;
565
+ *
566
+ * With regard to UNIX conventions, if the field contains any embedded
567
+ * field separator or EOL characters they will each be escaped by prefixing
568
+ * a leading backspace character. These are the only changes that have yet
569
+ * emerged following some research as being required.
570
+ *
571
+ * Excel has other embedded character escaping requirements, some that emerged
572
+ * from empirical testing, other through research. Firstly, with regards to
573
+ * any embedded speech marks ("), each occurrence should be escaped with
574
+ * another speech mark and the whole field then surrounded with speech marks.
575
+ * Thus if a field holds <em>"Hello" he said</em> then it should be modified
576
+ * to appear as <em>"""Hello"" he said"</em>. Furthermore, if the field
577
+ * contains either embedded separator or EOL characters, it should also
578
+ * be surrounded with speech marks. As a result <em>1,400</em> would become
579
+ * <em>"1,400"</em> assuming that the comma is the required field separator.
580
+ * This has one consequence in, if a field contains embedded speech marks
581
+ * and embedded separator characters, checks for both are not required as the
582
+ * additional set of speech marks that should be placed around any field
583
+ * containing embedded speech marks will also account for the embedded
584
+ * separator.
585
+ *
586
+ * It is worth making one further note with regard to embedded EOL
587
+ * characters. If the data in a worksheet is exported as a CSV file using
588
+ * Excel itself, then the field will be surrounded with speech marks. If the
589
+ * resulting CSV file is then re-imports into another worksheet, the EOL
590
+ * character will result in the original single field occupying more than
591
+ * one cell. This same 'feature' is replicated in this classes behaviour.
592
+ *
593
+ * @param field An instance of the String class encapsulating the formatted
594
+ * contents of a cell on an Excel worksheet.
595
+ * @return A String that encapsulates the formatted contents of that
596
+ * Excel worksheet cell but with any embedded separator, EOL or
597
+ * speech mark characters correctly escaped.
598
+ */
599
+ private String escapeEmbeddedCharacters(String field) {
600
+ StringBuffer buffer = null;
601
+
602
+ // If the fields contents should be formatted to confirm with Excel's
603
+ // convention....
604
+ if(this.formattingConvention == ToCSV.EXCEL_STYLE_ESCAPING) {
605
+
606
+ // Firstly, check if there are any speech marks (") in the field;
607
+ // each occurrence must be escaped with another set of speech marks
608
+ // and then the entire field should be enclosed within another
609
+ // set of speech marks. Thus, "Yes" he said would become
610
+ // """Yes"" he said"
611
+ if(field.contains("\"")) {
612
+ buffer = new StringBuffer(field.replaceAll("\"", "\\\"\\\""));
613
+ buffer.insert(0, "\"");
614
+ buffer.append("\"");
615
+ }
616
+ else {
617
+ // If the field contains either embedded separator or EOL
618
+ // characters, then escape the whole field by surrounding it
619
+ // with speech marks.
620
+ buffer = new StringBuffer(field);
621
+ if((buffer.indexOf(this.separator)) > -1 ||
622
+ (buffer.indexOf("\n")) > -1) {
623
+ buffer.insert(0, "\"");
624
+ buffer.append("\"");
625
+ }
626
+ }
627
+ return(buffer.toString().trim());
628
+ }
629
+ // The only other formatting convention this class obeys is the UNIX one
630
+ // where any occurrence of the field separator or EOL character will
631
+ // be escaped by preceding it with a backslash.
632
+ else {
633
+ if(field.contains(this.separator)) {
634
+ field = field.replaceAll(this.separator, ("\\\\" + this.separator));
635
+ }
636
+ if(field.contains("\n")) {
637
+ field = field.replaceAll("\n", "\\\\\n");
638
+ }
639
+ return(field);
640
+ }
641
+ }
642
+
643
+ /**
644
+ * The main() method contains code that demonstrates how to use the class.
645
+ *
646
+ * @param args An array containing zero, one or more elements all of type
647
+ * String. Each element will encapsulate an argument specified by the
648
+ * user when running the program from the command prompt.
649
+ */
650
+ public static void main(String[] args) {
651
+ // Check the number of arguments passed to the main method. There
652
+ // must be two, three or four; the name of and path to either the folder
653
+ // containing the Excel files or an individual Excel workbook that is/are
654
+ // to be converted, the name of and path to the folder to which the CSV
655
+ // files should be written, - optionally - the separator character
656
+ // that should be used to separate individual items (fields) on the
657
+ // lines (records) of the CSV file and - again optionally - an integer
658
+ // that indicates whether the CSV file ought to obey Excel's or UNIX
659
+ // conventions with regard to formatting fields that contain embedded
660
+ // separator, Speech mark or EOL character(s).
661
+ //
662
+ // Note that the names of the CSV files will be derived from those
663
+ // of the Excel file(s). Put simply the .xls or .xlsx extension will be
664
+ // replaced with .csv. Therefore, if the source folder contains files
665
+ // with matching names but different extensions - Test.xls and Test.xlsx
666
+ // for example - then the CSV file generated from one will overwrite
667
+ // that generated from the other.
668
+ ToCSV converter = null;
669
+ try {
670
+ converter = new ToCSV();
671
+ if(args.length == 2) {
672
+ // Just the Source File/Folder and Destination Folder were
673
+ // passed to the main method.
674
+ converter.convertExcelToCSV(args[0], args[1]);
675
+ }
676
+ else if(args.length == 3){
677
+ // The Source File/Folder, Destination Folder and Separator
678
+ // were passed to the main method.
679
+ converter.convertExcelToCSV(args[0], args[1], args[2]);
680
+ }
681
+ else if(args.length == 4) {
682
+ // The Source File/Folder, Destination Folder, Separator and
683
+ // Formatting Convention were passed to the main method.
684
+ converter.convertExcelToCSV(args[0], args[1],
685
+ args[2], Integer.parseInt(args[3]));
686
+ }
687
+ else {
688
+ // None or more than four parameters were passed so display
689
+ //a Usage message.
690
+ System.out.println("Usage: java ToCSV [Source File/Folder] " +
691
+ "[Destination Folder] [Separator] [Formatting Convention]\n" +
692
+ "\tSource File/Folder\tThis argument should contain the name of and\n" +
693
+ "\t\t\t\tpath to either a single Excel workbook or a\n" +
694
+ "\t\t\t\tfolder containing one or more Excel workbooks.\n" +
695
+ "\tDestination Folder\tThe name of and path to the folder that the\n" +
696
+ "\t\t\t\tCSV files should be written out into. The\n" +
697
+ "\t\t\t\tfolder must exist before running the ToCSV\n" +
698
+ "\t\t\t\tcode as it will not check for or create it.\n" +
699
+ "\tSeparator\t\tOptional. The character or characters that\n" +
700
+ "\t\t\t\tshould be used to separate fields in the CSV\n" +
701
+ "\t\t\t\trecord. If no value is passed then the comma\n" +
702
+ "\t\t\t\twill be assumed.\n" +
703
+ "\tFormatting Convention\tOptional. This argument can take one of two\n" +
704
+ "\t\t\t\tvalues. Passing 0 (zero) will result in a CSV\n" +
705
+ "\t\t\t\tfile that obeys Excel's formatting conventions\n" +
706
+ "\t\t\t\twhilst passing 1 (one) will result in a file\n" +
707
+ "\t\t\t\tthat obeys UNIX formatting conventions. If no\n" +
708
+ "\t\t\t\tvalue is passed, then the CSV file produced\n" +
709
+ "\t\t\t\twill obey Excel's formatting conventions.");
710
+ }
711
+ }
712
+ // It is not wise to have such a wide catch clause - Exception is very
713
+ // close to being at the top of the inheritance hierarchy - though it
714
+ // will suffice for this example as it is really not possible to recover
715
+ // easily from an exceptional set of circumstances at this point in the
716
+ // program. It should however, ideally be replaced with one or more
717
+ // catch clauses optimised to handle more specific problems.
718
+ catch(Exception ex) {
719
+ System.out.println("Caught an: " + ex.getClass().getName());
720
+ System.out.println("Message: " + ex.getMessage());
721
+ System.out.println("Stacktrace follows:.....");
722
+ ex.printStackTrace(System.out);
723
+ }
724
+ }
725
+
726
+ /**
727
+ * An instance of this class can be used to control the files returned
728
+ * be a call to the listFiles() method when made on an instance of the
729
+ * File class and that object refers to a folder/directory
730
+ */
731
+ class ExcelFilenameFilter implements FilenameFilter {
732
+
733
+ /**
734
+ * Determine those files that will be returned by a call to the
735
+ * listFiles() method. In this case, the name of the file must end with
736
+ * either of the following two extension; '.xls' or '.xlsx'. For the
737
+ * future, it is very possible to parameterise this and allow the
738
+ * containing class to pass, for example, an array of Strings to this
739
+ * class on instantiation. Each element in that array could encapsulate
740
+ * a valid file extension - '.xls', '.xlsx', '.xlt', '.xlst', etc. These
741
+ * could then be used to control which files were returned by the call
742
+ * to the listFiles() method.
743
+ *
744
+ * @param file An instance of the File class that encapsulates a handle
745
+ * referring to the folder/directory that contains the file.
746
+ * @param name An instance of the String class that encapsulates the
747
+ * name of the file.
748
+ * @return A boolean value that indicates whether the file should be
749
+ * included in the array returned by the call to the listFiles()
750
+ * method. In this case true will be returned if the name of the
751
+ * file ends with either '.xls' or '.xlsx' and false will be
752
+ * returned in all other instances.
753
+ */
754
+ public boolean accept(File file, String name) {
755
+ return(name.endsWith(".xls") || name.endsWith(".xlsx"));
756
+ }
757
+ }
758
+ }
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: poi2csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Douglas English
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Converts Excel .xls and .xlsx files to CSV.
42
+ email:
43
+ - douglas.english@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .DS_Store
49
+ - .gitignore
50
+ - Gemfile
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - build.xml
55
+ - classes/ToCSV$ExcelFilenameFilter.class
56
+ - classes/ToCSV.class
57
+ - lib/.DS_Store
58
+ - lib/commons-codec-1.5.jar
59
+ - lib/commons-logging-1.1.jar
60
+ - lib/dom4j-1.6.1.jar
61
+ - lib/junit-3.8.1.jar
62
+ - lib/log4j-1.2.13.jar
63
+ - lib/poi-3.9-20121203.jar
64
+ - lib/poi-examples-3.9-20121203.jar
65
+ - lib/poi-excelant-3.9-20121203.jar
66
+ - lib/poi-ooxml-3.9-20121203.jar
67
+ - lib/poi-ooxml-schemas-3.9-20121203.jar
68
+ - lib/poi-scratchpad-3.9-20121203.jar
69
+ - lib/poi2csv.rb
70
+ - lib/poi2csv/version.rb
71
+ - lib/stax-api-1.0.1.jar
72
+ - lib/xmlbeans-2.3.0.jar
73
+ - poi2csv.gemspec
74
+ - src/ToCSV.java
75
+ homepage: https://github.com/denglish/poi2csv
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ! '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.0.3
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: This GEM provides a wrapper to the http://poi.apache.org/ library for converting
99
+ Excel (.xls and .xlsx) files to CSV.
100
+ test_files: []