poi2csv 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3f419a351e1981e983019c5d6c94be41e8aa77d7
4
+ data.tar.gz: d469b9326fae229ffd4d460f5b9eff199e2455d7
5
+ SHA512:
6
+ metadata.gz: c8ee1be9dce6a77176aab26eae45577a93378fe6cb80b00ecb1fec2de915969dba906c0c267a8628350d52f59141b405d11babcb482a775c7c9a02feac86971b
7
+ data.tar.gz: adcd33315bd73bca8543afcb523c3547b206a914412847e18912e4ee1581a5cb98c094092924a7cbb6b1f6efb360b602fd4dab09b86397aa931998185bcfaf2c
data/.DS_Store ADDED
Binary file
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in poi2csv.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Douglas English
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Poi2csv
2
+
3
+ This GEM provides a wrapper to the http://poi.apache.org/ library for converting Excel (.xls and .xlsx) files to CSV.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'poi2csv'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install poi2csv
18
+
19
+ ## Usage
20
+
21
+ Poi2csv::to_csv(input_file_path, output_folder_path, separator=nil, formating_convention=nil)
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/build.xml ADDED
@@ -0,0 +1,70 @@
1
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2
+ <project basedir="." default="build" name="poi2csv">
3
+ <property environment="env"/>
4
+ <property name="ECLIPSE_HOME" value="../../../../Applications/eclipse"/>
5
+ <property name="debuglevel" value="source,lines,vars"/>
6
+ <property name="target" value="1.6"/>
7
+ <property name="source" value="1.6"/>
8
+ <path id="poi2csv.classpath">
9
+ <pathelement location="classes"/>
10
+ <pathelement location="lib/commons-codec-1.5.jar"/>
11
+ <pathelement location="lib/commons-logging-1.1.jar"/>
12
+ <pathelement location="lib/dom4j-1.6.1.jar"/>
13
+ <pathelement location="lib/junit-3.8.1.jar"/>
14
+ <pathelement location="lib/log4j-1.2.13.jar"/>
15
+ <pathelement location="lib/poi-3.9-20121203.jar"/>
16
+ <pathelement location="lib/poi-examples-3.9-20121203.jar"/>
17
+ <pathelement location="lib/poi-excelant-3.9-20121203.jar"/>
18
+ <pathelement location="lib/poi-ooxml-3.9-20121203.jar"/>
19
+ <pathelement location="lib/poi-ooxml-schemas-3.9-20121203.jar"/>
20
+ <pathelement location="lib/poi-scratchpad-3.9-20121203.jar"/>
21
+ <pathelement location="lib/stax-api-1.0.1.jar"/>
22
+ <pathelement location="lib/xmlbeans-2.3.0.jar"/>
23
+ </path>
24
+ <target name="init">
25
+ <mkdir dir="classes"/>
26
+ <copy includeemptydirs="false" todir="classes">
27
+ <fileset dir="src">
28
+ <exclude name="**/*.launch"/>
29
+ <exclude name="**/*.java"/>
30
+ </fileset>
31
+ </copy>
32
+ </target>
33
+ <target name="clean">
34
+ <delete dir="classes"/>
35
+ </target>
36
+ <target depends="clean" name="cleanall"/>
37
+ <target depends="build-subprojects,build-project" name="build"/>
38
+ <target name="build-subprojects"/>
39
+ <target depends="init" name="build-project">
40
+ <echo message="${ant.project.name}: ${ant.file}"/>
41
+ <javac debug="true" debuglevel="${debuglevel}" destdir="classes" includeantruntime="false" source="${source}" target="${target}">
42
+ <src path="src"/>
43
+ <classpath refid="poi2csv.classpath"/>
44
+ </javac>
45
+ </target>
46
+ <target description="Build all projects which reference this project. Useful to propagate changes." name="build-refprojects"/>
47
+ <target description="copy Eclipse compiler jars to ant lib directory" name="init-eclipse-compiler">
48
+ <copy todir="${ant.library.dir}">
49
+ <fileset dir="${ECLIPSE_HOME}/plugins" includes="org.eclipse.jdt.core_*.jar"/>
50
+ </copy>
51
+ <unzip dest="${ant.library.dir}">
52
+ <patternset includes="jdtCompilerAdapter.jar"/>
53
+ <fileset dir="${ECLIPSE_HOME}/plugins" includes="org.eclipse.jdt.core_*.jar"/>
54
+ </unzip>
55
+ </target>
56
+ <target description="compile project with Eclipse compiler" name="build-eclipse-compiler">
57
+ <property name="build.compiler" value="org.eclipse.jdt.core.JDTCompilerAdapter"/>
58
+ <antcall target="build"/>
59
+ </target>
60
+ <target name="ExtractExcelToCSV">
61
+ <java classname="ExtractExcelToCSV" failonerror="true" fork="yes">
62
+ <classpath refid="poi2csv.classpath"/>
63
+ </java>
64
+ </target>
65
+ <target name="ToCSV">
66
+ <java classname="ToCSV" failonerror="true" fork="yes">
67
+ <classpath refid="poi2csv.classpath"/>
68
+ </java>
69
+ </target>
70
+ </project>
Binary file
data/lib/.DS_Store ADDED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,3 @@
1
+ module Poi2csv
2
+ VERSION = "0.0.1"
3
+ end
data/lib/poi2csv.rb ADDED
@@ -0,0 +1,12 @@
1
+ require "poi2csv/version"
2
+
3
+ module Poi2csv
4
+
5
+ def self.to_csv(input_file_path, output_folder_path, separator=nil, formating_convention=nil)
6
+ `java -cp #{classpath} ToCSV #{input_file_path} #{output_folder_path} #{separator} #{formating_convention}`
7
+ end
8
+
9
+ def self.classpath
10
+ @_classpath ||= File.expand_path(File.join(File.dirname(__FILE__),'*')) + File::PATH_SEPARATOR + File.expand_path(File.join(File.dirname(__FILE__),'..', 'classes'))
11
+ end
12
+ end
Binary file
Binary file
data/poi2csv.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'poi2csv/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "poi2csv"
8
+ spec.version = Poi2csv::VERSION
9
+ spec.authors = ["Douglas English"]
10
+ spec.email = ["douglas.english@gmail.com"]
11
+ spec.description = %q{Converts Excel .xls and .xlsx files to CSV.}
12
+ spec.summary = %q{This GEM provides a wrapper to the http://poi.apache.org/ library for converting Excel (.xls and .xlsx) files to CSV.}
13
+ spec.homepage = "https://github.com/denglish/poi2csv"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ end
data/src/ToCSV.java ADDED
@@ -0,0 +1,758 @@
1
+ /* ====================================================================
2
+ Licensed to the Apache Software Foundation (ASF) under one or more
3
+ contributor license agreements. See the NOTICE file distributed with
4
+ this work for additional information regarding copyright ownership.
5
+ The ASF licenses this file to You under the Apache License, Version 2.0
6
+ (the "License"); you may not use this file except in compliance with
7
+ the License. You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+ ==================================================================== */
17
+
18
+ import org.apache.poi.ss.usermodel.WorkbookFactory;
19
+ import org.apache.poi.ss.usermodel.Workbook;
20
+ import org.apache.poi.ss.usermodel.Sheet;
21
+ import org.apache.poi.ss.usermodel.Row;
22
+ import org.apache.poi.ss.usermodel.Cell;
23
+ import org.apache.poi.ss.usermodel.DataFormatter;
24
+ import org.apache.poi.ss.usermodel.FormulaEvaluator;
25
+ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
26
+
27
+ import java.io.File;
28
+ import java.io.FileInputStream;
29
+ import java.io.FileWriter;
30
+ import java.io.BufferedWriter;
31
+ import java.io.FilenameFilter;
32
+ import java.io.IOException;
33
+ import java.io.FileNotFoundException;
34
+ import java.util.ArrayList;
35
+
36
+ /**
37
+ * Demonstrates <em>one</em> way to convert an Excel spreadsheet into a CSV
38
+ * file. This class makes the following assumptions;
39
+ * <list>
40
+ * <li>1. Where the Excel workbook contains more that one worksheet, then a single
41
+ * CSV file will contain the data from all of the worksheets.</li>
42
+ * <li>2. The data matrix contained in the CSV file will be square. This means that
43
+ * the number of fields in each record of the CSV file will match the number
44
+ * of cells in the longest row found in the Excel workbook. Any short records
45
+ * will be 'padded' with empty fields - an empty field is represented in the
46
+ * the CSV file in this way - ,,.</li>
47
+ * <li>3. Empty fields will represent missing cells.</li>
48
+ * <li>4. A record consisting of empty fields will be used to represent an empty row
49
+ * in the Excel workbook.</li>
50
+ * </list>
51
+ * Therefore, if the worksheet looked like this;
52
+ *
53
+ * <pre>
54
+ * ___________________________________________
55
+ * | | | | | |
56
+ * | A | B | C | D | E |
57
+ * ___|_______|_______|_______|_______|_______|
58
+ * | | | | | |
59
+ * 1 | 1 | 2 | 3 | 4 | 5 |
60
+ * ___|_______|_______|_______|_______|_______|
61
+ * | | | | | |
62
+ * 2 | | | | | |
63
+ * ___|_______|_______|_______|_______|_______|
64
+ * | | | | | |
65
+ * 3 | | A | | B | |
66
+ * ___|_______|_______|_______|_______|_______|
67
+ * | | | | | |
68
+ * 4 | | | | | Z |
69
+ * ___|_______|_______|_______|_______|_______|
70
+ * | | | | | |
71
+ * 5 | 1,400 | | 250 | | |
72
+ * ___|_______|_______|_______|_______|_______|
73
+ *
74
+ * </pre>
75
+ *
76
+ * Then, the resulting CSV file will contain the following lines (records);
77
+ * <pre>
78
+ * 1,2,3,4,5
79
+ * ,,,,
80
+ * ,A,,B,
81
+ * ,,,,Z
82
+ * "1,400",,250,,
83
+ * </pre><p>
84
+ * Typically, the comma is used to separate each of the fields that, together,
85
+ * constitute a single record or line within the CSV file. This is not however
86
+ * a hard and fast rule and so this class allows the user to determine which
87
+ * character is used as the field separator and assumes the comma if none other
88
+ * is specified.
89
+ * </p><p>
90
+ * If a field contains the separator then it will be escaped. If the file should
91
+ * obey Excel's CSV formatting rules, then the field will be surrounded with
92
+ * speech marks whilst if it should obey UNIX conventions, each occurrence of
93
+ * the separator will be preceded by the backslash character.
94
+ * </p><p>
95
+ * If a field contains an end of line (EOL) character then it too will be
96
+ * escaped. If the file should obey Excel's CSV formatting rules then the field
97
+ * will again be surrounded by speech marks. On the other hand, if the file
98
+ * should follow UNIX conventions then a single backslash will precede the
99
+ * EOL character. There is no single applicable standard for UNIX and some
100
+ * applications replace the CR with \r and the LF with \n but this class will
101
+ * not do so.
102
+ * </p><p>
103
+ * If the field contains double quotes then that character will be escaped. It
104
+ * seems as though UNIX does not define a standard for this whilst Excel does.
105
+ * Should the CSV file have to obey Excel's formating rules then the speech
106
+ * mark character will be escaped with a second set of speech marks. Finally, an
107
+ * enclosing set of speech marks will also surround the entire field. Thus, if
108
+ * the following line of text appeared in a cell - "Hello" he said - it would
109
+ * look like this when converted into a field within a CSV file - """Hello"" he
110
+ * said".
111
+ * </p><p>
112
+ * Finally, it is worth noting that talk of CSV 'standards' is really slightly
113
+ * miss-leading as there is no such thing. It may well be that the code in this
114
+ * class has to be modified to produce files to suit a specific application
115
+ * or requirement.
116
+ * </p>
117
+ * @author Mark B
118
+ * @version 1.00 9th April 2010
119
+ * 1.10 13th April 2010 - Added support for processing all Excel
120
+ * workbooks in a folder along with the ability
121
+ * to specify a field separator character.
122
+ * 2.00 14th April 2010 - Added support for embedded characters; the
123
+ * field separator, EOL and double quotes or
124
+ * speech marks. In addition, gave the client
125
+ * the ability to select how these are handled,
126
+ * either obeying Excel's or UNIX formatting
127
+ * conventions.
128
+ */
129
+ public class ToCSV {
130
+
131
+ private Workbook workbook = null;
132
+ private ArrayList<ArrayList<String>> csvData = null;
133
+ private int maxRowWidth = 0;
134
+ private int formattingConvention = 0;
135
+ private DataFormatter formatter = null;
136
+ private FormulaEvaluator evaluator = null;
137
+ private String separator = null;
138
+
139
+ private static final String CSV_FILE_EXTENSION = ".csv";
140
+ private static final String DEFAULT_SEPARATOR = ",";
141
+
142
+ /**
143
+ * Identifies that the CSV file should obey Excel's formatting conventions
144
+ * with regard to escaping certain embedded characters - the field separator,
145
+ * speech mark and end of line (EOL) character
146
+ */
147
+ public static final int EXCEL_STYLE_ESCAPING = 0;
148
+
149
+ /**
150
+ * Identifies that the CSV file should obey UNIX formatting conventions
151
+ * with regard to escaping certain embedded characters - the field separator
152
+ * and end of line (EOL) character
153
+ */
154
+ public static final int UNIX_STYLE_ESCAPING = 1;
155
+
156
+ /**
157
+ * Process the contents of a folder, convert the contents of each Excel
158
+ * workbook into CSV format and save the resulting file to the specified
159
+ * folder using the same name as the original workbook with the .xls or
160
+ * .xlsx extension replaced by .csv. This method will ensure that the
161
+ * CSV file created contains the comma field separator and that embedded
162
+ * characters such as the field separator, the EOL and double quotes are
163
+ * escaped in accordance with Excel's convention.
164
+ *
165
+ * @param strSource An instance of the String class that encapsulates the
166
+ * name of and path to either a folder containing those Excel
167
+ * workbook(s) or the name of and path to an individual Excel workbook
168
+ * that is/are to be converted.
169
+ * @param strDestination An instance of the String class encapsulating the
170
+ * name of and path to a folder that will contain the resulting CSV
171
+ * files.
172
+ * @throws java.io.FileNotFoundException Thrown if any file cannot be located
173
+ * on the file system during processing.
174
+ * @throws java.io.IOException Thrown if the file system encounters any
175
+ * problems during processing.
176
+ * @throws java.lang.IllegalArgumentException Thrown if the values passed
177
+ * to the strSource parameter refers to a file or folder that does not
178
+ * exist or if the value passed to the strDestination parameter refers
179
+ * to a folder that does not exist or simply does not refer to a
180
+ * folder.
181
+ * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown
182
+ * if the xml markup encountered whilst parsing a SpreadsheetML
183
+ * file (.xlsx) is invalid.
184
+ */
185
+ public void convertExcelToCSV(String strSource, String strDestination)
186
+ throws FileNotFoundException, IOException,
187
+ IllegalArgumentException, InvalidFormatException {
188
+
189
+ // Simply chain the call to the overloaded convertExcelToCSV(String,
190
+ // String, String, int) method, pass the default separator and ensure
191
+ // that certain embedded characters are escaped in accordance with
192
+ // Excel's formatting conventions
193
+ this.convertExcelToCSV(strSource, strDestination,
194
+ ToCSV.DEFAULT_SEPARATOR, ToCSV.EXCEL_STYLE_ESCAPING);
195
+ }
196
+
197
+ /**
198
+ * Process the contents of a folder, convert the contents of each Excel
199
+ * workbook into CSV format and save the resulting file to the specified
200
+ * folder using the same name as the original workbook with the .xls or
201
+ * .xlsx extension replaced by .csv. This method allows the client to
202
+ * define the field separator but will ensure that embedded characters such
203
+ * as the field separator, the EOL and double quotes are escaped in
204
+ * accordance with Excel's convention.
205
+ *
206
+ * @param strSource An instance of the String class that encapsulates the
207
+ * name of and path to either a folder containing those Excel
208
+ * workbook(s) or the name of and path to an individual Excel workbook
209
+ * that is/are to be converted.
210
+ * @param strDestination An instance of the String class encapsulating the
211
+ * name of and path to a folder that will contain the resulting CSV
212
+ * files.
213
+ * @param separator An instance of the String class that encapsulates the
214
+ * character or characters the client wishes to use as the field
215
+ * separator.
216
+ * @throws java.io.FileNotFoundException Thrown if any file cannot be located
217
+ * on the file system during processing.
218
+ * @throws java.io.IOException Thrown if the file system encounters any
219
+ * problems during processing.
220
+ * @throws java.lang.IllegalArgumentException Thrown if the values passed
221
+ * to the strSource parameter refers to a file or folder that does not
222
+ * exist or if the value passed to the strDestination parameter refers
223
+ * to a folder that does not exist or simply does not refer to a
224
+ * folder.
225
+ * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown
226
+ * if the xml markup encountered whilst parsing a SpreadsheetML
227
+ * file (.xlsx) is invalid.
228
+ */
229
+ public void convertExcelToCSV(String strSource, String strDestination,
230
+ String separator)
231
+ throws FileNotFoundException, IOException,
232
+ IllegalArgumentException, InvalidFormatException {
233
+
234
+ // Simply chain the call to the overloaded convertExcelToCSV(String,
235
+ // String, String, int) method and ensure that certain embedded
236
+ // characters are escaped in accordance with Excel's formatting
237
+ // conventions
238
+ this.convertExcelToCSV(strSource, strDestination,
239
+ separator, ToCSV.EXCEL_STYLE_ESCAPING);
240
+ }
241
+
242
+ /**
243
+ * Process the contents of a folder, convert the contents of each Excel
244
+ * workbook into CSV format and save the resulting file to the specified
245
+ * folder using the same name as the original workbook with the .xls or
246
+ * .xlsx extension replaced by .csv
247
+ *
248
+ * @param strSource An instance of the String class that encapsulates the
249
+ * name of and path to either a folder containing those Excel
250
+ * workbook(s) or the name of and path to an individual Excel workbook
251
+ * that is/are to be converted.
252
+ * @param strDestination An instance of the String class encapsulating the name
253
+ * of and path to a folder that will contain the resulting CSV files.
254
+ * @param formattingConvention A primitive int whose value will determine
255
+ * whether certain embedded characters should be escaped in accordance
256
+ * with Excel's or UNIX formatting conventions. Two constants are
257
+ * defined to support this option; ToCSV.EXCEL_STYLE_ESCAPING and
258
+ * ToCSV.UNIX_STYLE_ESCAPING
259
+ * @param separator An instance of the String class encapsulating the
260
+ * characters or characters that should be used to separate items
261
+ * on a line within the CSV file.
262
+ * @throws java.io.FileNotFoundException Thrown if any file cannot be located
263
+ * on the file system during processing.
264
+ * @throws java.io.IOException Thrown if the file system encounters any
265
+ * problems during processing.
266
+ * @throws java.lang.IllegalArgumentException Thrown if the values passed
267
+ * to the strSource parameter refers to a file or folder that does not
268
+ * exist, if the value passed to the strDestination parameter refers
269
+ * to a folder that does not exist, if the value passed to the
270
+ * strDestination parameter does not refer to a folder or if the
271
+ * value passed to the formattingConvention parameter is other than
272
+ * one of the values defined by the constants ToCSV.EXCEL_STYLE_ESCAPING
273
+ * and ToCSV.UNIX_STYLE_ESCAPING.
274
+ * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown
275
+ * if the xml markup encountered whilst parsing a SpreadsheetML
276
+ * file (.xlsx) is invalid.
277
+ */
278
+ public void convertExcelToCSV(String strSource, String strDestination,
279
+ String separator, int formattingConvention)
280
+ throws FileNotFoundException, IOException,
281
+ IllegalArgumentException, InvalidFormatException {
282
+ File source = new File(strSource);
283
+ File destination = new File(strDestination);
284
+ File[] filesList = null;
285
+ String destinationFilename = null;
286
+
287
+ // Check that the source file/folder exists.
288
+ if(!source.exists()) {
289
+ throw new IllegalArgumentException("The source for the Excel " +
290
+ "file(s) cannot be found.");
291
+ }
292
+
293
+ // Ensure that the folder the user has chosen to save the CSV files
294
+ // away into firstly exists and secondly is a folder rather than, for
295
+ // instance, a data file.
296
+ if(!destination.exists()) {
297
+ throw new IllegalArgumentException("The folder/directory for the " +
298
+ "converted CSV file(s) does not exist.");
299
+ }
300
+ if(!destination.isDirectory()) {
301
+ throw new IllegalArgumentException("The destination for the CSV " +
302
+ "file(s) is not a directory/folder.");
303
+ }
304
+
305
+ // Ensure the value passed to the formattingConvention parameter is
306
+ // within range.
307
+ if(formattingConvention != ToCSV.EXCEL_STYLE_ESCAPING &&
308
+ formattingConvention != ToCSV.UNIX_STYLE_ESCAPING) {
309
+ throw new IllegalArgumentException("The value passed to the " +
310
+ "formattingConvention parameter is out of range.");
311
+ }
312
+
313
+ // Copy the separator character and formatting convention into local
314
+ // variables for use in other methods.
315
+ this.separator = separator;
316
+ this.formattingConvention = formattingConvention;
317
+
318
+ // Check to see if the sourceFolder variable holds a reference to
319
+ // a file or a folder full of files.
320
+ if(source.isDirectory()) {
321
+ // Get a list of all of the Excel spreadsheet files (workbooks) in
322
+ // the source folder/directory
323
+ filesList = source.listFiles(new ExcelFilenameFilter());
324
+ }
325
+ else {
326
+ // Assume that it must be a file handle - although there are other
327
+ // options the code should perhaps check - and store the reference
328
+ // into the filesList variable.
329
+ filesList = new File[]{source};
330
+ }
331
+
332
+ // Step through each of the files in the source folder and for each
333
+ // open the workbook, convert it's contents to CSV format and then
334
+ // save the resulting file away into the folder specified by the
335
+ // contents of the destination variable. Note that the name of the
336
+ // csv file will be created by taking the name of the Excel file,
337
+ // removing the extension and replacing it with .csv. Note that there
338
+ // is one drawback with this approach; if the folder holding the files
339
+ // contains two workbooks whose names match but one is a binary file
340
+ // (.xls) and the other a SpreadsheetML file (.xlsx), then the names
341
+ // for both CSV files will be identical and one CSV file will,
342
+ // therefore, over-write the other.
343
+ for(File excelFile : filesList) {
344
+ // Open the workbook
345
+ this.openWorkbook(excelFile);
346
+
347
+ // Convert it's contents into a CSV file
348
+ this.convertToCSV();
349
+
350
+ // Build the name of the csv folder from that of the Excel workbook.
351
+ // Simply replace the .xls or .xlsx file extension with .csv
352
+ destinationFilename = excelFile.getName();
353
+ destinationFilename = destinationFilename.substring(
354
+ 0, destinationFilename.lastIndexOf(".")) +
355
+ ToCSV.CSV_FILE_EXTENSION;
356
+
357
+ // Save the CSV file away using the newly constructed file name
358
+ // and to the specified directory.
359
+ this.saveCSVFile(new File(destination, destinationFilename));
360
+ }
361
+ }
362
+
363
+ /**
364
+ * Open an Excel workbook ready for conversion.
365
+ *
366
+ * @param file An instance of the File class that encapsulates a handle
367
+ * to a valid Excel workbook. Note that the workbook can be in
368
+ * either binary (.xls) or SpreadsheetML (.xlsx) format.
369
+ * @throws java.io.FileNotFoundException Thrown if the file cannot be located.
370
+ * @throws java.io.IOException Thrown if a problem occurs in the file system.
371
+ * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown
372
+ * if invalid xml is found whilst parsing an input SpreadsheetML
373
+ * file.
374
+ */
375
+ private void openWorkbook(File file) throws FileNotFoundException,
376
+ IOException, InvalidFormatException {
377
+ FileInputStream fis = null;
378
+ try {
379
+ System.out.println("Opening workbook [" + file.getName() + "]");
380
+
381
+ fis = new FileInputStream(file);
382
+
383
+ // Open the workbook and then create the FormulaEvaluator and
384
+ // DataFormatter instances that will be needed to, respectively,
385
+ // force evaluation of formulae found in cells and create a
386
+ // formatted String encapsulating the cells contents.
387
+ this.workbook = WorkbookFactory.create(fis);
388
+ this.evaluator = this.workbook.getCreationHelper().createFormulaEvaluator();
389
+ this.formatter = new DataFormatter(true);
390
+ }
391
+ finally {
392
+ if(fis != null) {
393
+ fis.close();
394
+ }
395
+ }
396
+ }
397
+
398
+ /**
399
+ * Called to convert the contents of the currently opened workbook into
400
+ * a CSV file.
401
+ */
402
+ private void convertToCSV() {
403
+ Sheet sheet = null;
404
+ Row row = null;
405
+ int lastRowNum = 0;
406
+ this.csvData = new ArrayList<ArrayList<String>>();
407
+
408
+ System.out.println("Converting files contents to CSV format.");
409
+
410
+ // Discover how many sheets there are in the workbook....
411
+ int numSheets = this.workbook.getNumberOfSheets();
412
+
413
+ // and then iterate through them.
414
+ for(int i = 0; i < numSheets; i++) {
415
+
416
+ // Get a reference to a sheet and check to see if it contains
417
+ // any rows.
418
+ sheet = this.workbook.getSheetAt(i);
419
+ if(sheet.getPhysicalNumberOfRows() > 0) {
420
+
421
+ // Note down the index number of the bottom-most row and
422
+ // then iterate through all of the rows on the sheet starting
423
+ // from the very first row - number 1 - even if it is missing.
424
+ // Recover a reference to the row and then call another method
425
+ // which will strip the data from the cells and build lines
426
+ // for inclusion in the resulting CSV file.
427
+ lastRowNum = sheet.getLastRowNum();
428
+ for(int j = 0; j <= lastRowNum; j++) {
429
+ row = sheet.getRow(j);
430
+ this.rowToCSV(row);
431
+ }
432
+ }
433
+ }
434
+ }
435
+
436
+ /**
437
+ * Called to actually save the data recovered from the Excel workbook
438
+ * as a CSV file.
439
+ *
440
+ * @param file An instance of the File class that encapsulates a handle
441
+ * referring to the CSV file.
442
+ * @throws java.io.FileNotFoundException Thrown if the file cannot be found.
443
+ * @throws java.io.IOException Thrown to indicate and error occurred in the
444
+ * underlying file system.
445
+ */
446
+ private void saveCSVFile(File file)
447
+ throws FileNotFoundException, IOException {
448
+ FileWriter fw = null;
449
+ BufferedWriter bw = null;
450
+ ArrayList<String> line = null;
451
+ StringBuffer buffer = null;
452
+ String csvLineElement = null;
453
+ try {
454
+
455
+ System.out.println("Saving the CSV file [" + file.getName() + "]");
456
+
457
+ // Open a writer onto the CSV file.
458
+ fw = new FileWriter(file);
459
+ bw = new BufferedWriter(fw);
460
+
461
+ // Step through the elements of the ArrayList that was used to hold
462
+ // all of the data recovered from the Excel workbooks' sheets, rows
463
+ // and cells.
464
+ for(int i = 0; i < this.csvData.size(); i++) {
465
+ buffer = new StringBuffer();
466
+
467
+ // Get an element from the ArrayList that contains the data for
468
+ // the workbook. This element will itself be an ArrayList
469
+ // containing Strings and each String will hold the data recovered
470
+ // from a single cell. The for() loop is used to recover elements
471
+ // from this 'row' ArrayList one at a time and to write the Strings
472
+ // away to a StringBuffer thus assembling a single line for inclusion
473
+ // in the CSV file. If a row was empty or if it was short, then
474
+ // the ArrayList that contains it's data will also be shorter than
475
+ // some of the others. Therefore, it is necessary to check within
476
+ // the for loop to ensure that the ArrayList contains data to be
477
+ // processed. If it does, then an element will be recovered and
478
+ // appended to the StringBuffer.
479
+ line = this.csvData.get(i);
480
+ for(int j = 0; j < this.maxRowWidth; j++) {
481
+ if(line.size() > j) {
482
+ csvLineElement = line.get(j);
483
+ if(csvLineElement != null) {
484
+ buffer.append(this.escapeEmbeddedCharacters(
485
+ csvLineElement));
486
+ }
487
+ }
488
+ if(j < (this.maxRowWidth - 1)) {
489
+ buffer.append(this.separator);
490
+ }
491
+ }
492
+
493
+ // Once the line is built, write it away to the CSV file.
494
+ bw.write(buffer.toString().trim());
495
+
496
+ // Condition the inclusion of new line characters so as to
497
+ // avoid an additional, superfluous, new line at the end of
498
+ // the file.
499
+ if(i < (this.csvData.size() - 1)) {
500
+ bw.newLine();
501
+ }
502
+ }
503
+ }
504
+ finally {
505
+ if(bw != null) {
506
+ bw.flush();
507
+ bw.close();
508
+ }
509
+ }
510
+ }
511
+
512
+ /**
513
+ * Called to convert a row of cells into a line of data that can later be
514
+ * output to the CSV file.
515
+ *
516
+ * @param row An instance of either the HSSFRow or XSSFRow classes that
517
+ * encapsulates information about a row of cells recovered from
518
+ * an Excel workbook.
519
+ */
520
+ private void rowToCSV(Row row) {
521
+ Cell cell = null;
522
+ int lastCellNum = 0;
523
+ ArrayList<String> csvLine = new ArrayList<String>();
524
+
525
+ // Check to ensure that a row was recovered from the sheet as it is
526
+ // possible that one or more rows between other populated rows could be
527
+ // missing - blank. If the row does contain cells then...
528
+ if(row != null) {
529
+
530
+ // Get the index for the right most cell on the row and then
531
+ // step along the row from left to right recovering the contents
532
+ // of each cell, converting that into a formatted String and
533
+ // then storing the String into the csvLine ArrayList.
534
+ lastCellNum = row.getLastCellNum();
535
+ for(int i = 0; i <= lastCellNum; i++) {
536
+ cell = row.getCell(i);
537
+ if(cell == null) {
538
+ csvLine.add("");
539
+ }
540
+ else {
541
+ if(cell.getCellType() != Cell.CELL_TYPE_FORMULA) {
542
+ csvLine.add(this.formatter.formatCellValue(cell));
543
+ }
544
+ else {
545
+ csvLine.add(this.formatter.formatCellValue(cell, this.evaluator));
546
+ }
547
+ }
548
+ }
549
+ // Make a note of the index number of the right most cell. This value
550
+ // will later be used to ensure that the matrix of data in the CSV file
551
+ // is square.
552
+ if(lastCellNum > this.maxRowWidth) {
553
+ this.maxRowWidth = lastCellNum;
554
+ }
555
+ }
556
+ this.csvData.add(csvLine);
557
+ }
558
+
559
+ /**
560
+ * Checks to see whether the field - which consists of the formatted
561
+ * contents of an Excel worksheet cell encapsulated within a String - contains
562
+ * any embedded characters that must be escaped. The method is able to
563
+ * comply with either Excel's or UNIX formatting conventions in the
564
+ * following manner;
565
+ *
566
+ * With regard to UNIX conventions, if the field contains any embedded
567
+ * field separator or EOL characters they will each be escaped by prefixing
568
+ * a leading backspace character. These are the only changes that have yet
569
+ * emerged following some research as being required.
570
+ *
571
+ * Excel has other embedded character escaping requirements, some that emerged
572
+ * from empirical testing, other through research. Firstly, with regards to
573
+ * any embedded speech marks ("), each occurrence should be escaped with
574
+ * another speech mark and the whole field then surrounded with speech marks.
575
+ * Thus if a field holds <em>"Hello" he said</em> then it should be modified
576
+ * to appear as <em>"""Hello"" he said"</em>. Furthermore, if the field
577
+ * contains either embedded separator or EOL characters, it should also
578
+ * be surrounded with speech marks. As a result <em>1,400</em> would become
579
+ * <em>"1,400"</em> assuming that the comma is the required field separator.
580
+ * This has one consequence in, if a field contains embedded speech marks
581
+ * and embedded separator characters, checks for both are not required as the
582
+ * additional set of speech marks that should be placed around any field
583
+ * containing embedded speech marks will also account for the embedded
584
+ * separator.
585
+ *
586
+ * It is worth making one further note with regard to embedded EOL
587
+ * characters. If the data in a worksheet is exported as a CSV file using
588
+ * Excel itself, then the field will be surrounded with speech marks. If the
589
+ * resulting CSV file is then re-imports into another worksheet, the EOL
590
+ * character will result in the original single field occupying more than
591
+ * one cell. This same 'feature' is replicated in this classes behaviour.
592
+ *
593
+ * @param field An instance of the String class encapsulating the formatted
594
+ * contents of a cell on an Excel worksheet.
595
+ * @return A String that encapsulates the formatted contents of that
596
+ * Excel worksheet cell but with any embedded separator, EOL or
597
+ * speech mark characters correctly escaped.
598
+ */
599
+ private String escapeEmbeddedCharacters(String field) {
600
+ StringBuffer buffer = null;
601
+
602
+ // If the fields contents should be formatted to confirm with Excel's
603
+ // convention....
604
+ if(this.formattingConvention == ToCSV.EXCEL_STYLE_ESCAPING) {
605
+
606
+ // Firstly, check if there are any speech marks (") in the field;
607
+ // each occurrence must be escaped with another set of speech marks
608
+ // and then the entire field should be enclosed within another
609
+ // set of speech marks. Thus, "Yes" he said would become
610
+ // """Yes"" he said"
611
+ if(field.contains("\"")) {
612
+ buffer = new StringBuffer(field.replaceAll("\"", "\\\"\\\""));
613
+ buffer.insert(0, "\"");
614
+ buffer.append("\"");
615
+ }
616
+ else {
617
+ // If the field contains either embedded separator or EOL
618
+ // characters, then escape the whole field by surrounding it
619
+ // with speech marks.
620
+ buffer = new StringBuffer(field);
621
+ if((buffer.indexOf(this.separator)) > -1 ||
622
+ (buffer.indexOf("\n")) > -1) {
623
+ buffer.insert(0, "\"");
624
+ buffer.append("\"");
625
+ }
626
+ }
627
+ return(buffer.toString().trim());
628
+ }
629
+ // The only other formatting convention this class obeys is the UNIX one
630
+ // where any occurrence of the field separator or EOL character will
631
+ // be escaped by preceding it with a backslash.
632
+ else {
633
+ if(field.contains(this.separator)) {
634
+ field = field.replaceAll(this.separator, ("\\\\" + this.separator));
635
+ }
636
+ if(field.contains("\n")) {
637
+ field = field.replaceAll("\n", "\\\\\n");
638
+ }
639
+ return(field);
640
+ }
641
+ }
642
+
643
+ /**
644
+ * The main() method contains code that demonstrates how to use the class.
645
+ *
646
+ * @param args An array containing zero, one or more elements all of type
647
+ * String. Each element will encapsulate an argument specified by the
648
+ * user when running the program from the command prompt.
649
+ */
650
+ public static void main(String[] args) {
651
+ // Check the number of arguments passed to the main method. There
652
+ // must be two, three or four; the name of and path to either the folder
653
+ // containing the Excel files or an individual Excel workbook that is/are
654
+ // to be converted, the name of and path to the folder to which the CSV
655
+ // files should be written, - optionally - the separator character
656
+ // that should be used to separate individual items (fields) on the
657
+ // lines (records) of the CSV file and - again optionally - an integer
658
+ // that indicates whether the CSV file ought to obey Excel's or UNIX
659
+ // conventions with regard to formatting fields that contain embedded
660
+ // separator, Speech mark or EOL character(s).
661
+ //
662
+ // Note that the names of the CSV files will be derived from those
663
+ // of the Excel file(s). Put simply the .xls or .xlsx extension will be
664
+ // replaced with .csv. Therefore, if the source folder contains files
665
+ // with matching names but different extensions - Test.xls and Test.xlsx
666
+ // for example - then the CSV file generated from one will overwrite
667
+ // that generated from the other.
668
+ ToCSV converter = null;
669
+ try {
670
+ converter = new ToCSV();
671
+ if(args.length == 2) {
672
+ // Just the Source File/Folder and Destination Folder were
673
+ // passed to the main method.
674
+ converter.convertExcelToCSV(args[0], args[1]);
675
+ }
676
+ else if(args.length == 3){
677
+ // The Source File/Folder, Destination Folder and Separator
678
+ // were passed to the main method.
679
+ converter.convertExcelToCSV(args[0], args[1], args[2]);
680
+ }
681
+ else if(args.length == 4) {
682
+ // The Source File/Folder, Destination Folder, Separator and
683
+ // Formatting Convention were passed to the main method.
684
+ converter.convertExcelToCSV(args[0], args[1],
685
+ args[2], Integer.parseInt(args[3]));
686
+ }
687
+ else {
688
+ // None or more than four parameters were passed so display
689
+ //a Usage message.
690
+ System.out.println("Usage: java ToCSV [Source File/Folder] " +
691
+ "[Destination Folder] [Separator] [Formatting Convention]\n" +
692
+ "\tSource File/Folder\tThis argument should contain the name of and\n" +
693
+ "\t\t\t\tpath to either a single Excel workbook or a\n" +
694
+ "\t\t\t\tfolder containing one or more Excel workbooks.\n" +
695
+ "\tDestination Folder\tThe name of and path to the folder that the\n" +
696
+ "\t\t\t\tCSV files should be written out into. The\n" +
697
+ "\t\t\t\tfolder must exist before running the ToCSV\n" +
698
+ "\t\t\t\tcode as it will not check for or create it.\n" +
699
+ "\tSeparator\t\tOptional. The character or characters that\n" +
700
+ "\t\t\t\tshould be used to separate fields in the CSV\n" +
701
+ "\t\t\t\trecord. If no value is passed then the comma\n" +
702
+ "\t\t\t\twill be assumed.\n" +
703
+ "\tFormatting Convention\tOptional. This argument can take one of two\n" +
704
+ "\t\t\t\tvalues. Passing 0 (zero) will result in a CSV\n" +
705
+ "\t\t\t\tfile that obeys Excel's formatting conventions\n" +
706
+ "\t\t\t\twhilst passing 1 (one) will result in a file\n" +
707
+ "\t\t\t\tthat obeys UNIX formatting conventions. If no\n" +
708
+ "\t\t\t\tvalue is passed, then the CSV file produced\n" +
709
+ "\t\t\t\twill obey Excel's formatting conventions.");
710
+ }
711
+ }
712
+ // It is not wise to have such a wide catch clause - Exception is very
713
+ // close to being at the top of the inheritance hierarchy - though it
714
+ // will suffice for this example as it is really not possible to recover
715
+ // easily from an exceptional set of circumstances at this point in the
716
+ // program. It should however, ideally be replaced with one or more
717
+ // catch clauses optimised to handle more specific problems.
718
+ catch(Exception ex) {
719
+ System.out.println("Caught an: " + ex.getClass().getName());
720
+ System.out.println("Message: " + ex.getMessage());
721
+ System.out.println("Stacktrace follows:.....");
722
+ ex.printStackTrace(System.out);
723
+ }
724
+ }
725
+
726
+ /**
727
+ * An instance of this class can be used to control the files returned
728
+ * be a call to the listFiles() method when made on an instance of the
729
+ * File class and that object refers to a folder/directory
730
+ */
731
+ class ExcelFilenameFilter implements FilenameFilter {
732
+
733
+ /**
734
+ * Determine those files that will be returned by a call to the
735
+ * listFiles() method. In this case, the name of the file must end with
736
+ * either of the following two extension; '.xls' or '.xlsx'. For the
737
+ * future, it is very possible to parameterise this and allow the
738
+ * containing class to pass, for example, an array of Strings to this
739
+ * class on instantiation. Each element in that array could encapsulate
740
+ * a valid file extension - '.xls', '.xlsx', '.xlt', '.xlst', etc. These
741
+ * could then be used to control which files were returned by the call
742
+ * to the listFiles() method.
743
+ *
744
+ * @param file An instance of the File class that encapsulates a handle
745
+ * referring to the folder/directory that contains the file.
746
+ * @param name An instance of the String class that encapsulates the
747
+ * name of the file.
748
+ * @return A boolean value that indicates whether the file should be
749
+ * included in the array returned by the call to the listFiles()
750
+ * method. In this case true will be returned if the name of the
751
+ * file ends with either '.xls' or '.xlsx' and false will be
752
+ * returned in all other instances.
753
+ */
754
+ public boolean accept(File file, String name) {
755
+ return(name.endsWith(".xls") || name.endsWith(".xlsx"));
756
+ }
757
+ }
758
+ }
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: poi2csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Douglas English
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Converts Excel .xls and .xlsx files to CSV.
42
+ email:
43
+ - douglas.english@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .DS_Store
49
+ - .gitignore
50
+ - Gemfile
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - build.xml
55
+ - classes/ToCSV$ExcelFilenameFilter.class
56
+ - classes/ToCSV.class
57
+ - lib/.DS_Store
58
+ - lib/commons-codec-1.5.jar
59
+ - lib/commons-logging-1.1.jar
60
+ - lib/dom4j-1.6.1.jar
61
+ - lib/junit-3.8.1.jar
62
+ - lib/log4j-1.2.13.jar
63
+ - lib/poi-3.9-20121203.jar
64
+ - lib/poi-examples-3.9-20121203.jar
65
+ - lib/poi-excelant-3.9-20121203.jar
66
+ - lib/poi-ooxml-3.9-20121203.jar
67
+ - lib/poi-ooxml-schemas-3.9-20121203.jar
68
+ - lib/poi-scratchpad-3.9-20121203.jar
69
+ - lib/poi2csv.rb
70
+ - lib/poi2csv/version.rb
71
+ - lib/stax-api-1.0.1.jar
72
+ - lib/xmlbeans-2.3.0.jar
73
+ - poi2csv.gemspec
74
+ - src/ToCSV.java
75
+ homepage: https://github.com/denglish/poi2csv
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ! '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.0.3
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: This GEM provides a wrapper to the http://poi.apache.org/ library for converting
99
+ Excel (.xls and .xlsx) files to CSV.
100
+ test_files: []