xls_to_csv-paperclip-processor 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -21,6 +21,7 @@ Jeweler::Tasks.new do |gem|
21
21
  gem.description = %Q{If you want to convert .xls to .csv simply and unwittingly, then this gem is for you!}
22
22
  gem.email = "igor.alexandrov@gmail.com"
23
23
  gem.authors = ["Igor Alexandrov"]
24
+ gem.executables = [ 'xls2csv', 'xlsx2csv']
24
25
  # dependencies defined in Gemfile
25
26
  end
26
27
  Jeweler::RubygemsDotOrgTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.4.0
data/bin/xls2csv ADDED
Binary file
data/bin/xlsx2csv ADDED
@@ -0,0 +1,446 @@
1
+ #!/usr/bin/env python
2
+ #
3
+ # Copyright information
4
+ #
5
+ # Copyright (C) 2010-2012 Dilshod Temirkhodjaev <tdilshod@gmail.com>
6
+ #
7
+ # License
8
+ #
9
+ # This program is free software; you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation; either version 2 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # This program is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
21
+
22
+ __author__ = "Dilshod Temirkhodjaev <tdilshod@gmail.com>"
23
+ __license__ = "GPL-2+"
24
+
25
+ import csv, datetime, zipfile, sys, os
26
+ import xml.parsers.expat
27
+ from xml.dom import minidom
28
+ from optparse import OptionParser
29
+
30
+ # see also ruby-roo lib at: http://github.com/hmcgowan/roo
31
+ FORMATS = {
32
+ 'general' : 'float',
33
+ '0' : 'float',
34
+ '0.00' : 'float',
35
+ '#,##0' : 'float',
36
+ '#,##0.00' : 'float',
37
+ '0%' : 'percentage',
38
+ '0.00%' : 'percentage',
39
+ '0.00e+00' : 'float',
40
+ 'mm-dd-yy' : 'date',
41
+ 'd-mmm-yy' : 'date',
42
+ 'd-mmm' : 'date',
43
+ 'mmm-yy' : 'date',
44
+ 'h:mm am/pm' : 'date',
45
+ 'h:mm:ss am/pm' : 'date',
46
+ 'h:mm' : 'time',
47
+ 'h:mm:ss' : 'time',
48
+ 'm/d/yy h:mm' : 'date',
49
+ '#,##0 ;(#,##0)' : 'float',
50
+ '#,##0 ;[red](#,##0)' : 'float',
51
+ '#,##0.00;(#,##0.00)' : 'float',
52
+ '#,##0.00;[red](#,##0.00)' : 'float',
53
+ 'mm:ss' : 'time',
54
+ '[h]:mm:ss' : 'time',
55
+ 'mmss.0' : 'time',
56
+ '##0.0e+0' : 'float',
57
+ '@' : 'float',
58
+ 'yyyy\\-mm\\-dd' : 'date',
59
+ 'dd/mm/yy' : 'date',
60
+ 'hh:mm:ss' : 'time',
61
+ "dd/mm/yy\\ hh:mm" : 'date',
62
+ 'dd/mm/yyyy hh:mm:ss' : 'date',
63
+ 'yy-mm-dd' : 'date',
64
+ 'd-mmm-yyyy' : 'date',
65
+ 'm/d/yy' : 'date',
66
+ 'm/d/yyyy' : 'date',
67
+ 'dd-mmm-yyyy' : 'date',
68
+ 'dd/mm/yyyy' : 'date',
69
+ 'mm/dd/yy hh:mm am/pm' : 'date',
70
+ 'mm/dd/yyyy hh:mm:ss' : 'date',
71
+ 'yyyy-mm-dd hh:mm:ss' : 'date',
72
+ }
73
+ STANDARD_FORMATS = {
74
+ 0 : 'general',
75
+ 1 : '0',
76
+ 2 : '0.00',
77
+ 3 : '#,##0',
78
+ 4 : '#,##0.00',
79
+ 9 : '0%',
80
+ 10 : '0.00%',
81
+ 11 : '0.00e+00',
82
+ 12 : '# ?/?',
83
+ 13 : '# ??/??',
84
+ 14 : 'mm-dd-yy',
85
+ 15 : 'd-mmm-yy',
86
+ 16 : 'd-mmm',
87
+ 17 : 'mmm-yy',
88
+ 18 : 'h:mm am/pm',
89
+ 19 : 'h:mm:ss am/pm',
90
+ 20 : 'h:mm',
91
+ 21 : 'h:mm:ss',
92
+ 22 : 'm/d/yy h:mm',
93
+ 37 : '#,##0 ;(#,##0)',
94
+ 38 : '#,##0 ;[red](#,##0)',
95
+ 39 : '#,##0.00;(#,##0.00)',
96
+ 40 : '#,##0.00;[red](#,##0.00)',
97
+ 45 : 'mm:ss',
98
+ 46 : '[h]:mm:ss',
99
+ 47 : 'mmss.0',
100
+ 48 : '##0.0e+0',
101
+ 49 : '@',
102
+ }
103
+
104
+ #
105
+ # usage: xlsx2csv("test.xslx", open("test.csv", "w+"))
106
+ # parameters:
107
+ # sheetid - sheet no to convert (0 for all sheets)
108
+ # dateformat - override date/time format
109
+ # delimiter - csv columns delimiter symbol
110
+ # sheet_delimiter - sheets delimiter used when processing all sheets
111
+ # skip_empty_lines - skip empty lines
112
+ #
113
+ def xlsx2csv(infilepath, outfile, sheetid=1, dateformat=None, delimiter=",", sheetdelimiter="--------", skip_empty_lines=False):
114
+ writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL, delimiter=delimiter)
115
+ ziphandle = zipfile.ZipFile(infilepath)
116
+ try:
117
+ shared_strings = parse(ziphandle, SharedStrings, "xl/sharedStrings.xml")
118
+ styles = parse(ziphandle, Styles, "xl/styles.xml")
119
+ workbook = parse(ziphandle, Workbook, "xl/workbook.xml")
120
+
121
+ if sheetid > 0:
122
+ sheet = None
123
+ for s in workbook.sheets:
124
+ if s['id'] == sheetid:
125
+ sheet = Sheet(workbook, shared_strings, styles, ziphandle.read("xl/worksheets/sheet%i.xml" %s['id']))
126
+ break
127
+ if not sheet:
128
+ raise Exception("Sheet %i Not Found" %sheetid)
129
+ sheet.set_dateformat(dateformat)
130
+ sheet.set_skip_empty_lines(skip_empty_lines)
131
+ sheet.to_csv(writer)
132
+ else:
133
+ for s in workbook.sheets:
134
+ if sheetdelimiter != "":
135
+ outfile.write(sheetdelimiter + " " + str(s['id']) + " - " + s['name'].encode('utf-8') + "\r\n")
136
+ sheet = Sheet(workbook, shared_strings, styles, ziphandle.read("xl/worksheets/sheet%i.xml" %s['id']))
137
+ sheet.set_dateformat(dateformat)
138
+ sheet.set_skip_empty_lines(skip_empty_lines)
139
+ sheet.to_csv(writer)
140
+ finally:
141
+ ziphandle.close()
142
+
143
+ def parse(ziphandle, klass, filename):
144
+ instance = klass()
145
+ if filename in ziphandle.namelist():
146
+ instance.parse(ziphandle.read(filename))
147
+ return instance
148
+
149
+ class Workbook:
150
+ def __init__(self):
151
+ self.sheets = []
152
+ self.date1904 = False
153
+
154
+ def parse(self, data):
155
+ workbookDoc = minidom.parseString(data)
156
+ if len(workbookDoc.firstChild.getElementsByTagName("fileVersion")) == 0:
157
+ self.appName = 'unknown'
158
+ else:
159
+ self.appName = workbookDoc.firstChild.getElementsByTagName("fileVersion")[0]._attrs['appName'].value
160
+ try:
161
+ self.date1904 = workbookDoc.firstChild.getElementsByTagName("workbookPr")[0]._attrs['date1904'].value.lower().strip() != "false"
162
+ except:
163
+ pass
164
+
165
+ sheets = workbookDoc.firstChild.getElementsByTagName("sheets")[0]
166
+ for sheetNode in sheets.getElementsByTagName("sheet"):
167
+ attrs = sheetNode._attrs
168
+ name = attrs["name"].value
169
+ if self.appName == 'xl':
170
+ if attrs.has_key('r:id'): id = int(attrs["r:id"].value[3:])
171
+ else: id = int(attrs['sheetId'].value)
172
+ else:
173
+ if attrs.has_key('sheetId'): id = int(attrs["sheetId"].value)
174
+ else: id = int(attrs['r:id'].value[3:])
175
+ self.sheets.append({'name': name, 'id': id})
176
+
177
+ class Styles:
178
+ def __init__(self):
179
+ self.numFmts = {}
180
+ self.cellXfs = []
181
+
182
+ def parse(self, data):
183
+ styles = minidom.parseString(data).firstChild
184
+ # numFmts
185
+ numFmtsElement = styles.getElementsByTagName("numFmts")
186
+ if len(numFmtsElement) == 1:
187
+ for numFmt in numFmtsElement[0].childNodes:
188
+ numFmtId = int(numFmt._attrs['numFmtId'].value)
189
+ formatCode = numFmt._attrs['formatCode'].value.lower().replace('\\', '')
190
+ self.numFmts[numFmtId] = formatCode
191
+ # cellXfs
192
+ cellXfsElement = styles.getElementsByTagName("cellXfs")
193
+ if len(cellXfsElement) == 1:
194
+ for cellXfs in cellXfsElement[0].childNodes:
195
+ if (cellXfs.nodeName != "xf"):
196
+ continue
197
+ numFmtId = int(cellXfs._attrs['numFmtId'].value)
198
+ self.cellXfs.append(numFmtId)
199
+
200
+ class SharedStrings:
201
+ def __init__(self):
202
+ self.parser = None
203
+ self.strings = []
204
+ self.si = False
205
+ self.t = False
206
+ self.rPh = False
207
+ self.value = ""
208
+
209
+ def parse(self, data):
210
+ self.parser = xml.parsers.expat.ParserCreate()
211
+ self.parser.CharacterDataHandler = self.handleCharData
212
+ self.parser.StartElementHandler = self.handleStartElement
213
+ self.parser.EndElementHandler = self.handleEndElement
214
+ self.parser.Parse(data)
215
+
216
+ def handleCharData(self, data):
217
+ if self.t:
218
+ self.value+= data
219
+
220
+ def handleStartElement(self, name, attrs):
221
+ if name == 'si':
222
+ self.si = True
223
+ self.value = ""
224
+ elif name == 't' and self.rPh:
225
+ self.t = False
226
+ elif name == 't' and self.si:
227
+ self.t = True
228
+ elif name == 'rPh':
229
+ self.rPh = True
230
+
231
+ def handleEndElement(self, name):
232
+ if name == 'si':
233
+ self.si = False
234
+ self.strings.append(self.value)
235
+ elif name == 't':
236
+ self.t = False
237
+ elif name == 'rPh':
238
+ self.rPh = False
239
+
240
+ class Sheet:
241
+ def __init__(self, workbook, sharedString, styles, data):
242
+ self.parser = None
243
+ self.writer = None
244
+ self.sharedString = None
245
+ self.styles = None
246
+
247
+ self.in_sheet = False
248
+ self.in_row = False
249
+ self.in_cell = False
250
+ self.in_cell_value = False
251
+ self.in_cell_formula = False
252
+
253
+ self.columns = {}
254
+ self.rowNum = None
255
+ self.colType = None
256
+ self.s_attr = None
257
+ self.data = None
258
+
259
+ self.dateformat = None
260
+ self.skip_empty_lines = False
261
+
262
+ self.data = data
263
+ self.workbook = workbook
264
+ self.sharedStrings = sharedString.strings
265
+ self.styles = styles
266
+
267
+ def set_dateformat(self, dateformat):
268
+ self.dateformat = dateformat
269
+
270
+ def set_skip_empty_lines(self, skip):
271
+ self.skip_empty_lines = skip
272
+
273
+ def to_csv(self, writer):
274
+ self.writer = writer
275
+ self.parser = xml.parsers.expat.ParserCreate()
276
+ self.parser.CharacterDataHandler = self.handleCharData
277
+ self.parser.StartElementHandler = self.handleStartElement
278
+ self.parser.EndElementHandler = self.handleEndElement
279
+ self.parser.Parse(self.data)
280
+
281
+ def handleCharData(self, data):
282
+ if self.in_cell_value:
283
+ self.data = data # default value
284
+ if self.colType == "s": # shared string
285
+ self.data = self.sharedStrings[int(data)]
286
+ elif self.colType == "b": # boolean
287
+ self.data = (int(data) == 1 and "TRUE") or (int(data) == 0 and "FALSE") or data
288
+ elif self.s_attr:
289
+ s = int(self.s_attr)
290
+
291
+ # get cell format
292
+ format = None
293
+ xfs_numfmt = self.styles.cellXfs[s]
294
+ if self.styles.numFmts.has_key(xfs_numfmt):
295
+ format = self.styles.numFmts[xfs_numfmt]
296
+ elif STANDARD_FORMATS.has_key(xfs_numfmt):
297
+ format = STANDARD_FORMATS[xfs_numfmt]
298
+ # get format type
299
+ if format and FORMATS.has_key(format):
300
+ format_type = FORMATS[format]
301
+
302
+ if format_type == 'date': # date/time
303
+ try:
304
+ if self.workbook.date1904:
305
+ date = datetime.datetime(1904, 01, 01) + datetime.timedelta(float(data))
306
+ else:
307
+ date = datetime.datetime(1899, 12, 30) + datetime.timedelta(float(data))
308
+ if self.dateformat:
309
+ # str(dateformat) - python2.5 bug, see: http://bugs.python.org/issue2782
310
+ self.data = date.strftime(str(self.dateformat))
311
+ else:
312
+ dateformat = format.replace("yyyy", "%Y").replace("yy", "%y"). \
313
+ replace("hh:mm", "%H:%M").replace("h", "%H").replace("%H%H", "%H").replace("ss", "%S"). \
314
+ replace("d", "%e").replace("%e%e", "%d"). \
315
+ replace("mmmm", "%B").replace("mmm", "%b").replace(":mm", ":%M").replace("m", "%m").replace("%m%m", "%m"). \
316
+ replace("am/pm", "%p")
317
+ self.data = date.strftime(str(dateformat)).strip()
318
+ except (ValueError, OverflowError):
319
+ # invalid date format
320
+ self.data = data
321
+ elif format_type == 'time': # time
322
+ self.data = str(float(data) * 24*60*60)
323
+ # does not support it
324
+ #elif self.in_cell_formula:
325
+ # self.formula = data
326
+
327
+ def handleStartElement(self, name, attrs):
328
+ if self.in_row and name == 'c':
329
+ self.colType = attrs.get("t")
330
+ self.s_attr = attrs.get("s")
331
+ cellId = attrs.get("r")
332
+ if cellId:
333
+ self.colNum = cellId[:len(cellId)-len(self.rowNum)]
334
+ self.colIndex = 0
335
+ else:
336
+ self.colIndex+= 1
337
+ #self.formula = None
338
+ self.data = ""
339
+ self.in_cell = True
340
+ elif self.in_cell and name == 'v':
341
+ self.in_cell_value = True
342
+ #elif self.in_cell and name == 'f':
343
+ # self.in_cell_formula = True
344
+ elif self.in_sheet and name == 'row' and attrs.has_key('r'):
345
+ self.rowNum = attrs['r']
346
+ self.in_row = True
347
+ self.columns = {}
348
+ self.spans = None
349
+ if attrs.has_key('spans'):
350
+ self.spans = [int(i) for i in attrs['spans'].split(":")]
351
+ elif name == 'sheetData':
352
+ self.in_sheet = True
353
+
354
+ def handleEndElement(self, name):
355
+ if self.in_cell and name == 'v':
356
+ self.in_cell_value = False
357
+ #elif self.in_cell and name == 'f':
358
+ # self.in_cell_formula = False
359
+ elif self.in_cell and name == 'c':
360
+ t = 0
361
+ for i in self.colNum: t = t*26 + ord(i) - 64
362
+ self.columns[t - 1 + self.colIndex] = self.data
363
+ self.in_cell = False
364
+ if self.in_row and name == 'row':
365
+ if len(self.columns.keys()) > 0:
366
+ d = [""] * (max(self.columns.keys()) + 1)
367
+ for k in self.columns.keys():
368
+ d[k] = self.columns[k].encode("utf-8")
369
+ if self.spans:
370
+ l = self.spans[0] + self.spans[1] - 1
371
+ if len(d) < l:
372
+ d+= (l - len(d)) * ['']
373
+ # write line to csv
374
+ if not self.skip_empty_lines or d.count('') != len(d):
375
+ self.writer.writerow(d)
376
+ self.in_row = False
377
+ elif self.in_sheet and name == 'sheetData':
378
+ self.in_sheet = False
379
+
380
+ def convert_recursive(path, kwargs):
381
+ for name in os.listdir(path):
382
+ fullpath = os.path.join(path, name)
383
+ if os.path.isdir(fullpath):
384
+ convert_recursive(fullpath, kwargs)
385
+ else:
386
+ if fullpath.lower().endswith(".xlsx"):
387
+ outfilepath = fullpath[:-4] + 'csv'
388
+ print("Converting %s to %s" %(fullpath, outfilepath))
389
+ f = open(outfilepath, 'w+b')
390
+ try:
391
+ xlsx2csv(fullpath, f, **kwargs)
392
+ except zipfile.BadZipfile:
393
+ print("File is not a zip file")
394
+ f.close()
395
+
396
+ if __name__ == "__main__":
397
+ parser = OptionParser(usage = "%prog [options] infile [outfile]", version="0.11")
398
+ parser.add_option("-d", "--delimiter", dest="delimiter", default=",",
399
+ help="delimiter - csv columns delimiter, 'tab' or 'x09' for tab (comma is default)")
400
+ parser.add_option("-f", "--dateformat", dest="dateformat",
401
+ help="override date/time format (ex. %Y/%m/%d)")
402
+ parser.add_option("-i", "--ignoreempty", dest="skip_empty_lines", default=False, action="store_true",
403
+ help="skip empty lines")
404
+ parser.add_option("-p", "--sheetdelimiter", dest="sheetdelimiter", default="--------",
405
+ help="sheets delimiter used to separate sheets, pass '' if you don't want delimiters (default '--------')")
406
+ parser.add_option("-r", "--recursive", dest="recursive", default=False, action="store_true",
407
+ help="convert recursively")
408
+ parser.add_option("-s", "--sheet", dest="sheetid", default=1, type="int",
409
+ help="sheet no to convert (0 for all sheets)")
410
+
411
+ (options, args) = parser.parse_args()
412
+
413
+ if len(options.delimiter) == 1:
414
+ delimiter = options.delimiter
415
+ elif options.delimiter == 'tab':
416
+ delimiter = '\t'
417
+ elif options.delimiter == 'comma':
418
+ delimiter = ','
419
+ elif options.delimiter[0] == 'x':
420
+ delimiter = chr(int(options.delimiter[1:]))
421
+ else:
422
+ raise Exception("Invalid delimiter")
423
+
424
+ kwargs = {
425
+ 'sheetid' : options.sheetid,
426
+ 'delimiter' : delimiter,
427
+ 'sheetdelimiter' : options.sheetdelimiter,
428
+ 'dateformat' : options.dateformat,
429
+ 'skip_empty_lines' : options.skip_empty_lines
430
+ }
431
+
432
+ if options.recursive:
433
+ if len(args) == 1:
434
+ convert_recursive(args[0], kwargs)
435
+ else:
436
+ parser.print_help()
437
+ else:
438
+ if len(args) < 1:
439
+ parser.print_help()
440
+ else:
441
+ if len(args) > 1:
442
+ outfile = open(args[1], 'w+b')
443
+ xlsx2csv(args[0], outfile, **kwargs)
444
+ outfile.close()
445
+ else:
446
+ xlsx2csv(args[0], sys.stdout, **kwargs)
@@ -5,13 +5,14 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "xls_to_csv-paperclip-processor"
8
- s.version = "0.3.0"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Igor Alexandrov"]
12
- s.date = "2012-10-25"
12
+ s.date = "2012-11-27"
13
13
  s.description = "If you want to convert .xls to .csv simply and unwittingly, then this gem is for you!"
14
14
  s.email = "igor.alexandrov@gmail.com"
15
+ s.executables = ["xls2csv", "xlsx2csv"]
15
16
  s.extra_rdoc_files = [
16
17
  "README.md"
17
18
  ]
@@ -20,6 +21,8 @@ Gem::Specification.new do |s|
20
21
  "README.md",
21
22
  "Rakefile",
22
23
  "VERSION",
24
+ "bin/xls2csv",
25
+ "bin/xlsx2csv",
23
26
  "lib/xls_to_csv-paperclip-processor.rb",
24
27
  "xls_to_csv-paperclip-processor.gemspec"
25
28
  ]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xls_to_csv-paperclip-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-25 00:00:00.000000000 Z
12
+ date: 2012-11-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: paperclip
@@ -62,7 +62,9 @@ dependencies:
62
62
  description: If you want to convert .xls to .csv simply and unwittingly, then this
63
63
  gem is for you!
64
64
  email: igor.alexandrov@gmail.com
65
- executables: []
65
+ executables:
66
+ - xls2csv
67
+ - xlsx2csv
66
68
  extensions: []
67
69
  extra_rdoc_files:
68
70
  - README.md
@@ -71,6 +73,8 @@ files:
71
73
  - README.md
72
74
  - Rakefile
73
75
  - VERSION
76
+ - bin/xls2csv
77
+ - bin/xlsx2csv
74
78
  - lib/xls_to_csv-paperclip-processor.rb
75
79
  - xls_to_csv-paperclip-processor.gemspec
76
80
  homepage: http://github.com/igor-alexandrov/xls_to_csv-paperclip-processor
@@ -88,7 +92,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
88
92
  version: '0'
89
93
  segments:
90
94
  - 0
91
- hash: -4100455785649400648
95
+ hash: 2906243392911871525
92
96
  required_rubygems_version: !ruby/object:Gem::Requirement
93
97
  none: false
94
98
  requirements: