xls_to_csv-paperclip-processor 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -21,6 +21,7 @@ Jeweler::Tasks.new do |gem|
21
21
  gem.description = %Q{If you want to convert .xls to .csv simply and unwittingly, then this gem is for you!}
22
22
  gem.email = "igor.alexandrov@gmail.com"
23
23
  gem.authors = ["Igor Alexandrov"]
24
+ gem.executables = [ 'xls2csv', 'xlsx2csv']
24
25
  # dependencies defined in Gemfile
25
26
  end
26
27
  Jeweler::RubygemsDotOrgTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.4.0
data/bin/xls2csv ADDED
Binary file
data/bin/xlsx2csv ADDED
@@ -0,0 +1,446 @@
1
+ #!/usr/bin/env python
2
+ #
3
+ # Copyright information
4
+ #
5
+ # Copyright (C) 2010-2012 Dilshod Temirkhodjaev <tdilshod@gmail.com>
6
+ #
7
+ # License
8
+ #
9
+ # This program is free software; you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation; either version 2 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # This program is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
21
+
22
+ __author__ = "Dilshod Temirkhodjaev <tdilshod@gmail.com>"
23
+ __license__ = "GPL-2+"
24
+
25
+ import csv, datetime, zipfile, sys, os
26
+ import xml.parsers.expat
27
+ from xml.dom import minidom
28
+ from optparse import OptionParser
29
+
30
+ # see also ruby-roo lib at: http://github.com/hmcgowan/roo
31
+ FORMATS = {
32
+ 'general' : 'float',
33
+ '0' : 'float',
34
+ '0.00' : 'float',
35
+ '#,##0' : 'float',
36
+ '#,##0.00' : 'float',
37
+ '0%' : 'percentage',
38
+ '0.00%' : 'percentage',
39
+ '0.00e+00' : 'float',
40
+ 'mm-dd-yy' : 'date',
41
+ 'd-mmm-yy' : 'date',
42
+ 'd-mmm' : 'date',
43
+ 'mmm-yy' : 'date',
44
+ 'h:mm am/pm' : 'date',
45
+ 'h:mm:ss am/pm' : 'date',
46
+ 'h:mm' : 'time',
47
+ 'h:mm:ss' : 'time',
48
+ 'm/d/yy h:mm' : 'date',
49
+ '#,##0 ;(#,##0)' : 'float',
50
+ '#,##0 ;[red](#,##0)' : 'float',
51
+ '#,##0.00;(#,##0.00)' : 'float',
52
+ '#,##0.00;[red](#,##0.00)' : 'float',
53
+ 'mm:ss' : 'time',
54
+ '[h]:mm:ss' : 'time',
55
+ 'mmss.0' : 'time',
56
+ '##0.0e+0' : 'float',
57
+ '@' : 'float',
58
+ 'yyyy\\-mm\\-dd' : 'date',
59
+ 'dd/mm/yy' : 'date',
60
+ 'hh:mm:ss' : 'time',
61
+ "dd/mm/yy\\ hh:mm" : 'date',
62
+ 'dd/mm/yyyy hh:mm:ss' : 'date',
63
+ 'yy-mm-dd' : 'date',
64
+ 'd-mmm-yyyy' : 'date',
65
+ 'm/d/yy' : 'date',
66
+ 'm/d/yyyy' : 'date',
67
+ 'dd-mmm-yyyy' : 'date',
68
+ 'dd/mm/yyyy' : 'date',
69
+ 'mm/dd/yy hh:mm am/pm' : 'date',
70
+ 'mm/dd/yyyy hh:mm:ss' : 'date',
71
+ 'yyyy-mm-dd hh:mm:ss' : 'date',
72
+ }
73
+ STANDARD_FORMATS = {
74
+ 0 : 'general',
75
+ 1 : '0',
76
+ 2 : '0.00',
77
+ 3 : '#,##0',
78
+ 4 : '#,##0.00',
79
+ 9 : '0%',
80
+ 10 : '0.00%',
81
+ 11 : '0.00e+00',
82
+ 12 : '# ?/?',
83
+ 13 : '# ??/??',
84
+ 14 : 'mm-dd-yy',
85
+ 15 : 'd-mmm-yy',
86
+ 16 : 'd-mmm',
87
+ 17 : 'mmm-yy',
88
+ 18 : 'h:mm am/pm',
89
+ 19 : 'h:mm:ss am/pm',
90
+ 20 : 'h:mm',
91
+ 21 : 'h:mm:ss',
92
+ 22 : 'm/d/yy h:mm',
93
+ 37 : '#,##0 ;(#,##0)',
94
+ 38 : '#,##0 ;[red](#,##0)',
95
+ 39 : '#,##0.00;(#,##0.00)',
96
+ 40 : '#,##0.00;[red](#,##0.00)',
97
+ 45 : 'mm:ss',
98
+ 46 : '[h]:mm:ss',
99
+ 47 : 'mmss.0',
100
+ 48 : '##0.0e+0',
101
+ 49 : '@',
102
+ }
103
+
104
+ #
105
+ # usage: xlsx2csv("test.xslx", open("test.csv", "w+"))
106
+ # parameters:
107
+ # sheetid - sheet no to convert (0 for all sheets)
108
+ # dateformat - override date/time format
109
+ # delimiter - csv columns delimiter symbol
110
+ # sheet_delimiter - sheets delimiter used when processing all sheets
111
+ # skip_empty_lines - skip empty lines
112
+ #
113
+ def xlsx2csv(infilepath, outfile, sheetid=1, dateformat=None, delimiter=",", sheetdelimiter="--------", skip_empty_lines=False):
114
+ writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL, delimiter=delimiter)
115
+ ziphandle = zipfile.ZipFile(infilepath)
116
+ try:
117
+ shared_strings = parse(ziphandle, SharedStrings, "xl/sharedStrings.xml")
118
+ styles = parse(ziphandle, Styles, "xl/styles.xml")
119
+ workbook = parse(ziphandle, Workbook, "xl/workbook.xml")
120
+
121
+ if sheetid > 0:
122
+ sheet = None
123
+ for s in workbook.sheets:
124
+ if s['id'] == sheetid:
125
+ sheet = Sheet(workbook, shared_strings, styles, ziphandle.read("xl/worksheets/sheet%i.xml" %s['id']))
126
+ break
127
+ if not sheet:
128
+ raise Exception("Sheet %i Not Found" %sheetid)
129
+ sheet.set_dateformat(dateformat)
130
+ sheet.set_skip_empty_lines(skip_empty_lines)
131
+ sheet.to_csv(writer)
132
+ else:
133
+ for s in workbook.sheets:
134
+ if sheetdelimiter != "":
135
+ outfile.write(sheetdelimiter + " " + str(s['id']) + " - " + s['name'].encode('utf-8') + "\r\n")
136
+ sheet = Sheet(workbook, shared_strings, styles, ziphandle.read("xl/worksheets/sheet%i.xml" %s['id']))
137
+ sheet.set_dateformat(dateformat)
138
+ sheet.set_skip_empty_lines(skip_empty_lines)
139
+ sheet.to_csv(writer)
140
+ finally:
141
+ ziphandle.close()
142
+
143
+ def parse(ziphandle, klass, filename):
144
+ instance = klass()
145
+ if filename in ziphandle.namelist():
146
+ instance.parse(ziphandle.read(filename))
147
+ return instance
148
+
149
+ class Workbook:
150
+ def __init__(self):
151
+ self.sheets = []
152
+ self.date1904 = False
153
+
154
+ def parse(self, data):
155
+ workbookDoc = minidom.parseString(data)
156
+ if len(workbookDoc.firstChild.getElementsByTagName("fileVersion")) == 0:
157
+ self.appName = 'unknown'
158
+ else:
159
+ self.appName = workbookDoc.firstChild.getElementsByTagName("fileVersion")[0]._attrs['appName'].value
160
+ try:
161
+ self.date1904 = workbookDoc.firstChild.getElementsByTagName("workbookPr")[0]._attrs['date1904'].value.lower().strip() != "false"
162
+ except:
163
+ pass
164
+
165
+ sheets = workbookDoc.firstChild.getElementsByTagName("sheets")[0]
166
+ for sheetNode in sheets.getElementsByTagName("sheet"):
167
+ attrs = sheetNode._attrs
168
+ name = attrs["name"].value
169
+ if self.appName == 'xl':
170
+ if attrs.has_key('r:id'): id = int(attrs["r:id"].value[3:])
171
+ else: id = int(attrs['sheetId'].value)
172
+ else:
173
+ if attrs.has_key('sheetId'): id = int(attrs["sheetId"].value)
174
+ else: id = int(attrs['r:id'].value[3:])
175
+ self.sheets.append({'name': name, 'id': id})
176
+
177
+ class Styles:
178
+ def __init__(self):
179
+ self.numFmts = {}
180
+ self.cellXfs = []
181
+
182
+ def parse(self, data):
183
+ styles = minidom.parseString(data).firstChild
184
+ # numFmts
185
+ numFmtsElement = styles.getElementsByTagName("numFmts")
186
+ if len(numFmtsElement) == 1:
187
+ for numFmt in numFmtsElement[0].childNodes:
188
+ numFmtId = int(numFmt._attrs['numFmtId'].value)
189
+ formatCode = numFmt._attrs['formatCode'].value.lower().replace('\\', '')
190
+ self.numFmts[numFmtId] = formatCode
191
+ # cellXfs
192
+ cellXfsElement = styles.getElementsByTagName("cellXfs")
193
+ if len(cellXfsElement) == 1:
194
+ for cellXfs in cellXfsElement[0].childNodes:
195
+ if (cellXfs.nodeName != "xf"):
196
+ continue
197
+ numFmtId = int(cellXfs._attrs['numFmtId'].value)
198
+ self.cellXfs.append(numFmtId)
199
+
200
+ class SharedStrings:
201
+ def __init__(self):
202
+ self.parser = None
203
+ self.strings = []
204
+ self.si = False
205
+ self.t = False
206
+ self.rPh = False
207
+ self.value = ""
208
+
209
+ def parse(self, data):
210
+ self.parser = xml.parsers.expat.ParserCreate()
211
+ self.parser.CharacterDataHandler = self.handleCharData
212
+ self.parser.StartElementHandler = self.handleStartElement
213
+ self.parser.EndElementHandler = self.handleEndElement
214
+ self.parser.Parse(data)
215
+
216
+ def handleCharData(self, data):
217
+ if self.t:
218
+ self.value+= data
219
+
220
+ def handleStartElement(self, name, attrs):
221
+ if name == 'si':
222
+ self.si = True
223
+ self.value = ""
224
+ elif name == 't' and self.rPh:
225
+ self.t = False
226
+ elif name == 't' and self.si:
227
+ self.t = True
228
+ elif name == 'rPh':
229
+ self.rPh = True
230
+
231
+ def handleEndElement(self, name):
232
+ if name == 'si':
233
+ self.si = False
234
+ self.strings.append(self.value)
235
+ elif name == 't':
236
+ self.t = False
237
+ elif name == 'rPh':
238
+ self.rPh = False
239
+
240
+ class Sheet:
241
+ def __init__(self, workbook, sharedString, styles, data):
242
+ self.parser = None
243
+ self.writer = None
244
+ self.sharedString = None
245
+ self.styles = None
246
+
247
+ self.in_sheet = False
248
+ self.in_row = False
249
+ self.in_cell = False
250
+ self.in_cell_value = False
251
+ self.in_cell_formula = False
252
+
253
+ self.columns = {}
254
+ self.rowNum = None
255
+ self.colType = None
256
+ self.s_attr = None
257
+ self.data = None
258
+
259
+ self.dateformat = None
260
+ self.skip_empty_lines = False
261
+
262
+ self.data = data
263
+ self.workbook = workbook
264
+ self.sharedStrings = sharedString.strings
265
+ self.styles = styles
266
+
267
+ def set_dateformat(self, dateformat):
268
+ self.dateformat = dateformat
269
+
270
+ def set_skip_empty_lines(self, skip):
271
+ self.skip_empty_lines = skip
272
+
273
+ def to_csv(self, writer):
274
+ self.writer = writer
275
+ self.parser = xml.parsers.expat.ParserCreate()
276
+ self.parser.CharacterDataHandler = self.handleCharData
277
+ self.parser.StartElementHandler = self.handleStartElement
278
+ self.parser.EndElementHandler = self.handleEndElement
279
+ self.parser.Parse(self.data)
280
+
281
+ def handleCharData(self, data):
282
+ if self.in_cell_value:
283
+ self.data = data # default value
284
+ if self.colType == "s": # shared string
285
+ self.data = self.sharedStrings[int(data)]
286
+ elif self.colType == "b": # boolean
287
+ self.data = (int(data) == 1 and "TRUE") or (int(data) == 0 and "FALSE") or data
288
+ elif self.s_attr:
289
+ s = int(self.s_attr)
290
+
291
+ # get cell format
292
+ format = None
293
+ xfs_numfmt = self.styles.cellXfs[s]
294
+ if self.styles.numFmts.has_key(xfs_numfmt):
295
+ format = self.styles.numFmts[xfs_numfmt]
296
+ elif STANDARD_FORMATS.has_key(xfs_numfmt):
297
+ format = STANDARD_FORMATS[xfs_numfmt]
298
+ # get format type
299
+ if format and FORMATS.has_key(format):
300
+ format_type = FORMATS[format]
301
+
302
+ if format_type == 'date': # date/time
303
+ try:
304
+ if self.workbook.date1904:
305
+ date = datetime.datetime(1904, 01, 01) + datetime.timedelta(float(data))
306
+ else:
307
+ date = datetime.datetime(1899, 12, 30) + datetime.timedelta(float(data))
308
+ if self.dateformat:
309
+ # str(dateformat) - python2.5 bug, see: http://bugs.python.org/issue2782
310
+ self.data = date.strftime(str(self.dateformat))
311
+ else:
312
+ dateformat = format.replace("yyyy", "%Y").replace("yy", "%y"). \
313
+ replace("hh:mm", "%H:%M").replace("h", "%H").replace("%H%H", "%H").replace("ss", "%S"). \
314
+ replace("d", "%e").replace("%e%e", "%d"). \
315
+ replace("mmmm", "%B").replace("mmm", "%b").replace(":mm", ":%M").replace("m", "%m").replace("%m%m", "%m"). \
316
+ replace("am/pm", "%p")
317
+ self.data = date.strftime(str(dateformat)).strip()
318
+ except (ValueError, OverflowError):
319
+ # invalid date format
320
+ self.data = data
321
+ elif format_type == 'time': # time
322
+ self.data = str(float(data) * 24*60*60)
323
+ # does not support it
324
+ #elif self.in_cell_formula:
325
+ # self.formula = data
326
+
327
+ def handleStartElement(self, name, attrs):
328
+ if self.in_row and name == 'c':
329
+ self.colType = attrs.get("t")
330
+ self.s_attr = attrs.get("s")
331
+ cellId = attrs.get("r")
332
+ if cellId:
333
+ self.colNum = cellId[:len(cellId)-len(self.rowNum)]
334
+ self.colIndex = 0
335
+ else:
336
+ self.colIndex+= 1
337
+ #self.formula = None
338
+ self.data = ""
339
+ self.in_cell = True
340
+ elif self.in_cell and name == 'v':
341
+ self.in_cell_value = True
342
+ #elif self.in_cell and name == 'f':
343
+ # self.in_cell_formula = True
344
+ elif self.in_sheet and name == 'row' and attrs.has_key('r'):
345
+ self.rowNum = attrs['r']
346
+ self.in_row = True
347
+ self.columns = {}
348
+ self.spans = None
349
+ if attrs.has_key('spans'):
350
+ self.spans = [int(i) for i in attrs['spans'].split(":")]
351
+ elif name == 'sheetData':
352
+ self.in_sheet = True
353
+
354
+ def handleEndElement(self, name):
355
+ if self.in_cell and name == 'v':
356
+ self.in_cell_value = False
357
+ #elif self.in_cell and name == 'f':
358
+ # self.in_cell_formula = False
359
+ elif self.in_cell and name == 'c':
360
+ t = 0
361
+ for i in self.colNum: t = t*26 + ord(i) - 64
362
+ self.columns[t - 1 + self.colIndex] = self.data
363
+ self.in_cell = False
364
+ if self.in_row and name == 'row':
365
+ if len(self.columns.keys()) > 0:
366
+ d = [""] * (max(self.columns.keys()) + 1)
367
+ for k in self.columns.keys():
368
+ d[k] = self.columns[k].encode("utf-8")
369
+ if self.spans:
370
+ l = self.spans[0] + self.spans[1] - 1
371
+ if len(d) < l:
372
+ d+= (l - len(d)) * ['']
373
+ # write line to csv
374
+ if not self.skip_empty_lines or d.count('') != len(d):
375
+ self.writer.writerow(d)
376
+ self.in_row = False
377
+ elif self.in_sheet and name == 'sheetData':
378
+ self.in_sheet = False
379
+
380
+ def convert_recursive(path, kwargs):
381
+ for name in os.listdir(path):
382
+ fullpath = os.path.join(path, name)
383
+ if os.path.isdir(fullpath):
384
+ convert_recursive(fullpath, kwargs)
385
+ else:
386
+ if fullpath.lower().endswith(".xlsx"):
387
+ outfilepath = fullpath[:-4] + 'csv'
388
+ print("Converting %s to %s" %(fullpath, outfilepath))
389
+ f = open(outfilepath, 'w+b')
390
+ try:
391
+ xlsx2csv(fullpath, f, **kwargs)
392
+ except zipfile.BadZipfile:
393
+ print("File is not a zip file")
394
+ f.close()
395
+
396
+ if __name__ == "__main__":
397
+ parser = OptionParser(usage = "%prog [options] infile [outfile]", version="0.11")
398
+ parser.add_option("-d", "--delimiter", dest="delimiter", default=",",
399
+ help="delimiter - csv columns delimiter, 'tab' or 'x09' for tab (comma is default)")
400
+ parser.add_option("-f", "--dateformat", dest="dateformat",
401
+ help="override date/time format (ex. %Y/%m/%d)")
402
+ parser.add_option("-i", "--ignoreempty", dest="skip_empty_lines", default=False, action="store_true",
403
+ help="skip empty lines")
404
+ parser.add_option("-p", "--sheetdelimiter", dest="sheetdelimiter", default="--------",
405
+ help="sheets delimiter used to separate sheets, pass '' if you don't want delimiters (default '--------')")
406
+ parser.add_option("-r", "--recursive", dest="recursive", default=False, action="store_true",
407
+ help="convert recursively")
408
+ parser.add_option("-s", "--sheet", dest="sheetid", default=1, type="int",
409
+ help="sheet no to convert (0 for all sheets)")
410
+
411
+ (options, args) = parser.parse_args()
412
+
413
+ if len(options.delimiter) == 1:
414
+ delimiter = options.delimiter
415
+ elif options.delimiter == 'tab':
416
+ delimiter = '\t'
417
+ elif options.delimiter == 'comma':
418
+ delimiter = ','
419
+ elif options.delimiter[0] == 'x':
420
+ delimiter = chr(int(options.delimiter[1:]))
421
+ else:
422
+ raise Exception("Invalid delimiter")
423
+
424
+ kwargs = {
425
+ 'sheetid' : options.sheetid,
426
+ 'delimiter' : delimiter,
427
+ 'sheetdelimiter' : options.sheetdelimiter,
428
+ 'dateformat' : options.dateformat,
429
+ 'skip_empty_lines' : options.skip_empty_lines
430
+ }
431
+
432
+ if options.recursive:
433
+ if len(args) == 1:
434
+ convert_recursive(args[0], kwargs)
435
+ else:
436
+ parser.print_help()
437
+ else:
438
+ if len(args) < 1:
439
+ parser.print_help()
440
+ else:
441
+ if len(args) > 1:
442
+ outfile = open(args[1], 'w+b')
443
+ xlsx2csv(args[0], outfile, **kwargs)
444
+ outfile.close()
445
+ else:
446
+ xlsx2csv(args[0], sys.stdout, **kwargs)
@@ -5,13 +5,14 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "xls_to_csv-paperclip-processor"
8
- s.version = "0.3.0"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Igor Alexandrov"]
12
- s.date = "2012-10-25"
12
+ s.date = "2012-11-27"
13
13
  s.description = "If you want to convert .xls to .csv simply and unwittingly, then this gem is for you!"
14
14
  s.email = "igor.alexandrov@gmail.com"
15
+ s.executables = ["xls2csv", "xlsx2csv"]
15
16
  s.extra_rdoc_files = [
16
17
  "README.md"
17
18
  ]
@@ -20,6 +21,8 @@ Gem::Specification.new do |s|
20
21
  "README.md",
21
22
  "Rakefile",
22
23
  "VERSION",
24
+ "bin/xls2csv",
25
+ "bin/xlsx2csv",
23
26
  "lib/xls_to_csv-paperclip-processor.rb",
24
27
  "xls_to_csv-paperclip-processor.gemspec"
25
28
  ]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xls_to_csv-paperclip-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-25 00:00:00.000000000 Z
12
+ date: 2012-11-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: paperclip
@@ -62,7 +62,9 @@ dependencies:
62
62
  description: If you want to convert .xls to .csv simply and unwittingly, then this
63
63
  gem is for you!
64
64
  email: igor.alexandrov@gmail.com
65
- executables: []
65
+ executables:
66
+ - xls2csv
67
+ - xlsx2csv
66
68
  extensions: []
67
69
  extra_rdoc_files:
68
70
  - README.md
@@ -71,6 +73,8 @@ files:
71
73
  - README.md
72
74
  - Rakefile
73
75
  - VERSION
76
+ - bin/xls2csv
77
+ - bin/xlsx2csv
74
78
  - lib/xls_to_csv-paperclip-processor.rb
75
79
  - xls_to_csv-paperclip-processor.gemspec
76
80
  homepage: http://github.com/igor-alexandrov/xls_to_csv-paperclip-processor
@@ -88,7 +92,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
88
92
  version: '0'
89
93
  segments:
90
94
  - 0
91
- hash: -4100455785649400648
95
+ hash: 2906243392911871525
92
96
  required_rubygems_version: !ruby/object:Gem::Requirement
93
97
  none: false
94
98
  requirements: