undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,646 @@
1
+ # -*- coding: utf8 -*-
2
+ """File format conversion module."""
3
+ import csv
4
+ import logging
5
+ import xml.etree.ElementTree as etree
6
+ from collections import defaultdict
7
+
8
+ import bson
9
+ import orjson
10
+ import pandas
11
+ from bson import ObjectId
12
+ from iterable.helpers.detect import open_iterable
13
+ from tqdm import tqdm
14
+ from xlrd import open_workbook as load_xls
15
+
16
+ from ..utils import get_file_type, get_option, dict_generator
17
+
18
+ ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
19
+
20
+ DEFAULT_BATCH_SIZE = 50000
21
+
22
+ def get_iterable_options(options):
23
+ """Extract iterable-specific options from options dictionary."""
24
+ out = {}
25
+ for k in ITERABLE_OPTIONS_KEYS:
26
+ if k in options.keys():
27
+ out[k] = options[k]
28
+ return out
29
+
30
+
31
+
32
+ PREFIX_STRIP = True
33
+ PREFIX = ""
34
+
35
+ LINEEND = '\n'.encode('utf8')
36
+
37
+ def df_to_pyorc_schema(df):
38
+ """Extracts column information from pandas dataframe and generate pyorc schema"""
39
+ struct_schema = []
40
+ for k, v in df.dtypes.to_dict().items():
41
+ v = str(v)
42
+ if v == 'float64':
43
+ struct_schema.append('%s:float' % (k))
44
+ elif v == 'float32':
45
+ struct_schema.append('%s:float' % (k))
46
+ elif v == 'datetime64[ns]':
47
+ struct_schema.append('%s:timestamp' % (k))
48
+ elif v == 'int32':
49
+ struct_schema.append('%s:int' % (k))
50
+ elif v == 'int64':
51
+ struct_schema.append('%s:int' % (k))
52
+ else:
53
+ struct_schema.append('%s:string' %(k))
54
+ return struct_schema
55
+
56
+
57
+ def __copy_options(user_options, default_options):
58
+ """If user provided option so we use it, if not, default option value should be used"""
59
+ for k in default_options.keys():
60
+ if k not in user_options.keys():
61
+ user_options[k] = default_options[k]
62
+ return user_options
63
+
64
+
65
+ def etree_to_dict(t, prefix_strip=True):
66
+ """Convert XML element tree to dictionary."""
67
+ tag = t.tag if not prefix_strip else t.tag.rsplit('}', 1)[-1]
68
+ d = {tag: {} if t.attrib else None}
69
+ children = list(t)
70
+ if children:
71
+ dd = defaultdict(list)
72
+ for dc in map(etree_to_dict, children):
73
+ for k, v in dc.items():
74
+ if prefix_strip:
75
+ # Remove XML namespace prefix (e.g., '{http://...}tagname' -> 'tagname')
76
+ k = k.rsplit('}', 1)[-1]
77
+ dd[k].append(v)
78
+ d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
79
+ if t.attrib:
80
+ d[tag].update(('@' + k.rsplit('}', 1)[-1], v) for k, v in t.attrib.items())
81
+ if t.text:
82
+ text = t.text.strip()
83
+ if children or t.attrib:
84
+ tag = tag.rsplit('}', 1)[-1]
85
+ if text:
86
+ d[tag]['#text'] = text
87
+ else:
88
+ d[tag] = text
89
+ return d
90
+
91
+
92
+ def xml_to_jsonl(fromname, toname, options=None, default_options=None):
93
+ """Convert XML file to JSONL format."""
94
+ if options is None:
95
+ options = {}
96
+ if default_options is None:
97
+ default_options = {'prefix_strip': True}
98
+ options = __copy_options(options, default_options)
99
+ with open(fromname, 'rb') as ins, open(toname, 'wb') as outf:
100
+ n = 0
101
+ for event, elem in etree.iterparse(ins):
102
+ shorttag = elem.tag.rsplit('}', 1)[-1]
103
+ if shorttag == options['tagname']:
104
+ n += 1
105
+ if options['prefix_strip']:
106
+ j = etree_to_dict(elem,
107
+ prefix_strip=options['prefix_strip'])
108
+ else:
109
+ j = etree_to_dict(elem)
110
+ outf.write(orjson.dumps(j[shorttag]))
111
+ outf.write(LINEEND)
112
+ if n % 500 == 0:
113
+ logging.info('xml2jsonl: processed %d xml tags', n)
114
+ logging.info('xml2jsonl: processed %d xml tags finally', n)
115
+
116
+
117
+ def xls_to_csv(fromname, toname, options=None, default_options=None):
118
+ """Convert XLS file to CSV format."""
119
+ if options is None:
120
+ options = {}
121
+ if default_options is None:
122
+ default_options = {'start_line': 0, 'skip_end_rows': 0,
123
+ 'delimiter': ',', 'encoding': 'utf8'}
124
+ options = __copy_options(options, default_options)
125
+ b = load_xls(fromname)
126
+ s = b.sheet_by_index(0)
127
+ with open(toname, 'w', encoding=options['encoding']) as bc:
128
+ bcw = csv.writer(bc, delimiter=options['delimiter'])
129
+ n = 0
130
+ end_row = s.nrows - options['skip_end_rows']
131
+ for row in range(options['start_line'], end_row):
132
+ n += 1
133
+ this_row = []
134
+ for col in range(s.ncols):
135
+ v = str(s.cell_value(row, col))
136
+ v = v.replace('\n', ' ').strip()
137
+ this_row.append(v)
138
+ bcw.writerow(this_row)
139
+ if n % 10000 == 0:
140
+ logging.info('xls2csv: processed %d records', n)
141
+
142
+
143
+ def csv_to_bson(fromname, toname, options=None, default_options=None):
144
+ """Convert CSV file to BSON format."""
145
+ if options is None:
146
+ options = {}
147
+ if default_options is None:
148
+ default_options = {'encoding': 'utf8', 'delimiter': ','}
149
+ options = __copy_options(options, default_options)
150
+ with open(fromname, 'r', encoding=options['encoding']) as source:
151
+ reader = csv.DictReader(source, delimiter=options['delimiter'])
152
+ with open(toname, 'wb') as output:
153
+ n = 0
154
+ for j in reader:
155
+ n += 1
156
+ rec = bson.BSON.encode(j)
157
+ output.write(rec)
158
+ if n % 10000 == 0:
159
+ logging.info('csv2bson: processed %d records', n)
160
+
161
+
162
+ def csv_to_jsonl(fromname, toname, options=None, default_options=None):
163
+ """Convert CSV file to JSONL format."""
164
+ if options is None:
165
+ options = {}
166
+ if default_options is None:
167
+ default_options = {'encoding': 'utf8', 'delimiter': ','}
168
+ options = __copy_options(options, default_options)
169
+ with open(fromname, 'r', encoding=options['encoding']) as source:
170
+ reader = csv.DictReader(source, delimiter=options['delimiter'])
171
+ with open(toname, 'wb') as output:
172
+ n = 0
173
+ for j in reader:
174
+ n += 1
175
+ output.write(json.dumps(j, ensure_ascii=False).encode('utf8'))
176
+ output.write('\n'.encode('utf8'))
177
+ if n % 10000 == 0:
178
+ logging.info('csv2jsonl: processed %d records', n)
179
+
180
+
181
+ def xls_to_jsonl(fromname, toname, options=None, default_options=None):
182
+ """Convert XLS file to JSONL format."""
183
+ if options is None:
184
+ options = {}
185
+ if default_options is None:
186
+ default_options = {'start_page': 0, 'start_line': 0, 'fields': None}
187
+ options = __copy_options(options, default_options)
188
+ source = load_xls(fromname)
189
+ sheet = source.sheet_by_index(options['start_page'])
190
+ with open(toname, 'wb') as output:
191
+ n = 0
192
+ fields = (options['fields'].split(',')
193
+ if options['fields'] is not None else None)
194
+ for rownum in range(options['start_line'], sheet.nrows):
195
+ n += 1
196
+ tmp = list()
197
+ for i in range(0, sheet.ncols):
198
+ tmp.append(sheet.row_values(rownum)[i])
199
+ if n == 1 and fields is None:
200
+ fields = tmp
201
+ continue
202
+ line = orjson.dumps(dict(zip(fields, tmp)))
203
+ output.write(line + LINEEND)
204
+ if n % 10000 == 0:
205
+ logging.info('xls2jsonl: processed %d records', n)
206
+
207
+
208
+
209
+
210
+ def xlsx_to_jsonl(fromname, toname, options=None, default_options=None):
211
+ """Convert XLSX file to JSONL format."""
212
+ if options is None:
213
+ options = {}
214
+ if default_options is None:
215
+ default_options = {'start_page': 0, 'start_line': 0}
216
+ from openpyxl import load_workbook as load_xlsx
217
+ options = __copy_options(options, default_options)
218
+ source = load_xlsx(fromname)
219
+ # Use start_page to select the correct worksheet
220
+ start_page = options.get('start_page', 0)
221
+ if start_page >= len(source.worksheets):
222
+ raise ValueError(f"start_page {start_page} exceeds available worksheets ({len(source.worksheets)})")
223
+ sheet = source.worksheets[start_page]
224
+ with open(toname, 'wb') as output:
225
+ n = 0
226
+ fields = (options['fields'].split(',')
227
+ if options['fields'] is not None else None)
228
+ for row in sheet.iter_rows():
229
+ n += 1
230
+ if n < options['start_line']:
231
+ continue
232
+ tmp = list()
233
+
234
+ for cell in row:
235
+ tmp.append(cell.value)
236
+ if n == 1 and fields is None:
237
+ fields = tmp
238
+ continue
239
+ line = orjson.dumps(dict(zip(fields, tmp)))
240
+ output.write(line)
241
+ output.write(LINEEND)
242
+ if n % 10000 == 0:
243
+ logging.debug('xlsx2bson: processed %d records', n)
244
+ source.close()
245
+
246
+ def xlsx_to_bson(fromname, toname, options=None, default_options=None):
247
+ """Convert XLSX file to BSON format."""
248
+ if options is None:
249
+ options = {}
250
+ if default_options is None:
251
+ default_options = {'start_page': 0, 'start_line': 0}
252
+ from openpyxl import load_workbook as load_xlsx
253
+ options = __copy_options(options, default_options)
254
+ source = load_xlsx(fromname)
255
+ sheet = source.active # FIXME! Use start_page instead
256
+ with open(toname, 'wb') as output:
257
+ n = 0
258
+ fields = (options['fields'].split(',')
259
+ if options['fields'] is not None else None)
260
+ for row in sheet.iter_rows():
261
+ n += 1
262
+ if n < options['start_line']:
263
+ continue
264
+ tmp = list()
265
+
266
+ for cell in row:
267
+ tmp.append(cell.value)
268
+ if n == 1 and fields is None:
269
+ fields = tmp
270
+ continue
271
+ output.write(bson.BSON.encode(dict(zip(fields, tmp))))
272
+
273
+ if n % 10000 == 0:
274
+ logging.debug('xlsx2bson: processed %d records', n)
275
+ source.close()
276
+
277
+ def xls_to_bson(fromname, toname, options=None, default_options=None):
278
+ """Convert XLS file to BSON format."""
279
+ if options is None:
280
+ options = {}
281
+ if default_options is None:
282
+ default_options = {'start_page': 0, 'start_line': 0}
283
+ options = __copy_options(options, default_options)
284
+ source = load_xls(fromname)
285
+ sheet = source.sheet_by_index(options['start_page'])
286
+ with open(toname, 'wb') as output:
287
+ n = 0
288
+ for rownum in range(options['start_line'], sheet.nrows):
289
+ n += 1
290
+ tmp = list()
291
+ for i in range(0, sheet.ncols):
292
+ tmp.append(sheet.row_values(rownum)[i])
293
+ output.write(bson.BSON.encode(dict(zip(options['fields'], tmp))))
294
+ if n % 10000 == 0:
295
+ logging.info('xls2bson: processed %d records', n)
296
+
297
+
298
+ def _is_flat(item):
299
+ """Check if dictionary item is flat (no nested structures)."""
300
+ for k, v in item.items():
301
+ if isinstance(v, (dict, tuple, list)):
302
+ return False
303
+ return True
304
+
305
+
306
+ def express_analyze_jsonl(filename, itemlimit=100):
307
+ """Quickly analyze JSONL file structure."""
308
+ isflat = True
309
+ n = 0
310
+ keys = set()
311
+ with open(filename, 'r', encoding='utf8') as f:
312
+ for line in f:
313
+ n += 1
314
+ if n > itemlimit:
315
+ break
316
+ record = orjson.loads(line)
317
+ if isflat:
318
+ if not _is_flat(record):
319
+ isflat = False
320
+ if len(keys) == 0:
321
+ keys = set(record.keys())
322
+ else:
323
+ keys = keys.union(set(record.keys()))
324
+ keys = list(keys)
325
+ keys.sort()
326
+ return {'isflat': isflat, 'keys': keys}
327
+
328
+
329
+ def jsonl_to_csv(fromname, toname, options=None, default_options=None):
330
+ """Convert JSONL file to CSV format."""
331
+ if options is None:
332
+ options = {}
333
+ if default_options is None:
334
+ default_options = {'force_flat': False, 'useitems': 100, 'delimiter': ','}
335
+ options = __copy_options(options, default_options)
336
+ analysis = express_analyze_jsonl(fromname, itemlimit=options['useitems'])
337
+ if not options['force_flat'] and not analysis['isflat']:
338
+ logging.error("File %s is not flat and 'force_flat' flag not set. "
339
+ "File not converted", fromname)
340
+ return
341
+ keys = analysis['keys']
342
+ with open(toname, 'w', encoding='utf8') as out:
343
+ writer = csv.writer(out, delimiter=options['delimiter'])
344
+ writer.writerow(keys)
345
+ with open(fromname, 'r', encoding='utf8') as f:
346
+ n = 0
347
+ for line in f:
348
+ n += 1
349
+ record = orjson.loads(line)
350
+ item = []
351
+ for k in keys:
352
+ if k in record:
353
+ item.append(record[k])
354
+ else:
355
+ item.append('')
356
+ writer.writerow(item)
357
+ if n % 10000 == 0:
358
+ logging.info('jsonl2csv: processed %d records', n)
359
+
360
+
361
+ def default(obj):
362
+ """Default serializer for BSON ObjectId."""
363
+ if isinstance(obj, ObjectId):
364
+ return str(obj)
365
+ return None
366
+
367
+ def bson_to_jsonl(fromname, toname, options=None, default_options=None):
368
+ """Convert BSON file to JSONL format."""
369
+ if options is None:
370
+ options = {}
371
+ if default_options is None:
372
+ default_options = {}
373
+ options = __copy_options(options, default_options)
374
+ with open(fromname, 'rb') as source:
375
+ with open(toname, 'wb') as output:
376
+ n = 0
377
+ for r in bson.decode_file_iter(source):
378
+ n += 1
379
+ output.write(orjson.dumps(r, default=default))
380
+ output.write(LINEEND)
381
+ if n % 10000 == 0:
382
+ logging.info('bson2jsonl: processed %d records', n)
383
+
384
+
385
+ def json_to_jsonl(fromname, toname, options=None, default_options=None):
386
+ """Simple implementation of JSON to JSON lines conversion.
387
+
388
+ Assumes that JSON is an array or dict with 1st level value with data.
389
+ """
390
+ if options is None:
391
+ options = {}
392
+ if default_options is None:
393
+ default_options = {}
394
+ options = __copy_options(options, default_options)
395
+ source = open(fromname, 'rb')
396
+ source_data = json.load(source)
397
+ data = source_data
398
+ if 'tagname' in options.keys():
399
+ if isinstance(source_data, dict) and options['tagname'] in source_data:
400
+ data = data[options['tagname']]
401
+ with open(toname, 'wb') as output:
402
+ n = 0
403
+ for r in data:
404
+ n += 1
405
+ output.write(orjson.dumps(r) + LINEEND)
406
+ if n % 10000 == 0:
407
+ logging.info('json2jsonl: processed %d records', n)
408
+ source.close()
409
+
410
+
411
+ def csv_to_parquet(fromname, toname, options=None, default_options=None):
412
+ """Convert CSV file to Parquet format."""
413
+ if options is None:
414
+ options = {}
415
+ if default_options is None:
416
+ default_options = {'encoding': 'utf8', 'delimiter': ',',
417
+ 'compression': 'brotli'}
418
+ options = __copy_options(options, default_options)
419
+ df = pandas.read_csv(fromname, delimiter=options['delimiter'],
420
+ encoding=options['encoding'])
421
+ comp = (options['compression']
422
+ if options['compression'] != 'None' else None)
423
+ df.to_parquet(toname, compression=comp)
424
+
425
+
426
+ def jsonl_to_parquet(fromname, toname, options=None, default_options=None):
427
+ """Convert JSONL file to Parquet format."""
428
+ if options is None:
429
+ options = {}
430
+ if default_options is None:
431
+ default_options = {'force_flat': False, 'useitems': 100,
432
+ 'compression': 'brotli'}
433
+ options = __copy_options(options, default_options)
434
+ df = pandas.read_json(fromname, lines=True, encoding=options['encoding'])
435
+ comp = (options['compression']
436
+ if options['compression'] != 'None' else None)
437
+ df.to_parquet(toname, compression=comp)
438
+
439
+
440
+ PYORC_COMPRESSION_MAP = {'zstd': 5, 'snappy' : 2, 'zlib' : 1, 'lzo' : 3, 'lz4' : 4, 'None' : 0}
441
+
442
+ def csv_to_orc(fromname, toname, options=None, default_options=None):
443
+ """Converts CSV file to ORC file."""
444
+ if options is None:
445
+ options = {}
446
+ if default_options is None:
447
+ default_options = {'encoding': 'utf8', 'delimiter': ',',
448
+ 'compression': 'zstd'}
449
+ import pyorc
450
+ options = __copy_options(options, default_options)
451
+ comp_key = options['compression']
452
+ compression = (PYORC_COMPRESSION_MAP[comp_key]
453
+ if comp_key in PYORC_COMPRESSION_MAP.keys() else 0)
454
+ with open(fromname, 'r', encoding=options['encoding']) as source:
455
+ reader = csv.DictReader(source, delimiter=options['delimiter'])
456
+ struct_schema = []
457
+ for field in reader.fieldnames:
458
+ struct_schema.append('%s:string' % (field))
459
+ schema_str = ','.join(struct_schema)
460
+ with open(toname, 'wb') as output:
461
+ writer = pyorc.Writer(output, f"struct<{schema_str}>",
462
+ struct_repr=pyorc.StructRepr.DICT,
463
+ compression=compression,
464
+ compression_strategy=1)
465
+ n = 0
466
+ for row in reader:
467
+ n += 1
468
+ try:
469
+ writer.write(row)
470
+ except TypeError:
471
+ print('Error processing row %d. Skip and continue', n)
472
+
473
+ def jsonl_to_orc(fromname, toname, options=None, default_options=None):
474
+ """Converts JSON file to ORC file."""
475
+ if options is None:
476
+ options = {}
477
+ if default_options is None:
478
+ default_options = {'force_flat': False, 'useitems': 100,
479
+ 'compression': 'zstd'}
480
+ import pyorc
481
+ options = __copy_options(options, default_options)
482
+ comp_key = options['compression']
483
+ compression = (PYORC_COMPRESSION_MAP[comp_key]
484
+ if comp_key in PYORC_COMPRESSION_MAP.keys() else 0)
485
+ df = pandas.read_json(fromname, lines=True, encoding=options['encoding'])
486
+ df.info()
487
+ struct_schema = df_to_pyorc_schema(df)
488
+ schema_str = ','.join(struct_schema)
489
+ with open(toname, 'wb') as output:
490
+ writer = pyorc.Writer(output, f"struct<{schema_str}>",
491
+ struct_repr=pyorc.StructRepr.DICT,
492
+ compression=compression,
493
+ compression_strategy=1)
494
+ writer.writerows(df.to_dict(orient="records"))
495
+
496
+ def csv_to_avro(fromname, toname, options=None, default_options=None):
497
+ """Converts CSV file to AVRO file."""
498
+ if options is None:
499
+ options = {}
500
+ if default_options is None:
501
+ default_options = {'encoding': 'utf8', 'delimiter': ',',
502
+ 'compression': 'deflate'}
503
+ import avro.schema
504
+ from avro.datafile import DataFileWriter
505
+ from avro.io import DatumWriter
506
+
507
+ options = __copy_options(options, default_options)
508
+ with open(fromname, 'r', encoding=options['encoding']) as source:
509
+ reader = csv.DictReader(source, delimiter=options['delimiter'])
510
+
511
+ schema_dict = {"namespace": "data.avro", "type": "record",
512
+ "name": "Record", "fields": []}
513
+
514
+ for field in reader.fieldnames:
515
+ schema_dict['fields'].append({'name': field, 'type': 'string'})
516
+ schema = avro.schema.parse(json.dumps(schema_dict))
517
+ with open(toname, 'wb') as output:
518
+ writer = DataFileWriter(output, DatumWriter(), schema,
519
+ codec=options['compression'])
520
+ n = 0
521
+ for row in reader:
522
+ n += 1
523
+ try:
524
+ writer.append(row)
525
+ except TypeError:
526
+ print('Error processing row %d. Skip and continue', n)
527
+
528
+ CONVERT_FUNC_MAP = {
529
+ 'xls2csv': xls_to_csv,
530
+ 'xls2jsonl': xls_to_jsonl,
531
+ 'xls2bson': xls_to_bson,
532
+ 'xlsx2jsonl': xlsx_to_jsonl,
533
+ 'xlsx2bson': xlsx_to_bson,
534
+ 'csv2jsonl': csv_to_jsonl,
535
+ 'csv2bson': csv_to_bson,
536
+ 'xml2jsonl': xml_to_jsonl,
537
+ 'jsonl2csv': jsonl_to_csv,
538
+ 'bson2jsonl': bson_to_jsonl,
539
+ 'json2jsonl': json_to_jsonl,
540
+ 'csv2parquet' : csv_to_parquet,
541
+ 'jsonl2parquet': jsonl_to_parquet,
542
+ 'jsonl2orc' : jsonl_to_orc,
543
+ 'csv2orc' : csv_to_orc,
544
+ 'csv2avro' : csv_to_avro,
545
+ }
546
+
547
+
548
+ DEFAULT_HEADERS_DETECT_LIMIT = 1000
549
+
550
+ def make_flat(item):
551
+ """Flatten nested structures in dictionary by converting to strings."""
552
+ result = {}
553
+ for k, v in item.items():
554
+ if isinstance(v, (tuple, list, dict)):
555
+ result[k] = str(v)
556
+ else:
557
+ result[k] = v
558
+ return result
559
+
560
+ class Converter:
561
+ """File format converter handler."""
562
+ def __init__(self, batch_size = DEFAULT_BATCH_SIZE):
563
+ self.batch_size = batch_size
564
+ pass
565
+
566
+ def convert(self, fromfile, tofile, options=None, limit=DEFAULT_HEADERS_DETECT_LIMIT):
567
+ """Convert file from one format to another.
568
+
569
+ Processes files in two phases:
570
+ 1. Schema extraction: Samples records to determine field structure
571
+ 2. Conversion: Streams records from source to destination format
572
+
573
+ Uses sets for efficient key tracking during schema extraction.
574
+
575
+ Args:
576
+ fromfile: Path to input file.
577
+ tofile: Path to output file.
578
+ options: Dictionary of conversion options (encoding, delimiter, etc.).
579
+ limit: Maximum records to sample for schema detection.
580
+
581
+ Raises:
582
+ ValueError: If file format is not supported.
583
+ IOError: If file cannot be read or written.
584
+ """
585
+ if options is None:
586
+ options = {}
587
+ iterableargs = get_iterable_options(options)
588
+ it_in = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
589
+ is_flatten = get_option(options, 'flatten')
590
+ keys_set = set() # Use set for O(1) lookup instead of O(n) list operations
591
+ n = 0
592
+ logging.info('Extracting schema')
593
+ for item in tqdm(it_in, total=limit):
594
+ if limit is not None and n > limit:
595
+ break
596
+ n += 1
597
+ if not is_flatten:
598
+ dk = dict_generator(item)
599
+ for i in dk:
600
+ k = ".".join(i[:-1])
601
+ keys_set.add(k)
602
+ else:
603
+ item = make_flat(item)
604
+ for k in item.keys():
605
+ keys_set.add(k)
606
+
607
+ keys = list(keys_set) # Convert to list for backward compatibility
608
+ it_in.reset()
609
+ it_out = open_iterable(tofile, mode='w', iterableargs={'keys' : keys})
610
+
611
+ logging.info('Converting data')
612
+ n = 0
613
+ batch = []
614
+ for row in tqdm(it_in):
615
+ n += 1
616
+ if is_flatten:
617
+ for k in keys:
618
+ if k not in row.keys():
619
+ row[k] = None
620
+ batch.append(make_flat(row))
621
+ else:
622
+ batch.append(row)
623
+ if n % self.batch_size == 0:
624
+ it_out.write_bulk(batch)
625
+ batch = []
626
+ if len(batch) > 0:
627
+ it_out.write_bulk(batch)
628
+ it_in.close()
629
+ it_out.close()
630
+
631
+
632
+ def convert_old(self, fromfile, tofile, options=None):
633
+ """Legacy conversion method."""
634
+ if options is None:
635
+ options = {}
636
+ fromtype = (options['format_in'] if options['format_in'] is not None
637
+ else get_file_type(fromfile))
638
+ totype = (options['format_out'] if options['format_out'] is not None
639
+ else get_file_type(tofile))
640
+ key = '%s2%s' % (fromtype, totype)
641
+ func = CONVERT_FUNC_MAP.get(key, None)
642
+ if func is None:
643
+ logging.error('Conversion between %s and %s not supported' % (fromtype, totype))
644
+ else:
645
+ logging.info('Convert %s from %s to %s' % (key, fromfile, tofile))
646
+ func(fromfile, tofile, options)