undatum 1.0.17__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- undatum/__init__.py +9 -0
- undatum/__main__.py +25 -0
- undatum/ai/__init__.py +145 -0
- undatum/ai/base.py +85 -0
- undatum/ai/config.py +184 -0
- undatum/ai/perplexity.py +79 -0
- undatum/ai/providers.py +1002 -0
- undatum/ai/schemas.py +42 -0
- undatum/cmds/__init__.py +6 -0
- undatum/cmds/analyzer.py +697 -0
- undatum/cmds/converter.py +646 -0
- undatum/cmds/ingester.py +116 -0
- undatum/cmds/query.py +68 -0
- undatum/cmds/schemer.py +328 -0
- undatum/cmds/selector.py +437 -0
- undatum/cmds/statistics.py +158 -0
- undatum/cmds/textproc.py +59 -0
- undatum/cmds/transformer.py +81 -0
- undatum/cmds/validator.py +137 -0
- undatum/common/__init__.py +6 -0
- undatum/common/functions.py +81 -0
- undatum/common/iterable.py +222 -0
- undatum/common/scheme.py +261 -0
- undatum/constants.py +21 -0
- undatum/core.py +616 -0
- undatum/formats/__init__.py +6 -0
- undatum/formats/docx.py +160 -0
- undatum/utils.py +298 -0
- undatum/validate/__init__.py +11 -0
- undatum/validate/commonrules.py +15 -0
- undatum/validate/ruscodes.py +202 -0
- undatum-1.0.17.dist-info/METADATA +610 -0
- undatum-1.0.17.dist-info/RECORD +37 -0
- undatum-1.0.17.dist-info/WHEEL +6 -0
- undatum-1.0.17.dist-info/entry_points.txt +3 -0
- undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
- undatum-1.0.17.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,646 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""File format conversion module."""
|
|
3
|
+
import csv
|
|
4
|
+
import logging
|
|
5
|
+
import xml.etree.ElementTree as etree
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
|
|
8
|
+
import bson
|
|
9
|
+
import orjson
|
|
10
|
+
import pandas
|
|
11
|
+
from bson import ObjectId
|
|
12
|
+
from iterable.helpers.detect import open_iterable
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
from xlrd import open_workbook as load_xls
|
|
15
|
+
|
|
16
|
+
from ..utils import get_file_type, get_option, dict_generator
|
|
17
|
+
|
|
18
|
+
ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
|
|
19
|
+
|
|
20
|
+
DEFAULT_BATCH_SIZE = 50000
|
|
21
|
+
|
|
22
|
+
def get_iterable_options(options):
|
|
23
|
+
"""Extract iterable-specific options from options dictionary."""
|
|
24
|
+
out = {}
|
|
25
|
+
for k in ITERABLE_OPTIONS_KEYS:
|
|
26
|
+
if k in options.keys():
|
|
27
|
+
out[k] = options[k]
|
|
28
|
+
return out
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
PREFIX_STRIP = True
|
|
33
|
+
PREFIX = ""
|
|
34
|
+
|
|
35
|
+
LINEEND = '\n'.encode('utf8')
|
|
36
|
+
|
|
37
|
+
def df_to_pyorc_schema(df):
|
|
38
|
+
"""Extracts column information from pandas dataframe and generate pyorc schema"""
|
|
39
|
+
struct_schema = []
|
|
40
|
+
for k, v in df.dtypes.to_dict().items():
|
|
41
|
+
v = str(v)
|
|
42
|
+
if v == 'float64':
|
|
43
|
+
struct_schema.append('%s:float' % (k))
|
|
44
|
+
elif v == 'float32':
|
|
45
|
+
struct_schema.append('%s:float' % (k))
|
|
46
|
+
elif v == 'datetime64[ns]':
|
|
47
|
+
struct_schema.append('%s:timestamp' % (k))
|
|
48
|
+
elif v == 'int32':
|
|
49
|
+
struct_schema.append('%s:int' % (k))
|
|
50
|
+
elif v == 'int64':
|
|
51
|
+
struct_schema.append('%s:int' % (k))
|
|
52
|
+
else:
|
|
53
|
+
struct_schema.append('%s:string' %(k))
|
|
54
|
+
return struct_schema
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def __copy_options(user_options, default_options):
|
|
58
|
+
"""If user provided option so we use it, if not, default option value should be used"""
|
|
59
|
+
for k in default_options.keys():
|
|
60
|
+
if k not in user_options.keys():
|
|
61
|
+
user_options[k] = default_options[k]
|
|
62
|
+
return user_options
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def etree_to_dict(t, prefix_strip=True):
|
|
66
|
+
"""Convert XML element tree to dictionary."""
|
|
67
|
+
tag = t.tag if not prefix_strip else t.tag.rsplit('}', 1)[-1]
|
|
68
|
+
d = {tag: {} if t.attrib else None}
|
|
69
|
+
children = list(t)
|
|
70
|
+
if children:
|
|
71
|
+
dd = defaultdict(list)
|
|
72
|
+
for dc in map(etree_to_dict, children):
|
|
73
|
+
for k, v in dc.items():
|
|
74
|
+
if prefix_strip:
|
|
75
|
+
# Remove XML namespace prefix (e.g., '{http://...}tagname' -> 'tagname')
|
|
76
|
+
k = k.rsplit('}', 1)[-1]
|
|
77
|
+
dd[k].append(v)
|
|
78
|
+
d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
|
|
79
|
+
if t.attrib:
|
|
80
|
+
d[tag].update(('@' + k.rsplit('}', 1)[-1], v) for k, v in t.attrib.items())
|
|
81
|
+
if t.text:
|
|
82
|
+
text = t.text.strip()
|
|
83
|
+
if children or t.attrib:
|
|
84
|
+
tag = tag.rsplit('}', 1)[-1]
|
|
85
|
+
if text:
|
|
86
|
+
d[tag]['#text'] = text
|
|
87
|
+
else:
|
|
88
|
+
d[tag] = text
|
|
89
|
+
return d
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def xml_to_jsonl(fromname, toname, options=None, default_options=None):
|
|
93
|
+
"""Convert XML file to JSONL format."""
|
|
94
|
+
if options is None:
|
|
95
|
+
options = {}
|
|
96
|
+
if default_options is None:
|
|
97
|
+
default_options = {'prefix_strip': True}
|
|
98
|
+
options = __copy_options(options, default_options)
|
|
99
|
+
with open(fromname, 'rb') as ins, open(toname, 'wb') as outf:
|
|
100
|
+
n = 0
|
|
101
|
+
for event, elem in etree.iterparse(ins):
|
|
102
|
+
shorttag = elem.tag.rsplit('}', 1)[-1]
|
|
103
|
+
if shorttag == options['tagname']:
|
|
104
|
+
n += 1
|
|
105
|
+
if options['prefix_strip']:
|
|
106
|
+
j = etree_to_dict(elem,
|
|
107
|
+
prefix_strip=options['prefix_strip'])
|
|
108
|
+
else:
|
|
109
|
+
j = etree_to_dict(elem)
|
|
110
|
+
outf.write(orjson.dumps(j[shorttag]))
|
|
111
|
+
outf.write(LINEEND)
|
|
112
|
+
if n % 500 == 0:
|
|
113
|
+
logging.info('xml2jsonl: processed %d xml tags', n)
|
|
114
|
+
logging.info('xml2jsonl: processed %d xml tags finally', n)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def xls_to_csv(fromname, toname, options=None, default_options=None):
|
|
118
|
+
"""Convert XLS file to CSV format."""
|
|
119
|
+
if options is None:
|
|
120
|
+
options = {}
|
|
121
|
+
if default_options is None:
|
|
122
|
+
default_options = {'start_line': 0, 'skip_end_rows': 0,
|
|
123
|
+
'delimiter': ',', 'encoding': 'utf8'}
|
|
124
|
+
options = __copy_options(options, default_options)
|
|
125
|
+
b = load_xls(fromname)
|
|
126
|
+
s = b.sheet_by_index(0)
|
|
127
|
+
with open(toname, 'w', encoding=options['encoding']) as bc:
|
|
128
|
+
bcw = csv.writer(bc, delimiter=options['delimiter'])
|
|
129
|
+
n = 0
|
|
130
|
+
end_row = s.nrows - options['skip_end_rows']
|
|
131
|
+
for row in range(options['start_line'], end_row):
|
|
132
|
+
n += 1
|
|
133
|
+
this_row = []
|
|
134
|
+
for col in range(s.ncols):
|
|
135
|
+
v = str(s.cell_value(row, col))
|
|
136
|
+
v = v.replace('\n', ' ').strip()
|
|
137
|
+
this_row.append(v)
|
|
138
|
+
bcw.writerow(this_row)
|
|
139
|
+
if n % 10000 == 0:
|
|
140
|
+
logging.info('xls2csv: processed %d records', n)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def csv_to_bson(fromname, toname, options=None, default_options=None):
|
|
144
|
+
"""Convert CSV file to BSON format."""
|
|
145
|
+
if options is None:
|
|
146
|
+
options = {}
|
|
147
|
+
if default_options is None:
|
|
148
|
+
default_options = {'encoding': 'utf8', 'delimiter': ','}
|
|
149
|
+
options = __copy_options(options, default_options)
|
|
150
|
+
with open(fromname, 'r', encoding=options['encoding']) as source:
|
|
151
|
+
reader = csv.DictReader(source, delimiter=options['delimiter'])
|
|
152
|
+
with open(toname, 'wb') as output:
|
|
153
|
+
n = 0
|
|
154
|
+
for j in reader:
|
|
155
|
+
n += 1
|
|
156
|
+
rec = bson.BSON.encode(j)
|
|
157
|
+
output.write(rec)
|
|
158
|
+
if n % 10000 == 0:
|
|
159
|
+
logging.info('csv2bson: processed %d records', n)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def csv_to_jsonl(fromname, toname, options=None, default_options=None):
|
|
163
|
+
"""Convert CSV file to JSONL format."""
|
|
164
|
+
if options is None:
|
|
165
|
+
options = {}
|
|
166
|
+
if default_options is None:
|
|
167
|
+
default_options = {'encoding': 'utf8', 'delimiter': ','}
|
|
168
|
+
options = __copy_options(options, default_options)
|
|
169
|
+
with open(fromname, 'r', encoding=options['encoding']) as source:
|
|
170
|
+
reader = csv.DictReader(source, delimiter=options['delimiter'])
|
|
171
|
+
with open(toname, 'wb') as output:
|
|
172
|
+
n = 0
|
|
173
|
+
for j in reader:
|
|
174
|
+
n += 1
|
|
175
|
+
output.write(json.dumps(j, ensure_ascii=False).encode('utf8'))
|
|
176
|
+
output.write('\n'.encode('utf8'))
|
|
177
|
+
if n % 10000 == 0:
|
|
178
|
+
logging.info('csv2jsonl: processed %d records', n)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def xls_to_jsonl(fromname, toname, options=None, default_options=None):
|
|
182
|
+
"""Convert XLS file to JSONL format."""
|
|
183
|
+
if options is None:
|
|
184
|
+
options = {}
|
|
185
|
+
if default_options is None:
|
|
186
|
+
default_options = {'start_page': 0, 'start_line': 0, 'fields': None}
|
|
187
|
+
options = __copy_options(options, default_options)
|
|
188
|
+
source = load_xls(fromname)
|
|
189
|
+
sheet = source.sheet_by_index(options['start_page'])
|
|
190
|
+
with open(toname, 'wb') as output:
|
|
191
|
+
n = 0
|
|
192
|
+
fields = (options['fields'].split(',')
|
|
193
|
+
if options['fields'] is not None else None)
|
|
194
|
+
for rownum in range(options['start_line'], sheet.nrows):
|
|
195
|
+
n += 1
|
|
196
|
+
tmp = list()
|
|
197
|
+
for i in range(0, sheet.ncols):
|
|
198
|
+
tmp.append(sheet.row_values(rownum)[i])
|
|
199
|
+
if n == 1 and fields is None:
|
|
200
|
+
fields = tmp
|
|
201
|
+
continue
|
|
202
|
+
line = orjson.dumps(dict(zip(fields, tmp)))
|
|
203
|
+
output.write(line + LINEEND)
|
|
204
|
+
if n % 10000 == 0:
|
|
205
|
+
logging.info('xls2jsonl: processed %d records', n)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def xlsx_to_jsonl(fromname, toname, options=None, default_options=None):
|
|
211
|
+
"""Convert XLSX file to JSONL format."""
|
|
212
|
+
if options is None:
|
|
213
|
+
options = {}
|
|
214
|
+
if default_options is None:
|
|
215
|
+
default_options = {'start_page': 0, 'start_line': 0}
|
|
216
|
+
from openpyxl import load_workbook as load_xlsx
|
|
217
|
+
options = __copy_options(options, default_options)
|
|
218
|
+
source = load_xlsx(fromname)
|
|
219
|
+
# Use start_page to select the correct worksheet
|
|
220
|
+
start_page = options.get('start_page', 0)
|
|
221
|
+
if start_page >= len(source.worksheets):
|
|
222
|
+
raise ValueError(f"start_page {start_page} exceeds available worksheets ({len(source.worksheets)})")
|
|
223
|
+
sheet = source.worksheets[start_page]
|
|
224
|
+
with open(toname, 'wb') as output:
|
|
225
|
+
n = 0
|
|
226
|
+
fields = (options['fields'].split(',')
|
|
227
|
+
if options['fields'] is not None else None)
|
|
228
|
+
for row in sheet.iter_rows():
|
|
229
|
+
n += 1
|
|
230
|
+
if n < options['start_line']:
|
|
231
|
+
continue
|
|
232
|
+
tmp = list()
|
|
233
|
+
|
|
234
|
+
for cell in row:
|
|
235
|
+
tmp.append(cell.value)
|
|
236
|
+
if n == 1 and fields is None:
|
|
237
|
+
fields = tmp
|
|
238
|
+
continue
|
|
239
|
+
line = orjson.dumps(dict(zip(fields, tmp)))
|
|
240
|
+
output.write(line)
|
|
241
|
+
output.write(LINEEND)
|
|
242
|
+
if n % 10000 == 0:
|
|
243
|
+
logging.debug('xlsx2bson: processed %d records', n)
|
|
244
|
+
source.close()
|
|
245
|
+
|
|
246
|
+
def xlsx_to_bson(fromname, toname, options=None, default_options=None):
|
|
247
|
+
"""Convert XLSX file to BSON format."""
|
|
248
|
+
if options is None:
|
|
249
|
+
options = {}
|
|
250
|
+
if default_options is None:
|
|
251
|
+
default_options = {'start_page': 0, 'start_line': 0}
|
|
252
|
+
from openpyxl import load_workbook as load_xlsx
|
|
253
|
+
options = __copy_options(options, default_options)
|
|
254
|
+
source = load_xlsx(fromname)
|
|
255
|
+
sheet = source.active # FIXME! Use start_page instead
|
|
256
|
+
with open(toname, 'wb') as output:
|
|
257
|
+
n = 0
|
|
258
|
+
fields = (options['fields'].split(',')
|
|
259
|
+
if options['fields'] is not None else None)
|
|
260
|
+
for row in sheet.iter_rows():
|
|
261
|
+
n += 1
|
|
262
|
+
if n < options['start_line']:
|
|
263
|
+
continue
|
|
264
|
+
tmp = list()
|
|
265
|
+
|
|
266
|
+
for cell in row:
|
|
267
|
+
tmp.append(cell.value)
|
|
268
|
+
if n == 1 and fields is None:
|
|
269
|
+
fields = tmp
|
|
270
|
+
continue
|
|
271
|
+
output.write(bson.BSON.encode(dict(zip(fields, tmp))))
|
|
272
|
+
|
|
273
|
+
if n % 10000 == 0:
|
|
274
|
+
logging.debug('xlsx2bson: processed %d records', n)
|
|
275
|
+
source.close()
|
|
276
|
+
|
|
277
|
+
def xls_to_bson(fromname, toname, options=None, default_options=None):
|
|
278
|
+
"""Convert XLS file to BSON format."""
|
|
279
|
+
if options is None:
|
|
280
|
+
options = {}
|
|
281
|
+
if default_options is None:
|
|
282
|
+
default_options = {'start_page': 0, 'start_line': 0}
|
|
283
|
+
options = __copy_options(options, default_options)
|
|
284
|
+
source = load_xls(fromname)
|
|
285
|
+
sheet = source.sheet_by_index(options['start_page'])
|
|
286
|
+
with open(toname, 'wb') as output:
|
|
287
|
+
n = 0
|
|
288
|
+
for rownum in range(options['start_line'], sheet.nrows):
|
|
289
|
+
n += 1
|
|
290
|
+
tmp = list()
|
|
291
|
+
for i in range(0, sheet.ncols):
|
|
292
|
+
tmp.append(sheet.row_values(rownum)[i])
|
|
293
|
+
output.write(bson.BSON.encode(dict(zip(options['fields'], tmp))))
|
|
294
|
+
if n % 10000 == 0:
|
|
295
|
+
logging.info('xls2bson: processed %d records', n)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _is_flat(item):
|
|
299
|
+
"""Check if dictionary item is flat (no nested structures)."""
|
|
300
|
+
for k, v in item.items():
|
|
301
|
+
if isinstance(v, (dict, tuple, list)):
|
|
302
|
+
return False
|
|
303
|
+
return True
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def express_analyze_jsonl(filename, itemlimit=100):
|
|
307
|
+
"""Quickly analyze JSONL file structure."""
|
|
308
|
+
isflat = True
|
|
309
|
+
n = 0
|
|
310
|
+
keys = set()
|
|
311
|
+
with open(filename, 'r', encoding='utf8') as f:
|
|
312
|
+
for line in f:
|
|
313
|
+
n += 1
|
|
314
|
+
if n > itemlimit:
|
|
315
|
+
break
|
|
316
|
+
record = orjson.loads(line)
|
|
317
|
+
if isflat:
|
|
318
|
+
if not _is_flat(record):
|
|
319
|
+
isflat = False
|
|
320
|
+
if len(keys) == 0:
|
|
321
|
+
keys = set(record.keys())
|
|
322
|
+
else:
|
|
323
|
+
keys = keys.union(set(record.keys()))
|
|
324
|
+
keys = list(keys)
|
|
325
|
+
keys.sort()
|
|
326
|
+
return {'isflat': isflat, 'keys': keys}
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def jsonl_to_csv(fromname, toname, options=None, default_options=None):
|
|
330
|
+
"""Convert JSONL file to CSV format."""
|
|
331
|
+
if options is None:
|
|
332
|
+
options = {}
|
|
333
|
+
if default_options is None:
|
|
334
|
+
default_options = {'force_flat': False, 'useitems': 100, 'delimiter': ','}
|
|
335
|
+
options = __copy_options(options, default_options)
|
|
336
|
+
analysis = express_analyze_jsonl(fromname, itemlimit=options['useitems'])
|
|
337
|
+
if not options['force_flat'] and not analysis['isflat']:
|
|
338
|
+
logging.error("File %s is not flat and 'force_flat' flag not set. "
|
|
339
|
+
"File not converted", fromname)
|
|
340
|
+
return
|
|
341
|
+
keys = analysis['keys']
|
|
342
|
+
with open(toname, 'w', encoding='utf8') as out:
|
|
343
|
+
writer = csv.writer(out, delimiter=options['delimiter'])
|
|
344
|
+
writer.writerow(keys)
|
|
345
|
+
with open(fromname, 'r', encoding='utf8') as f:
|
|
346
|
+
n = 0
|
|
347
|
+
for line in f:
|
|
348
|
+
n += 1
|
|
349
|
+
record = orjson.loads(line)
|
|
350
|
+
item = []
|
|
351
|
+
for k in keys:
|
|
352
|
+
if k in record:
|
|
353
|
+
item.append(record[k])
|
|
354
|
+
else:
|
|
355
|
+
item.append('')
|
|
356
|
+
writer.writerow(item)
|
|
357
|
+
if n % 10000 == 0:
|
|
358
|
+
logging.info('jsonl2csv: processed %d records', n)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def default(obj):
|
|
362
|
+
"""Default serializer for BSON ObjectId."""
|
|
363
|
+
if isinstance(obj, ObjectId):
|
|
364
|
+
return str(obj)
|
|
365
|
+
return None
|
|
366
|
+
|
|
367
|
+
def bson_to_jsonl(fromname, toname, options=None, default_options=None):
|
|
368
|
+
"""Convert BSON file to JSONL format."""
|
|
369
|
+
if options is None:
|
|
370
|
+
options = {}
|
|
371
|
+
if default_options is None:
|
|
372
|
+
default_options = {}
|
|
373
|
+
options = __copy_options(options, default_options)
|
|
374
|
+
with open(fromname, 'rb') as source:
|
|
375
|
+
with open(toname, 'wb') as output:
|
|
376
|
+
n = 0
|
|
377
|
+
for r in bson.decode_file_iter(source):
|
|
378
|
+
n += 1
|
|
379
|
+
output.write(orjson.dumps(r, default=default))
|
|
380
|
+
output.write(LINEEND)
|
|
381
|
+
if n % 10000 == 0:
|
|
382
|
+
logging.info('bson2jsonl: processed %d records', n)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def json_to_jsonl(fromname, toname, options=None, default_options=None):
|
|
386
|
+
"""Simple implementation of JSON to JSON lines conversion.
|
|
387
|
+
|
|
388
|
+
Assumes that JSON is an array or dict with 1st level value with data.
|
|
389
|
+
"""
|
|
390
|
+
if options is None:
|
|
391
|
+
options = {}
|
|
392
|
+
if default_options is None:
|
|
393
|
+
default_options = {}
|
|
394
|
+
options = __copy_options(options, default_options)
|
|
395
|
+
source = open(fromname, 'rb')
|
|
396
|
+
source_data = json.load(source)
|
|
397
|
+
data = source_data
|
|
398
|
+
if 'tagname' in options.keys():
|
|
399
|
+
if isinstance(source_data, dict) and options['tagname'] in source_data:
|
|
400
|
+
data = data[options['tagname']]
|
|
401
|
+
with open(toname, 'wb') as output:
|
|
402
|
+
n = 0
|
|
403
|
+
for r in data:
|
|
404
|
+
n += 1
|
|
405
|
+
output.write(orjson.dumps(r) + LINEEND)
|
|
406
|
+
if n % 10000 == 0:
|
|
407
|
+
logging.info('json2jsonl: processed %d records', n)
|
|
408
|
+
source.close()
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def csv_to_parquet(fromname, toname, options=None, default_options=None):
|
|
412
|
+
"""Convert CSV file to Parquet format."""
|
|
413
|
+
if options is None:
|
|
414
|
+
options = {}
|
|
415
|
+
if default_options is None:
|
|
416
|
+
default_options = {'encoding': 'utf8', 'delimiter': ',',
|
|
417
|
+
'compression': 'brotli'}
|
|
418
|
+
options = __copy_options(options, default_options)
|
|
419
|
+
df = pandas.read_csv(fromname, delimiter=options['delimiter'],
|
|
420
|
+
encoding=options['encoding'])
|
|
421
|
+
comp = (options['compression']
|
|
422
|
+
if options['compression'] != 'None' else None)
|
|
423
|
+
df.to_parquet(toname, compression=comp)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def jsonl_to_parquet(fromname, toname, options=None, default_options=None):
|
|
427
|
+
"""Convert JSONL file to Parquet format."""
|
|
428
|
+
if options is None:
|
|
429
|
+
options = {}
|
|
430
|
+
if default_options is None:
|
|
431
|
+
default_options = {'force_flat': False, 'useitems': 100,
|
|
432
|
+
'compression': 'brotli'}
|
|
433
|
+
options = __copy_options(options, default_options)
|
|
434
|
+
df = pandas.read_json(fromname, lines=True, encoding=options['encoding'])
|
|
435
|
+
comp = (options['compression']
|
|
436
|
+
if options['compression'] != 'None' else None)
|
|
437
|
+
df.to_parquet(toname, compression=comp)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
PYORC_COMPRESSION_MAP = {'zstd': 5, 'snappy' : 2, 'zlib' : 1, 'lzo' : 3, 'lz4' : 4, 'None' : 0}
|
|
441
|
+
|
|
442
|
+
def csv_to_orc(fromname, toname, options=None, default_options=None):
|
|
443
|
+
"""Converts CSV file to ORC file."""
|
|
444
|
+
if options is None:
|
|
445
|
+
options = {}
|
|
446
|
+
if default_options is None:
|
|
447
|
+
default_options = {'encoding': 'utf8', 'delimiter': ',',
|
|
448
|
+
'compression': 'zstd'}
|
|
449
|
+
import pyorc
|
|
450
|
+
options = __copy_options(options, default_options)
|
|
451
|
+
comp_key = options['compression']
|
|
452
|
+
compression = (PYORC_COMPRESSION_MAP[comp_key]
|
|
453
|
+
if comp_key in PYORC_COMPRESSION_MAP.keys() else 0)
|
|
454
|
+
with open(fromname, 'r', encoding=options['encoding']) as source:
|
|
455
|
+
reader = csv.DictReader(source, delimiter=options['delimiter'])
|
|
456
|
+
struct_schema = []
|
|
457
|
+
for field in reader.fieldnames:
|
|
458
|
+
struct_schema.append('%s:string' % (field))
|
|
459
|
+
schema_str = ','.join(struct_schema)
|
|
460
|
+
with open(toname, 'wb') as output:
|
|
461
|
+
writer = pyorc.Writer(output, f"struct<{schema_str}>",
|
|
462
|
+
struct_repr=pyorc.StructRepr.DICT,
|
|
463
|
+
compression=compression,
|
|
464
|
+
compression_strategy=1)
|
|
465
|
+
n = 0
|
|
466
|
+
for row in reader:
|
|
467
|
+
n += 1
|
|
468
|
+
try:
|
|
469
|
+
writer.write(row)
|
|
470
|
+
except TypeError:
|
|
471
|
+
print('Error processing row %d. Skip and continue', n)
|
|
472
|
+
|
|
473
|
+
def jsonl_to_orc(fromname, toname, options=None, default_options=None):
|
|
474
|
+
"""Converts JSON file to ORC file."""
|
|
475
|
+
if options is None:
|
|
476
|
+
options = {}
|
|
477
|
+
if default_options is None:
|
|
478
|
+
default_options = {'force_flat': False, 'useitems': 100,
|
|
479
|
+
'compression': 'zstd'}
|
|
480
|
+
import pyorc
|
|
481
|
+
options = __copy_options(options, default_options)
|
|
482
|
+
comp_key = options['compression']
|
|
483
|
+
compression = (PYORC_COMPRESSION_MAP[comp_key]
|
|
484
|
+
if comp_key in PYORC_COMPRESSION_MAP.keys() else 0)
|
|
485
|
+
df = pandas.read_json(fromname, lines=True, encoding=options['encoding'])
|
|
486
|
+
df.info()
|
|
487
|
+
struct_schema = df_to_pyorc_schema(df)
|
|
488
|
+
schema_str = ','.join(struct_schema)
|
|
489
|
+
with open(toname, 'wb') as output:
|
|
490
|
+
writer = pyorc.Writer(output, f"struct<{schema_str}>",
|
|
491
|
+
struct_repr=pyorc.StructRepr.DICT,
|
|
492
|
+
compression=compression,
|
|
493
|
+
compression_strategy=1)
|
|
494
|
+
writer.writerows(df.to_dict(orient="records"))
|
|
495
|
+
|
|
496
|
+
def csv_to_avro(fromname, toname, options=None, default_options=None):
|
|
497
|
+
"""Converts CSV file to AVRO file."""
|
|
498
|
+
if options is None:
|
|
499
|
+
options = {}
|
|
500
|
+
if default_options is None:
|
|
501
|
+
default_options = {'encoding': 'utf8', 'delimiter': ',',
|
|
502
|
+
'compression': 'deflate'}
|
|
503
|
+
import avro.schema
|
|
504
|
+
from avro.datafile import DataFileWriter
|
|
505
|
+
from avro.io import DatumWriter
|
|
506
|
+
|
|
507
|
+
options = __copy_options(options, default_options)
|
|
508
|
+
with open(fromname, 'r', encoding=options['encoding']) as source:
|
|
509
|
+
reader = csv.DictReader(source, delimiter=options['delimiter'])
|
|
510
|
+
|
|
511
|
+
schema_dict = {"namespace": "data.avro", "type": "record",
|
|
512
|
+
"name": "Record", "fields": []}
|
|
513
|
+
|
|
514
|
+
for field in reader.fieldnames:
|
|
515
|
+
schema_dict['fields'].append({'name': field, 'type': 'string'})
|
|
516
|
+
schema = avro.schema.parse(json.dumps(schema_dict))
|
|
517
|
+
with open(toname, 'wb') as output:
|
|
518
|
+
writer = DataFileWriter(output, DatumWriter(), schema,
|
|
519
|
+
codec=options['compression'])
|
|
520
|
+
n = 0
|
|
521
|
+
for row in reader:
|
|
522
|
+
n += 1
|
|
523
|
+
try:
|
|
524
|
+
writer.append(row)
|
|
525
|
+
except TypeError:
|
|
526
|
+
print('Error processing row %d. Skip and continue', n)
|
|
527
|
+
|
|
528
|
+
CONVERT_FUNC_MAP = {
|
|
529
|
+
'xls2csv': xls_to_csv,
|
|
530
|
+
'xls2jsonl': xls_to_jsonl,
|
|
531
|
+
'xls2bson': xls_to_bson,
|
|
532
|
+
'xlsx2jsonl': xlsx_to_jsonl,
|
|
533
|
+
'xlsx2bson': xlsx_to_bson,
|
|
534
|
+
'csv2jsonl': csv_to_jsonl,
|
|
535
|
+
'csv2bson': csv_to_bson,
|
|
536
|
+
'xml2jsonl': xml_to_jsonl,
|
|
537
|
+
'jsonl2csv': jsonl_to_csv,
|
|
538
|
+
'bson2jsonl': bson_to_jsonl,
|
|
539
|
+
'json2jsonl': json_to_jsonl,
|
|
540
|
+
'csv2parquet' : csv_to_parquet,
|
|
541
|
+
'jsonl2parquet': jsonl_to_parquet,
|
|
542
|
+
'jsonl2orc' : jsonl_to_orc,
|
|
543
|
+
'csv2orc' : csv_to_orc,
|
|
544
|
+
'csv2avro' : csv_to_avro,
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
DEFAULT_HEADERS_DETECT_LIMIT = 1000
|
|
549
|
+
|
|
550
|
+
def make_flat(item):
|
|
551
|
+
"""Flatten nested structures in dictionary by converting to strings."""
|
|
552
|
+
result = {}
|
|
553
|
+
for k, v in item.items():
|
|
554
|
+
if isinstance(v, (tuple, list, dict)):
|
|
555
|
+
result[k] = str(v)
|
|
556
|
+
else:
|
|
557
|
+
result[k] = v
|
|
558
|
+
return result
|
|
559
|
+
|
|
560
|
+
class Converter:
|
|
561
|
+
"""File format converter handler."""
|
|
562
|
+
def __init__(self, batch_size = DEFAULT_BATCH_SIZE):
|
|
563
|
+
self.batch_size = batch_size
|
|
564
|
+
pass
|
|
565
|
+
|
|
566
|
+
def convert(self, fromfile, tofile, options=None, limit=DEFAULT_HEADERS_DETECT_LIMIT):
|
|
567
|
+
"""Convert file from one format to another.
|
|
568
|
+
|
|
569
|
+
Processes files in two phases:
|
|
570
|
+
1. Schema extraction: Samples records to determine field structure
|
|
571
|
+
2. Conversion: Streams records from source to destination format
|
|
572
|
+
|
|
573
|
+
Uses sets for efficient key tracking during schema extraction.
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
fromfile: Path to input file.
|
|
577
|
+
tofile: Path to output file.
|
|
578
|
+
options: Dictionary of conversion options (encoding, delimiter, etc.).
|
|
579
|
+
limit: Maximum records to sample for schema detection.
|
|
580
|
+
|
|
581
|
+
Raises:
|
|
582
|
+
ValueError: If file format is not supported.
|
|
583
|
+
IOError: If file cannot be read or written.
|
|
584
|
+
"""
|
|
585
|
+
if options is None:
|
|
586
|
+
options = {}
|
|
587
|
+
iterableargs = get_iterable_options(options)
|
|
588
|
+
it_in = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
|
|
589
|
+
is_flatten = get_option(options, 'flatten')
|
|
590
|
+
keys_set = set() # Use set for O(1) lookup instead of O(n) list operations
|
|
591
|
+
n = 0
|
|
592
|
+
logging.info('Extracting schema')
|
|
593
|
+
for item in tqdm(it_in, total=limit):
|
|
594
|
+
if limit is not None and n > limit:
|
|
595
|
+
break
|
|
596
|
+
n += 1
|
|
597
|
+
if not is_flatten:
|
|
598
|
+
dk = dict_generator(item)
|
|
599
|
+
for i in dk:
|
|
600
|
+
k = ".".join(i[:-1])
|
|
601
|
+
keys_set.add(k)
|
|
602
|
+
else:
|
|
603
|
+
item = make_flat(item)
|
|
604
|
+
for k in item.keys():
|
|
605
|
+
keys_set.add(k)
|
|
606
|
+
|
|
607
|
+
keys = list(keys_set) # Convert to list for backward compatibility
|
|
608
|
+
it_in.reset()
|
|
609
|
+
it_out = open_iterable(tofile, mode='w', iterableargs={'keys' : keys})
|
|
610
|
+
|
|
611
|
+
logging.info('Converting data')
|
|
612
|
+
n = 0
|
|
613
|
+
batch = []
|
|
614
|
+
for row in tqdm(it_in):
|
|
615
|
+
n += 1
|
|
616
|
+
if is_flatten:
|
|
617
|
+
for k in keys:
|
|
618
|
+
if k not in row.keys():
|
|
619
|
+
row[k] = None
|
|
620
|
+
batch.append(make_flat(row))
|
|
621
|
+
else:
|
|
622
|
+
batch.append(row)
|
|
623
|
+
if n % self.batch_size == 0:
|
|
624
|
+
it_out.write_bulk(batch)
|
|
625
|
+
batch = []
|
|
626
|
+
if len(batch) > 0:
|
|
627
|
+
it_out.write_bulk(batch)
|
|
628
|
+
it_in.close()
|
|
629
|
+
it_out.close()
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def convert_old(self, fromfile, tofile, options=None):
|
|
633
|
+
"""Legacy conversion method."""
|
|
634
|
+
if options is None:
|
|
635
|
+
options = {}
|
|
636
|
+
fromtype = (options['format_in'] if options['format_in'] is not None
|
|
637
|
+
else get_file_type(fromfile))
|
|
638
|
+
totype = (options['format_out'] if options['format_out'] is not None
|
|
639
|
+
else get_file_type(tofile))
|
|
640
|
+
key = '%s2%s' % (fromtype, totype)
|
|
641
|
+
func = CONVERT_FUNC_MAP.get(key, None)
|
|
642
|
+
if func is None:
|
|
643
|
+
logging.error('Conversion between %s and %s not supported' % (fromtype, totype))
|
|
644
|
+
else:
|
|
645
|
+
logging.info('Convert %s from %s to %s' % (key, fromfile, tofile))
|
|
646
|
+
func(fromfile, tofile, options)
|