undatum 1.0.17__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- undatum/__init__.py +9 -0
- undatum/__main__.py +25 -0
- undatum/ai/__init__.py +145 -0
- undatum/ai/base.py +85 -0
- undatum/ai/config.py +184 -0
- undatum/ai/perplexity.py +79 -0
- undatum/ai/providers.py +1002 -0
- undatum/ai/schemas.py +42 -0
- undatum/cmds/__init__.py +6 -0
- undatum/cmds/analyzer.py +697 -0
- undatum/cmds/converter.py +646 -0
- undatum/cmds/ingester.py +116 -0
- undatum/cmds/query.py +68 -0
- undatum/cmds/schemer.py +328 -0
- undatum/cmds/selector.py +437 -0
- undatum/cmds/statistics.py +158 -0
- undatum/cmds/textproc.py +59 -0
- undatum/cmds/transformer.py +81 -0
- undatum/cmds/validator.py +137 -0
- undatum/common/__init__.py +6 -0
- undatum/common/functions.py +81 -0
- undatum/common/iterable.py +222 -0
- undatum/common/scheme.py +261 -0
- undatum/constants.py +21 -0
- undatum/core.py +616 -0
- undatum/formats/__init__.py +6 -0
- undatum/formats/docx.py +160 -0
- undatum/utils.py +298 -0
- undatum/validate/__init__.py +11 -0
- undatum/validate/commonrules.py +15 -0
- undatum/validate/ruscodes.py +202 -0
- undatum-1.0.17.dist-info/METADATA +610 -0
- undatum-1.0.17.dist-info/RECORD +37 -0
- undatum-1.0.17.dist-info/WHEEL +6 -0
- undatum-1.0.17.dist-info/entry_points.txt +3 -0
- undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
- undatum-1.0.17.dist-info/top_level.txt +1 -0
undatum/cmds/selector.py
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""Data selection and filtering module."""
|
|
3
|
+
import csv
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import zipfile
|
|
8
|
+
|
|
9
|
+
import bson
|
|
10
|
+
import dictquery as dq
|
|
11
|
+
import duckdb
|
|
12
|
+
import orjson
|
|
13
|
+
from iterable.helpers.detect import detect_file_type, open_iterable
|
|
14
|
+
|
|
15
|
+
from ..common.iterable import DataWriter, IterableData
|
|
16
|
+
from ..constants import DUCKABLE_CODECS, DUCKABLE_FILE_TYPES
|
|
17
|
+
from ..utils import (detect_encoding, dict_generator, get_dict_value,
|
|
18
|
+
get_file_type, get_option, strip_dict_fields)
|
|
19
|
+
|
|
20
|
+
LINEEND = '\n'.encode('utf8')
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_iterable_options(options):
|
|
27
|
+
"""Extract iterable-specific options from options dictionary."""
|
|
28
|
+
out = {}
|
|
29
|
+
for k in ITERABLE_OPTIONS_KEYS:
|
|
30
|
+
if k in options.keys():
|
|
31
|
+
out[k] = options[k]
|
|
32
|
+
return out
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _detect_engine(fromfile, engine, filetype):
|
|
36
|
+
"""Detect the appropriate engine for processing."""
|
|
37
|
+
compression = 'raw'
|
|
38
|
+
if filetype is None:
|
|
39
|
+
ftype = detect_file_type(fromfile)
|
|
40
|
+
if ftype['success']:
|
|
41
|
+
filetype = ftype['datatype'].id()
|
|
42
|
+
if ftype['codec'] is not None:
|
|
43
|
+
compression = ftype['codec'].id()
|
|
44
|
+
logging.info(f'File filetype {filetype} and compression {compression}')
|
|
45
|
+
if engine == 'auto':
|
|
46
|
+
if filetype in DUCKABLE_FILE_TYPES and compression in DUCKABLE_CODECS:
|
|
47
|
+
return 'duckdb'
|
|
48
|
+
return 'iterable'
|
|
49
|
+
return engine
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_iterable_fields_uniq(iterable, fields, dolog=False, dq_instance=None): # pylint: disable=unused-argument
|
|
53
|
+
"""Returns all uniq values of the fields of iterable dictionary."""
|
|
54
|
+
# dq_instance kept for API compatibility
|
|
55
|
+
n = 0
|
|
56
|
+
uniqval = []
|
|
57
|
+
for row in iterable:
|
|
58
|
+
n += 1
|
|
59
|
+
if dolog and n % 1000 == 0:
|
|
60
|
+
logging.debug('uniq: processing %d records', n)
|
|
61
|
+
try:
|
|
62
|
+
allvals = []
|
|
63
|
+
for field in fields:
|
|
64
|
+
allvals.append(get_dict_value(row, field.split('.')))
|
|
65
|
+
|
|
66
|
+
for n1, _ in enumerate(allvals[0]):
|
|
67
|
+
k = []
|
|
68
|
+
for n2, _ in enumerate(allvals):
|
|
69
|
+
k.append(str(allvals[n2][n1]))
|
|
70
|
+
if k not in uniqval:
|
|
71
|
+
uniqval.append(k)
|
|
72
|
+
except KeyError:
|
|
73
|
+
pass
|
|
74
|
+
return uniqval
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_duckdb_fields_uniq(filename, fields, dolog=False, dq_instance=None): # pylint: disable=unused-argument
|
|
78
|
+
"""Returns all uniq values of the fields of the filename using DuckdDB."""
|
|
79
|
+
# dq_instance kept for API compatibility
|
|
80
|
+
uniqval = []
|
|
81
|
+
fieldstext = ','.join(fields)
|
|
82
|
+
query = (f"select unnest(grp) from (select distinct({fieldstext}) "
|
|
83
|
+
f"as grp from '{filename}')")
|
|
84
|
+
if dolog:
|
|
85
|
+
logging.info(query)
|
|
86
|
+
uniqval = duckdb.sql(query).fetchall()
|
|
87
|
+
return uniqval
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_iterable_fields_freq(iterable, fields, dolog=False, filter_expr=None, dq_instance=None):
|
|
92
|
+
"""Iterates and returns most frequent values."""
|
|
93
|
+
n = 0
|
|
94
|
+
valuedict = {}
|
|
95
|
+
items = []
|
|
96
|
+
for r in iterable:
|
|
97
|
+
n += 1
|
|
98
|
+
if dolog and n % 10000 == 0:
|
|
99
|
+
logging.info('frequency: processing %d records', n)
|
|
100
|
+
if filter_expr is not None:
|
|
101
|
+
query_obj = dq_instance if dq_instance is not None else dq
|
|
102
|
+
if not query_obj.match(r, filter_expr):
|
|
103
|
+
continue
|
|
104
|
+
try:
|
|
105
|
+
allvals = []
|
|
106
|
+
for field in fields:
|
|
107
|
+
allvals.append(get_dict_value(r, field.split('.')))
|
|
108
|
+
|
|
109
|
+
for n1, _ in enumerate(allvals[0]):
|
|
110
|
+
k = []
|
|
111
|
+
for n2, _ in enumerate(allvals):
|
|
112
|
+
k.append(str(allvals[n2][n1]))
|
|
113
|
+
kx = '\t'.join(k)
|
|
114
|
+
v = valuedict.get(kx, 0)
|
|
115
|
+
valuedict[kx] = v + 1
|
|
116
|
+
except KeyError:
|
|
117
|
+
pass
|
|
118
|
+
for k, v in valuedict.items():
|
|
119
|
+
row = k.split('\t')
|
|
120
|
+
row.append(v)
|
|
121
|
+
items.append(row)
|
|
122
|
+
items.sort(key=lambda x: x[-1], reverse=True)
|
|
123
|
+
return items
|
|
124
|
+
|
|
125
|
+
def get_duckdb_fields_freq(filename, fields, dolog=False, dq_instance=None): # pylint: disable=unused-argument
|
|
126
|
+
"""Returns frequencies for the fields of the filename using DuckdDB."""
|
|
127
|
+
# dq_instance kept for API compatibility
|
|
128
|
+
uniqval = []
|
|
129
|
+
fieldstext = ','.join(fields)
|
|
130
|
+
query = (f"select {fieldstext}, count(*) as c from '{filename}' "
|
|
131
|
+
f"group by {fieldstext} order by c desc")
|
|
132
|
+
if dolog:
|
|
133
|
+
logging.info(query)
|
|
134
|
+
uniqval = duckdb.sql(query).fetchall()
|
|
135
|
+
return uniqval
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class Selector:
|
|
139
|
+
"""Data selection and filtering handler."""
|
|
140
|
+
def __init__(self):
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
def uniq(self, fromfile, options=None):
|
|
144
|
+
"""Extracts unique values by field."""
|
|
145
|
+
if options is None:
|
|
146
|
+
options = {}
|
|
147
|
+
logging.debug('Processing %s', fromfile)
|
|
148
|
+
iterableargs = get_iterable_options(options)
|
|
149
|
+
filetype = get_option(options, 'filetype')
|
|
150
|
+
to_file = get_option(options, 'output')
|
|
151
|
+
engine = get_option(options, 'engine')
|
|
152
|
+
if to_file:
|
|
153
|
+
to_type = get_file_type(to_file)
|
|
154
|
+
if not to_file:
|
|
155
|
+
logging.debug('Output file type not supported')
|
|
156
|
+
return
|
|
157
|
+
out = open(to_file, 'w', encoding='utf8')
|
|
158
|
+
else:
|
|
159
|
+
to_type = 'csv'
|
|
160
|
+
out = sys.stdout
|
|
161
|
+
fields = options['fields'].split(',')
|
|
162
|
+
detected_engine = _detect_engine(fromfile, engine, filetype)
|
|
163
|
+
if detected_engine == 'duckdb':
|
|
164
|
+
output_type = 'duckdb'
|
|
165
|
+
uniqval = get_duckdb_fields_uniq(fromfile, fields, dolog=True)
|
|
166
|
+
elif detected_engine == 'iterable':
|
|
167
|
+
output_type = 'iterable'
|
|
168
|
+
iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
|
|
169
|
+
logging.info('uniq: looking for fields: %s' % (options['fields']))
|
|
170
|
+
uniqval = get_iterable_fields_uniq(iterable, fields, dolog=True)
|
|
171
|
+
iterable.close()
|
|
172
|
+
else:
|
|
173
|
+
logging.info('Engine not supported. Please choose duckdb or iterable')
|
|
174
|
+
return
|
|
175
|
+
logging.debug('%d unique values found' % (len(uniqval)))
|
|
176
|
+
writer = DataWriter(out, filetype=to_type, output_type=output_type, fieldnames=fields)
|
|
177
|
+
writer.write_items(uniqval)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def headers(self, fromfile, options=None):
|
|
181
|
+
"""Extracts headers values."""
|
|
182
|
+
if options is None:
|
|
183
|
+
options = {}
|
|
184
|
+
limit = get_option(options, 'limit')
|
|
185
|
+
iterableargs = get_iterable_options(options)
|
|
186
|
+
|
|
187
|
+
iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
|
|
188
|
+
keys_set = set() # Use set for O(1) lookup instead of O(n) list operations
|
|
189
|
+
n = 0
|
|
190
|
+
for item in iterable:
|
|
191
|
+
if limit and n > limit:
|
|
192
|
+
break
|
|
193
|
+
n += 1
|
|
194
|
+
dk = dict_generator(item)
|
|
195
|
+
for i in dk:
|
|
196
|
+
k = ".".join(i[:-1])
|
|
197
|
+
keys_set.add(k)
|
|
198
|
+
iterable.close()
|
|
199
|
+
keys = list(keys_set) # Convert to list for backward compatibility
|
|
200
|
+
output = get_option(options, 'output')
|
|
201
|
+
if output:
|
|
202
|
+
with open(output, 'w', encoding=get_option(options, 'encoding')) as f:
|
|
203
|
+
f.write('\n'.join(keys))
|
|
204
|
+
else:
|
|
205
|
+
for x in keys:
|
|
206
|
+
print(x.encode('utf8').decode('utf8', 'ignore'))
|
|
207
|
+
|
|
208
|
+
def frequency(self, fromfile, options=None):
|
|
209
|
+
"""Calculates frequency of the values in the file."""
|
|
210
|
+
if options is None:
|
|
211
|
+
options = {}
|
|
212
|
+
logging.debug('Processing %s', fromfile)
|
|
213
|
+
iterableargs = get_iterable_options(options)
|
|
214
|
+
filetype = get_option(options, 'filetype')
|
|
215
|
+
to_file = get_option(options, 'output')
|
|
216
|
+
engine = get_option(options, 'engine')
|
|
217
|
+
if to_file:
|
|
218
|
+
to_type = get_file_type(to_file)
|
|
219
|
+
if not to_file:
|
|
220
|
+
logging.debug('Output file type not supported')
|
|
221
|
+
return
|
|
222
|
+
out = open(to_file, 'w', encoding='utf8')
|
|
223
|
+
else:
|
|
224
|
+
to_type = 'csv'
|
|
225
|
+
out = sys.stdout
|
|
226
|
+
fields = options['fields'].split(',')
|
|
227
|
+
detected_engine = _detect_engine(fromfile, engine, filetype)
|
|
228
|
+
items = []
|
|
229
|
+
output_type = 'iterable'
|
|
230
|
+
if detected_engine == 'duckdb':
|
|
231
|
+
items = get_duckdb_fields_freq(fromfile, fields=fields, dolog=True)
|
|
232
|
+
output_type = 'duckdb'
|
|
233
|
+
elif detected_engine == 'iterable':
|
|
234
|
+
output_type = 'iterable'
|
|
235
|
+
iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
|
|
236
|
+
if iterable is not None:
|
|
237
|
+
items = get_iterable_fields_freq(iterable, fields, dolog=True)
|
|
238
|
+
else:
|
|
239
|
+
logging.info('File type not supported')
|
|
240
|
+
return
|
|
241
|
+
else:
|
|
242
|
+
logging.debug('Data processing engine is not set and not detected')
|
|
243
|
+
return
|
|
244
|
+
logging.debug('frequency: %d unique values found' % (len(items)))
|
|
245
|
+
fields.append('count')
|
|
246
|
+
writer = DataWriter(out, filetype=to_type, output_type=output_type, fieldnames=fields)
|
|
247
|
+
writer.write_items(items)
|
|
248
|
+
|
|
249
|
+
def select(self, fromfile, options=None):
|
|
250
|
+
"""Select or re-order columns from file."""
|
|
251
|
+
if options is None:
|
|
252
|
+
options = {}
|
|
253
|
+
f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
|
|
254
|
+
iterable = IterableData(fromfile, options=options)
|
|
255
|
+
to_file = get_option(options, 'output')
|
|
256
|
+
|
|
257
|
+
if to_file:
|
|
258
|
+
to_type = get_file_type(to_file)
|
|
259
|
+
if not to_file:
|
|
260
|
+
print('Output file type not supported')
|
|
261
|
+
return
|
|
262
|
+
if to_type == 'bson':
|
|
263
|
+
out = open(to_file, 'wb')
|
|
264
|
+
if to_type == 'jsonl':
|
|
265
|
+
out = open(to_file, 'wb')
|
|
266
|
+
else:
|
|
267
|
+
out = open(to_file, 'w', encoding='utf8')
|
|
268
|
+
else:
|
|
269
|
+
to_type = f_type
|
|
270
|
+
out = sys.stdout
|
|
271
|
+
fields = options['fields'].split(',')
|
|
272
|
+
writer = DataWriter(out, filetype=to_type, fieldnames=fields)
|
|
273
|
+
if iterable:
|
|
274
|
+
n = 0
|
|
275
|
+
fields = [field.split('.') for field in fields]
|
|
276
|
+
chunk = []
|
|
277
|
+
for r in iterable.iter():
|
|
278
|
+
n += 1
|
|
279
|
+
if options['filter'] is not None:
|
|
280
|
+
res = dq.match(r, options['filter'])
|
|
281
|
+
# print(options['filter'], r)
|
|
282
|
+
if not res:
|
|
283
|
+
continue
|
|
284
|
+
r_selected = strip_dict_fields(r, fields, 0)
|
|
285
|
+
if n % 1000 == 0:
|
|
286
|
+
logging.info('select: processing %d records of %s' % (n, fromfile))
|
|
287
|
+
if len(chunk) > 0:
|
|
288
|
+
writer.write_items(chunk)
|
|
289
|
+
chunk = []
|
|
290
|
+
else:
|
|
291
|
+
chunk.append(r_selected)
|
|
292
|
+
if len(chunk) > 0:
|
|
293
|
+
writer.write_items(chunk)
|
|
294
|
+
else:
|
|
295
|
+
logging.info('File type not supported')
|
|
296
|
+
return
|
|
297
|
+
logging.debug('select: %d records processed' % (n))
|
|
298
|
+
out.close()
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def split_new(self, fromfile, options=None):
|
|
302
|
+
"""Splits the given file with data into chunks based on chunk size or field value."""
|
|
303
|
+
if options is None:
|
|
304
|
+
options = {}
|
|
305
|
+
iterableargs = get_iterable_options(options)
|
|
306
|
+
open_iterable(fromfile, mode='r', iterableargs=iterableargs)
|
|
307
|
+
get_option(options, 'output')
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def split(self, fromfile, options=None):
|
|
311
|
+
"""Splits the given file with data into chunks based on chunk size or field value."""
|
|
312
|
+
if options is None:
|
|
313
|
+
options = {}
|
|
314
|
+
f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
|
|
315
|
+
if options['zipfile']:
|
|
316
|
+
z = zipfile.ZipFile(fromfile, mode='r')
|
|
317
|
+
fnames = z.namelist()
|
|
318
|
+
finfilename = fnames[0]
|
|
319
|
+
if f_type == 'bson':
|
|
320
|
+
infile = z.open(fnames[0], 'rb')
|
|
321
|
+
else:
|
|
322
|
+
infile = z.open(fnames[0], 'r')
|
|
323
|
+
elif options['gzipfile']:
|
|
324
|
+
import gzip
|
|
325
|
+
infile = gzip.open(fromfile, 'rb')
|
|
326
|
+
finfilename = fromfile.split('.', 1)[0] + '.' + f_type
|
|
327
|
+
else:
|
|
328
|
+
finfilename = fromfile
|
|
329
|
+
if f_type == 'bson':
|
|
330
|
+
infile = open(fromfile, 'rb')
|
|
331
|
+
else:
|
|
332
|
+
if 'encoding' in options.keys():
|
|
333
|
+
infile = open(fromfile, 'r', encoding=get_option(options, 'encoding'))
|
|
334
|
+
else:
|
|
335
|
+
detected_enc = detect_encoding(fromfile, limit=100000)
|
|
336
|
+
if detected_enc:
|
|
337
|
+
infile = open(fromfile, 'r', encoding=detected_enc['encoding'])
|
|
338
|
+
else:
|
|
339
|
+
infile = open(fromfile, 'r', encoding='utf8')
|
|
340
|
+
fields = options['fields'].split(',') if options['fields'] is not None else None
|
|
341
|
+
valuedict = {}
|
|
342
|
+
delimiter = get_option(options, 'delimiter')
|
|
343
|
+
if f_type == 'csv':
|
|
344
|
+
reader = csv.DictReader(infile, delimiter=delimiter)
|
|
345
|
+
n = 0
|
|
346
|
+
chunknum = 1
|
|
347
|
+
if options['fields'] is None:
|
|
348
|
+
splitname = finfilename.rsplit('.', 1)[0] + '_%d.csv' % (chunknum)
|
|
349
|
+
out = open(splitname, 'w', encoding=get_option(options, 'encoding'))
|
|
350
|
+
writer = csv.DictWriter(out, fieldnames=reader.fieldnames, delimiter=delimiter)
|
|
351
|
+
writer.writeheader()
|
|
352
|
+
for r in reader:
|
|
353
|
+
n += 1
|
|
354
|
+
if n % 10000 == 0:
|
|
355
|
+
logging.info('split: processing %d records of %s' % (n, fromfile))
|
|
356
|
+
if options['filter'] is not None:
|
|
357
|
+
if not dq.match(r, options['filter']):
|
|
358
|
+
continue
|
|
359
|
+
writer.writerow(r)
|
|
360
|
+
if n % options['chunksize'] == 0:
|
|
361
|
+
out.close()
|
|
362
|
+
chunknum += 1
|
|
363
|
+
splitname = finfilename.rsplit('.', 1)[0] + '_%d.csv' % (
|
|
364
|
+
chunknum)
|
|
365
|
+
out = open(splitname, 'w',
|
|
366
|
+
encoding=get_option(options, 'encoding'))
|
|
367
|
+
writer = csv.DictWriter(out, fieldnames=reader.fieldnames,
|
|
368
|
+
delimiter=delimiter)
|
|
369
|
+
writer.writeheader()
|
|
370
|
+
elif f_type == 'jsonl':
|
|
371
|
+
n = 0
|
|
372
|
+
chunknum = 1
|
|
373
|
+
if options['fields'] is None:
|
|
374
|
+
splitname = finfilename.rsplit('.', 1)[0] + '_%d.jsonl' % (chunknum)
|
|
375
|
+
out = open(splitname, 'wb') # , encoding=get_option(options, 'encoding'))
|
|
376
|
+
|
|
377
|
+
for l in infile:
|
|
378
|
+
n += 1
|
|
379
|
+
if n % 10000 == 0:
|
|
380
|
+
logging.info('split: processing %d records of %s' % (n, fromfile))
|
|
381
|
+
r = orjson.loads(l)
|
|
382
|
+
if options['filter'] is not None:
|
|
383
|
+
if not dq.match(r, options['filter']):
|
|
384
|
+
continue
|
|
385
|
+
out.write(orjson.dumps(r, option=orjson.OPT_APPEND_NEWLINE))
|
|
386
|
+
if n % options['chunksize'] == 0:
|
|
387
|
+
out.close()
|
|
388
|
+
chunknum += 1
|
|
389
|
+
splitname = finfilename.rsplit('.', 1)[0] + '_%d.jsonl' % (chunknum)
|
|
390
|
+
logging.info('split: new chunk %s' % splitname)
|
|
391
|
+
out = open(splitname, 'wb') #, encoding=get_option(options, 'encoding'))
|
|
392
|
+
else:
|
|
393
|
+
for l in infile:
|
|
394
|
+
n += 1
|
|
395
|
+
if n % 10000 == 0:
|
|
396
|
+
logging.info('split: processing %d records of %s' % (n, fromfile))
|
|
397
|
+
r = orjson.loads(l)
|
|
398
|
+
if options['filter'] is not None:
|
|
399
|
+
if not dq.match(r, options['filter']):
|
|
400
|
+
continue
|
|
401
|
+
try:
|
|
402
|
+
kx = get_dict_value(r, fields[0].split('.'))[0]
|
|
403
|
+
except IndexError:
|
|
404
|
+
continue
|
|
405
|
+
kx = "None"
|
|
406
|
+
if kx is None:
|
|
407
|
+
continue
|
|
408
|
+
kx = (kx.replace('\\', '-').replace('/', '-')
|
|
409
|
+
.replace('?', '-').replace('<', '-')
|
|
410
|
+
.replace('>', '-').replace('\n', ''))
|
|
411
|
+
v = valuedict.get(kx, None)
|
|
412
|
+
if v is None:
|
|
413
|
+
# splitname = finfilename.rsplit('.', 1)[0] + '_%s.jsonl' % (kx)
|
|
414
|
+
splitname = '%s.jsonl' % (kx)
|
|
415
|
+
if options['dirname'] is not None:
|
|
416
|
+
splitname = os.path.join(options['dirname'],
|
|
417
|
+
splitname)
|
|
418
|
+
valuedict[kx] = open(splitname, 'w', encoding='utf8')
|
|
419
|
+
valuedict[kx].write(l)
|
|
420
|
+
# valuedict[kx].write(l.decode('utf8'))#.decode('utf8')#)
|
|
421
|
+
for opened in valuedict.values():
|
|
422
|
+
opened.close()
|
|
423
|
+
elif f_type == 'bson':
|
|
424
|
+
bson_iter = bson.decode_file_iter(infile)
|
|
425
|
+
n = 0
|
|
426
|
+
for r in bson_iter:
|
|
427
|
+
n += 1
|
|
428
|
+
# print(r)
|
|
429
|
+
strip_dict_fields(r, fields, 0)
|
|
430
|
+
# out.write(json.dumps(r_selected)+'\n')
|
|
431
|
+
if n % 10000 == 0:
|
|
432
|
+
logging.info('split: processing %d records of %s' % (n, fromfile))
|
|
433
|
+
|
|
434
|
+
else:
|
|
435
|
+
logging.info('File type not supported')
|
|
436
|
+
return
|
|
437
|
+
logging.debug('split: %d records processed' % (n))
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""Statistical analysis module."""
|
|
3
|
+
from ..utils import get_option, dict_generator, guess_datatype
|
|
4
|
+
from ..constants import DEFAULT_DICT_SHARE
|
|
5
|
+
import logging
|
|
6
|
+
from qddate import DateParser
|
|
7
|
+
#from ..common.iterable import IterableData
|
|
8
|
+
from iterable.helpers.detect import open_iterable
|
|
9
|
+
|
|
10
|
+
#STAT_READY_DATA_FORMATS = ['jsonl', 'bson', 'csv']
|
|
11
|
+
|
|
12
|
+
ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_iterable_options(options):
|
|
16
|
+
"""Extract iterable-specific options from options dictionary."""
|
|
17
|
+
out = {}
|
|
18
|
+
for k in ITERABLE_OPTIONS_KEYS:
|
|
19
|
+
if k in options.keys():
|
|
20
|
+
out[k] = options[k]
|
|
21
|
+
return out
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class StatProcessor:
|
|
25
|
+
"""Statistical processing handler."""
|
|
26
|
+
def __init__(self, nodates=True):
|
|
27
|
+
if nodates:
|
|
28
|
+
self.qd = None
|
|
29
|
+
else:
|
|
30
|
+
self.qd = DateParser(generate=True)
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
def stats(self, fromfile, options):
|
|
34
|
+
"""Produces statistics and structure analysis of JSONlines, BSON or CSV file and produces stats"""
|
|
35
|
+
from rich import print
|
|
36
|
+
from rich.table import Table
|
|
37
|
+
|
|
38
|
+
iterableargs = get_iterable_options(options)
|
|
39
|
+
iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
|
|
40
|
+
dictshare = get_option(options, 'dictshare')
|
|
41
|
+
|
|
42
|
+
if dictshare and dictshare.isdigit():
|
|
43
|
+
dictshare = int(dictshare)
|
|
44
|
+
else:
|
|
45
|
+
dictshare = DEFAULT_DICT_SHARE
|
|
46
|
+
|
|
47
|
+
profile = {'version': 1.0}
|
|
48
|
+
fielddata = {}
|
|
49
|
+
fieldtypes = {}
|
|
50
|
+
|
|
51
|
+
# data = json.load(open(profile['filename']))
|
|
52
|
+
count = 0
|
|
53
|
+
nfields = 0
|
|
54
|
+
|
|
55
|
+
# process data items one by one
|
|
56
|
+
logging.debug('Start processing %s' % (fromfile))
|
|
57
|
+
for item in iterable:
|
|
58
|
+
count += 1
|
|
59
|
+
dk = dict_generator(item)
|
|
60
|
+
if count % 1000 == 0: logging.debug('Processing %d records of %s' % (count, fromfile))
|
|
61
|
+
for i in dk:
|
|
62
|
+
# print(i)
|
|
63
|
+
k = '.'.join(i[:-1])
|
|
64
|
+
if len(i) == 0: continue
|
|
65
|
+
if i[0].isdigit(): continue
|
|
66
|
+
if len(i[0]) == 1: continue
|
|
67
|
+
v = i[-1]
|
|
68
|
+
if k not in fielddata: # Use direct dict membership check instead of list()
|
|
69
|
+
fielddata[k] = {'key': k, 'uniq': {}, 'n_uniq': 0, 'total': 0, 'share_uniq': 0.0,
|
|
70
|
+
'minlen': None, 'maxlen': 0, 'avglen': 0, 'totallen': 0}
|
|
71
|
+
fd = fielddata[k]
|
|
72
|
+
uniqval = fd['uniq'].get(v, 0)
|
|
73
|
+
fd['uniq'][v] = uniqval + 1
|
|
74
|
+
fd['total'] += 1
|
|
75
|
+
if uniqval == 0:
|
|
76
|
+
fd['n_uniq'] += 1
|
|
77
|
+
fd['share_uniq'] = (fd['n_uniq'] * 100.0) / fd['total']
|
|
78
|
+
fl = len(str(v))
|
|
79
|
+
if fd['minlen'] is None:
|
|
80
|
+
fd['minlen'] = fl
|
|
81
|
+
else:
|
|
82
|
+
fd['minlen'] = fl if fl < fd['minlen'] else fd['minlen']
|
|
83
|
+
fd['maxlen'] = fl if fl > fd['maxlen'] else fd['maxlen']
|
|
84
|
+
fd['totallen'] += fl
|
|
85
|
+
fielddata[k] = fd
|
|
86
|
+
if k not in fieldtypes: # Use direct dict membership check instead of list()
|
|
87
|
+
fieldtypes[k] = {'key': k, 'types': {}}
|
|
88
|
+
fd = fieldtypes[k]
|
|
89
|
+
thetype = guess_datatype(v, self.qd)['base']
|
|
90
|
+
uniqval = fd['types'].get(thetype, 0)
|
|
91
|
+
fd['types'][thetype] = uniqval + 1
|
|
92
|
+
fieldtypes[k] = fd
|
|
93
|
+
# print count
|
|
94
|
+
for k, v in fielddata.items(): # Use dict.items() directly, no list() conversion
|
|
95
|
+
fielddata[k]['share_uniq'] = (v['n_uniq'] * 100.0) / v['total']
|
|
96
|
+
fielddata[k]['avglen'] = v['totallen'] / v['total']
|
|
97
|
+
profile['count'] = count
|
|
98
|
+
profile['num_fields'] = nfields
|
|
99
|
+
|
|
100
|
+
# Determine field types first so we can use them when building dicts
|
|
101
|
+
finfields = {}
|
|
102
|
+
for fd in fieldtypes.values(): # Use dict.values() directly, no list() conversion
|
|
103
|
+
fdt = list(fd['types'].keys()) # Keep list() here as we need to check membership and modify
|
|
104
|
+
if 'empty' in fdt:
|
|
105
|
+
del fd['types']['empty']
|
|
106
|
+
types_keys = list(fd['types'].keys()) # Need list for len() and indexing
|
|
107
|
+
if len(types_keys) != 1:
|
|
108
|
+
ftype = 'str'
|
|
109
|
+
else:
|
|
110
|
+
ftype = types_keys[0]
|
|
111
|
+
finfields[fd['key']] = ftype
|
|
112
|
+
|
|
113
|
+
profile['fieldtypes'] = finfields
|
|
114
|
+
|
|
115
|
+
dictkeys = []
|
|
116
|
+
dicts = {}
|
|
117
|
+
# print(profile)
|
|
118
|
+
profile['fields'] = []
|
|
119
|
+
for fd in fielddata.values(): # Use dict.values() directly, no list() conversion
|
|
120
|
+
# print(fd['key']) # , fd['n_uniq'], fd['share_uniq'], fieldtypes[fd['key']]
|
|
121
|
+
field = {'key': fd['key'], 'is_uniq': 0 if fd['share_uniq'] < 100 else 1}
|
|
122
|
+
profile['fields'].append(field)
|
|
123
|
+
if fd['share_uniq'] < dictshare:
|
|
124
|
+
dictkeys.append(fd['key'])
|
|
125
|
+
# Use determined field type instead of defaulting to 'str'
|
|
126
|
+
field_type = finfields.get(fd['key'], 'str')
|
|
127
|
+
dicts[fd['key']] = {'items': fd['uniq'], 'count': fd['n_uniq'],
|
|
128
|
+
'type': field_type}
|
|
129
|
+
# for k, v in fd['uniq'].items():
|
|
130
|
+
# print fd['key'], k, v
|
|
131
|
+
profile['dictkeys'] = dictkeys
|
|
132
|
+
|
|
133
|
+
for k, v in fielddata.items(): # Use dict.items() directly, no list() conversion
|
|
134
|
+
del v['uniq']
|
|
135
|
+
fielddata[k] = v
|
|
136
|
+
profile['debug'] = {'fieldtypes': fieldtypes.copy(), 'fielddata': fielddata}
|
|
137
|
+
table = []
|
|
138
|
+
for fd in fielddata.values(): # Use dict.values() directly, no list() conversion
|
|
139
|
+
field = [fd['key'], ]
|
|
140
|
+
field.append(finfields[fd['key']])
|
|
141
|
+
field.append(True if fd['key'] in dictkeys else False)
|
|
142
|
+
field.append(False if fd['share_uniq'] < 100 else True)
|
|
143
|
+
field.append(fd['n_uniq'])
|
|
144
|
+
field.append(fd['share_uniq'])
|
|
145
|
+
field.append(fd['minlen'])
|
|
146
|
+
field.append(fd['maxlen'])
|
|
147
|
+
field.append(fd['avglen'])
|
|
148
|
+
table.append(field)
|
|
149
|
+
headers = ('key', 'ftype', 'is_dictkey', 'is_uniq', 'n_uniq', 'share_uniq', 'minlen', 'maxlen', 'avglen')
|
|
150
|
+
reptable = Table(title="Statistics")
|
|
151
|
+
reptable.add_column(headers[0], justify="left", style="magenta")
|
|
152
|
+
for key in headers[1:-1]:
|
|
153
|
+
reptable.add_column(key, justify="left", style="cyan", no_wrap=True)
|
|
154
|
+
reptable.add_column(headers[-1], justify="right", style="cyan")
|
|
155
|
+
for row in table:
|
|
156
|
+
reptable.add_row(*map(str, row))
|
|
157
|
+
print(reptable)
|
|
158
|
+
|
undatum/cmds/textproc.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""Text processing module."""
|
|
3
|
+
from ..utils import get_file_type, get_option
|
|
4
|
+
from iterable.helpers.detect import open_iterable
|
|
5
|
+
|
|
6
|
+
#STAT_READY_DATA_FORMATS = ['jsonl', 'bson', 'csv']
|
|
7
|
+
|
|
8
|
+
ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_iterable_options(options):
|
|
12
|
+
"""Extract iterable-specific options from options dictionary."""
|
|
13
|
+
out = {}
|
|
14
|
+
for k in ITERABLE_OPTIONS_KEYS:
|
|
15
|
+
if k in options.keys():
|
|
16
|
+
out[k] = options[k]
|
|
17
|
+
return out
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_keys(adict, prefix=None):
|
|
21
|
+
"""Extract all keys from nested dictionary."""
|
|
22
|
+
keys = {}
|
|
23
|
+
for k, v in adict.items():
|
|
24
|
+
fullk = '.'.join([prefix, k]) if prefix else k
|
|
25
|
+
keys[fullk] = 1
|
|
26
|
+
if isinstance(v, dict):
|
|
27
|
+
for ak in get_keys(v, fullk):
|
|
28
|
+
keys[ak] = 1
|
|
29
|
+
elif isinstance(v, list):
|
|
30
|
+
for item in v:
|
|
31
|
+
if isinstance(item, dict):
|
|
32
|
+
for ak in get_keys(item, fullk):
|
|
33
|
+
keys[ak] = 1
|
|
34
|
+
else:
|
|
35
|
+
print(('%s\t%s' % (fullk, str(v))))
|
|
36
|
+
return keys
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class TextProcessor:
|
|
40
|
+
"""Text processing handler."""
|
|
41
|
+
def __init__(self):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
def flatten(self, filename, options):
|
|
45
|
+
"""Flatten the data. One field - one line"""
|
|
46
|
+
get_file_type(filename) if options['format_in'] is None else options['format_in']
|
|
47
|
+
iterableargs = get_iterable_options(options)
|
|
48
|
+
iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
|
|
49
|
+
get_option(options, 'output')
|
|
50
|
+
i = 0
|
|
51
|
+
for rec in iterable:
|
|
52
|
+
allkeys = {}
|
|
53
|
+
i += 1
|
|
54
|
+
for k in get_keys(rec):
|
|
55
|
+
v = allkeys.get(k, 0)
|
|
56
|
+
allkeys[k] = v + 1
|
|
57
|
+
for k, v in allkeys.items():
|
|
58
|
+
print('\t'.join([k, str(v)]))
|
|
59
|
+
|