undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,437 @@
1
+ # -*- coding: utf8 -*-
2
+ """Data selection and filtering module."""
3
+ import csv
4
+ import logging
5
+ import os
6
+ import sys
7
+ import zipfile
8
+
9
+ import bson
10
+ import dictquery as dq
11
+ import duckdb
12
+ import orjson
13
+ from iterable.helpers.detect import detect_file_type, open_iterable
14
+
15
+ from ..common.iterable import DataWriter, IterableData
16
+ from ..constants import DUCKABLE_CODECS, DUCKABLE_FILE_TYPES
17
+ from ..utils import (detect_encoding, dict_generator, get_dict_value,
18
+ get_file_type, get_option, strip_dict_fields)
19
+
20
+ LINEEND = '\n'.encode('utf8')
21
+
22
+
23
+ ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
24
+
25
+
26
+ def get_iterable_options(options):
27
+ """Extract iterable-specific options from options dictionary."""
28
+ out = {}
29
+ for k in ITERABLE_OPTIONS_KEYS:
30
+ if k in options.keys():
31
+ out[k] = options[k]
32
+ return out
33
+
34
+
35
+ def _detect_engine(fromfile, engine, filetype):
36
+ """Detect the appropriate engine for processing."""
37
+ compression = 'raw'
38
+ if filetype is None:
39
+ ftype = detect_file_type(fromfile)
40
+ if ftype['success']:
41
+ filetype = ftype['datatype'].id()
42
+ if ftype['codec'] is not None:
43
+ compression = ftype['codec'].id()
44
+ logging.info(f'File filetype {filetype} and compression {compression}')
45
+ if engine == 'auto':
46
+ if filetype in DUCKABLE_FILE_TYPES and compression in DUCKABLE_CODECS:
47
+ return 'duckdb'
48
+ return 'iterable'
49
+ return engine
50
+
51
+
52
+ def get_iterable_fields_uniq(iterable, fields, dolog=False, dq_instance=None): # pylint: disable=unused-argument
53
+ """Returns all uniq values of the fields of iterable dictionary."""
54
+ # dq_instance kept for API compatibility
55
+ n = 0
56
+ uniqval = []
57
+ for row in iterable:
58
+ n += 1
59
+ if dolog and n % 1000 == 0:
60
+ logging.debug('uniq: processing %d records', n)
61
+ try:
62
+ allvals = []
63
+ for field in fields:
64
+ allvals.append(get_dict_value(row, field.split('.')))
65
+
66
+ for n1, _ in enumerate(allvals[0]):
67
+ k = []
68
+ for n2, _ in enumerate(allvals):
69
+ k.append(str(allvals[n2][n1]))
70
+ if k not in uniqval:
71
+ uniqval.append(k)
72
+ except KeyError:
73
+ pass
74
+ return uniqval
75
+
76
+
77
+ def get_duckdb_fields_uniq(filename, fields, dolog=False, dq_instance=None): # pylint: disable=unused-argument
78
+ """Returns all uniq values of the fields of the filename using DuckdDB."""
79
+ # dq_instance kept for API compatibility
80
+ uniqval = []
81
+ fieldstext = ','.join(fields)
82
+ query = (f"select unnest(grp) from (select distinct({fieldstext}) "
83
+ f"as grp from '{filename}')")
84
+ if dolog:
85
+ logging.info(query)
86
+ uniqval = duckdb.sql(query).fetchall()
87
+ return uniqval
88
+
89
+
90
+
91
+ def get_iterable_fields_freq(iterable, fields, dolog=False, filter_expr=None, dq_instance=None):
92
+ """Iterates and returns most frequent values."""
93
+ n = 0
94
+ valuedict = {}
95
+ items = []
96
+ for r in iterable:
97
+ n += 1
98
+ if dolog and n % 10000 == 0:
99
+ logging.info('frequency: processing %d records', n)
100
+ if filter_expr is not None:
101
+ query_obj = dq_instance if dq_instance is not None else dq
102
+ if not query_obj.match(r, filter_expr):
103
+ continue
104
+ try:
105
+ allvals = []
106
+ for field in fields:
107
+ allvals.append(get_dict_value(r, field.split('.')))
108
+
109
+ for n1, _ in enumerate(allvals[0]):
110
+ k = []
111
+ for n2, _ in enumerate(allvals):
112
+ k.append(str(allvals[n2][n1]))
113
+ kx = '\t'.join(k)
114
+ v = valuedict.get(kx, 0)
115
+ valuedict[kx] = v + 1
116
+ except KeyError:
117
+ pass
118
+ for k, v in valuedict.items():
119
+ row = k.split('\t')
120
+ row.append(v)
121
+ items.append(row)
122
+ items.sort(key=lambda x: x[-1], reverse=True)
123
+ return items
124
+
125
+ def get_duckdb_fields_freq(filename, fields, dolog=False, dq_instance=None): # pylint: disable=unused-argument
126
+ """Returns frequencies for the fields of the filename using DuckdDB."""
127
+ # dq_instance kept for API compatibility
128
+ uniqval = []
129
+ fieldstext = ','.join(fields)
130
+ query = (f"select {fieldstext}, count(*) as c from '{filename}' "
131
+ f"group by {fieldstext} order by c desc")
132
+ if dolog:
133
+ logging.info(query)
134
+ uniqval = duckdb.sql(query).fetchall()
135
+ return uniqval
136
+
137
+
138
+ class Selector:
139
+ """Data selection and filtering handler."""
140
+ def __init__(self):
141
+ pass
142
+
143
+ def uniq(self, fromfile, options=None):
144
+ """Extracts unique values by field."""
145
+ if options is None:
146
+ options = {}
147
+ logging.debug('Processing %s', fromfile)
148
+ iterableargs = get_iterable_options(options)
149
+ filetype = get_option(options, 'filetype')
150
+ to_file = get_option(options, 'output')
151
+ engine = get_option(options, 'engine')
152
+ if to_file:
153
+ to_type = get_file_type(to_file)
154
+ if not to_file:
155
+ logging.debug('Output file type not supported')
156
+ return
157
+ out = open(to_file, 'w', encoding='utf8')
158
+ else:
159
+ to_type = 'csv'
160
+ out = sys.stdout
161
+ fields = options['fields'].split(',')
162
+ detected_engine = _detect_engine(fromfile, engine, filetype)
163
+ if detected_engine == 'duckdb':
164
+ output_type = 'duckdb'
165
+ uniqval = get_duckdb_fields_uniq(fromfile, fields, dolog=True)
166
+ elif detected_engine == 'iterable':
167
+ output_type = 'iterable'
168
+ iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
169
+ logging.info('uniq: looking for fields: %s' % (options['fields']))
170
+ uniqval = get_iterable_fields_uniq(iterable, fields, dolog=True)
171
+ iterable.close()
172
+ else:
173
+ logging.info('Engine not supported. Please choose duckdb or iterable')
174
+ return
175
+ logging.debug('%d unique values found' % (len(uniqval)))
176
+ writer = DataWriter(out, filetype=to_type, output_type=output_type, fieldnames=fields)
177
+ writer.write_items(uniqval)
178
+
179
+
180
+ def headers(self, fromfile, options=None):
181
+ """Extracts headers values."""
182
+ if options is None:
183
+ options = {}
184
+ limit = get_option(options, 'limit')
185
+ iterableargs = get_iterable_options(options)
186
+
187
+ iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
188
+ keys_set = set() # Use set for O(1) lookup instead of O(n) list operations
189
+ n = 0
190
+ for item in iterable:
191
+ if limit and n > limit:
192
+ break
193
+ n += 1
194
+ dk = dict_generator(item)
195
+ for i in dk:
196
+ k = ".".join(i[:-1])
197
+ keys_set.add(k)
198
+ iterable.close()
199
+ keys = list(keys_set) # Convert to list for backward compatibility
200
+ output = get_option(options, 'output')
201
+ if output:
202
+ with open(output, 'w', encoding=get_option(options, 'encoding')) as f:
203
+ f.write('\n'.join(keys))
204
+ else:
205
+ for x in keys:
206
+ print(x.encode('utf8').decode('utf8', 'ignore'))
207
+
208
+ def frequency(self, fromfile, options=None):
209
+ """Calculates frequency of the values in the file."""
210
+ if options is None:
211
+ options = {}
212
+ logging.debug('Processing %s', fromfile)
213
+ iterableargs = get_iterable_options(options)
214
+ filetype = get_option(options, 'filetype')
215
+ to_file = get_option(options, 'output')
216
+ engine = get_option(options, 'engine')
217
+ if to_file:
218
+ to_type = get_file_type(to_file)
219
+ if not to_file:
220
+ logging.debug('Output file type not supported')
221
+ return
222
+ out = open(to_file, 'w', encoding='utf8')
223
+ else:
224
+ to_type = 'csv'
225
+ out = sys.stdout
226
+ fields = options['fields'].split(',')
227
+ detected_engine = _detect_engine(fromfile, engine, filetype)
228
+ items = []
229
+ output_type = 'iterable'
230
+ if detected_engine == 'duckdb':
231
+ items = get_duckdb_fields_freq(fromfile, fields=fields, dolog=True)
232
+ output_type = 'duckdb'
233
+ elif detected_engine == 'iterable':
234
+ output_type = 'iterable'
235
+ iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
236
+ if iterable is not None:
237
+ items = get_iterable_fields_freq(iterable, fields, dolog=True)
238
+ else:
239
+ logging.info('File type not supported')
240
+ return
241
+ else:
242
+ logging.debug('Data processing engine is not set and not detected')
243
+ return
244
+ logging.debug('frequency: %d unique values found' % (len(items)))
245
+ fields.append('count')
246
+ writer = DataWriter(out, filetype=to_type, output_type=output_type, fieldnames=fields)
247
+ writer.write_items(items)
248
+
249
+ def select(self, fromfile, options=None):
250
+ """Select or re-order columns from file."""
251
+ if options is None:
252
+ options = {}
253
+ f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
254
+ iterable = IterableData(fromfile, options=options)
255
+ to_file = get_option(options, 'output')
256
+
257
+ if to_file:
258
+ to_type = get_file_type(to_file)
259
+ if not to_file:
260
+ print('Output file type not supported')
261
+ return
262
+ if to_type == 'bson':
263
+ out = open(to_file, 'wb')
264
+ if to_type == 'jsonl':
265
+ out = open(to_file, 'wb')
266
+ else:
267
+ out = open(to_file, 'w', encoding='utf8')
268
+ else:
269
+ to_type = f_type
270
+ out = sys.stdout
271
+ fields = options['fields'].split(',')
272
+ writer = DataWriter(out, filetype=to_type, fieldnames=fields)
273
+ if iterable:
274
+ n = 0
275
+ fields = [field.split('.') for field in fields]
276
+ chunk = []
277
+ for r in iterable.iter():
278
+ n += 1
279
+ if options['filter'] is not None:
280
+ res = dq.match(r, options['filter'])
281
+ # print(options['filter'], r)
282
+ if not res:
283
+ continue
284
+ r_selected = strip_dict_fields(r, fields, 0)
285
+ if n % 1000 == 0:
286
+ logging.info('select: processing %d records of %s' % (n, fromfile))
287
+ if len(chunk) > 0:
288
+ writer.write_items(chunk)
289
+ chunk = []
290
+ else:
291
+ chunk.append(r_selected)
292
+ if len(chunk) > 0:
293
+ writer.write_items(chunk)
294
+ else:
295
+ logging.info('File type not supported')
296
+ return
297
+ logging.debug('select: %d records processed' % (n))
298
+ out.close()
299
+
300
+
301
+ def split_new(self, fromfile, options=None):
302
+ """Splits the given file with data into chunks based on chunk size or field value."""
303
+ if options is None:
304
+ options = {}
305
+ iterableargs = get_iterable_options(options)
306
+ open_iterable(fromfile, mode='r', iterableargs=iterableargs)
307
+ get_option(options, 'output')
308
+
309
+
310
+ def split(self, fromfile, options=None):
311
+ """Splits the given file with data into chunks based on chunk size or field value."""
312
+ if options is None:
313
+ options = {}
314
+ f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
315
+ if options['zipfile']:
316
+ z = zipfile.ZipFile(fromfile, mode='r')
317
+ fnames = z.namelist()
318
+ finfilename = fnames[0]
319
+ if f_type == 'bson':
320
+ infile = z.open(fnames[0], 'rb')
321
+ else:
322
+ infile = z.open(fnames[0], 'r')
323
+ elif options['gzipfile']:
324
+ import gzip
325
+ infile = gzip.open(fromfile, 'rb')
326
+ finfilename = fromfile.split('.', 1)[0] + '.' + f_type
327
+ else:
328
+ finfilename = fromfile
329
+ if f_type == 'bson':
330
+ infile = open(fromfile, 'rb')
331
+ else:
332
+ if 'encoding' in options.keys():
333
+ infile = open(fromfile, 'r', encoding=get_option(options, 'encoding'))
334
+ else:
335
+ detected_enc = detect_encoding(fromfile, limit=100000)
336
+ if detected_enc:
337
+ infile = open(fromfile, 'r', encoding=detected_enc['encoding'])
338
+ else:
339
+ infile = open(fromfile, 'r', encoding='utf8')
340
+ fields = options['fields'].split(',') if options['fields'] is not None else None
341
+ valuedict = {}
342
+ delimiter = get_option(options, 'delimiter')
343
+ if f_type == 'csv':
344
+ reader = csv.DictReader(infile, delimiter=delimiter)
345
+ n = 0
346
+ chunknum = 1
347
+ if options['fields'] is None:
348
+ splitname = finfilename.rsplit('.', 1)[0] + '_%d.csv' % (chunknum)
349
+ out = open(splitname, 'w', encoding=get_option(options, 'encoding'))
350
+ writer = csv.DictWriter(out, fieldnames=reader.fieldnames, delimiter=delimiter)
351
+ writer.writeheader()
352
+ for r in reader:
353
+ n += 1
354
+ if n % 10000 == 0:
355
+ logging.info('split: processing %d records of %s' % (n, fromfile))
356
+ if options['filter'] is not None:
357
+ if not dq.match(r, options['filter']):
358
+ continue
359
+ writer.writerow(r)
360
+ if n % options['chunksize'] == 0:
361
+ out.close()
362
+ chunknum += 1
363
+ splitname = finfilename.rsplit('.', 1)[0] + '_%d.csv' % (
364
+ chunknum)
365
+ out = open(splitname, 'w',
366
+ encoding=get_option(options, 'encoding'))
367
+ writer = csv.DictWriter(out, fieldnames=reader.fieldnames,
368
+ delimiter=delimiter)
369
+ writer.writeheader()
370
+ elif f_type == 'jsonl':
371
+ n = 0
372
+ chunknum = 1
373
+ if options['fields'] is None:
374
+ splitname = finfilename.rsplit('.', 1)[0] + '_%d.jsonl' % (chunknum)
375
+ out = open(splitname, 'wb') # , encoding=get_option(options, 'encoding'))
376
+
377
+ for l in infile:
378
+ n += 1
379
+ if n % 10000 == 0:
380
+ logging.info('split: processing %d records of %s' % (n, fromfile))
381
+ r = orjson.loads(l)
382
+ if options['filter'] is not None:
383
+ if not dq.match(r, options['filter']):
384
+ continue
385
+ out.write(orjson.dumps(r, option=orjson.OPT_APPEND_NEWLINE))
386
+ if n % options['chunksize'] == 0:
387
+ out.close()
388
+ chunknum += 1
389
+ splitname = finfilename.rsplit('.', 1)[0] + '_%d.jsonl' % (chunknum)
390
+ logging.info('split: new chunk %s' % splitname)
391
+ out = open(splitname, 'wb') #, encoding=get_option(options, 'encoding'))
392
+ else:
393
+ for l in infile:
394
+ n += 1
395
+ if n % 10000 == 0:
396
+ logging.info('split: processing %d records of %s' % (n, fromfile))
397
+ r = orjson.loads(l)
398
+ if options['filter'] is not None:
399
+ if not dq.match(r, options['filter']):
400
+ continue
401
+ try:
402
+ kx = get_dict_value(r, fields[0].split('.'))[0]
403
+ except IndexError:
404
+ continue
405
+ kx = "None"
406
+ if kx is None:
407
+ continue
408
+ kx = (kx.replace('\\', '-').replace('/', '-')
409
+ .replace('?', '-').replace('<', '-')
410
+ .replace('>', '-').replace('\n', ''))
411
+ v = valuedict.get(kx, None)
412
+ if v is None:
413
+ # splitname = finfilename.rsplit('.', 1)[0] + '_%s.jsonl' % (kx)
414
+ splitname = '%s.jsonl' % (kx)
415
+ if options['dirname'] is not None:
416
+ splitname = os.path.join(options['dirname'],
417
+ splitname)
418
+ valuedict[kx] = open(splitname, 'w', encoding='utf8')
419
+ valuedict[kx].write(l)
420
+ # valuedict[kx].write(l.decode('utf8'))#.decode('utf8')#)
421
+ for opened in valuedict.values():
422
+ opened.close()
423
+ elif f_type == 'bson':
424
+ bson_iter = bson.decode_file_iter(infile)
425
+ n = 0
426
+ for r in bson_iter:
427
+ n += 1
428
+ # print(r)
429
+ strip_dict_fields(r, fields, 0)
430
+ # out.write(json.dumps(r_selected)+'\n')
431
+ if n % 10000 == 0:
432
+ logging.info('split: processing %d records of %s' % (n, fromfile))
433
+
434
+ else:
435
+ logging.info('File type not supported')
436
+ return
437
+ logging.debug('split: %d records processed' % (n))
@@ -0,0 +1,158 @@
1
+ # -*- coding: utf8 -*-
2
+ """Statistical analysis module."""
3
+ from ..utils import get_option, dict_generator, guess_datatype
4
+ from ..constants import DEFAULT_DICT_SHARE
5
+ import logging
6
+ from qddate import DateParser
7
+ #from ..common.iterable import IterableData
8
+ from iterable.helpers.detect import open_iterable
9
+
10
+ #STAT_READY_DATA_FORMATS = ['jsonl', 'bson', 'csv']
11
+
12
+ ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
13
+
14
+
15
+ def get_iterable_options(options):
16
+ """Extract iterable-specific options from options dictionary."""
17
+ out = {}
18
+ for k in ITERABLE_OPTIONS_KEYS:
19
+ if k in options.keys():
20
+ out[k] = options[k]
21
+ return out
22
+
23
+
24
+ class StatProcessor:
25
+ """Statistical processing handler."""
26
+ def __init__(self, nodates=True):
27
+ if nodates:
28
+ self.qd = None
29
+ else:
30
+ self.qd = DateParser(generate=True)
31
+ pass
32
+
33
+ def stats(self, fromfile, options):
34
+ """Produces statistics and structure analysis of JSONlines, BSON or CSV file and produces stats"""
35
+ from rich import print
36
+ from rich.table import Table
37
+
38
+ iterableargs = get_iterable_options(options)
39
+ iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
40
+ dictshare = get_option(options, 'dictshare')
41
+
42
+ if dictshare and dictshare.isdigit():
43
+ dictshare = int(dictshare)
44
+ else:
45
+ dictshare = DEFAULT_DICT_SHARE
46
+
47
+ profile = {'version': 1.0}
48
+ fielddata = {}
49
+ fieldtypes = {}
50
+
51
+ # data = json.load(open(profile['filename']))
52
+ count = 0
53
+ nfields = 0
54
+
55
+ # process data items one by one
56
+ logging.debug('Start processing %s' % (fromfile))
57
+ for item in iterable:
58
+ count += 1
59
+ dk = dict_generator(item)
60
+ if count % 1000 == 0: logging.debug('Processing %d records of %s' % (count, fromfile))
61
+ for i in dk:
62
+ # print(i)
63
+ k = '.'.join(i[:-1])
64
+ if len(i) == 0: continue
65
+ if i[0].isdigit(): continue
66
+ if len(i[0]) == 1: continue
67
+ v = i[-1]
68
+ if k not in fielddata: # Use direct dict membership check instead of list()
69
+ fielddata[k] = {'key': k, 'uniq': {}, 'n_uniq': 0, 'total': 0, 'share_uniq': 0.0,
70
+ 'minlen': None, 'maxlen': 0, 'avglen': 0, 'totallen': 0}
71
+ fd = fielddata[k]
72
+ uniqval = fd['uniq'].get(v, 0)
73
+ fd['uniq'][v] = uniqval + 1
74
+ fd['total'] += 1
75
+ if uniqval == 0:
76
+ fd['n_uniq'] += 1
77
+ fd['share_uniq'] = (fd['n_uniq'] * 100.0) / fd['total']
78
+ fl = len(str(v))
79
+ if fd['minlen'] is None:
80
+ fd['minlen'] = fl
81
+ else:
82
+ fd['minlen'] = fl if fl < fd['minlen'] else fd['minlen']
83
+ fd['maxlen'] = fl if fl > fd['maxlen'] else fd['maxlen']
84
+ fd['totallen'] += fl
85
+ fielddata[k] = fd
86
+ if k not in fieldtypes: # Use direct dict membership check instead of list()
87
+ fieldtypes[k] = {'key': k, 'types': {}}
88
+ fd = fieldtypes[k]
89
+ thetype = guess_datatype(v, self.qd)['base']
90
+ uniqval = fd['types'].get(thetype, 0)
91
+ fd['types'][thetype] = uniqval + 1
92
+ fieldtypes[k] = fd
93
+ # print count
94
+ for k, v in fielddata.items(): # Use dict.items() directly, no list() conversion
95
+ fielddata[k]['share_uniq'] = (v['n_uniq'] * 100.0) / v['total']
96
+ fielddata[k]['avglen'] = v['totallen'] / v['total']
97
+ profile['count'] = count
98
+ profile['num_fields'] = nfields
99
+
100
+ # Determine field types first so we can use them when building dicts
101
+ finfields = {}
102
+ for fd in fieldtypes.values(): # Use dict.values() directly, no list() conversion
103
+ fdt = list(fd['types'].keys()) # Keep list() here as we need to check membership and modify
104
+ if 'empty' in fdt:
105
+ del fd['types']['empty']
106
+ types_keys = list(fd['types'].keys()) # Need list for len() and indexing
107
+ if len(types_keys) != 1:
108
+ ftype = 'str'
109
+ else:
110
+ ftype = types_keys[0]
111
+ finfields[fd['key']] = ftype
112
+
113
+ profile['fieldtypes'] = finfields
114
+
115
+ dictkeys = []
116
+ dicts = {}
117
+ # print(profile)
118
+ profile['fields'] = []
119
+ for fd in fielddata.values(): # Use dict.values() directly, no list() conversion
120
+ # print(fd['key']) # , fd['n_uniq'], fd['share_uniq'], fieldtypes[fd['key']]
121
+ field = {'key': fd['key'], 'is_uniq': 0 if fd['share_uniq'] < 100 else 1}
122
+ profile['fields'].append(field)
123
+ if fd['share_uniq'] < dictshare:
124
+ dictkeys.append(fd['key'])
125
+ # Use determined field type instead of defaulting to 'str'
126
+ field_type = finfields.get(fd['key'], 'str')
127
+ dicts[fd['key']] = {'items': fd['uniq'], 'count': fd['n_uniq'],
128
+ 'type': field_type}
129
+ # for k, v in fd['uniq'].items():
130
+ # print fd['key'], k, v
131
+ profile['dictkeys'] = dictkeys
132
+
133
+ for k, v in fielddata.items(): # Use dict.items() directly, no list() conversion
134
+ del v['uniq']
135
+ fielddata[k] = v
136
+ profile['debug'] = {'fieldtypes': fieldtypes.copy(), 'fielddata': fielddata}
137
+ table = []
138
+ for fd in fielddata.values(): # Use dict.values() directly, no list() conversion
139
+ field = [fd['key'], ]
140
+ field.append(finfields[fd['key']])
141
+ field.append(True if fd['key'] in dictkeys else False)
142
+ field.append(False if fd['share_uniq'] < 100 else True)
143
+ field.append(fd['n_uniq'])
144
+ field.append(fd['share_uniq'])
145
+ field.append(fd['minlen'])
146
+ field.append(fd['maxlen'])
147
+ field.append(fd['avglen'])
148
+ table.append(field)
149
+ headers = ('key', 'ftype', 'is_dictkey', 'is_uniq', 'n_uniq', 'share_uniq', 'minlen', 'maxlen', 'avglen')
150
+ reptable = Table(title="Statistics")
151
+ reptable.add_column(headers[0], justify="left", style="magenta")
152
+ for key in headers[1:-1]:
153
+ reptable.add_column(key, justify="left", style="cyan", no_wrap=True)
154
+ reptable.add_column(headers[-1], justify="right", style="cyan")
155
+ for row in table:
156
+ reptable.add_row(*map(str, row))
157
+ print(reptable)
158
+
@@ -0,0 +1,59 @@
1
+ # -*- coding: utf8 -*-
2
+ """Text processing module."""
3
+ from ..utils import get_file_type, get_option
4
+ from iterable.helpers.detect import open_iterable
5
+
6
+ #STAT_READY_DATA_FORMATS = ['jsonl', 'bson', 'csv']
7
+
8
+ ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
9
+
10
+
11
+ def get_iterable_options(options):
12
+ """Extract iterable-specific options from options dictionary."""
13
+ out = {}
14
+ for k in ITERABLE_OPTIONS_KEYS:
15
+ if k in options.keys():
16
+ out[k] = options[k]
17
+ return out
18
+
19
+
20
+ def get_keys(adict, prefix=None):
21
+ """Extract all keys from nested dictionary."""
22
+ keys = {}
23
+ for k, v in adict.items():
24
+ fullk = '.'.join([prefix, k]) if prefix else k
25
+ keys[fullk] = 1
26
+ if isinstance(v, dict):
27
+ for ak in get_keys(v, fullk):
28
+ keys[ak] = 1
29
+ elif isinstance(v, list):
30
+ for item in v:
31
+ if isinstance(item, dict):
32
+ for ak in get_keys(item, fullk):
33
+ keys[ak] = 1
34
+ else:
35
+ print(('%s\t%s' % (fullk, str(v))))
36
+ return keys
37
+
38
+
39
+ class TextProcessor:
40
+ """Text processing handler."""
41
+ def __init__(self):
42
+ pass
43
+
44
+ def flatten(self, filename, options):
45
+ """Flatten the data. One field - one line"""
46
+ get_file_type(filename) if options['format_in'] is None else options['format_in']
47
+ iterableargs = get_iterable_options(options)
48
+ iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
49
+ get_option(options, 'output')
50
+ i = 0
51
+ for rec in iterable:
52
+ allkeys = {}
53
+ i += 1
54
+ for k in get_keys(rec):
55
+ v = allkeys.get(k, 0)
56
+ allkeys[k] = v + 1
57
+ for k, v in allkeys.items():
58
+ print('\t'.join([k, str(v)]))
59
+