undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,81 @@
1
+ # -*- coding: utf8 -*-
2
+ """Data transformation module."""
3
+ import sys
4
+ import orjson
5
+ import logging
6
+ #from xmlr import xmliter
7
+ from ..utils import get_option, dict_generator
8
+ from runpy import run_path
9
+ from iterable.helpers.detect import open_iterable
10
+
11
+ ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
12
+
13
+
14
+ def get_iterable_options(options):
15
+ """Extract iterable-specific options from options dictionary."""
16
+ out = {}
17
+ for k in ITERABLE_OPTIONS_KEYS:
18
+ if k in options.keys():
19
+ out[k] = options[k]
20
+ return out
21
+
22
+ DEFAULT_HEADERS_DETECT_LIMIT = 1000
23
+
24
+
25
+
26
+ class Transformer:
27
+ """Data transformation handler."""
28
+ def __init__(self):
29
+ pass
30
+
31
+
32
+ def script(self, fromfile, options={}):
33
+ """Run certain script against selected file"""
34
+
35
+ script = run_path(options['script'])
36
+ __process_func = script['process']
37
+
38
+ iterableargs = get_iterable_options(options)
39
+ read_iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
40
+
41
+ limit = DEFAULT_HEADERS_DETECT_LIMIT
42
+
43
+ keys_set = set() # Use set for O(1) lookup instead of O(n) list operations
44
+ n = 0
45
+ for item in read_iterable:
46
+ if limit and n > limit:
47
+ break
48
+ item = __process_func(item)
49
+ n += 1
50
+ dk = dict_generator(item)
51
+ for i in dk:
52
+ k = ".".join(i[:-1])
53
+ keys_set.add(k)
54
+ keys = list(keys_set) # Convert to list for backward compatibility
55
+
56
+ read_iterable.close()
57
+ read_iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
58
+
59
+
60
+ write_to_iterable = False
61
+ to_file = get_option(options, 'output')
62
+ if to_file:
63
+ write_to_iterable = True
64
+ write_iterable = open_iterable(to_file, mode='w', iterableargs={'keys' : keys})
65
+ n = 0
66
+ for r in read_iterable:
67
+ n += 1
68
+ if n % 10000 == 0:
69
+ logging.info('apply script: processing %d records of %s' % (n, fromfile))
70
+ item = __process_func(r)
71
+ if write_to_iterable:
72
+ write_iterable.write(item)
73
+ else:
74
+ sys.stdout.write(orjson.dumps(item, option=orjson.OPT_APPEND_NEWLINE).decode('utf8'))
75
+
76
+ logging.debug('select: %d records processed' % (n))
77
+ read_iterable.close()
78
+ if write_to_iterable:
79
+ write_iterable.close()
80
+
81
+
@@ -0,0 +1,137 @@
1
+ # -*- coding: utf8 -*-
2
+ """Data validation module."""
3
+ import csv
4
+ import logging
5
+ import sys
6
+ import zipfile
7
+
8
+ import bson
9
+ import dictquery as dq
10
+ import orjson
11
+
12
+ from ..utils import get_file_type, get_option, get_dict_value
13
+ from ..validate import VALIDATION_RULEMAP
14
+
15
+
16
+ class Validator:
17
+ """Data validation handler."""
18
+ def __init__(self):
19
+ pass
20
+
21
+ def validate(self, fromfile, options=None):
22
+ """Validates selected field against validation rule."""
23
+ if options is None:
24
+ options = {}
25
+ logging.debug('Processing %s', fromfile)
26
+ f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
27
+ if options['zipfile']:
28
+ z = zipfile.ZipFile(fromfile, mode='r')
29
+ fnames = z.namelist()
30
+ if f_type == 'bson':
31
+ infile = z.open(fnames[0], 'rb')
32
+ else:
33
+ infile = z.open(fnames[0], 'r')
34
+ else:
35
+ if f_type == 'bson':
36
+ infile = open(fromfile, 'rb')
37
+ else:
38
+ infile = open(fromfile, 'r', encoding=get_option(options, 'encoding'))
39
+ to_file = get_option(options, 'output')
40
+ if to_file:
41
+ get_file_type(to_file)
42
+ if not to_file:
43
+ logging.debug('Output file type not supported')
44
+ return
45
+ out = open(to_file, 'w', encoding='utf8')
46
+ else:
47
+ out = sys.stdout
48
+ fields = options['fields'].split(',')
49
+ val_func = VALIDATION_RULEMAP[options['rule']]
50
+ logging.info('uniq: looking for fields: %s', options['fields'])
51
+ validated = []
52
+ stats = {'total': 0, 'invalid': 0, 'novalue' : 0}
53
+ if f_type == 'csv':
54
+ delimiter = get_option(options, 'delimiter')
55
+ reader = csv.DictReader(infile, delimiter=delimiter)
56
+ n = 0
57
+ for r in reader:
58
+ n += 1
59
+ if n % 1000 == 0:
60
+ logging.info('uniq: processing %d records of %s', n, fromfile)
61
+ if options['filter'] is not None:
62
+ if not dq.match(r, options['filter']):
63
+ continue
64
+ res = val_func(r[fields[0]])
65
+ stats['total'] += 1
66
+ if not res:
67
+ stats['invalid'] += 1
68
+ validated.append({fields[0] : r[fields[0]], fields[0] + '_valid' : res})
69
+
70
+ elif f_type == 'jsonl':
71
+ n = 0
72
+ for l in infile:
73
+ n += 1
74
+ if n % 10000 == 0:
75
+ logging.info('uniq: processing %d records of %s', n, fromfile)
76
+ r = orjson.loads(l)
77
+ if options['filter'] is not None:
78
+ if not dq.match(r, options['filter']):
79
+ continue
80
+ stats['total'] += 1
81
+ values = get_dict_value(r, fields[0].split('.'))
82
+ if len(values) > 0:
83
+ res = val_func(values[0])
84
+ if not res:
85
+ stats['invalid'] += 1
86
+ validated.append({fields[0] : values[0], fields[0] + '_valid' : res})
87
+ else:
88
+ stats['novalue'] += 1
89
+
90
+ elif f_type == 'bson':
91
+ bson_iter = bson.decode_file_iter(infile)
92
+ n = 0
93
+ for r in bson_iter:
94
+ n += 1
95
+ if n % 1000 == 0:
96
+ logging.info('uniq: processing %d records of %s', n, fromfile)
97
+ if options['filter'] is not None:
98
+ if not dq.match(r, options['filter']):
99
+ continue
100
+ stats['total'] += 1
101
+ values = get_dict_value(r, fields[0].split('.'))
102
+ if len(values) > 0:
103
+ res = val_func(values[0])
104
+ if not res:
105
+ stats['invalid'] += 1
106
+ validated.append({fields[0] : values[0], fields[0] + '_valid' : res})
107
+ else:
108
+ stats['novalue'] += 1
109
+ else:
110
+ logging.error('Invalid filed format provided')
111
+ if not options['zipfile']:
112
+ infile.close()
113
+ return
114
+ if not options['zipfile']:
115
+ infile.close()
116
+ stats['share'] = 100.0 * stats['invalid'] / stats['total']
117
+ novalue_share = 100.0 * stats['novalue'] / stats['total']
118
+ logging.debug('validate: complete, %d records (%.2f%%) not valid and %d '
119
+ '(%.2f%%) not found of %d against %s',
120
+ stats['invalid'], stats['share'], stats['novalue'],
121
+ novalue_share, stats['total'], options['rule'])
122
+ if options['mode'] != 'stats':
123
+ fieldnames = [fields[0], fields[0] + '_valid']
124
+ writer = csv.DictWriter(out, fieldnames=fieldnames,
125
+ delimiter=get_option(options, 'delimiter'))
126
+ for row in validated:
127
+ if options['mode'] == 'invalid':
128
+ if not row[fields[0] + '_valid']:
129
+ writer.writerow(row)
130
+ elif options['mode'] == 'all':
131
+ writer.writerow(row)
132
+ else:
133
+ out.write(str(orjson.dumps(stats, option=orjson.OPT_INDENT_2)))
134
+ if to_file:
135
+ out.close()
136
+ if options['zipfile']:
137
+ z.close()
@@ -0,0 +1,6 @@
1
+ # -*- coding: utf8 -*-
2
+ """Common utilities and helper functions for data processing.
3
+
4
+ This module provides shared functionality used across the undatum package,
5
+ including iterable data handling, schema management, and common functions.
6
+ """
@@ -0,0 +1,81 @@
1
+ # -*- coding: utf8 -*-
2
+ """Common utility functions for dictionary operations.
3
+
4
+ This module provides helper functions for accessing nested dictionary
5
+ values using dot-notation keys.
6
+ """
7
+
8
+
9
+ def get_dict_value(adict, key, prefix=None):
10
+ """Get value from dictionary using dot-notation key.
11
+
12
+ Args:
13
+ adict: Dictionary to search.
14
+ key: Dot-separated key path (e.g., 'field.subfield').
15
+ prefix: Pre-split key parts (used internally for recursion).
16
+
17
+ Returns:
18
+ Value at the specified key path.
19
+
20
+ Raises:
21
+ KeyError: If any key in the path doesn't exist.
22
+ """
23
+ if prefix is None:
24
+ prefix = key.split('.')
25
+ if len(prefix) == 1:
26
+ return adict[prefix[0]]
27
+ return get_dict_value(adict[prefix[0]], key, prefix=prefix[1:])
28
+
29
+
30
+ def get_dict_value_deep(adict, key, prefix=None, as_array=False, splitter='.'):
31
+ """Get value from hierarchical dictionaries with deep traversal.
32
+
33
+ Supports nested dictionaries and lists, with optional array collection
34
+ of values from multiple sources.
35
+
36
+ Args:
37
+ adict: Dictionary or list to search.
38
+ key: Dot-separated key path (e.g., 'field.subfield').
39
+ prefix: Pre-split key parts (used internally for recursion).
40
+ as_array: If True, collect all matching values into an array.
41
+ splitter: Character used to split key path (default: '.').
42
+
43
+ Returns:
44
+ Value at the specified key path, or list of values if as_array=True.
45
+ Returns None if key path not found.
46
+ """
47
+ if prefix is None:
48
+ prefix = key.split(splitter)
49
+ if len(prefix) == 1:
50
+ if isinstance(adict, dict):
51
+ if prefix[0] not in adict:
52
+ return None
53
+ if as_array:
54
+ return [adict[prefix[0]]]
55
+ return adict[prefix[0]]
56
+ if isinstance(adict, list):
57
+ if as_array:
58
+ result = []
59
+ for v in adict:
60
+ if prefix[0] in v:
61
+ result.append(v[prefix[0]])
62
+ return result
63
+ if len(adict) > 0 and prefix[0] in adict[0]:
64
+ return adict[0][prefix[0]]
65
+ return None
66
+ if isinstance(adict, dict):
67
+ if prefix[0] in adict:
68
+ return get_dict_value_deep(adict[prefix[0]], key, prefix=prefix[1:],
69
+ as_array=as_array)
70
+ elif isinstance(adict, list):
71
+ if as_array:
72
+ result = []
73
+ for v in adict:
74
+ res = get_dict_value_deep(v[prefix[0]], key, prefix=prefix[1:],
75
+ as_array=as_array)
76
+ if res:
77
+ result.extend(res)
78
+ return result
79
+ return get_dict_value_deep(adict[0][prefix[0]], key, prefix=prefix[1:],
80
+ as_array=as_array)
81
+ return None
@@ -0,0 +1,222 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Iterable data handling module."""
3
+ from ..constants import BINARY_FILE_TYPES
4
+ from ..utils import get_file_type, get_option, detect_encoding, detect_delimiter
5
+ import csv
6
+ import jsonlines
7
+ import bson
8
+ import logging
9
+ import io
10
+
11
+ SUPPORTED_COMPRESSION = {'gz': True, 'zip': True, 'xz': False, '7z': False, 'lz4': False, 'bz2' : True}
12
+ import gzip
13
+ from zipfile import ZipFile
14
+ from lzma import LZMAFile
15
+ from bz2 import BZ2File
16
+
17
+ try:
18
+ import lz4
19
+ SUPPORTED_COMPRESSION['lz4'] = True
20
+ except ImportError:
21
+ pass
22
+
23
+ try:
24
+ import py7zr
25
+ SUPPORTED_COMPRESSION['7z'] = True
26
+ except ImportError:
27
+ pass
28
+
29
+
30
+
31
+
32
+
33
+ DEFAULT_ENCODING = 'utf8'
34
+ DEFAULT_DELIMITER = ','
35
+
36
+ class IterableData:
37
+ """Iterable data reader (CSV/JSON lines, BSON"""
38
+ def __init__(self, filename, options={}, autodetect=True, autodetect_limit=100000):
39
+ """Creates iterable object from CSV, JSON lines and other iterable files.
40
+ """
41
+ self.autodetect = autodetect
42
+ self.autodetect_limit = autodetect_limit
43
+ self.options = options
44
+ self.archiveobj = None
45
+ self.fileobj = None
46
+ self.binary = False
47
+ self.delimiter = get_option(options, 'delimiter')
48
+ self.init(filename, options)
49
+ pass
50
+
51
+
52
+ def init(self, filename, options):
53
+ f_type = get_file_type(filename) if options['format_in'] is None else options['format_in']
54
+ self.encoding = get_option(options, 'encoding')
55
+ self.filetype = f_type
56
+ ext = filename.rsplit('.', 1)[-1].lower()
57
+ self.ext = ext
58
+ if ext in SUPPORTED_COMPRESSION.keys():
59
+ self.binary = True
60
+ self.mode = 'rb' if self.filetype in BINARY_FILE_TYPES else 'r'
61
+ if ext == 'gz':
62
+ self.fileobj = gzip.open(filename, self.mode)
63
+ elif ext == 'bz2':
64
+ self.fileobj = BZ2File(filename, self.mode)
65
+ elif ext == 'xz':
66
+ self.fileobj = LZMAFile(filename, self.mode)
67
+ elif ext == 'zip':
68
+ self.archiveobj = ZipFile(filename, mode='r')
69
+ fnames = self.archiveobj.namelist()
70
+ self.fileobj = self.archiveobj.open(fnames[0], self.mode)
71
+ else:
72
+ raise NotImplementedError
73
+ else:
74
+ if f_type in BINARY_FILE_TYPES:
75
+ self.fileobj = open(filename, 'rb')
76
+ else:
77
+ if 'encoding' in options.keys() and options['encoding']:
78
+ encoding = get_option(options, 'encoding')
79
+ else:
80
+ if self.autodetect:
81
+ detected_enc = detect_encoding(filename, limit=self.autodetect_limit)
82
+ encoding = detected_enc['encoding'] if detected_enc else DEFAULT_ENCODING
83
+ if f_type == 'csv':
84
+ detected_del = detect_delimiter(filename, encoding)
85
+ delimiter = detected_del if detected_del else DEFAULT_DELIMITER
86
+ self.delimiter = delimiter
87
+ else:
88
+ encoding = DEFAULT_ENCODING
89
+ if f_type == 'csv':
90
+ delimiter = DEFAULT_DELIMITER
91
+ self.delimiter = delimiter
92
+ logging.debug('Detected encoding %s' % (detected_enc['encoding']))
93
+ self.encoding = encoding
94
+ self.fileobj = open(filename, 'r', encoding=encoding)
95
+
96
+ def init_orig(self, filename, options):
97
+ f_type = get_file_type(filename) if options['format_in'] is None else options['format_in']
98
+ encoding = get_option(options, 'encoding')
99
+ self.filetype = f_type
100
+ if options['zipfile']:
101
+ z = ZipFile(filename, mode='r')
102
+ fnames = z.namelist()
103
+ if f_type in BINARY_FILE_TYPES:
104
+ self.fileobj = z.open(fnames[0], 'rb')
105
+ else:
106
+ self.fileobj = z.open(fnames[0], 'r')
107
+ else:
108
+ if f_type in BINARY_FILE_TYPES:
109
+ self.fileobj = open(filename, 'rb')
110
+ else:
111
+ if 'encoding' in options.keys() and options['encoding']:
112
+ encoding = get_option(options, 'encoding')
113
+ else:
114
+ if self.autodetect:
115
+ detected_enc = detect_encoding(filename, limit=self.autodetect_limit)
116
+ encoding = detected_enc['encoding'] if detected_enc else DEFAULT_ENCODING
117
+ if f_type == 'csv':
118
+ detected_del = detect_delimiter(filename, encoding)
119
+ delimiter = detected_del if detected_del else DEFAULT_DELIMITER
120
+ self.delimiter = delimiter
121
+ else:
122
+ encoding = DEFAULT_ENCODING
123
+ if f_type == 'csv':
124
+ delimiter = DEFAULT_DELIMITER
125
+ logging.debug('Detected encoding %s' % (detected_enc['encoding']))
126
+ self.encoding = encoding
127
+ self.fileobj = open(filename, 'r', encoding=encoding)
128
+
129
+ def iter(self):
130
+ if self.filetype == 'csv':
131
+ if self.binary:
132
+ obj = io.TextIOWrapper(self.fileobj, encoding=self.encoding)
133
+ reader = csv.DictReader(obj, delimiter=self.delimiter)
134
+ else:
135
+ reader = csv.DictReader(self.fileobj, delimiter=self.delimiter)
136
+ return iter(reader)
137
+ elif self.filetype == 'jsonl':
138
+ return jsonlines.Reader(self.fileobj)
139
+ elif self.filetype == 'bson':
140
+ return bson.decode_file_iter(self.fileobj)
141
+
142
+ def close(self):
143
+ """Closes file object and archive file object if it exists"""
144
+ if self.fileobj is not None:
145
+ self.fileobj.close()
146
+ if self.archiveobj is not None:
147
+ self.archiveobj.close()
148
+
149
+
150
+
151
+ class BSONWriter:
152
+ """BSON file writer."""
153
+ def __init__(self, fileobj):
154
+ self.fo = fileobj
155
+
156
+ def write(self, item):
157
+ rec = bson.BSON.encode(item)
158
+ self.fo.write(rec)
159
+
160
+
161
+ class DataWriter:
162
+ """Data writer (CSV/JSON lines, BSON"""
163
+ def __init__(self, fileobj, filetype, output_type:str='iterable', delimiter:str=',', fieldnames:list=None):
164
+ """Creates iterable object from CSV, JSON lines or BSON file.
165
+ """
166
+ self.output_type = output_type
167
+ self.filetype = filetype
168
+ self.fieldnames = fieldnames
169
+ self.fileobj = fileobj
170
+ if self.filetype == 'csv':
171
+ self.writer = csv.DictWriter(self.fileobj, delimiter=delimiter, fieldnames=fieldnames)
172
+ elif self.filetype == 'jsonl':
173
+ self.writer = jsonlines.Writer(self.fileobj)
174
+ elif self.filetype == 'bson':
175
+ self.writer = BSONWriter(self.fileobj)
176
+ pass
177
+
178
+
179
+ # def write_item(self, row):
180
+
181
+ def write_items(self, outdata):
182
+ if len(outdata) == 0:
183
+ return
184
+ if self.filetype == 'csv':
185
+ self.writer.writeheader()
186
+ if isinstance(outdata[0], str):
187
+ for rawitem in outdata:
188
+ item = {self.fieldnames[0]: rawitem}
189
+ self.writer.writerow(item)
190
+ elif isinstance(outdata[0], list) or isinstance(outdata[0], tuple):
191
+ for rawitem in outdata:
192
+ item = dict(zip(self.fieldnames, rawitem))
193
+ self.writer.writerow(item)
194
+ else:
195
+ self.writer.writerows(outdata)
196
+ elif self.filetype in ['jsonl', 'bson']:
197
+ # If our data is just array of strings, we just transform it to dict
198
+ if isinstance(outdata[0], str):
199
+ for rawitem in outdata:
200
+ item = {self.fieldnames[0]: rawitem}
201
+ self.writer.write(item)
202
+ # handle.write(orjson.dumps(item, option=orjson.OPT_APPEND_NEWLINE).decode('utf8'))
203
+ elif isinstance(outdata[0], list) or isinstance(outdata[0], tuple):
204
+ for rawitem in outdata:
205
+ item = dict(zip(self.fieldnames, rawitem))
206
+ self.writer.write(item)
207
+ # handle.write(orjson.dumps(item, option=orjson.OPT_APPEND_NEWLINE).decode('utf8'))
208
+ else:
209
+ if self.output_type == 'iterable':
210
+ for item in outdata:
211
+ self.writer.write(item)
212
+ elif self.output_type == 'duckdb':
213
+ item = dict(zip(self.fieldnames, rawitem))
214
+ self.writer.write(item)
215
+ # handle.write(orjson.dumps(item, option=orjson.OPT_APPEND_NEWLINE).decode('utf8'))
216
+
217
+
218
+ if __name__ == "__main__":
219
+ f = open('outtest.jsonl', 'w')
220
+ writer = DataWriter(f, filetype='jsonl', fieldnames=['name', 'value'])
221
+ writer.write_items([{'name' : 'Cat', 'value' : 15}])
222
+ pass