undatum 1.0.17__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- undatum/__init__.py +9 -0
- undatum/__main__.py +25 -0
- undatum/ai/__init__.py +145 -0
- undatum/ai/base.py +85 -0
- undatum/ai/config.py +184 -0
- undatum/ai/perplexity.py +79 -0
- undatum/ai/providers.py +1002 -0
- undatum/ai/schemas.py +42 -0
- undatum/cmds/__init__.py +6 -0
- undatum/cmds/analyzer.py +697 -0
- undatum/cmds/converter.py +646 -0
- undatum/cmds/ingester.py +116 -0
- undatum/cmds/query.py +68 -0
- undatum/cmds/schemer.py +328 -0
- undatum/cmds/selector.py +437 -0
- undatum/cmds/statistics.py +158 -0
- undatum/cmds/textproc.py +59 -0
- undatum/cmds/transformer.py +81 -0
- undatum/cmds/validator.py +137 -0
- undatum/common/__init__.py +6 -0
- undatum/common/functions.py +81 -0
- undatum/common/iterable.py +222 -0
- undatum/common/scheme.py +261 -0
- undatum/constants.py +21 -0
- undatum/core.py +616 -0
- undatum/formats/__init__.py +6 -0
- undatum/formats/docx.py +160 -0
- undatum/utils.py +298 -0
- undatum/validate/__init__.py +11 -0
- undatum/validate/commonrules.py +15 -0
- undatum/validate/ruscodes.py +202 -0
- undatum-1.0.17.dist-info/METADATA +610 -0
- undatum-1.0.17.dist-info/RECORD +37 -0
- undatum-1.0.17.dist-info/WHEEL +6 -0
- undatum-1.0.17.dist-info/entry_points.txt +3 -0
- undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
- undatum-1.0.17.dist-info/top_level.txt +1 -0
undatum/common/scheme.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""Schema definition and type mapping module."""
|
|
3
|
+
import datetime
|
|
4
|
+
import bson
|
|
5
|
+
import orjson
|
|
6
|
+
import csv
|
|
7
|
+
import logging
|
|
8
|
+
from copy import copy
|
|
9
|
+
from .functions import get_dict_value_deep
|
|
10
|
+
|
|
11
|
+
OTYPES_MAP = [[type(""), 'string'],
|
|
12
|
+
[type(u""), 'string'],
|
|
13
|
+
[datetime.datetime, 'datetime'],
|
|
14
|
+
[int, 'integer'],
|
|
15
|
+
[bool, 'boolean'],
|
|
16
|
+
[float, 'float'],
|
|
17
|
+
[str, 'string'],
|
|
18
|
+
[bson.int64.Int64, 'integer'],
|
|
19
|
+
[bson.objectid.ObjectId, 'string'],
|
|
20
|
+
[type([]), 'array']
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def merge_schemes(alist, novalue=True):
|
|
25
|
+
"""Merges schemes of list of objects and generates final data schema"""
|
|
26
|
+
if len(alist) == 0:
|
|
27
|
+
return None
|
|
28
|
+
obj = alist[0]
|
|
29
|
+
okeys = obj.keys()
|
|
30
|
+
for item in alist[1:]:
|
|
31
|
+
for k in item.keys():
|
|
32
|
+
# print(obj[k]['type'])
|
|
33
|
+
if k not in okeys:
|
|
34
|
+
obj[k] = item[k]
|
|
35
|
+
elif obj[k]['type'] in ['integer', 'float', 'string', 'datetime']:
|
|
36
|
+
if not novalue:
|
|
37
|
+
obj[k]['value'] += item[k]['value']
|
|
38
|
+
elif obj[k]['type'] == 'dict':
|
|
39
|
+
if not novalue:
|
|
40
|
+
obj[k]['value'] += item[k]['value']
|
|
41
|
+
if 'schema' in item[k].keys():
|
|
42
|
+
obj[k]['schema'] = merge_schemes([obj[k]['schema'], item[k]['schema']])
|
|
43
|
+
elif obj[k]['type'] == 'array':
|
|
44
|
+
# if 'subtype' not in obj[k].keys():
|
|
45
|
+
# logging.info(str(obj[k]))
|
|
46
|
+
if 'subtype' in obj[k].keys() and obj[k]['subtype'] == 'dict':
|
|
47
|
+
if not novalue:
|
|
48
|
+
obj[k]['value'] += item[k]['value']
|
|
49
|
+
if 'schema' in item[k].keys():
|
|
50
|
+
obj[k]['schema'] = merge_schemes([obj[k]['schema'], item[k]['schema']])
|
|
51
|
+
else:
|
|
52
|
+
if not novalue:
|
|
53
|
+
obj[k]['value'] += item['value']
|
|
54
|
+
return obj
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_schemes(alist):
|
|
58
|
+
"""Generates schemas for each object"""
|
|
59
|
+
results = []
|
|
60
|
+
for o in alist:
|
|
61
|
+
results.append(get_schema(o))
|
|
62
|
+
return results
|
|
63
|
+
|
|
64
|
+
def get_schema(obj, novalue=True):
|
|
65
|
+
"""Generates schema from object"""
|
|
66
|
+
result = {}
|
|
67
|
+
for k in obj.keys():
|
|
68
|
+
tt = type(obj[k])
|
|
69
|
+
if obj[k] is None:
|
|
70
|
+
result[k] = {'type': 'string', 'value' : 1}
|
|
71
|
+
elif tt == type("") or tt == type(u"") or isinstance(obj[k], str):
|
|
72
|
+
result[k] = {'type': 'string', 'value' : 1}
|
|
73
|
+
elif isinstance(obj[k], str):
|
|
74
|
+
result[k] = {'type': 'string', 'value': 1}
|
|
75
|
+
elif tt == datetime.datetime:
|
|
76
|
+
result[k] = {'type': 'datetime', 'value' : 1}
|
|
77
|
+
elif tt == bool:
|
|
78
|
+
result[k] = {'type': 'boolean', 'value' : 1}
|
|
79
|
+
elif tt == float:
|
|
80
|
+
result[k] = {'type': 'float', 'value' : 1}
|
|
81
|
+
elif tt == int:
|
|
82
|
+
result[k] = {'type': 'integer', 'value' : 1}
|
|
83
|
+
elif tt == bson.int64.Int64:
|
|
84
|
+
result[k] = {'type': 'integer', 'value' : 1}
|
|
85
|
+
elif tt == bson.objectid.ObjectId:
|
|
86
|
+
result[k] = {'type': 'string', 'value' : 1}
|
|
87
|
+
elif tt == type({}):
|
|
88
|
+
result[k] = {'type': 'dict', 'value' : 1, 'schema' : get_schema(obj[k])}
|
|
89
|
+
elif tt == type([]):
|
|
90
|
+
result[k] = {'type': 'array', 'value' : 1}
|
|
91
|
+
if len(obj[k]) == 0:
|
|
92
|
+
result[k]['subtype'] = 'string'
|
|
93
|
+
else:
|
|
94
|
+
found = False
|
|
95
|
+
for otype, oname in OTYPES_MAP:
|
|
96
|
+
if isinstance(obj[k][0], otype):
|
|
97
|
+
result[k]['subtype'] = oname
|
|
98
|
+
found = True
|
|
99
|
+
if not found:
|
|
100
|
+
if isinstance(obj[k][0], dict):
|
|
101
|
+
result[k]['subtype'] = 'dict'
|
|
102
|
+
result[k]['schema'] = merge_schemes(get_schemes(obj[k]))
|
|
103
|
+
else:
|
|
104
|
+
logging.info("Unknown object %s type %s" % (k, str(type(obj[k][0]))))
|
|
105
|
+
else:
|
|
106
|
+
logging.info("Unknown object %s type %s" % (k, str(type(obj[k]))))
|
|
107
|
+
result[k] = {'type': 'string', 'value' : 1}
|
|
108
|
+
if novalue:
|
|
109
|
+
del result[k]['value']
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
def extract_keys(obj, parent=None, text=None, level=1):
|
|
113
|
+
"""Extracts keys"""
|
|
114
|
+
text = ''
|
|
115
|
+
if not parent:
|
|
116
|
+
text = "'schema': {\n"
|
|
117
|
+
for k in obj.keys():
|
|
118
|
+
if isinstance(obj[k], dict):
|
|
119
|
+
text += "\t" * level + "'%s' : {'type' : 'dict', 'schema' : {\n" % (k)
|
|
120
|
+
text += extract_keys(obj[k], k, text, level+1)
|
|
121
|
+
text += "\t" * level + "}},\n"
|
|
122
|
+
elif isinstance(obj[k], list):
|
|
123
|
+
text += "\t" * level + "'%s' : {'type' : 'list', 'schema' : { 'type' : 'dict', 'schema' : {\n" % (k)
|
|
124
|
+
if len(obj[k]) > 0:
|
|
125
|
+
item = obj[k][0]
|
|
126
|
+
if isinstance(item, dict):
|
|
127
|
+
text += extract_keys(item, k, text, level+1)
|
|
128
|
+
else:
|
|
129
|
+
text += "\t" * level + "'%s' : {'type' : 'string'},\n" % (k)
|
|
130
|
+
text += "\t" * level + "}}},\n"
|
|
131
|
+
else:
|
|
132
|
+
logging.info(str(type(obj[k])))
|
|
133
|
+
text += "\t" * level + "'%s' : {'type' : 'string'},\n" % (k)
|
|
134
|
+
if not parent:
|
|
135
|
+
text += "}"
|
|
136
|
+
return text
|
|
137
|
+
|
|
138
|
+
def __get_filetype_by_ext(filename):
|
|
139
|
+
ext = filename.rsplit('.', 1)[-1].lower()
|
|
140
|
+
if ext in ['bson', 'json', 'csv', 'jsonl']:
|
|
141
|
+
return ext
|
|
142
|
+
return filename
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def generate_scheme_from_file(filename=None, fileobj=None, filetype='bson', alimit=1000, verbose=0, encoding='utf8', delimiter=",", quotechar='"'):
|
|
146
|
+
"""Generates schema of the data BSON file"""
|
|
147
|
+
if not filetype and filename is not None:
|
|
148
|
+
filetype = __get_filetype_by_ext(filename)
|
|
149
|
+
datacache = []
|
|
150
|
+
if filetype == 'bson':
|
|
151
|
+
if filename:
|
|
152
|
+
source = open(filename, 'rb')
|
|
153
|
+
else:
|
|
154
|
+
source = fileobj
|
|
155
|
+
n = 0
|
|
156
|
+
for r in bson.decode_file_iter(source):
|
|
157
|
+
n += 1
|
|
158
|
+
if n > alimit:
|
|
159
|
+
break
|
|
160
|
+
datacache.append(r)
|
|
161
|
+
if filename:
|
|
162
|
+
source.close()
|
|
163
|
+
elif filetype == 'jsonl':
|
|
164
|
+
if filename:
|
|
165
|
+
source = open(filename, 'r', encoding=encoding)
|
|
166
|
+
else:
|
|
167
|
+
source = fileobj
|
|
168
|
+
n = 0
|
|
169
|
+
for r in source:
|
|
170
|
+
n += 1
|
|
171
|
+
if n > alimit:
|
|
172
|
+
break
|
|
173
|
+
datacache.append(orjson.loads(r))
|
|
174
|
+
if filename:
|
|
175
|
+
source.close()
|
|
176
|
+
elif filetype == 'csv':
|
|
177
|
+
if filename:
|
|
178
|
+
source = open(filename, 'r', encoding=encoding)
|
|
179
|
+
else:
|
|
180
|
+
source = fileobj
|
|
181
|
+
n = 0
|
|
182
|
+
reader = csv.DictReader(source, quotechar=quotechar, delimiter=delimiter, quoting=csv.QUOTE_ALL)
|
|
183
|
+
for r in reader:
|
|
184
|
+
n += 1
|
|
185
|
+
if n > alimit:
|
|
186
|
+
break
|
|
187
|
+
datacache.append(r)
|
|
188
|
+
if filename:
|
|
189
|
+
source.close()
|
|
190
|
+
n = 0
|
|
191
|
+
scheme = None
|
|
192
|
+
for r in datacache:
|
|
193
|
+
n += 1
|
|
194
|
+
if scheme is None:
|
|
195
|
+
scheme = get_schema(r)
|
|
196
|
+
else:
|
|
197
|
+
scheme = merge_schemes([scheme, get_schema(r)])
|
|
198
|
+
return scheme
|
|
199
|
+
|
|
200
|
+
def schema2fieldslist(schema, prefix=None, predefined=None, sample=None):
|
|
201
|
+
"""Converts data schema to the fields list"""
|
|
202
|
+
fieldslist = []
|
|
203
|
+
for k in schema.keys():
|
|
204
|
+
if prefix is None:
|
|
205
|
+
name = k
|
|
206
|
+
else:
|
|
207
|
+
name = '.'.join(['.'.join(prefix.split('.')), k])
|
|
208
|
+
try:
|
|
209
|
+
sampledata = get_dict_value_deep(sample, name) if sample else ''
|
|
210
|
+
except:
|
|
211
|
+
sampledata = ''
|
|
212
|
+
if 'schema' not in schema[k].keys():
|
|
213
|
+
if schema[k]['type'] != 'array':
|
|
214
|
+
field = {'name' : name, 'type': schema[k]['type'], 'description' : '', 'sample' : sampledata, 'class' : ""}
|
|
215
|
+
else:
|
|
216
|
+
field = {'name': name, 'type': 'list of [%s]' % schema[k]['type'], 'description' : '', 'sample' : sampledata, 'class' : ""}
|
|
217
|
+
if predefined:
|
|
218
|
+
if name in predefined.keys():
|
|
219
|
+
field['description'] = predefined[name]['text']
|
|
220
|
+
if predefined[name]['class']:
|
|
221
|
+
field['class'] = predefined[name]['class']
|
|
222
|
+
elif k in predefined.keys():
|
|
223
|
+
field['description'] = predefined[k]['text']
|
|
224
|
+
if predefined[k]['class']:
|
|
225
|
+
field['class'] = predefined[k]['class']
|
|
226
|
+
if field['type'] == 'datetime':
|
|
227
|
+
field['class'] = 'datetime'
|
|
228
|
+
fieldslist.append(field)
|
|
229
|
+
else:
|
|
230
|
+
if prefix is not None:
|
|
231
|
+
subprefix = copy(prefix) + '.' + k
|
|
232
|
+
# subprefix.append(k)
|
|
233
|
+
else:
|
|
234
|
+
subprefix = k
|
|
235
|
+
if schema[k]['type'] == 'dict':
|
|
236
|
+
field = {'name' : name, 'type': schema[k]['type'], 'description' : '', 'sample' : '', 'class' : ''}
|
|
237
|
+
if predefined:
|
|
238
|
+
if name in predefined.keys():
|
|
239
|
+
field['description'] = predefined[name]['text']
|
|
240
|
+
if predefined[name]['class']:
|
|
241
|
+
field['class'] = predefined[name]['class']
|
|
242
|
+
elif k in predefined.keys():
|
|
243
|
+
field['description'] = predefined[k]['text']
|
|
244
|
+
if predefined[k]['class']:
|
|
245
|
+
field['class'] = predefined[k]['class']
|
|
246
|
+
fieldslist.append(field)
|
|
247
|
+
fieldslist.extend(schema2fieldslist(schema[k]['schema'], prefix=subprefix, predefined=predefined, sample=sample))
|
|
248
|
+
elif schema[k]['type'] == 'array':
|
|
249
|
+
field = {'name': name, 'type': 'list of [%s]' % schema[k]['type'], 'description' : '', 'sample' : '', 'class' : ''}
|
|
250
|
+
if predefined:
|
|
251
|
+
if name in predefined.keys():
|
|
252
|
+
field['description'] = predefined[name]['text']
|
|
253
|
+
if predefined[name]['class']:
|
|
254
|
+
field['class'] = predefined[name]['class']
|
|
255
|
+
elif k in predefined.keys():
|
|
256
|
+
field['description'] = predefined[k]['text']
|
|
257
|
+
if predefined[k]['class']:
|
|
258
|
+
field['class'] = predefined[k]['class']
|
|
259
|
+
fieldslist.append(field)
|
|
260
|
+
fieldslist.extend(schema2fieldslist(schema[k]['schema'], prefix=subprefix, sample=sample))
|
|
261
|
+
return fieldslist
|
undatum/constants.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""Constants and configuration values for the undatum package."""
|
|
3
|
+
DATE_PATTERNS = ["%d.%m.%Y", "%Y-%m-%d", "%y-%m-%d", "%Y-%m-%dT%H:%M:%S",
|
|
4
|
+
"%Y-%m-%d %H:%M:%S",
|
|
5
|
+
"%d.%m.%Y %H:%M"]
|
|
6
|
+
DEFAULT_DICT_SHARE = 70
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
SUPPORTED_FILE_TYPES = ['xls', 'xlsx', 'csv', 'xml', 'json', 'jsonl', 'yaml',
|
|
10
|
+
'tsv', 'sql', 'bson', 'parquet', 'orc', 'avro']
|
|
11
|
+
COMPRESSED_FILE_TYPES = ['gz', 'xz', 'zip', 'lz4', '7z', 'bz2']
|
|
12
|
+
BINARY_FILE_TYPES = ['xls', 'xlsx', 'bson', 'parquet', 'irc'] + \
|
|
13
|
+
COMPRESSED_FILE_TYPES
|
|
14
|
+
|
|
15
|
+
DEFAULT_OPTIONS = {'encoding': 'utf8',
|
|
16
|
+
'delimiter': ',',
|
|
17
|
+
'limit': 1000
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
DUCKABLE_FILE_TYPES = ['csv', 'jsonl', 'json', 'parquet']
|
|
21
|
+
DUCKABLE_CODECS = ['zst', 'gzip', 'raw']
|