undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,261 @@
1
+ # -*- coding: utf8 -*-
2
+ """Schema definition and type mapping module."""
3
+ import datetime
4
+ import bson
5
+ import orjson
6
+ import csv
7
+ import logging
8
+ from copy import copy
9
+ from .functions import get_dict_value_deep
10
+
11
+ OTYPES_MAP = [[type(""), 'string'],
12
+ [type(u""), 'string'],
13
+ [datetime.datetime, 'datetime'],
14
+ [int, 'integer'],
15
+ [bool, 'boolean'],
16
+ [float, 'float'],
17
+ [str, 'string'],
18
+ [bson.int64.Int64, 'integer'],
19
+ [bson.objectid.ObjectId, 'string'],
20
+ [type([]), 'array']
21
+ ]
22
+
23
+
24
+ def merge_schemes(alist, novalue=True):
25
+ """Merges schemes of list of objects and generates final data schema"""
26
+ if len(alist) == 0:
27
+ return None
28
+ obj = alist[0]
29
+ okeys = obj.keys()
30
+ for item in alist[1:]:
31
+ for k in item.keys():
32
+ # print(obj[k]['type'])
33
+ if k not in okeys:
34
+ obj[k] = item[k]
35
+ elif obj[k]['type'] in ['integer', 'float', 'string', 'datetime']:
36
+ if not novalue:
37
+ obj[k]['value'] += item[k]['value']
38
+ elif obj[k]['type'] == 'dict':
39
+ if not novalue:
40
+ obj[k]['value'] += item[k]['value']
41
+ if 'schema' in item[k].keys():
42
+ obj[k]['schema'] = merge_schemes([obj[k]['schema'], item[k]['schema']])
43
+ elif obj[k]['type'] == 'array':
44
+ # if 'subtype' not in obj[k].keys():
45
+ # logging.info(str(obj[k]))
46
+ if 'subtype' in obj[k].keys() and obj[k]['subtype'] == 'dict':
47
+ if not novalue:
48
+ obj[k]['value'] += item[k]['value']
49
+ if 'schema' in item[k].keys():
50
+ obj[k]['schema'] = merge_schemes([obj[k]['schema'], item[k]['schema']])
51
+ else:
52
+ if not novalue:
53
+ obj[k]['value'] += item['value']
54
+ return obj
55
+
56
+
57
+ def get_schemes(alist):
58
+ """Generates schemas for each object"""
59
+ results = []
60
+ for o in alist:
61
+ results.append(get_schema(o))
62
+ return results
63
+
64
+ def get_schema(obj, novalue=True):
65
+ """Generates schema from object"""
66
+ result = {}
67
+ for k in obj.keys():
68
+ tt = type(obj[k])
69
+ if obj[k] is None:
70
+ result[k] = {'type': 'string', 'value' : 1}
71
+ elif tt == type("") or tt == type(u"") or isinstance(obj[k], str):
72
+ result[k] = {'type': 'string', 'value' : 1}
73
+ elif isinstance(obj[k], str):
74
+ result[k] = {'type': 'string', 'value': 1}
75
+ elif tt == datetime.datetime:
76
+ result[k] = {'type': 'datetime', 'value' : 1}
77
+ elif tt == bool:
78
+ result[k] = {'type': 'boolean', 'value' : 1}
79
+ elif tt == float:
80
+ result[k] = {'type': 'float', 'value' : 1}
81
+ elif tt == int:
82
+ result[k] = {'type': 'integer', 'value' : 1}
83
+ elif tt == bson.int64.Int64:
84
+ result[k] = {'type': 'integer', 'value' : 1}
85
+ elif tt == bson.objectid.ObjectId:
86
+ result[k] = {'type': 'string', 'value' : 1}
87
+ elif tt == type({}):
88
+ result[k] = {'type': 'dict', 'value' : 1, 'schema' : get_schema(obj[k])}
89
+ elif tt == type([]):
90
+ result[k] = {'type': 'array', 'value' : 1}
91
+ if len(obj[k]) == 0:
92
+ result[k]['subtype'] = 'string'
93
+ else:
94
+ found = False
95
+ for otype, oname in OTYPES_MAP:
96
+ if isinstance(obj[k][0], otype):
97
+ result[k]['subtype'] = oname
98
+ found = True
99
+ if not found:
100
+ if isinstance(obj[k][0], dict):
101
+ result[k]['subtype'] = 'dict'
102
+ result[k]['schema'] = merge_schemes(get_schemes(obj[k]))
103
+ else:
104
+ logging.info("Unknown object %s type %s" % (k, str(type(obj[k][0]))))
105
+ else:
106
+ logging.info("Unknown object %s type %s" % (k, str(type(obj[k]))))
107
+ result[k] = {'type': 'string', 'value' : 1}
108
+ if novalue:
109
+ del result[k]['value']
110
+ return result
111
+
112
+ def extract_keys(obj, parent=None, text=None, level=1):
113
+ """Extracts keys"""
114
+ text = ''
115
+ if not parent:
116
+ text = "'schema': {\n"
117
+ for k in obj.keys():
118
+ if isinstance(obj[k], dict):
119
+ text += "\t" * level + "'%s' : {'type' : 'dict', 'schema' : {\n" % (k)
120
+ text += extract_keys(obj[k], k, text, level+1)
121
+ text += "\t" * level + "}},\n"
122
+ elif isinstance(obj[k], list):
123
+ text += "\t" * level + "'%s' : {'type' : 'list', 'schema' : { 'type' : 'dict', 'schema' : {\n" % (k)
124
+ if len(obj[k]) > 0:
125
+ item = obj[k][0]
126
+ if isinstance(item, dict):
127
+ text += extract_keys(item, k, text, level+1)
128
+ else:
129
+ text += "\t" * level + "'%s' : {'type' : 'string'},\n" % (k)
130
+ text += "\t" * level + "}}},\n"
131
+ else:
132
+ logging.info(str(type(obj[k])))
133
+ text += "\t" * level + "'%s' : {'type' : 'string'},\n" % (k)
134
+ if not parent:
135
+ text += "}"
136
+ return text
137
+
138
+ def __get_filetype_by_ext(filename):
139
+ ext = filename.rsplit('.', 1)[-1].lower()
140
+ if ext in ['bson', 'json', 'csv', 'jsonl']:
141
+ return ext
142
+ return filename
143
+
144
+
145
+ def generate_scheme_from_file(filename=None, fileobj=None, filetype='bson', alimit=1000, verbose=0, encoding='utf8', delimiter=",", quotechar='"'):
146
+ """Generates schema of the data BSON file"""
147
+ if not filetype and filename is not None:
148
+ filetype = __get_filetype_by_ext(filename)
149
+ datacache = []
150
+ if filetype == 'bson':
151
+ if filename:
152
+ source = open(filename, 'rb')
153
+ else:
154
+ source = fileobj
155
+ n = 0
156
+ for r in bson.decode_file_iter(source):
157
+ n += 1
158
+ if n > alimit:
159
+ break
160
+ datacache.append(r)
161
+ if filename:
162
+ source.close()
163
+ elif filetype == 'jsonl':
164
+ if filename:
165
+ source = open(filename, 'r', encoding=encoding)
166
+ else:
167
+ source = fileobj
168
+ n = 0
169
+ for r in source:
170
+ n += 1
171
+ if n > alimit:
172
+ break
173
+ datacache.append(orjson.loads(r))
174
+ if filename:
175
+ source.close()
176
+ elif filetype == 'csv':
177
+ if filename:
178
+ source = open(filename, 'r', encoding=encoding)
179
+ else:
180
+ source = fileobj
181
+ n = 0
182
+ reader = csv.DictReader(source, quotechar=quotechar, delimiter=delimiter, quoting=csv.QUOTE_ALL)
183
+ for r in reader:
184
+ n += 1
185
+ if n > alimit:
186
+ break
187
+ datacache.append(r)
188
+ if filename:
189
+ source.close()
190
+ n = 0
191
+ scheme = None
192
+ for r in datacache:
193
+ n += 1
194
+ if scheme is None:
195
+ scheme = get_schema(r)
196
+ else:
197
+ scheme = merge_schemes([scheme, get_schema(r)])
198
+ return scheme
199
+
200
+ def schema2fieldslist(schema, prefix=None, predefined=None, sample=None):
201
+ """Converts data schema to the fields list"""
202
+ fieldslist = []
203
+ for k in schema.keys():
204
+ if prefix is None:
205
+ name = k
206
+ else:
207
+ name = '.'.join(['.'.join(prefix.split('.')), k])
208
+ try:
209
+ sampledata = get_dict_value_deep(sample, name) if sample else ''
210
+ except:
211
+ sampledata = ''
212
+ if 'schema' not in schema[k].keys():
213
+ if schema[k]['type'] != 'array':
214
+ field = {'name' : name, 'type': schema[k]['type'], 'description' : '', 'sample' : sampledata, 'class' : ""}
215
+ else:
216
+ field = {'name': name, 'type': 'list of [%s]' % schema[k]['type'], 'description' : '', 'sample' : sampledata, 'class' : ""}
217
+ if predefined:
218
+ if name in predefined.keys():
219
+ field['description'] = predefined[name]['text']
220
+ if predefined[name]['class']:
221
+ field['class'] = predefined[name]['class']
222
+ elif k in predefined.keys():
223
+ field['description'] = predefined[k]['text']
224
+ if predefined[k]['class']:
225
+ field['class'] = predefined[k]['class']
226
+ if field['type'] == 'datetime':
227
+ field['class'] = 'datetime'
228
+ fieldslist.append(field)
229
+ else:
230
+ if prefix is not None:
231
+ subprefix = copy(prefix) + '.' + k
232
+ # subprefix.append(k)
233
+ else:
234
+ subprefix = k
235
+ if schema[k]['type'] == 'dict':
236
+ field = {'name' : name, 'type': schema[k]['type'], 'description' : '', 'sample' : '', 'class' : ''}
237
+ if predefined:
238
+ if name in predefined.keys():
239
+ field['description'] = predefined[name]['text']
240
+ if predefined[name]['class']:
241
+ field['class'] = predefined[name]['class']
242
+ elif k in predefined.keys():
243
+ field['description'] = predefined[k]['text']
244
+ if predefined[k]['class']:
245
+ field['class'] = predefined[k]['class']
246
+ fieldslist.append(field)
247
+ fieldslist.extend(schema2fieldslist(schema[k]['schema'], prefix=subprefix, predefined=predefined, sample=sample))
248
+ elif schema[k]['type'] == 'array':
249
+ field = {'name': name, 'type': 'list of [%s]' % schema[k]['type'], 'description' : '', 'sample' : '', 'class' : ''}
250
+ if predefined:
251
+ if name in predefined.keys():
252
+ field['description'] = predefined[name]['text']
253
+ if predefined[name]['class']:
254
+ field['class'] = predefined[name]['class']
255
+ elif k in predefined.keys():
256
+ field['description'] = predefined[k]['text']
257
+ if predefined[k]['class']:
258
+ field['class'] = predefined[k]['class']
259
+ fieldslist.append(field)
260
+ fieldslist.extend(schema2fieldslist(schema[k]['schema'], prefix=subprefix, sample=sample))
261
+ return fieldslist
undatum/constants.py ADDED
@@ -0,0 +1,21 @@
1
+ # -*- coding: utf8 -*-
2
+ """Constants and configuration values for the undatum package."""
3
+ DATE_PATTERNS = ["%d.%m.%Y", "%Y-%m-%d", "%y-%m-%d", "%Y-%m-%dT%H:%M:%S",
4
+ "%Y-%m-%d %H:%M:%S",
5
+ "%d.%m.%Y %H:%M"]
6
+ DEFAULT_DICT_SHARE = 70
7
+
8
+
9
+ SUPPORTED_FILE_TYPES = ['xls', 'xlsx', 'csv', 'xml', 'json', 'jsonl', 'yaml',
10
+ 'tsv', 'sql', 'bson', 'parquet', 'orc', 'avro']
11
+ COMPRESSED_FILE_TYPES = ['gz', 'xz', 'zip', 'lz4', '7z', 'bz2']
12
+ BINARY_FILE_TYPES = ['xls', 'xlsx', 'bson', 'parquet', 'irc'] + \
13
+ COMPRESSED_FILE_TYPES
14
+
15
+ DEFAULT_OPTIONS = {'encoding': 'utf8',
16
+ 'delimiter': ',',
17
+ 'limit': 1000
18
+ }
19
+
20
+ DUCKABLE_FILE_TYPES = ['csv', 'jsonl', 'json', 'parquet']
21
+ DUCKABLE_CODECS = ['zst', 'gzip', 'raw']