undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ # -*- coding: utf8 -*-
2
+ """Data ingestion module for databases."""
3
+ import duckdb
4
+ import logging
5
+ from iterable.helpers.detect import open_iterable
6
+ from tqdm import tqdm
7
+
8
+ from pymongo import MongoClient
9
+ from elasticsearch import Elasticsearch
10
+
11
+ ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
12
+
13
+ DUCKABLE_FILE_TYPES = ['parquet', 'csv', 'jsonl', 'json', 'jsonl.gz']
14
+ DUCKABLE_CODECS = ['gz', 'zst']
15
+
16
+
17
+ DEFAULT_BATCH_SIZE = 50000
18
+
19
+ def get_iterable_options(options):
20
+ """Extract iterable-specific options from options dictionary."""
21
+ out = {}
22
+ for k in ITERABLE_OPTIONS_KEYS:
23
+ if k in options.keys():
24
+ out[k] = options[k]
25
+ return out
26
+
27
+
28
+ class BasicIngester:
29
+ """Base class for data ingestion."""
30
+ def __init__(self):
31
+ pass
32
+
33
+ def ingest(self, batch):
34
+ raise NotImplemented
35
+
36
+ class ElasticIngester(BasicIngester):
37
+ """Elasticsearch data ingester."""
38
+ def __init__(self, uri:str, api_key:str, search_index:str, document_id:str="id"):
39
+ self.client = Elasticsearch(uri, api_key=api_key, verify_certs=False,ssl_show_warn=False, timeout=60, max_retries=10, retry_on_timeout=True)
40
+ self._index = search_index
41
+ self._item_id = document_id
42
+ pass
43
+
44
+
45
+ def ingest(self, batch):
46
+ documents = []
47
+ for doc in batch:
48
+ documents.append({ "index": { "_index": self._index, '_id' : doc[self._item_id]}})
49
+ documents.append(doc)
50
+ result = self.client.bulk(operations=documents, pipeline="ent-search-generic-ingestion")
51
+
52
+
53
+ class MongoIngester:
54
+ """MongoDB data ingester."""
55
+ def __init__(self, uri, db, table,do_drop=False):
56
+ self.client = MongoClient(uri)
57
+ self.db = self.client[db]
58
+ if do_drop:
59
+ del self.db[table]
60
+ self.coll = self.db[table]
61
+
62
+
63
+ def ingest(self, batch):
64
+ result = self.coll.insert_many(batch)
65
+
66
+
67
+ class Ingester:
68
+ """Main data ingestion handler."""
69
+ def __init__(self, batch_size=DEFAULT_BATCH_SIZE):
70
+ self.batch_size = batch_size
71
+ pass
72
+
73
+ def ingest(self, fromfiles, uri, db, table, options={}):
74
+ for filename in fromfiles:
75
+ self.ingest_single(filename, uri, db, table, options=options)
76
+
77
+
78
+ def ingest_single(self, fromfile, uri, db, table, options={}):
79
+ """Loads single file data contents to the schemaless database like MongoDB"""
80
+ dbtype = options['dbtype']
81
+ processor = None
82
+ totals = -1
83
+ skip = options['skip']
84
+ use_totals = options['totals']if 'totals' in options.keys() else False
85
+ do_drop = options['drop']if 'dro[]' in options.keys() else False
86
+ print(f'Ingesting {fromfile} to {uri} with db {db} table {table}')
87
+ if use_totals:
88
+ parts = fromfile.rsplit('.', 2)
89
+ if len(parts) == 2:
90
+ if parts[-1].lower() in DUCKABLE_FILE_TYPES:
91
+ totals = duckdb.sql(f"select count(*) from '{fromfile}'").fetchone()[0]
92
+ elif len(parts) == 3:
93
+ if parts[-2].lower() in DUCKABLE_FILE_TYPES and parts[-1].lower() in DUCKABLE_CODECS:
94
+ totals = duckdb.sql(f"select count(*) from '{fromfile}'").fetchone()[0]
95
+ if dbtype == 'mongodb':
96
+ processor = MongoIngester(uri, db, table, do_drop=do_drop)
97
+ elif dbtype == 'elastic':
98
+ api_key = options['api_key']
99
+ id_key = options['doc_id']
100
+ processor = ElasticIngester(uri=uri, api_key=api_key, search_index=table, document_id=id_key)
101
+ iterableargs = get_iterable_options(options)
102
+ it_in = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
103
+ logging.info(f'Ingesting data: filename {fromfile}, uri: {uri}, db {db}, table {table}')
104
+ n = 0
105
+ batch = []
106
+ for row in tqdm(it_in, total=totals):
107
+ n += 1
108
+ if skip is not None and skip > 0:
109
+ if n < skip: continue
110
+ batch.append(row)
111
+ if n % self.batch_size == 0:
112
+ processor.ingest(batch)
113
+ batch = []
114
+ if len(batch) > 0:
115
+ processor.ingest(batch)
116
+
undatum/cmds/query.py ADDED
@@ -0,0 +1,68 @@
1
+ # -*- coding: utf8 -*-
2
+ """Data query module using mistql."""
3
+ # import json
4
+ import logging
5
+ import sys
6
+
7
+
8
+ # from xmlr import xmliter
9
+ from ..utils import get_file_type, get_option, strip_dict_fields
10
+ from ..common.iterable import IterableData
11
+ LINEEND = '\n'.encode('utf8')
12
+
13
+ DEFAULT_CHUNK_SIZE = 50
14
+
15
+ class DataQuery:
16
+ """Data query handler using mistql."""
17
+ def __init__(self):
18
+ pass
19
+
20
+
21
+ def query(self, fromfile, options=None):
22
+ """Use mistql to query data."""
23
+ if options is None:
24
+ options = {}
25
+ from mistql import query
26
+ f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
27
+ iterable = IterableData(fromfile, options=options)
28
+ to_file = get_option(options, 'output')
29
+
30
+ if to_file:
31
+ to_type = get_file_type(to_file)
32
+ if not to_file:
33
+ print('Output file type not supported')
34
+ return
35
+ if to_type == 'bson':
36
+ out = open(to_file, 'wb')
37
+ elif to_type == 'jsonl':
38
+ out = open(to_file, 'wb')
39
+ else:
40
+ out = open(to_file, 'w', encoding='utf8')
41
+ else:
42
+ to_type = f_type
43
+ out = sys.stdout
44
+ fields = options['fields'].split(',') if options['fields'] else None
45
+ # writer = DataWriter(out, filetype=to_type, fieldnames=fields)
46
+ if iterable:
47
+ n = 0
48
+ fields = [field.split('.') for field in fields] if fields else None
49
+ for r in iterable.iter():
50
+ n += 1
51
+ if fields:
52
+ r_selected = strip_dict_fields(r, fields, 0)
53
+ else:
54
+ r_selected = r
55
+ if options['query'] is not None:
56
+ res = query(options['query'], r_selected)
57
+ # print(options['filter'], r)
58
+ if not res:
59
+ continue
60
+ else:
61
+ res = r_selected
62
+ print(res)
63
+ else:
64
+ logging.info('File type not supported')
65
+ return
66
+ logging.debug('query: %d records processed', n)
67
+ if to_file:
68
+ out.close()
@@ -0,0 +1,328 @@
1
+ # -*- coding: utf8 -*-
2
+ """Schema generation and extraction module."""
3
+ import csv
4
+ import io
5
+ import json
6
+ import logging
7
+ import os
8
+ import tempfile
9
+ import zipfile
10
+ from typing import Optional
11
+
12
+ import duckdb
13
+ import orjson
14
+ import pandas as pd
15
+ import tqdm
16
+ import xxhash
17
+ import yaml
18
+ from pydantic import BaseModel
19
+ from pyzstd import ZstdFile
20
+ from qddate import DateParser
21
+
22
+ from ..common.scheme import generate_scheme_from_file
23
+ from ..utils import get_file_type, get_option
24
+ from ..ai import get_fields_info, get_description
25
+
26
+
27
+
28
+ def column_type_parse(column_type):
29
+ """Parse column type string to extract array flag and base type."""
30
+ is_array = (column_type[-2:] == '[]')
31
+ if is_array:
32
+ text = column_type[:-2]
33
+ else:
34
+ text = column_type
35
+ if text[:6] == 'STRUCT':
36
+ atype = text[:6]
37
+ elif text[:4] == 'JSON':
38
+ atype = 'VARCHAR'
39
+ else:
40
+ atype = text
41
+ return [atype, str(is_array)]
42
+
43
+
44
+ def get_schema_key(fields):
45
+ """Generate hash key for schema based on field names."""
46
+ return xxhash.xxh64('|'.join(sorted(fields))).hexdigest()
47
+
48
+
49
+ def duckdb_decompose(filename: str = None, frame: pd.DataFrame = None,
50
+ filetype: str = None, path: str = "*",
51
+ limit: int = 10000000, recursive: bool = True,
52
+ root: str = "", ignore_errors: bool = True):
53
+ """Decomposes file or data frame structure."""
54
+ text_ignore = ', ignore_errors=true' if ignore_errors else ''
55
+ if filetype in ['csv', 'tsv']:
56
+ read_func = f"read_csv('{filename}'{text_ignore}, sample_size={limit})"
57
+ elif filetype in ['json', 'jsonl']:
58
+ read_func = f"read_json('{filename}'{text_ignore})"
59
+ else:
60
+ read_func = f"'{filename}'"
61
+ if path == '*':
62
+ if filename is not None:
63
+ query_str = f"describe select {path} from {read_func} limit {limit}"
64
+ data = duckdb.sql(query_str).fetchall()
65
+ else:
66
+ query_str = f"describe select {path} from frame limit {limit}"
67
+ data = duckdb.sql(query_str).fetchall()
68
+ else:
69
+ path_parts = path.split('.')
70
+ query = None
71
+ if len(path_parts) == 1:
72
+ if filename is not None:
73
+ query = (f"describe select unnest(\"{path}\", recursive:=true) "
74
+ f"from {read_func} limit {limit}")
75
+ else:
76
+ query = (f"describe select unnest(\"{path}\", recursive:=true) "
77
+ f"from frame limit {limit}")
78
+ elif len(path_parts) == 2:
79
+ if filename is not None:
80
+ query = (f"describe select unnest(\"{path_parts[1]}\", "
81
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
82
+ f"recursive:=true) from {read_func} limit {limit})")
83
+ else:
84
+ query = (f"describe select unnest(\"{path_parts[1]}\", "
85
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
86
+ f"recursive:=true) from frame limit {limit})")
87
+ elif len(path_parts) == 3:
88
+ if filename is not None:
89
+ query = (f"describe select unnest(\"{path_parts[2]}\", "
90
+ f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
91
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
92
+ f"recursive:=true) from {read_func} limit {limit}))")
93
+ else:
94
+ query = (f"describe select unnest(\"{path_parts[2]}\", "
95
+ f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
96
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
97
+ f"recursive:=true) from frame limit {limit}))")
98
+ elif len(path_parts) == 4:
99
+ if filename is not None:
100
+ query = (f"describe select unnest(\"{path_parts[2]}.{path_parts[3]}\", "
101
+ f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
102
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
103
+ f"recursive:=true) from {read_func} limit {limit}))")
104
+ else:
105
+ query = (f"describe select unnest(\"{path_parts[2]}.{path_parts[3]}\", "
106
+ f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
107
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
108
+ f"recursive:=true) from frame limit {limit}))")
109
+ data = duckdb.sql(query).fetchall()
110
+ table = []
111
+ for row in data:
112
+ item = [row[0] if len(root) == 0 else root + '.' + row[0]]
113
+ item.extend(column_type_parse(row[1]))
114
+ table.append(item)
115
+ if recursive and item[1] == 'STRUCT':
116
+ sub_path = row[0] if len(root) == 0 else item[0]
117
+ subtable = duckdb_decompose(filename, frame, filetype=filetype,
118
+ path=sub_path, limit=limit,
119
+ recursive=recursive, root=item[0],
120
+ ignore_errors=ignore_errors)
121
+ for subitem in subtable:
122
+ table.append(subitem)
123
+ return table
124
+
125
+
126
+ class FieldSchema(BaseModel):
127
+ """Schema definition for a data field."""
128
+ name: str
129
+ ftype: str
130
+ is_array: bool = False
131
+ description: Optional[str] = None
132
+ sem_type: str = None
133
+ sem_url: str = None
134
+
135
+
136
+ class TableSchema(BaseModel):
137
+ """Table schema definition."""
138
+ key: Optional[str] = None
139
+ num_cols: int = -1
140
+ is_flat: bool = True
141
+ id: Optional[str] = None
142
+ fields: Optional[list[FieldSchema]] = []
143
+ description: Optional[str] = None
144
+ files: Optional[list[str]] = []
145
+
146
+
147
+ MAX_SAMPLE_SIZE = 200
148
+ DELIMITED_FILES = ['csv', 'tsv']
149
+ DUCKABLE_FILE_TYPES = ['csv', 'jsonl', 'json', 'parquet']
150
+ DUCKABLE_CODECS = ['zst', 'gzip', 'raw']
151
+
152
+
153
+ def table_from_objects(objects:list, id:str, objects_limit:int, use_pandas:bool=False, filetype='csv', autodoc:bool=False, lang:str='English'):
154
+ """Reconstructs table schema from list of objects"""
155
+ table = TableSchema(id=id)
156
+ table.num_records = len(objects)
157
+ if autodoc:
158
+ f = io.StringIO()
159
+ writer = csv.writer(f)
160
+ writer.writerows(objects[:MAX_SAMPLE_SIZE])
161
+ table.description = get_description(f.getvalue(), language=lang)
162
+ if use_pandas:
163
+ df = pd.DataFrame(objects)
164
+ columns_raw = duckdb_decompose(frame=df, path='*', limit=objects_limit)
165
+ else:
166
+ tfile = tempfile.NamedTemporaryFile(suffix='.' + filetype, mode='w', encoding='utf8', delete=False)
167
+ tfile.close()
168
+ tfile_real = ZstdFile(tfile.name, mode='w', level_or_option=9)
169
+ wrapper = io.TextIOWrapper(tfile_real, encoding='utf8', write_through=True)
170
+ if filetype == 'csv':
171
+ writer = csv.writer(wrapper)
172
+ writer.writerows(objects[:objects_limit])
173
+ elif filetype == 'jsonl':
174
+ for row in objects[:objects_limit]:
175
+ wrapper.write(json.dumps(row) + '\n')
176
+ tfile_real.close()
177
+ # Getting structure
178
+ columns_raw = duckdb_decompose(tfile.name, filetype=filetype, path='*', limit=objects_limit)
179
+ os.remove(tfile.name)
180
+ is_flat = True
181
+ table.num_cols = len(columns_raw)
182
+
183
+ for column in columns_raw:
184
+ field = FieldSchema(name=column[0], ftype=column[1], is_array=column[2])
185
+ table.fields.append(field)
186
+ if field.ftype == 'STRUCT' or field.is_array:
187
+ is_flat = False
188
+ table.is_flat = is_flat
189
+ table.num_records = len(objects)
190
+ return table
191
+
192
+
193
+ def build_schema(filename:str, objects_limit:int=100000):
194
+ """Build schema from file by analyzing sample of objects."""
195
+ fileext = filename.rsplit('.', 1)[-1].lower()
196
+ filetype = fileext
197
+ # Getting total count
198
+ table = TableSchema(id=os.path.basename(filename))
199
+ # Getting structure
200
+ columns_raw = duckdb_decompose(filename, filetype=filetype, path='*', limit=objects_limit)
201
+ is_flat = True
202
+ table.num_cols = len(columns_raw)
203
+ fieldsnames = []
204
+ for column in columns_raw:
205
+ field = FieldSchema(name=column[0], ftype=column[1], is_array=column[2])
206
+ fieldsnames.append(column[0])
207
+ table.fields.append(field)
208
+ if field.ftype == 'STRUCT' or field.is_array:
209
+ is_flat = False
210
+ table.is_flat = is_flat
211
+ table.key = get_schema_key(fieldsnames)
212
+ return table
213
+
214
+
215
+
216
+ class Schemer:
217
+ """Schema generation handler."""
218
+ def __init__(self, nodates=True):
219
+ if nodates:
220
+ self.qd = None
221
+ else:
222
+ self.qd = DateParser(generate=True)
223
+ pass
224
+
225
+ def extract_schema(self, fromfile, options):
226
+ """Extract schema from file and output as YAML."""
227
+ table = build_schema(fromfile)
228
+ print(yaml.dump(table.model_dump(), Dumper=yaml.Dumper))
229
+
230
+
231
+ def extract_schema_bulk(self, fromdir, options):
232
+ """Extracts schemes from all data files and writes schema structures"""
233
+ filenames = os.listdir(fromdir)
234
+ files = []
235
+ tables = {}
236
+ supported_exts = ['csv', 'json', 'jsonl', 'parquet', 'csv.gz',
237
+ 'csv.zstd', 'jsonl.zstd']
238
+ for filename in filenames:
239
+ ext = filename.rsplit('.', 1)[-1]
240
+ if ext in supported_exts:
241
+ files.append(os.path.join(fromdir, filename))
242
+ mode = options['mode']
243
+ print(f'Found {len(files)} files. Processing mode {mode}')
244
+ for filename in tqdm.tqdm(files):
245
+ table = build_schema(filename)
246
+ fbase = os.path.basename(filename)
247
+ table.id = table.key
248
+ if mode == 'distinct':
249
+ if table.key not in tables.keys():
250
+ tables[table.key] = table
251
+ tables[table.key].files.append(fbase)
252
+ if ('autodoc' in options.keys() and options['autodoc'] and
253
+ 'lang' in options.keys()):
254
+ fields = []
255
+ for column in table.fields:
256
+ fields.append(column.name)
257
+ descriptions = get_fields_info(fields,
258
+ language=options['lang'])
259
+ for column in table.fields:
260
+ if column.name in descriptions.keys():
261
+ column.description = descriptions[column.name]
262
+ else:
263
+ tables[table.key].files.append(fbase)
264
+ elif mode == 'perfile':
265
+ table.files.append(fbase)
266
+ if ('autodoc' in options.keys() and options['autodoc'] and
267
+ 'lang' in options.keys()):
268
+ fields = []
269
+ for column in table.fields:
270
+ fields.append(column.name)
271
+ descriptions = get_fields_info(fields,
272
+ language=options['lang'])
273
+ for column in table.fields:
274
+ if column.name in descriptions.keys():
275
+ column.description = descriptions[column.name]
276
+ output_path = os.path.join(options['output'], fbase + '.yaml')
277
+ with open(output_path, 'w', encoding='utf8') as f:
278
+ f.write(yaml.dump(table.model_dump(), Dumper=yaml.Dumper))
279
+ if mode == 'distinct':
280
+ print(f'Total schemas {len(tables)}, files {len(files)}')
281
+ elif mode == 'perfile':
282
+ print(f'Total schemas {len(files)}, files {len(files)}')
283
+ if 'output' in options.keys():
284
+ if mode == 'distinct':
285
+ print('Writing schemas')
286
+ for table in tables.values():
287
+ output_path = os.path.join(options['output'],
288
+ table.key + '.yaml')
289
+ with open(output_path, 'w', encoding='utf8') as f:
290
+ f.write(yaml.dump(table.model_dump(),
291
+ Dumper=yaml.Dumper))
292
+ # print(yaml.dump(table.model_dump(), Dumper=yaml.Dumper))
293
+
294
+
295
+ def generate_scheme(self, fromfile, options):
296
+ """Generates cerberus scheme from JSON lines or BSON file"""
297
+ f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
298
+ if f_type not in ['jsonl', 'bson', 'csv']:
299
+ print('Only JSON lines, CSV and BSON (.jsonl, .csv, .bson) files supported now')
300
+ return
301
+ if options['zipfile']:
302
+ z = zipfile.ZipFile(fromfile, mode='r')
303
+ fnames = z.namelist()
304
+ fnames[0]
305
+ if f_type == 'bson':
306
+ infile = z.open(fnames[0], 'rb')
307
+ else:
308
+ infile = z.open(fnames[0], 'r')
309
+ else:
310
+ if f_type == 'bson':
311
+ infile = open(fromfile, 'rb')
312
+ else:
313
+ infile = open(fromfile, 'r', encoding=get_option(options, 'encoding'))
314
+
315
+ logging.debug('Start identifying scheme for %s', fromfile)
316
+ scheme = generate_scheme_from_file(fileobj=infile, filetype=f_type,
317
+ delimiter=options['delimiter'],
318
+ encoding=options['encoding'])
319
+ if options['output']:
320
+ with open(options['output'], 'w', encoding='utf8') as f:
321
+ f.write(orjson.dumps(scheme,
322
+ option=orjson.OPT_INDENT_2).decode('utf8'))
323
+ if not options['zipfile']:
324
+ infile.close()
325
+ if options['zipfile']:
326
+ z.close()
327
+ else:
328
+ print(str(orjson.dumps(scheme, option=orjson.OPT_INDENT_2).decode('utf8')))