undatum 1.0.17__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- undatum/__init__.py +9 -0
- undatum/__main__.py +25 -0
- undatum/ai/__init__.py +145 -0
- undatum/ai/base.py +85 -0
- undatum/ai/config.py +184 -0
- undatum/ai/perplexity.py +79 -0
- undatum/ai/providers.py +1002 -0
- undatum/ai/schemas.py +42 -0
- undatum/cmds/__init__.py +6 -0
- undatum/cmds/analyzer.py +697 -0
- undatum/cmds/converter.py +646 -0
- undatum/cmds/ingester.py +116 -0
- undatum/cmds/query.py +68 -0
- undatum/cmds/schemer.py +328 -0
- undatum/cmds/selector.py +437 -0
- undatum/cmds/statistics.py +158 -0
- undatum/cmds/textproc.py +59 -0
- undatum/cmds/transformer.py +81 -0
- undatum/cmds/validator.py +137 -0
- undatum/common/__init__.py +6 -0
- undatum/common/functions.py +81 -0
- undatum/common/iterable.py +222 -0
- undatum/common/scheme.py +261 -0
- undatum/constants.py +21 -0
- undatum/core.py +616 -0
- undatum/formats/__init__.py +6 -0
- undatum/formats/docx.py +160 -0
- undatum/utils.py +298 -0
- undatum/validate/__init__.py +11 -0
- undatum/validate/commonrules.py +15 -0
- undatum/validate/ruscodes.py +202 -0
- undatum-1.0.17.dist-info/METADATA +610 -0
- undatum-1.0.17.dist-info/RECORD +37 -0
- undatum-1.0.17.dist-info/WHEEL +6 -0
- undatum-1.0.17.dist-info/entry_points.txt +3 -0
- undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
- undatum-1.0.17.dist-info/top_level.txt +1 -0
undatum/cmds/ingester.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""Data ingestion module for databases."""
|
|
3
|
+
import duckdb
|
|
4
|
+
import logging
|
|
5
|
+
from iterable.helpers.detect import open_iterable
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
from pymongo import MongoClient
|
|
9
|
+
from elasticsearch import Elasticsearch
|
|
10
|
+
|
|
11
|
+
ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
|
|
12
|
+
|
|
13
|
+
DUCKABLE_FILE_TYPES = ['parquet', 'csv', 'jsonl', 'json', 'jsonl.gz']
|
|
14
|
+
DUCKABLE_CODECS = ['gz', 'zst']
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
DEFAULT_BATCH_SIZE = 50000
|
|
18
|
+
|
|
19
|
+
def get_iterable_options(options):
|
|
20
|
+
"""Extract iterable-specific options from options dictionary."""
|
|
21
|
+
out = {}
|
|
22
|
+
for k in ITERABLE_OPTIONS_KEYS:
|
|
23
|
+
if k in options.keys():
|
|
24
|
+
out[k] = options[k]
|
|
25
|
+
return out
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BasicIngester:
|
|
29
|
+
"""Base class for data ingestion."""
|
|
30
|
+
def __init__(self):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
def ingest(self, batch):
|
|
34
|
+
raise NotImplemented
|
|
35
|
+
|
|
36
|
+
class ElasticIngester(BasicIngester):
|
|
37
|
+
"""Elasticsearch data ingester."""
|
|
38
|
+
def __init__(self, uri:str, api_key:str, search_index:str, document_id:str="id"):
|
|
39
|
+
self.client = Elasticsearch(uri, api_key=api_key, verify_certs=False,ssl_show_warn=False, timeout=60, max_retries=10, retry_on_timeout=True)
|
|
40
|
+
self._index = search_index
|
|
41
|
+
self._item_id = document_id
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def ingest(self, batch):
|
|
46
|
+
documents = []
|
|
47
|
+
for doc in batch:
|
|
48
|
+
documents.append({ "index": { "_index": self._index, '_id' : doc[self._item_id]}})
|
|
49
|
+
documents.append(doc)
|
|
50
|
+
result = self.client.bulk(operations=documents, pipeline="ent-search-generic-ingestion")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class MongoIngester:
|
|
54
|
+
"""MongoDB data ingester."""
|
|
55
|
+
def __init__(self, uri, db, table,do_drop=False):
|
|
56
|
+
self.client = MongoClient(uri)
|
|
57
|
+
self.db = self.client[db]
|
|
58
|
+
if do_drop:
|
|
59
|
+
del self.db[table]
|
|
60
|
+
self.coll = self.db[table]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def ingest(self, batch):
|
|
64
|
+
result = self.coll.insert_many(batch)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class Ingester:
|
|
68
|
+
"""Main data ingestion handler."""
|
|
69
|
+
def __init__(self, batch_size=DEFAULT_BATCH_SIZE):
|
|
70
|
+
self.batch_size = batch_size
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
def ingest(self, fromfiles, uri, db, table, options={}):
|
|
74
|
+
for filename in fromfiles:
|
|
75
|
+
self.ingest_single(filename, uri, db, table, options=options)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def ingest_single(self, fromfile, uri, db, table, options={}):
|
|
79
|
+
"""Loads single file data contents to the schemaless database like MongoDB"""
|
|
80
|
+
dbtype = options['dbtype']
|
|
81
|
+
processor = None
|
|
82
|
+
totals = -1
|
|
83
|
+
skip = options['skip']
|
|
84
|
+
use_totals = options['totals']if 'totals' in options.keys() else False
|
|
85
|
+
do_drop = options['drop']if 'dro[]' in options.keys() else False
|
|
86
|
+
print(f'Ingesting {fromfile} to {uri} with db {db} table {table}')
|
|
87
|
+
if use_totals:
|
|
88
|
+
parts = fromfile.rsplit('.', 2)
|
|
89
|
+
if len(parts) == 2:
|
|
90
|
+
if parts[-1].lower() in DUCKABLE_FILE_TYPES:
|
|
91
|
+
totals = duckdb.sql(f"select count(*) from '{fromfile}'").fetchone()[0]
|
|
92
|
+
elif len(parts) == 3:
|
|
93
|
+
if parts[-2].lower() in DUCKABLE_FILE_TYPES and parts[-1].lower() in DUCKABLE_CODECS:
|
|
94
|
+
totals = duckdb.sql(f"select count(*) from '{fromfile}'").fetchone()[0]
|
|
95
|
+
if dbtype == 'mongodb':
|
|
96
|
+
processor = MongoIngester(uri, db, table, do_drop=do_drop)
|
|
97
|
+
elif dbtype == 'elastic':
|
|
98
|
+
api_key = options['api_key']
|
|
99
|
+
id_key = options['doc_id']
|
|
100
|
+
processor = ElasticIngester(uri=uri, api_key=api_key, search_index=table, document_id=id_key)
|
|
101
|
+
iterableargs = get_iterable_options(options)
|
|
102
|
+
it_in = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
|
|
103
|
+
logging.info(f'Ingesting data: filename {fromfile}, uri: {uri}, db {db}, table {table}')
|
|
104
|
+
n = 0
|
|
105
|
+
batch = []
|
|
106
|
+
for row in tqdm(it_in, total=totals):
|
|
107
|
+
n += 1
|
|
108
|
+
if skip is not None and skip > 0:
|
|
109
|
+
if n < skip: continue
|
|
110
|
+
batch.append(row)
|
|
111
|
+
if n % self.batch_size == 0:
|
|
112
|
+
processor.ingest(batch)
|
|
113
|
+
batch = []
|
|
114
|
+
if len(batch) > 0:
|
|
115
|
+
processor.ingest(batch)
|
|
116
|
+
|
undatum/cmds/query.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""Data query module using mistql."""
|
|
3
|
+
# import json
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# from xmlr import xmliter
|
|
9
|
+
from ..utils import get_file_type, get_option, strip_dict_fields
|
|
10
|
+
from ..common.iterable import IterableData
|
|
11
|
+
LINEEND = '\n'.encode('utf8')
|
|
12
|
+
|
|
13
|
+
DEFAULT_CHUNK_SIZE = 50
|
|
14
|
+
|
|
15
|
+
class DataQuery:
|
|
16
|
+
"""Data query handler using mistql."""
|
|
17
|
+
def __init__(self):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def query(self, fromfile, options=None):
|
|
22
|
+
"""Use mistql to query data."""
|
|
23
|
+
if options is None:
|
|
24
|
+
options = {}
|
|
25
|
+
from mistql import query
|
|
26
|
+
f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
|
|
27
|
+
iterable = IterableData(fromfile, options=options)
|
|
28
|
+
to_file = get_option(options, 'output')
|
|
29
|
+
|
|
30
|
+
if to_file:
|
|
31
|
+
to_type = get_file_type(to_file)
|
|
32
|
+
if not to_file:
|
|
33
|
+
print('Output file type not supported')
|
|
34
|
+
return
|
|
35
|
+
if to_type == 'bson':
|
|
36
|
+
out = open(to_file, 'wb')
|
|
37
|
+
elif to_type == 'jsonl':
|
|
38
|
+
out = open(to_file, 'wb')
|
|
39
|
+
else:
|
|
40
|
+
out = open(to_file, 'w', encoding='utf8')
|
|
41
|
+
else:
|
|
42
|
+
to_type = f_type
|
|
43
|
+
out = sys.stdout
|
|
44
|
+
fields = options['fields'].split(',') if options['fields'] else None
|
|
45
|
+
# writer = DataWriter(out, filetype=to_type, fieldnames=fields)
|
|
46
|
+
if iterable:
|
|
47
|
+
n = 0
|
|
48
|
+
fields = [field.split('.') for field in fields] if fields else None
|
|
49
|
+
for r in iterable.iter():
|
|
50
|
+
n += 1
|
|
51
|
+
if fields:
|
|
52
|
+
r_selected = strip_dict_fields(r, fields, 0)
|
|
53
|
+
else:
|
|
54
|
+
r_selected = r
|
|
55
|
+
if options['query'] is not None:
|
|
56
|
+
res = query(options['query'], r_selected)
|
|
57
|
+
# print(options['filter'], r)
|
|
58
|
+
if not res:
|
|
59
|
+
continue
|
|
60
|
+
else:
|
|
61
|
+
res = r_selected
|
|
62
|
+
print(res)
|
|
63
|
+
else:
|
|
64
|
+
logging.info('File type not supported')
|
|
65
|
+
return
|
|
66
|
+
logging.debug('query: %d records processed', n)
|
|
67
|
+
if to_file:
|
|
68
|
+
out.close()
|
undatum/cmds/schemer.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""Schema generation and extraction module."""
|
|
3
|
+
import csv
|
|
4
|
+
import io
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import tempfile
|
|
9
|
+
import zipfile
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
import duckdb
|
|
13
|
+
import orjson
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import tqdm
|
|
16
|
+
import xxhash
|
|
17
|
+
import yaml
|
|
18
|
+
from pydantic import BaseModel
|
|
19
|
+
from pyzstd import ZstdFile
|
|
20
|
+
from qddate import DateParser
|
|
21
|
+
|
|
22
|
+
from ..common.scheme import generate_scheme_from_file
|
|
23
|
+
from ..utils import get_file_type, get_option
|
|
24
|
+
from ..ai import get_fields_info, get_description
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def column_type_parse(column_type):
|
|
29
|
+
"""Parse column type string to extract array flag and base type."""
|
|
30
|
+
is_array = (column_type[-2:] == '[]')
|
|
31
|
+
if is_array:
|
|
32
|
+
text = column_type[:-2]
|
|
33
|
+
else:
|
|
34
|
+
text = column_type
|
|
35
|
+
if text[:6] == 'STRUCT':
|
|
36
|
+
atype = text[:6]
|
|
37
|
+
elif text[:4] == 'JSON':
|
|
38
|
+
atype = 'VARCHAR'
|
|
39
|
+
else:
|
|
40
|
+
atype = text
|
|
41
|
+
return [atype, str(is_array)]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_schema_key(fields):
|
|
45
|
+
"""Generate hash key for schema based on field names."""
|
|
46
|
+
return xxhash.xxh64('|'.join(sorted(fields))).hexdigest()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def duckdb_decompose(filename: str = None, frame: pd.DataFrame = None,
|
|
50
|
+
filetype: str = None, path: str = "*",
|
|
51
|
+
limit: int = 10000000, recursive: bool = True,
|
|
52
|
+
root: str = "", ignore_errors: bool = True):
|
|
53
|
+
"""Decomposes file or data frame structure."""
|
|
54
|
+
text_ignore = ', ignore_errors=true' if ignore_errors else ''
|
|
55
|
+
if filetype in ['csv', 'tsv']:
|
|
56
|
+
read_func = f"read_csv('{filename}'{text_ignore}, sample_size={limit})"
|
|
57
|
+
elif filetype in ['json', 'jsonl']:
|
|
58
|
+
read_func = f"read_json('{filename}'{text_ignore})"
|
|
59
|
+
else:
|
|
60
|
+
read_func = f"'{filename}'"
|
|
61
|
+
if path == '*':
|
|
62
|
+
if filename is not None:
|
|
63
|
+
query_str = f"describe select {path} from {read_func} limit {limit}"
|
|
64
|
+
data = duckdb.sql(query_str).fetchall()
|
|
65
|
+
else:
|
|
66
|
+
query_str = f"describe select {path} from frame limit {limit}"
|
|
67
|
+
data = duckdb.sql(query_str).fetchall()
|
|
68
|
+
else:
|
|
69
|
+
path_parts = path.split('.')
|
|
70
|
+
query = None
|
|
71
|
+
if len(path_parts) == 1:
|
|
72
|
+
if filename is not None:
|
|
73
|
+
query = (f"describe select unnest(\"{path}\", recursive:=true) "
|
|
74
|
+
f"from {read_func} limit {limit}")
|
|
75
|
+
else:
|
|
76
|
+
query = (f"describe select unnest(\"{path}\", recursive:=true) "
|
|
77
|
+
f"from frame limit {limit}")
|
|
78
|
+
elif len(path_parts) == 2:
|
|
79
|
+
if filename is not None:
|
|
80
|
+
query = (f"describe select unnest(\"{path_parts[1]}\", "
|
|
81
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
82
|
+
f"recursive:=true) from {read_func} limit {limit})")
|
|
83
|
+
else:
|
|
84
|
+
query = (f"describe select unnest(\"{path_parts[1]}\", "
|
|
85
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
86
|
+
f"recursive:=true) from frame limit {limit})")
|
|
87
|
+
elif len(path_parts) == 3:
|
|
88
|
+
if filename is not None:
|
|
89
|
+
query = (f"describe select unnest(\"{path_parts[2]}\", "
|
|
90
|
+
f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
|
|
91
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
92
|
+
f"recursive:=true) from {read_func} limit {limit}))")
|
|
93
|
+
else:
|
|
94
|
+
query = (f"describe select unnest(\"{path_parts[2]}\", "
|
|
95
|
+
f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
|
|
96
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
97
|
+
f"recursive:=true) from frame limit {limit}))")
|
|
98
|
+
elif len(path_parts) == 4:
|
|
99
|
+
if filename is not None:
|
|
100
|
+
query = (f"describe select unnest(\"{path_parts[2]}.{path_parts[3]}\", "
|
|
101
|
+
f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
|
|
102
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
103
|
+
f"recursive:=true) from {read_func} limit {limit}))")
|
|
104
|
+
else:
|
|
105
|
+
query = (f"describe select unnest(\"{path_parts[2]}.{path_parts[3]}\", "
|
|
106
|
+
f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
|
|
107
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
108
|
+
f"recursive:=true) from frame limit {limit}))")
|
|
109
|
+
data = duckdb.sql(query).fetchall()
|
|
110
|
+
table = []
|
|
111
|
+
for row in data:
|
|
112
|
+
item = [row[0] if len(root) == 0 else root + '.' + row[0]]
|
|
113
|
+
item.extend(column_type_parse(row[1]))
|
|
114
|
+
table.append(item)
|
|
115
|
+
if recursive and item[1] == 'STRUCT':
|
|
116
|
+
sub_path = row[0] if len(root) == 0 else item[0]
|
|
117
|
+
subtable = duckdb_decompose(filename, frame, filetype=filetype,
|
|
118
|
+
path=sub_path, limit=limit,
|
|
119
|
+
recursive=recursive, root=item[0],
|
|
120
|
+
ignore_errors=ignore_errors)
|
|
121
|
+
for subitem in subtable:
|
|
122
|
+
table.append(subitem)
|
|
123
|
+
return table
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class FieldSchema(BaseModel):
|
|
127
|
+
"""Schema definition for a data field."""
|
|
128
|
+
name: str
|
|
129
|
+
ftype: str
|
|
130
|
+
is_array: bool = False
|
|
131
|
+
description: Optional[str] = None
|
|
132
|
+
sem_type: str = None
|
|
133
|
+
sem_url: str = None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class TableSchema(BaseModel):
|
|
137
|
+
"""Table schema definition."""
|
|
138
|
+
key: Optional[str] = None
|
|
139
|
+
num_cols: int = -1
|
|
140
|
+
is_flat: bool = True
|
|
141
|
+
id: Optional[str] = None
|
|
142
|
+
fields: Optional[list[FieldSchema]] = []
|
|
143
|
+
description: Optional[str] = None
|
|
144
|
+
files: Optional[list[str]] = []
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
MAX_SAMPLE_SIZE = 200
|
|
148
|
+
DELIMITED_FILES = ['csv', 'tsv']
|
|
149
|
+
DUCKABLE_FILE_TYPES = ['csv', 'jsonl', 'json', 'parquet']
|
|
150
|
+
DUCKABLE_CODECS = ['zst', 'gzip', 'raw']
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def table_from_objects(objects:list, id:str, objects_limit:int, use_pandas:bool=False, filetype='csv', autodoc:bool=False, lang:str='English'):
|
|
154
|
+
"""Reconstructs table schema from list of objects"""
|
|
155
|
+
table = TableSchema(id=id)
|
|
156
|
+
table.num_records = len(objects)
|
|
157
|
+
if autodoc:
|
|
158
|
+
f = io.StringIO()
|
|
159
|
+
writer = csv.writer(f)
|
|
160
|
+
writer.writerows(objects[:MAX_SAMPLE_SIZE])
|
|
161
|
+
table.description = get_description(f.getvalue(), language=lang)
|
|
162
|
+
if use_pandas:
|
|
163
|
+
df = pd.DataFrame(objects)
|
|
164
|
+
columns_raw = duckdb_decompose(frame=df, path='*', limit=objects_limit)
|
|
165
|
+
else:
|
|
166
|
+
tfile = tempfile.NamedTemporaryFile(suffix='.' + filetype, mode='w', encoding='utf8', delete=False)
|
|
167
|
+
tfile.close()
|
|
168
|
+
tfile_real = ZstdFile(tfile.name, mode='w', level_or_option=9)
|
|
169
|
+
wrapper = io.TextIOWrapper(tfile_real, encoding='utf8', write_through=True)
|
|
170
|
+
if filetype == 'csv':
|
|
171
|
+
writer = csv.writer(wrapper)
|
|
172
|
+
writer.writerows(objects[:objects_limit])
|
|
173
|
+
elif filetype == 'jsonl':
|
|
174
|
+
for row in objects[:objects_limit]:
|
|
175
|
+
wrapper.write(json.dumps(row) + '\n')
|
|
176
|
+
tfile_real.close()
|
|
177
|
+
# Getting structure
|
|
178
|
+
columns_raw = duckdb_decompose(tfile.name, filetype=filetype, path='*', limit=objects_limit)
|
|
179
|
+
os.remove(tfile.name)
|
|
180
|
+
is_flat = True
|
|
181
|
+
table.num_cols = len(columns_raw)
|
|
182
|
+
|
|
183
|
+
for column in columns_raw:
|
|
184
|
+
field = FieldSchema(name=column[0], ftype=column[1], is_array=column[2])
|
|
185
|
+
table.fields.append(field)
|
|
186
|
+
if field.ftype == 'STRUCT' or field.is_array:
|
|
187
|
+
is_flat = False
|
|
188
|
+
table.is_flat = is_flat
|
|
189
|
+
table.num_records = len(objects)
|
|
190
|
+
return table
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def build_schema(filename:str, objects_limit:int=100000):
|
|
194
|
+
"""Build schema from file by analyzing sample of objects."""
|
|
195
|
+
fileext = filename.rsplit('.', 1)[-1].lower()
|
|
196
|
+
filetype = fileext
|
|
197
|
+
# Getting total count
|
|
198
|
+
table = TableSchema(id=os.path.basename(filename))
|
|
199
|
+
# Getting structure
|
|
200
|
+
columns_raw = duckdb_decompose(filename, filetype=filetype, path='*', limit=objects_limit)
|
|
201
|
+
is_flat = True
|
|
202
|
+
table.num_cols = len(columns_raw)
|
|
203
|
+
fieldsnames = []
|
|
204
|
+
for column in columns_raw:
|
|
205
|
+
field = FieldSchema(name=column[0], ftype=column[1], is_array=column[2])
|
|
206
|
+
fieldsnames.append(column[0])
|
|
207
|
+
table.fields.append(field)
|
|
208
|
+
if field.ftype == 'STRUCT' or field.is_array:
|
|
209
|
+
is_flat = False
|
|
210
|
+
table.is_flat = is_flat
|
|
211
|
+
table.key = get_schema_key(fieldsnames)
|
|
212
|
+
return table
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class Schemer:
|
|
217
|
+
"""Schema generation handler."""
|
|
218
|
+
def __init__(self, nodates=True):
|
|
219
|
+
if nodates:
|
|
220
|
+
self.qd = None
|
|
221
|
+
else:
|
|
222
|
+
self.qd = DateParser(generate=True)
|
|
223
|
+
pass
|
|
224
|
+
|
|
225
|
+
def extract_schema(self, fromfile, options):
|
|
226
|
+
"""Extract schema from file and output as YAML."""
|
|
227
|
+
table = build_schema(fromfile)
|
|
228
|
+
print(yaml.dump(table.model_dump(), Dumper=yaml.Dumper))
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def extract_schema_bulk(self, fromdir, options):
|
|
232
|
+
"""Extracts schemes from all data files and writes schema structures"""
|
|
233
|
+
filenames = os.listdir(fromdir)
|
|
234
|
+
files = []
|
|
235
|
+
tables = {}
|
|
236
|
+
supported_exts = ['csv', 'json', 'jsonl', 'parquet', 'csv.gz',
|
|
237
|
+
'csv.zstd', 'jsonl.zstd']
|
|
238
|
+
for filename in filenames:
|
|
239
|
+
ext = filename.rsplit('.', 1)[-1]
|
|
240
|
+
if ext in supported_exts:
|
|
241
|
+
files.append(os.path.join(fromdir, filename))
|
|
242
|
+
mode = options['mode']
|
|
243
|
+
print(f'Found {len(files)} files. Processing mode {mode}')
|
|
244
|
+
for filename in tqdm.tqdm(files):
|
|
245
|
+
table = build_schema(filename)
|
|
246
|
+
fbase = os.path.basename(filename)
|
|
247
|
+
table.id = table.key
|
|
248
|
+
if mode == 'distinct':
|
|
249
|
+
if table.key not in tables.keys():
|
|
250
|
+
tables[table.key] = table
|
|
251
|
+
tables[table.key].files.append(fbase)
|
|
252
|
+
if ('autodoc' in options.keys() and options['autodoc'] and
|
|
253
|
+
'lang' in options.keys()):
|
|
254
|
+
fields = []
|
|
255
|
+
for column in table.fields:
|
|
256
|
+
fields.append(column.name)
|
|
257
|
+
descriptions = get_fields_info(fields,
|
|
258
|
+
language=options['lang'])
|
|
259
|
+
for column in table.fields:
|
|
260
|
+
if column.name in descriptions.keys():
|
|
261
|
+
column.description = descriptions[column.name]
|
|
262
|
+
else:
|
|
263
|
+
tables[table.key].files.append(fbase)
|
|
264
|
+
elif mode == 'perfile':
|
|
265
|
+
table.files.append(fbase)
|
|
266
|
+
if ('autodoc' in options.keys() and options['autodoc'] and
|
|
267
|
+
'lang' in options.keys()):
|
|
268
|
+
fields = []
|
|
269
|
+
for column in table.fields:
|
|
270
|
+
fields.append(column.name)
|
|
271
|
+
descriptions = get_fields_info(fields,
|
|
272
|
+
language=options['lang'])
|
|
273
|
+
for column in table.fields:
|
|
274
|
+
if column.name in descriptions.keys():
|
|
275
|
+
column.description = descriptions[column.name]
|
|
276
|
+
output_path = os.path.join(options['output'], fbase + '.yaml')
|
|
277
|
+
with open(output_path, 'w', encoding='utf8') as f:
|
|
278
|
+
f.write(yaml.dump(table.model_dump(), Dumper=yaml.Dumper))
|
|
279
|
+
if mode == 'distinct':
|
|
280
|
+
print(f'Total schemas {len(tables)}, files {len(files)}')
|
|
281
|
+
elif mode == 'perfile':
|
|
282
|
+
print(f'Total schemas {len(files)}, files {len(files)}')
|
|
283
|
+
if 'output' in options.keys():
|
|
284
|
+
if mode == 'distinct':
|
|
285
|
+
print('Writing schemas')
|
|
286
|
+
for table in tables.values():
|
|
287
|
+
output_path = os.path.join(options['output'],
|
|
288
|
+
table.key + '.yaml')
|
|
289
|
+
with open(output_path, 'w', encoding='utf8') as f:
|
|
290
|
+
f.write(yaml.dump(table.model_dump(),
|
|
291
|
+
Dumper=yaml.Dumper))
|
|
292
|
+
# print(yaml.dump(table.model_dump(), Dumper=yaml.Dumper))
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def generate_scheme(self, fromfile, options):
|
|
296
|
+
"""Generates cerberus scheme from JSON lines or BSON file"""
|
|
297
|
+
f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
|
|
298
|
+
if f_type not in ['jsonl', 'bson', 'csv']:
|
|
299
|
+
print('Only JSON lines, CSV and BSON (.jsonl, .csv, .bson) files supported now')
|
|
300
|
+
return
|
|
301
|
+
if options['zipfile']:
|
|
302
|
+
z = zipfile.ZipFile(fromfile, mode='r')
|
|
303
|
+
fnames = z.namelist()
|
|
304
|
+
fnames[0]
|
|
305
|
+
if f_type == 'bson':
|
|
306
|
+
infile = z.open(fnames[0], 'rb')
|
|
307
|
+
else:
|
|
308
|
+
infile = z.open(fnames[0], 'r')
|
|
309
|
+
else:
|
|
310
|
+
if f_type == 'bson':
|
|
311
|
+
infile = open(fromfile, 'rb')
|
|
312
|
+
else:
|
|
313
|
+
infile = open(fromfile, 'r', encoding=get_option(options, 'encoding'))
|
|
314
|
+
|
|
315
|
+
logging.debug('Start identifying scheme for %s', fromfile)
|
|
316
|
+
scheme = generate_scheme_from_file(fileobj=infile, filetype=f_type,
|
|
317
|
+
delimiter=options['delimiter'],
|
|
318
|
+
encoding=options['encoding'])
|
|
319
|
+
if options['output']:
|
|
320
|
+
with open(options['output'], 'w', encoding='utf8') as f:
|
|
321
|
+
f.write(orjson.dumps(scheme,
|
|
322
|
+
option=orjson.OPT_INDENT_2).decode('utf8'))
|
|
323
|
+
if not options['zipfile']:
|
|
324
|
+
infile.close()
|
|
325
|
+
if options['zipfile']:
|
|
326
|
+
z.close()
|
|
327
|
+
else:
|
|
328
|
+
print(str(orjson.dumps(scheme, option=orjson.OPT_INDENT_2).decode('utf8')))
|