undatum 1.0.17__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- undatum/__init__.py +9 -0
- undatum/__main__.py +25 -0
- undatum/ai/__init__.py +145 -0
- undatum/ai/base.py +85 -0
- undatum/ai/config.py +184 -0
- undatum/ai/perplexity.py +79 -0
- undatum/ai/providers.py +1002 -0
- undatum/ai/schemas.py +42 -0
- undatum/cmds/__init__.py +6 -0
- undatum/cmds/analyzer.py +697 -0
- undatum/cmds/converter.py +646 -0
- undatum/cmds/ingester.py +116 -0
- undatum/cmds/query.py +68 -0
- undatum/cmds/schemer.py +328 -0
- undatum/cmds/selector.py +437 -0
- undatum/cmds/statistics.py +158 -0
- undatum/cmds/textproc.py +59 -0
- undatum/cmds/transformer.py +81 -0
- undatum/cmds/validator.py +137 -0
- undatum/common/__init__.py +6 -0
- undatum/common/functions.py +81 -0
- undatum/common/iterable.py +222 -0
- undatum/common/scheme.py +261 -0
- undatum/constants.py +21 -0
- undatum/core.py +616 -0
- undatum/formats/__init__.py +6 -0
- undatum/formats/docx.py +160 -0
- undatum/utils.py +298 -0
- undatum/validate/__init__.py +11 -0
- undatum/validate/commonrules.py +15 -0
- undatum/validate/ruscodes.py +202 -0
- undatum-1.0.17.dist-info/METADATA +610 -0
- undatum-1.0.17.dist-info/RECORD +37 -0
- undatum-1.0.17.dist-info/WHEEL +6 -0
- undatum-1.0.17.dist-info/entry_points.txt +3 -0
- undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
- undatum-1.0.17.dist-info/top_level.txt +1 -0
undatum/cmds/analyzer.py
ADDED
|
@@ -0,0 +1,697 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
2
|
+
"""Data analysis and insights module.
|
|
3
|
+
|
|
4
|
+
This module provides data analysis capabilities including schema detection,
|
|
5
|
+
field type inference, and AI-powered documentation generation.
|
|
6
|
+
|
|
7
|
+
Note: Some functions have been optimized for performance (e.g., using sets
|
|
8
|
+
for key tracking), but further optimizations may be possible for very large datasets.
|
|
9
|
+
"""
|
|
10
|
+
import csv
|
|
11
|
+
import io
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
import tempfile
|
|
16
|
+
from collections import OrderedDict
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
import duckdb
|
|
20
|
+
import pandas as pd
|
|
21
|
+
import xlrd
|
|
22
|
+
import xmltodict
|
|
23
|
+
import yaml
|
|
24
|
+
from iterable.helpers.detect import (detect_encoding_any, detect_file_type,
|
|
25
|
+
TEXT_DATA_TYPES)
|
|
26
|
+
from openpyxl import load_workbook
|
|
27
|
+
from pydantic import BaseModel
|
|
28
|
+
from pyzstd import ZstdFile
|
|
29
|
+
|
|
30
|
+
from ..ai import get_fields_info, get_description, get_ai_service, AIService
|
|
31
|
+
from ..formats.docx import analyze_docx
|
|
32
|
+
from ..utils import get_dict_value
|
|
33
|
+
|
|
34
|
+
OBJECTS_ANALYZE_LIMIT = 10000
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
DUCKDB_TYPES = ['VARCHAR', 'DATE', 'JSON', 'BIGINT', 'DOUBLE', 'BOOLEAN']
|
|
38
|
+
|
|
39
|
+
def column_type_parse(column_type):
|
|
40
|
+
"""Parse column type string to extract array flag and base type."""
|
|
41
|
+
is_array = (column_type[-2:] == '[]')
|
|
42
|
+
if is_array:
|
|
43
|
+
text = column_type[:-2]
|
|
44
|
+
else:
|
|
45
|
+
text = column_type
|
|
46
|
+
if text[:6] == 'STRUCT':
|
|
47
|
+
atype = text[:6]
|
|
48
|
+
elif text[:4] == 'JSON':
|
|
49
|
+
atype = 'VARCHAR'
|
|
50
|
+
else:
|
|
51
|
+
atype = text
|
|
52
|
+
return [atype, str(is_array)]
|
|
53
|
+
|
|
54
|
+
def duckdb_decompose(filename: str = None, frame: pd.DataFrame = None,
|
|
55
|
+
filetype: str = None, path: str = "*", limit: int = 10000000,
|
|
56
|
+
recursive: bool = True, root: str = "", ignore_errors: bool = True):
|
|
57
|
+
"""Decompose file or DataFrame structure using DuckDB.
|
|
58
|
+
|
|
59
|
+
This function uses DuckDB's summarize and unnest functions to extract
|
|
60
|
+
schema information from nested data structures. It handles up to 4 levels
|
|
61
|
+
of nesting by constructing recursive SQL queries.
|
|
62
|
+
|
|
63
|
+
The function builds SQL queries dynamically based on the nesting depth:
|
|
64
|
+
- Level 1: Direct field access
|
|
65
|
+
- Level 2-4: Nested unnest operations
|
|
66
|
+
- Recursive: Processes STRUCT types by calling itself recursively
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
filename: Path to input file. If None, frame must be provided.
|
|
70
|
+
frame: Pandas DataFrame. Used when filename is None.
|
|
71
|
+
filetype: File type ('csv', 'tsv', 'json', 'jsonl'). Determines read function.
|
|
72
|
+
path: Path expression for nested fields (default: '*' for all fields).
|
|
73
|
+
limit: Maximum records to process (default: 10000000).
|
|
74
|
+
recursive: Whether to recursively process STRUCT types (default: True).
|
|
75
|
+
root: Root path prefix for nested queries (used internally for recursion).
|
|
76
|
+
ignore_errors: Whether to ignore parsing errors in DuckDB (default: True).
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
List of lists containing field information:
|
|
80
|
+
[field_path, base_type, is_array, unique_count, total_count, uniqueness_percentage]
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
ValueError: If both filename and frame are None.
|
|
84
|
+
duckdb.Error: If DuckDB query fails.
|
|
85
|
+
|
|
86
|
+
Example:
|
|
87
|
+
>>> result = duckdb_decompose('data.jsonl', filetype='json')
|
|
88
|
+
>>> print(result[0]) # ['field1', 'VARCHAR', 'False', '100', '1000', '10.00']
|
|
89
|
+
"""
|
|
90
|
+
text_ignore = ', ignore_errors=true' if ignore_errors else ''
|
|
91
|
+
if filetype in ['csv', 'tsv']:
|
|
92
|
+
read_func = f"read_csv('{filename}'{text_ignore})"
|
|
93
|
+
elif filetype in ['json', 'jsonl']:
|
|
94
|
+
read_func = f"read_json('{filename}'{text_ignore})"
|
|
95
|
+
else:
|
|
96
|
+
read_func = f"'{filename}'"
|
|
97
|
+
if path == '*':
|
|
98
|
+
if filename is not None:
|
|
99
|
+
query_str = f"summarize select {path} from {read_func} limit {limit}"
|
|
100
|
+
data = duckdb.sql(query_str).fetchall()
|
|
101
|
+
else:
|
|
102
|
+
query_str = f"summarize select {path} from frame limit {limit}"
|
|
103
|
+
data = duckdb.sql(query_str).fetchall()
|
|
104
|
+
else:
|
|
105
|
+
path_parts = path.split('.')
|
|
106
|
+
query = None
|
|
107
|
+
if len(path_parts) == 1:
|
|
108
|
+
if filename is not None:
|
|
109
|
+
query = (f"summarize select unnest(\"{path}\", recursive:=true) "
|
|
110
|
+
f"from {read_func} limit {limit}")
|
|
111
|
+
else:
|
|
112
|
+
query = (f"summarize select unnest(\"{path}\", recursive:=true) "
|
|
113
|
+
f"from frame limit {limit}")
|
|
114
|
+
elif len(path_parts) == 2:
|
|
115
|
+
if filename is not None:
|
|
116
|
+
query = (f"summarize select unnest(\"{path_parts[1]}\", "
|
|
117
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
118
|
+
f"recursive:=true) from {read_func} limit {limit})")
|
|
119
|
+
else:
|
|
120
|
+
query = (f"summarize select unnest(\"{path_parts[1]}\", "
|
|
121
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
122
|
+
f"recursive:=true) from frame limit {limit})")
|
|
123
|
+
elif len(path_parts) == 3:
|
|
124
|
+
if filename is not None:
|
|
125
|
+
query = (f"summarize select unnest(\"{path_parts[2]}\", "
|
|
126
|
+
f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
|
|
127
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
128
|
+
f"recursive:=true) from {read_func} limit {limit}))")
|
|
129
|
+
else:
|
|
130
|
+
query = (f"summarize select unnest(\"{path_parts[2]}\", "
|
|
131
|
+
f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
|
|
132
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
133
|
+
f"recursive:=true) from frame limit {limit}))")
|
|
134
|
+
elif len(path_parts) == 4:
|
|
135
|
+
if filename is not None:
|
|
136
|
+
query = (f"summarize select unnest(\"{path_parts[2]}.{path_parts[3]}\", "
|
|
137
|
+
f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
|
|
138
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
139
|
+
f"recursive:=true) from {read_func} limit {limit}))")
|
|
140
|
+
else:
|
|
141
|
+
query = (f"summarize select unnest(\"{path_parts[2]}.{path_parts[3]}\", "
|
|
142
|
+
f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
|
|
143
|
+
f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
|
|
144
|
+
f"recursive:=true) from frame limit {limit}))")
|
|
145
|
+
data = duckdb.sql(query).fetchall()
|
|
146
|
+
table = []
|
|
147
|
+
for row in data:
|
|
148
|
+
item = [row[0] if len(root) == 0 else root + '.' + row[0]]
|
|
149
|
+
item.extend(column_type_parse(row[1]))
|
|
150
|
+
item.append(str(row[4]))
|
|
151
|
+
item.append(str(row[10]))
|
|
152
|
+
uniq_share = row[4] * 100.0 / row[10] if row[10] > 0 else 0
|
|
153
|
+
item.append(f'{uniq_share:0.2f}')
|
|
154
|
+
table.append(item)
|
|
155
|
+
if recursive and item[1] == 'STRUCT':
|
|
156
|
+
sub_path = row[0] if len(root) == 0 else item[0]
|
|
157
|
+
subtable = duckdb_decompose(filename, frame, filetype=filetype,
|
|
158
|
+
path=sub_path, limit=limit,
|
|
159
|
+
recursive=recursive, root=item[0],
|
|
160
|
+
ignore_errors=ignore_errors)
|
|
161
|
+
for subitem in subtable:
|
|
162
|
+
table.append(subitem)
|
|
163
|
+
return table
|
|
164
|
+
|
|
165
|
+
def _seek_dict_lists(data, level=0, path=None, candidates=None):
|
|
166
|
+
"""Seek list structures in dictionary recursively."""
|
|
167
|
+
if candidates is None:
|
|
168
|
+
candidates = OrderedDict()
|
|
169
|
+
# print(level, path, candidates)
|
|
170
|
+
for key, value in data.items():
|
|
171
|
+
if isinstance(value, list):
|
|
172
|
+
isobjectlist = False
|
|
173
|
+
for listitem in value[:20]:
|
|
174
|
+
if isinstance(listitem, (dict, OrderedDict)):
|
|
175
|
+
isobjectlist = True
|
|
176
|
+
break
|
|
177
|
+
if not isobjectlist:
|
|
178
|
+
continue
|
|
179
|
+
key = f'{path}.{key}' if path is not None else key
|
|
180
|
+
if key not in candidates:
|
|
181
|
+
candidates[key] = {'key' : key, 'num' : len(value)}
|
|
182
|
+
elif isinstance(value, (OrderedDict, dict)):
|
|
183
|
+
res = _seek_dict_lists(value, level + 1, path + '.' + key if path else key, candidates)
|
|
184
|
+
for k, v in res.items():
|
|
185
|
+
if k not in candidates.keys():
|
|
186
|
+
candidates[k] = v
|
|
187
|
+
else:
|
|
188
|
+
continue
|
|
189
|
+
return candidates
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _seek_xml_lists(data, level=0, path=None, candidates=None):
|
|
194
|
+
"""Seek list structures in XML data recursively."""
|
|
195
|
+
if candidates is None:
|
|
196
|
+
candidates = OrderedDict()
|
|
197
|
+
for key, value in data.items():
|
|
198
|
+
if isinstance(value, list):
|
|
199
|
+
key = f'{path}.{key}' if path is not None else key
|
|
200
|
+
if key not in candidates:
|
|
201
|
+
candidates[key] = {'key' : key, 'num' : len(value)}
|
|
202
|
+
elif isinstance(value, (OrderedDict, dict)):
|
|
203
|
+
res = _seek_xml_lists(value, level + 1, path + '.' + key if path else key, candidates)
|
|
204
|
+
for k, v in res.items():
|
|
205
|
+
if k not in candidates.keys():
|
|
206
|
+
candidates[k] = v
|
|
207
|
+
else:
|
|
208
|
+
continue
|
|
209
|
+
return candidates
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _process_json_data(data, report, fullkey, objects_limit, use_pandas,
|
|
213
|
+
autodoc, lang, ai_service: Optional[AIService] = None):
|
|
214
|
+
"""Process JSON data and add tables to report."""
|
|
215
|
+
candidates = _seek_dict_lists(data, level=0)
|
|
216
|
+
if len(candidates) == 1:
|
|
217
|
+
fullkey = str(next(iter(candidates)))
|
|
218
|
+
table = TableSchema(id=fullkey)
|
|
219
|
+
objects = get_dict_value(data, keys=fullkey.split('.'))[0]
|
|
220
|
+
table = table_from_objects(objects, table_id=fullkey,
|
|
221
|
+
objects_limit=objects_limit,
|
|
222
|
+
use_pandas=use_pandas,
|
|
223
|
+
filetype='jsonl',
|
|
224
|
+
autodoc=autodoc, lang=lang,
|
|
225
|
+
ai_service=ai_service)
|
|
226
|
+
report.tables.append(table)
|
|
227
|
+
report.total_tables = len(report.tables)
|
|
228
|
+
report.total_records = table.num_records
|
|
229
|
+
elif len(candidates) > 1:
|
|
230
|
+
total = 0
|
|
231
|
+
for fullkey in candidates:
|
|
232
|
+
table = TableSchema(id=fullkey)
|
|
233
|
+
objects = get_dict_value(data, keys=fullkey.split('.'))[0]
|
|
234
|
+
table = table_from_objects(objects, table_id=fullkey,
|
|
235
|
+
objects_limit=objects_limit,
|
|
236
|
+
use_pandas=use_pandas,
|
|
237
|
+
filetype='jsonl',
|
|
238
|
+
autodoc=autodoc, lang=lang,
|
|
239
|
+
ai_service=ai_service)
|
|
240
|
+
total += table.num_records
|
|
241
|
+
report.tables.append(table)
|
|
242
|
+
report.total_records = total
|
|
243
|
+
report.total_tables = len(report.tables)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class FieldSchema(BaseModel):
|
|
247
|
+
"""Schema definition for a data field."""
|
|
248
|
+
name: str
|
|
249
|
+
ftype: str
|
|
250
|
+
is_array:bool = False
|
|
251
|
+
description: Optional[str] = None
|
|
252
|
+
sem_type:str = None
|
|
253
|
+
sem_url:str = None
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class TableSchema(BaseModel):
|
|
257
|
+
"""Table schema definition."""
|
|
258
|
+
num_records: int = -1
|
|
259
|
+
num_cols: int = -1
|
|
260
|
+
is_flat: bool = True
|
|
261
|
+
id: Optional[str] = None
|
|
262
|
+
fields: Optional[list[FieldSchema]] = []
|
|
263
|
+
description: Optional[str] = None
|
|
264
|
+
|
|
265
|
+
class ReportSchema(BaseModel):
|
|
266
|
+
"""Schema of the data file analysis results."""
|
|
267
|
+
filename: str
|
|
268
|
+
file_size: int
|
|
269
|
+
file_type: str
|
|
270
|
+
compression: str = None
|
|
271
|
+
total_tables: int = 1
|
|
272
|
+
total_records: int = -1
|
|
273
|
+
tables: Optional[list[TableSchema]] = []
|
|
274
|
+
metadata: dict = {}
|
|
275
|
+
success: bool = False
|
|
276
|
+
error: str = None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
MAX_SAMPLE_SIZE = 200
|
|
280
|
+
DELIMITED_FILES = ['csv', 'tsv']
|
|
281
|
+
DUCKABLE_FILE_TYPES = ['csv', 'jsonl', 'json', 'parquet']
|
|
282
|
+
DUCKABLE_CODECS = ['zst', 'gzip', 'raw']
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def table_from_objects(objects: list, table_id: str, objects_limit: int,
|
|
286
|
+
use_pandas: bool = False, filetype='csv',
|
|
287
|
+
autodoc: bool = False, lang: str = 'English',
|
|
288
|
+
ai_service: Optional[AIService] = None):
|
|
289
|
+
"""Reconstructs table schema from list of objects."""
|
|
290
|
+
table = TableSchema(id=table_id)
|
|
291
|
+
table.num_records = len(objects)
|
|
292
|
+
if autodoc:
|
|
293
|
+
f = io.StringIO()
|
|
294
|
+
writer = csv.writer(f)
|
|
295
|
+
writer.writerows(objects[:MAX_SAMPLE_SIZE])
|
|
296
|
+
table.description = get_description(f.getvalue(), language=lang, ai_service=ai_service)
|
|
297
|
+
if use_pandas:
|
|
298
|
+
df = pd.DataFrame(objects)
|
|
299
|
+
columns_raw = duckdb_decompose(frame=df, path='*',
|
|
300
|
+
limit=objects_limit)
|
|
301
|
+
else:
|
|
302
|
+
suffix = '.' + filetype
|
|
303
|
+
tfile = tempfile.NamedTemporaryFile(suffix=suffix, mode='w',
|
|
304
|
+
encoding='utf8', delete=False)
|
|
305
|
+
tfile.close()
|
|
306
|
+
with ZstdFile(tfile.name, mode='w', level_or_option=9) as tfile_real:
|
|
307
|
+
wrapper = io.TextIOWrapper(tfile_real, encoding='utf8',
|
|
308
|
+
write_through=True)
|
|
309
|
+
if filetype == 'csv':
|
|
310
|
+
writer = csv.writer(wrapper)
|
|
311
|
+
writer.writerows(objects[:objects_limit])
|
|
312
|
+
elif filetype == 'jsonl':
|
|
313
|
+
for row in objects[:objects_limit]:
|
|
314
|
+
wrapper.write(json.dumps(row) + '\n')
|
|
315
|
+
# Getting structure
|
|
316
|
+
columns_raw = duckdb_decompose(tfile.name, filetype=filetype,
|
|
317
|
+
path='*', limit=objects_limit)
|
|
318
|
+
os.remove(tfile.name)
|
|
319
|
+
is_flat = True
|
|
320
|
+
table.num_cols = len(columns_raw)
|
|
321
|
+
|
|
322
|
+
for column in columns_raw:
|
|
323
|
+
field = FieldSchema(name=column[0], ftype=column[1],
|
|
324
|
+
is_array=column[2])
|
|
325
|
+
table.fields.append(field)
|
|
326
|
+
if field.ftype == 'STRUCT' or field.is_array:
|
|
327
|
+
is_flat = False
|
|
328
|
+
table.is_flat = is_flat
|
|
329
|
+
table.num_records = len(objects)
|
|
330
|
+
return table
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def analyze(filename: str, filetype: str = None, compression: str = 'raw',
|
|
336
|
+
objects_limit: int = OBJECTS_ANALYZE_LIMIT, encoding: str = None,
|
|
337
|
+
scan: bool = True, stats: bool = True, engine: str = "auto", # noqa: ARG001
|
|
338
|
+
use_pandas: bool = False, ignore_errors: bool = True,
|
|
339
|
+
autodoc: bool = False, lang: str = 'English',
|
|
340
|
+
ai_provider: Optional[str] = None, ai_config: Optional[dict] = None):
|
|
341
|
+
"""Analyzes any type of data file and provides meaningful insights.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
ai_provider: AI provider name (openai, openrouter, ollama, lmstudio, perplexity)
|
|
345
|
+
ai_config: Optional AI configuration dictionary
|
|
346
|
+
"""
|
|
347
|
+
fileext = filename.rsplit('.', 1)[-1].lower()
|
|
348
|
+
filesize = os.path.getsize(filename)
|
|
349
|
+
if filetype is None:
|
|
350
|
+
ftype = detect_file_type(filename)
|
|
351
|
+
if ftype['success']:
|
|
352
|
+
filetype = ftype['datatype'].id()
|
|
353
|
+
if ftype['codec'] is not None:
|
|
354
|
+
compression = ftype['codec'].id()
|
|
355
|
+
# Handling special cases
|
|
356
|
+
if filetype is None and fileext == 'docx':
|
|
357
|
+
filetype = 'docx'
|
|
358
|
+
|
|
359
|
+
report = ReportSchema(filename=filename, file_size=filesize,
|
|
360
|
+
file_type=filetype, compression=compression)
|
|
361
|
+
|
|
362
|
+
# Initialize AI service if autodoc is enabled
|
|
363
|
+
ai_service = None
|
|
364
|
+
if autodoc:
|
|
365
|
+
try:
|
|
366
|
+
config = ai_config or {}
|
|
367
|
+
if ai_provider:
|
|
368
|
+
config['provider'] = ai_provider
|
|
369
|
+
ai_service = get_ai_service(provider=ai_provider, config=config)
|
|
370
|
+
except Exception as e:
|
|
371
|
+
# If AI service fails to initialize, disable autodoc
|
|
372
|
+
import warnings
|
|
373
|
+
warnings.warn(f"Failed to initialize AI service: {e}. Disabling autodoc.")
|
|
374
|
+
autodoc = False
|
|
375
|
+
|
|
376
|
+
if filetype in TEXT_DATA_TYPES:
|
|
377
|
+
if encoding is None:
|
|
378
|
+
encoding = detect_encoding_any(filename)
|
|
379
|
+
enc_key = 'encoding' if 'encoding' in encoding else None
|
|
380
|
+
report.metadata['encoding'] = encoding.get(enc_key) if enc_key else None
|
|
381
|
+
else:
|
|
382
|
+
report.metadata['encoding'] = encoding
|
|
383
|
+
if scan:
|
|
384
|
+
duckable_cond = (report.file_type in DUCKABLE_FILE_TYPES and
|
|
385
|
+
report.compression in DUCKABLE_CODECS and
|
|
386
|
+
engine in ['auto', 'duckdb'])
|
|
387
|
+
if duckable_cond:
|
|
388
|
+
# Getting total count
|
|
389
|
+
text_ignore = ', ignore_errors=true' if ignore_errors else ''
|
|
390
|
+
if filetype in ['json', 'jsonl']:
|
|
391
|
+
query_str = f"select count(*) from read_json('{filename}'{text_ignore})"
|
|
392
|
+
num_records = duckdb.sql(query_str).fetchall()[0][0]
|
|
393
|
+
elif filetype in ['csv', 'tsv']:
|
|
394
|
+
query_str = f"select count(*) from read_csv('{filename}'{text_ignore})"
|
|
395
|
+
num_records = duckdb.sql(query_str).fetchall()[0][0]
|
|
396
|
+
else:
|
|
397
|
+
query_str = f"select count(*) from '{filename}'"
|
|
398
|
+
num_records = duckdb.sql(query_str).fetchall()[0][0]
|
|
399
|
+
table = TableSchema(id=os.path.basename(filename))
|
|
400
|
+
table.num_records = num_records
|
|
401
|
+
report.tables = [table]
|
|
402
|
+
report.total_records = table.num_records
|
|
403
|
+
report.total_tables = 1
|
|
404
|
+
|
|
405
|
+
# Getting structure
|
|
406
|
+
columns_raw = duckdb_decompose(filename, filetype=filetype,
|
|
407
|
+
path='*', limit=objects_limit)
|
|
408
|
+
is_flat = True
|
|
409
|
+
table.num_cols = len(columns_raw)
|
|
410
|
+
for column in columns_raw:
|
|
411
|
+
field = FieldSchema(name=column[0], ftype=column[1],
|
|
412
|
+
is_array=column[2])
|
|
413
|
+
table.fields.append(field)
|
|
414
|
+
if field.ftype == 'STRUCT' or field.is_array:
|
|
415
|
+
is_flat = False
|
|
416
|
+
table.is_flat = is_flat
|
|
417
|
+
query_str = f"select * from '{filename}' limit {MAX_SAMPLE_SIZE}"
|
|
418
|
+
sample = duckdb.sql(query_str).fetchall()
|
|
419
|
+
f = io.StringIO()
|
|
420
|
+
writer = csv.writer(f)
|
|
421
|
+
writer.writerows(sample[:MAX_SAMPLE_SIZE])
|
|
422
|
+
if autodoc:
|
|
423
|
+
table.description = get_description(f.getvalue(), language=lang, ai_service=ai_service)
|
|
424
|
+
else:
|
|
425
|
+
if engine == 'duckdb':
|
|
426
|
+
report.success = False
|
|
427
|
+
report.error = (f"Not supported file type {report.file_type} "
|
|
428
|
+
f"or compression {report.compression}")
|
|
429
|
+
else:
|
|
430
|
+
# Processing MS Word XML files
|
|
431
|
+
if fileext == 'docx':
|
|
432
|
+
docx_tables = analyze_docx(filename, extract_data=True)
|
|
433
|
+
total = 0
|
|
434
|
+
for dtable in docx_tables:
|
|
435
|
+
table = table_from_objects(dtable['data'],
|
|
436
|
+
table_id=str(dtable['id']),
|
|
437
|
+
objects_limit=objects_limit,
|
|
438
|
+
use_pandas=use_pandas,
|
|
439
|
+
filetype='csv',
|
|
440
|
+
autodoc=autodoc, lang=lang,
|
|
441
|
+
ai_service=ai_service)
|
|
442
|
+
total += table.num_records
|
|
443
|
+
report.tables.append(table)
|
|
444
|
+
report.total_records = total
|
|
445
|
+
report.total_tables = len(report.tables)
|
|
446
|
+
elif filetype == 'xlsx':
|
|
447
|
+
wb = load_workbook(filename)
|
|
448
|
+
total = 0
|
|
449
|
+
for sheetname in wb.sheetnames:
|
|
450
|
+
sheet = wb.get_sheet_by_name(sheetname)
|
|
451
|
+
objects = []
|
|
452
|
+
max_num = (objects_limit if objects_limit < sheet.max_row
|
|
453
|
+
else sheet.max_row)
|
|
454
|
+
for n in range(0, max_num):
|
|
455
|
+
row = next(sheet.iter_rows())
|
|
456
|
+
tmp = []
|
|
457
|
+
for cell in row:
|
|
458
|
+
tmp.append(str(cell.value))
|
|
459
|
+
objects.append(tmp)
|
|
460
|
+
table = table_from_objects(objects, table_id=sheetname,
|
|
461
|
+
objects_limit=objects_limit,
|
|
462
|
+
use_pandas=use_pandas,
|
|
463
|
+
filetype='csv',
|
|
464
|
+
autodoc=autodoc, lang=lang,
|
|
465
|
+
ai_service=ai_service)
|
|
466
|
+
total += table.num_records
|
|
467
|
+
report.tables.append(table)
|
|
468
|
+
report.total_records = total
|
|
469
|
+
report.total_tables = len(report.tables)
|
|
470
|
+
elif filetype == 'xls':
|
|
471
|
+
wb = xlrd.open_workbook(filename)
|
|
472
|
+
total = 0
|
|
473
|
+
for sheetname in wb.sheet_names():
|
|
474
|
+
sheet = wb.sheet_by_name(sheetname)
|
|
475
|
+
objects = []
|
|
476
|
+
max_num = (objects_limit if objects_limit < sheet.nrows
|
|
477
|
+
else sheet.nrows)
|
|
478
|
+
for n in range(0, max_num):
|
|
479
|
+
tmp = []
|
|
480
|
+
for i in range(0, sheet.ncols):
|
|
481
|
+
cell_value = sheet.cell_value(n, i)
|
|
482
|
+
get_col = str(cell_value)
|
|
483
|
+
tmp.append(get_col)
|
|
484
|
+
objects.append(tmp)
|
|
485
|
+
table = table_from_objects(objects, table_id=sheetname,
|
|
486
|
+
objects_limit=objects_limit,
|
|
487
|
+
use_pandas=use_pandas,
|
|
488
|
+
filetype='csv',
|
|
489
|
+
autodoc=autodoc, lang=lang,
|
|
490
|
+
ai_service=ai_service)
|
|
491
|
+
report.tables.append(table)
|
|
492
|
+
total += table.num_records
|
|
493
|
+
report.total_records = total
|
|
494
|
+
report.total_tables = len(report.tables)
|
|
495
|
+
elif filetype == 'xml':
|
|
496
|
+
fileobj = None
|
|
497
|
+
codec = None
|
|
498
|
+
if ftype['codec'] is not None:
|
|
499
|
+
codec = ftype['codec'](filename, open_it=True)
|
|
500
|
+
fileobj = codec.fileobj()
|
|
501
|
+
if fileobj is None:
|
|
502
|
+
with open(filename, 'rb') as f:
|
|
503
|
+
data = xmltodict.parse(f, process_namespaces=False)
|
|
504
|
+
else:
|
|
505
|
+
data = xmltodict.parse(fileobj, process_namespaces=False)
|
|
506
|
+
candidates = _seek_xml_lists(data, level=0)
|
|
507
|
+
if len(candidates) == 1:
|
|
508
|
+
fullkey = str(next(iter(candidates)))
|
|
509
|
+
table = TableSchema(id=fullkey)
|
|
510
|
+
objects = get_dict_value(data,
|
|
511
|
+
keys=fullkey.split('.'))[0]
|
|
512
|
+
table = table_from_objects(objects, table_id=fullkey,
|
|
513
|
+
objects_limit=objects_limit,
|
|
514
|
+
use_pandas=use_pandas,
|
|
515
|
+
filetype='jsonl',
|
|
516
|
+
autodoc=autodoc, lang=lang,
|
|
517
|
+
ai_service=ai_service)
|
|
518
|
+
report.tables.append(table)
|
|
519
|
+
report.total_tables = len(report.tables)
|
|
520
|
+
report.total_records = table.num_records
|
|
521
|
+
elif len(candidates) > 1:
|
|
522
|
+
total = 0
|
|
523
|
+
for fullkey in candidates:
|
|
524
|
+
table = TableSchema(id=fullkey)
|
|
525
|
+
objects = get_dict_value(data,
|
|
526
|
+
keys=fullkey.split('.'))[0]
|
|
527
|
+
table = table_from_objects(objects, table_id=fullkey,
|
|
528
|
+
objects_limit=objects_limit,
|
|
529
|
+
use_pandas=use_pandas,
|
|
530
|
+
filetype='jsonl',
|
|
531
|
+
autodoc=autodoc, lang=lang)
|
|
532
|
+
total += table.num_records
|
|
533
|
+
report.tables.append(table)
|
|
534
|
+
report.total_records = total
|
|
535
|
+
report.total_tables = len(report.tables)
|
|
536
|
+
if codec is not None:
|
|
537
|
+
codec.close()
|
|
538
|
+
else:
|
|
539
|
+
fileobj.close()
|
|
540
|
+
elif filetype == 'json':
|
|
541
|
+
fileobj = None
|
|
542
|
+
codec = None
|
|
543
|
+
if ftype['codec'] is not None:
|
|
544
|
+
codec = ftype['codec'](filename, open_it=True)
|
|
545
|
+
fileobj = codec.fileobj()
|
|
546
|
+
if fileobj is None:
|
|
547
|
+
with open(filename, 'rb') as f:
|
|
548
|
+
data = json.load(f)
|
|
549
|
+
else:
|
|
550
|
+
data = json.load(fileobj)
|
|
551
|
+
_process_json_data(data, report, fullkey,
|
|
552
|
+
objects_limit, use_pandas,
|
|
553
|
+
autodoc, lang, ai_service)
|
|
554
|
+
if codec is not None:
|
|
555
|
+
codec.close()
|
|
556
|
+
elif fileobj is not None:
|
|
557
|
+
fileobj.close()
|
|
558
|
+
|
|
559
|
+
if autodoc and report.total_tables > 0:
|
|
560
|
+
for table in report.tables:
|
|
561
|
+
fields = []
|
|
562
|
+
for column in table.fields:
|
|
563
|
+
fields.append(column.name)
|
|
564
|
+
descriptions = get_fields_info(fields, language=lang, ai_service=ai_service)
|
|
565
|
+
for column in table.fields:
|
|
566
|
+
if column.name in descriptions:
|
|
567
|
+
column.description = descriptions[column.name]
|
|
568
|
+
return report
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def _format_file_size(size_bytes):
|
|
575
|
+
"""Format file size in human-readable format."""
|
|
576
|
+
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
|
577
|
+
if size_bytes < 1024.0:
|
|
578
|
+
return f"{size_bytes:.2f} {unit}"
|
|
579
|
+
size_bytes /= 1024.0
|
|
580
|
+
return f"{size_bytes:.2f} PB"
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _format_number(num):
|
|
584
|
+
"""Format number with commas for readability."""
|
|
585
|
+
if num is None or num == -1:
|
|
586
|
+
return "N/A"
|
|
587
|
+
return f"{num:,}"
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def _write_analysis_output(report, options, output_stream):
|
|
591
|
+
"""Write analysis report to output stream in the specified format."""
|
|
592
|
+
from tabulate import tabulate
|
|
593
|
+
|
|
594
|
+
if options['outtype'] == 'json':
|
|
595
|
+
json_output = json.dumps(report.model_dump(), indent=4, ensure_ascii=False)
|
|
596
|
+
output_stream.write(json_output)
|
|
597
|
+
output_stream.write('\n')
|
|
598
|
+
elif options['outtype'] == 'yaml':
|
|
599
|
+
yaml_output = yaml.dump(report.model_dump(), Dumper=yaml.Dumper)
|
|
600
|
+
output_stream.write(yaml_output)
|
|
601
|
+
elif options['outtype'] == 'markdown':
|
|
602
|
+
raise NotImplementedError("Markdown output not implemented")
|
|
603
|
+
else:
|
|
604
|
+
# Text output format
|
|
605
|
+
# Print header
|
|
606
|
+
print("=" * 70, file=output_stream)
|
|
607
|
+
print("ANALYSIS REPORT", file=output_stream)
|
|
608
|
+
print("=" * 70, file=output_stream)
|
|
609
|
+
print(file=output_stream)
|
|
610
|
+
|
|
611
|
+
# File information section
|
|
612
|
+
print("File Information", file=output_stream)
|
|
613
|
+
print("-" * 70, file=output_stream)
|
|
614
|
+
headers = ['Attribute', 'Value']
|
|
615
|
+
reptable = []
|
|
616
|
+
reptable.append(['Filename', str(report.filename)])
|
|
617
|
+
reptable.append(['File size', _format_file_size(report.file_size)])
|
|
618
|
+
reptable.append(['File type', report.file_type or 'N/A'])
|
|
619
|
+
reptable.append(['Compression', str(report.compression) if report.compression else 'None'])
|
|
620
|
+
reptable.append(['Total tables', _format_number(report.total_tables)])
|
|
621
|
+
reptable.append(['Total records', _format_number(report.total_records)])
|
|
622
|
+
for k, v in report.metadata.items():
|
|
623
|
+
reptable.append([k.replace('_', ' ').title(), str(v)])
|
|
624
|
+
print(tabulate(reptable, headers=headers, tablefmt='grid'), file=output_stream)
|
|
625
|
+
print(file=output_stream)
|
|
626
|
+
|
|
627
|
+
# Tables section
|
|
628
|
+
if report.tables:
|
|
629
|
+
print("=" * 70, file=output_stream)
|
|
630
|
+
print("TABLE STRUCTURES", file=output_stream)
|
|
631
|
+
print("=" * 70, file=output_stream)
|
|
632
|
+
print(file=output_stream)
|
|
633
|
+
|
|
634
|
+
tabheaders = ['Field Name', 'Type', 'Is Array', 'Description']
|
|
635
|
+
for idx, rtable in enumerate(report.tables, 1):
|
|
636
|
+
if len(report.tables) > 1:
|
|
637
|
+
print(f"Table {idx}: {rtable.id}", file=output_stream)
|
|
638
|
+
else:
|
|
639
|
+
print(f"Table: {rtable.id}", file=output_stream)
|
|
640
|
+
print("-" * 70, file=output_stream)
|
|
641
|
+
print(f" Records: {_format_number(rtable.num_records)}", file=output_stream)
|
|
642
|
+
print(f" Columns: {_format_number(rtable.num_cols)}", file=output_stream)
|
|
643
|
+
print(f" Structure: {'Flat' if rtable.is_flat else 'Nested'}", file=output_stream)
|
|
644
|
+
print(file=output_stream)
|
|
645
|
+
|
|
646
|
+
table = []
|
|
647
|
+
for field in rtable.fields:
|
|
648
|
+
desc = field.description if field.description else '-'
|
|
649
|
+
table.append([
|
|
650
|
+
field.name,
|
|
651
|
+
field.ftype,
|
|
652
|
+
'Yes' if field.is_array else 'No',
|
|
653
|
+
desc
|
|
654
|
+
])
|
|
655
|
+
print(tabulate(table, headers=tabheaders, tablefmt='grid'), file=output_stream)
|
|
656
|
+
|
|
657
|
+
if rtable.description:
|
|
658
|
+
print(file=output_stream)
|
|
659
|
+
print("Summary:", file=output_stream)
|
|
660
|
+
print("-" * 70, file=output_stream)
|
|
661
|
+
# Wrap description text for better readability
|
|
662
|
+
desc_lines = rtable.description.split('\n')
|
|
663
|
+
for line in desc_lines:
|
|
664
|
+
if line.strip():
|
|
665
|
+
print(f" {line.strip()}", file=output_stream)
|
|
666
|
+
|
|
667
|
+
if idx < len(report.tables):
|
|
668
|
+
print(file=output_stream)
|
|
669
|
+
print(file=output_stream)
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
class Analyzer:
|
|
673
|
+
"""Data analysis handler."""
|
|
674
|
+
def __init__(self):
|
|
675
|
+
pass
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def analyze(self, filename, options):
|
|
679
|
+
"""Analyzes given data file and returns it's parameters"""
|
|
680
|
+
encoding = options.get('encoding')
|
|
681
|
+
report = analyze(filename, encoding=encoding,
|
|
682
|
+
engine=options['engine'],
|
|
683
|
+
use_pandas=options['use_pandas'],
|
|
684
|
+
autodoc=options['autodoc'], lang=options['lang'],
|
|
685
|
+
ai_provider=options.get('ai_provider'),
|
|
686
|
+
ai_config=options.get('ai_config'))
|
|
687
|
+
|
|
688
|
+
# Determine output destination
|
|
689
|
+
output_file = options.get('output')
|
|
690
|
+
|
|
691
|
+
if output_file:
|
|
692
|
+
# Use context manager for file output
|
|
693
|
+
with open(output_file, 'w', encoding='utf8') as output_stream:
|
|
694
|
+
_write_analysis_output(report, options, output_stream)
|
|
695
|
+
else:
|
|
696
|
+
# Write to stdout
|
|
697
|
+
_write_analysis_output(report, options, sys.stdout)
|