undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,697 @@
1
+ # -*- coding: utf8 -*-
2
+ """Data analysis and insights module.
3
+
4
+ This module provides data analysis capabilities including schema detection,
5
+ field type inference, and AI-powered documentation generation.
6
+
7
+ Note: Some functions have been optimized for performance (e.g., using sets
8
+ for key tracking), but further optimizations may be possible for very large datasets.
9
+ """
10
+ import csv
11
+ import io
12
+ import json
13
+ import os
14
+ import sys
15
+ import tempfile
16
+ from collections import OrderedDict
17
+ from typing import Optional
18
+
19
+ import duckdb
20
+ import pandas as pd
21
+ import xlrd
22
+ import xmltodict
23
+ import yaml
24
+ from iterable.helpers.detect import (detect_encoding_any, detect_file_type,
25
+ TEXT_DATA_TYPES)
26
+ from openpyxl import load_workbook
27
+ from pydantic import BaseModel
28
+ from pyzstd import ZstdFile
29
+
30
+ from ..ai import get_fields_info, get_description, get_ai_service, AIService
31
+ from ..formats.docx import analyze_docx
32
+ from ..utils import get_dict_value
33
+
34
+ OBJECTS_ANALYZE_LIMIT = 10000
35
+
36
+
37
+ DUCKDB_TYPES = ['VARCHAR', 'DATE', 'JSON', 'BIGINT', 'DOUBLE', 'BOOLEAN']
38
+
39
+ def column_type_parse(column_type):
40
+ """Parse column type string to extract array flag and base type."""
41
+ is_array = (column_type[-2:] == '[]')
42
+ if is_array:
43
+ text = column_type[:-2]
44
+ else:
45
+ text = column_type
46
+ if text[:6] == 'STRUCT':
47
+ atype = text[:6]
48
+ elif text[:4] == 'JSON':
49
+ atype = 'VARCHAR'
50
+ else:
51
+ atype = text
52
+ return [atype, str(is_array)]
53
+
54
+ def duckdb_decompose(filename: str = None, frame: pd.DataFrame = None,
55
+ filetype: str = None, path: str = "*", limit: int = 10000000,
56
+ recursive: bool = True, root: str = "", ignore_errors: bool = True):
57
+ """Decompose file or DataFrame structure using DuckDB.
58
+
59
+ This function uses DuckDB's summarize and unnest functions to extract
60
+ schema information from nested data structures. It handles up to 4 levels
61
+ of nesting by constructing recursive SQL queries.
62
+
63
+ The function builds SQL queries dynamically based on the nesting depth:
64
+ - Level 1: Direct field access
65
+ - Level 2-4: Nested unnest operations
66
+ - Recursive: Processes STRUCT types by calling itself recursively
67
+
68
+ Args:
69
+ filename: Path to input file. If None, frame must be provided.
70
+ frame: Pandas DataFrame. Used when filename is None.
71
+ filetype: File type ('csv', 'tsv', 'json', 'jsonl'). Determines read function.
72
+ path: Path expression for nested fields (default: '*' for all fields).
73
+ limit: Maximum records to process (default: 10000000).
74
+ recursive: Whether to recursively process STRUCT types (default: True).
75
+ root: Root path prefix for nested queries (used internally for recursion).
76
+ ignore_errors: Whether to ignore parsing errors in DuckDB (default: True).
77
+
78
+ Returns:
79
+ List of lists containing field information:
80
+ [field_path, base_type, is_array, unique_count, total_count, uniqueness_percentage]
81
+
82
+ Raises:
83
+ ValueError: If both filename and frame are None.
84
+ duckdb.Error: If DuckDB query fails.
85
+
86
+ Example:
87
+ >>> result = duckdb_decompose('data.jsonl', filetype='json')
88
+ >>> print(result[0]) # ['field1', 'VARCHAR', 'False', '100', '1000', '10.00']
89
+ """
90
+ text_ignore = ', ignore_errors=true' if ignore_errors else ''
91
+ if filetype in ['csv', 'tsv']:
92
+ read_func = f"read_csv('{filename}'{text_ignore})"
93
+ elif filetype in ['json', 'jsonl']:
94
+ read_func = f"read_json('{filename}'{text_ignore})"
95
+ else:
96
+ read_func = f"'{filename}'"
97
+ if path == '*':
98
+ if filename is not None:
99
+ query_str = f"summarize select {path} from {read_func} limit {limit}"
100
+ data = duckdb.sql(query_str).fetchall()
101
+ else:
102
+ query_str = f"summarize select {path} from frame limit {limit}"
103
+ data = duckdb.sql(query_str).fetchall()
104
+ else:
105
+ path_parts = path.split('.')
106
+ query = None
107
+ if len(path_parts) == 1:
108
+ if filename is not None:
109
+ query = (f"summarize select unnest(\"{path}\", recursive:=true) "
110
+ f"from {read_func} limit {limit}")
111
+ else:
112
+ query = (f"summarize select unnest(\"{path}\", recursive:=true) "
113
+ f"from frame limit {limit}")
114
+ elif len(path_parts) == 2:
115
+ if filename is not None:
116
+ query = (f"summarize select unnest(\"{path_parts[1]}\", "
117
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
118
+ f"recursive:=true) from {read_func} limit {limit})")
119
+ else:
120
+ query = (f"summarize select unnest(\"{path_parts[1]}\", "
121
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
122
+ f"recursive:=true) from frame limit {limit})")
123
+ elif len(path_parts) == 3:
124
+ if filename is not None:
125
+ query = (f"summarize select unnest(\"{path_parts[2]}\", "
126
+ f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
127
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
128
+ f"recursive:=true) from {read_func} limit {limit}))")
129
+ else:
130
+ query = (f"summarize select unnest(\"{path_parts[2]}\", "
131
+ f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
132
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
133
+ f"recursive:=true) from frame limit {limit}))")
134
+ elif len(path_parts) == 4:
135
+ if filename is not None:
136
+ query = (f"summarize select unnest(\"{path_parts[2]}.{path_parts[3]}\", "
137
+ f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
138
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
139
+ f"recursive:=true) from {read_func} limit {limit}))")
140
+ else:
141
+ query = (f"summarize select unnest(\"{path_parts[2]}.{path_parts[3]}\", "
142
+ f"recursive:=true) from (select unnest(\"{path_parts[1]}\", "
143
+ f"recursive:=true) from (select unnest(\"{path_parts[0]}\", "
144
+ f"recursive:=true) from frame limit {limit}))")
145
+ data = duckdb.sql(query).fetchall()
146
+ table = []
147
+ for row in data:
148
+ item = [row[0] if len(root) == 0 else root + '.' + row[0]]
149
+ item.extend(column_type_parse(row[1]))
150
+ item.append(str(row[4]))
151
+ item.append(str(row[10]))
152
+ uniq_share = row[4] * 100.0 / row[10] if row[10] > 0 else 0
153
+ item.append(f'{uniq_share:0.2f}')
154
+ table.append(item)
155
+ if recursive and item[1] == 'STRUCT':
156
+ sub_path = row[0] if len(root) == 0 else item[0]
157
+ subtable = duckdb_decompose(filename, frame, filetype=filetype,
158
+ path=sub_path, limit=limit,
159
+ recursive=recursive, root=item[0],
160
+ ignore_errors=ignore_errors)
161
+ for subitem in subtable:
162
+ table.append(subitem)
163
+ return table
164
+
165
+ def _seek_dict_lists(data, level=0, path=None, candidates=None):
166
+ """Seek list structures in dictionary recursively."""
167
+ if candidates is None:
168
+ candidates = OrderedDict()
169
+ # print(level, path, candidates)
170
+ for key, value in data.items():
171
+ if isinstance(value, list):
172
+ isobjectlist = False
173
+ for listitem in value[:20]:
174
+ if isinstance(listitem, (dict, OrderedDict)):
175
+ isobjectlist = True
176
+ break
177
+ if not isobjectlist:
178
+ continue
179
+ key = f'{path}.{key}' if path is not None else key
180
+ if key not in candidates:
181
+ candidates[key] = {'key' : key, 'num' : len(value)}
182
+ elif isinstance(value, (OrderedDict, dict)):
183
+ res = _seek_dict_lists(value, level + 1, path + '.' + key if path else key, candidates)
184
+ for k, v in res.items():
185
+ if k not in candidates.keys():
186
+ candidates[k] = v
187
+ else:
188
+ continue
189
+ return candidates
190
+
191
+
192
+
193
+ def _seek_xml_lists(data, level=0, path=None, candidates=None):
194
+ """Seek list structures in XML data recursively."""
195
+ if candidates is None:
196
+ candidates = OrderedDict()
197
+ for key, value in data.items():
198
+ if isinstance(value, list):
199
+ key = f'{path}.{key}' if path is not None else key
200
+ if key not in candidates:
201
+ candidates[key] = {'key' : key, 'num' : len(value)}
202
+ elif isinstance(value, (OrderedDict, dict)):
203
+ res = _seek_xml_lists(value, level + 1, path + '.' + key if path else key, candidates)
204
+ for k, v in res.items():
205
+ if k not in candidates.keys():
206
+ candidates[k] = v
207
+ else:
208
+ continue
209
+ return candidates
210
+
211
+
212
+ def _process_json_data(data, report, fullkey, objects_limit, use_pandas,
213
+ autodoc, lang, ai_service: Optional[AIService] = None):
214
+ """Process JSON data and add tables to report."""
215
+ candidates = _seek_dict_lists(data, level=0)
216
+ if len(candidates) == 1:
217
+ fullkey = str(next(iter(candidates)))
218
+ table = TableSchema(id=fullkey)
219
+ objects = get_dict_value(data, keys=fullkey.split('.'))[0]
220
+ table = table_from_objects(objects, table_id=fullkey,
221
+ objects_limit=objects_limit,
222
+ use_pandas=use_pandas,
223
+ filetype='jsonl',
224
+ autodoc=autodoc, lang=lang,
225
+ ai_service=ai_service)
226
+ report.tables.append(table)
227
+ report.total_tables = len(report.tables)
228
+ report.total_records = table.num_records
229
+ elif len(candidates) > 1:
230
+ total = 0
231
+ for fullkey in candidates:
232
+ table = TableSchema(id=fullkey)
233
+ objects = get_dict_value(data, keys=fullkey.split('.'))[0]
234
+ table = table_from_objects(objects, table_id=fullkey,
235
+ objects_limit=objects_limit,
236
+ use_pandas=use_pandas,
237
+ filetype='jsonl',
238
+ autodoc=autodoc, lang=lang,
239
+ ai_service=ai_service)
240
+ total += table.num_records
241
+ report.tables.append(table)
242
+ report.total_records = total
243
+ report.total_tables = len(report.tables)
244
+
245
+
246
+ class FieldSchema(BaseModel):
247
+ """Schema definition for a data field."""
248
+ name: str
249
+ ftype: str
250
+ is_array:bool = False
251
+ description: Optional[str] = None
252
+ sem_type:str = None
253
+ sem_url:str = None
254
+
255
+
256
+ class TableSchema(BaseModel):
257
+ """Table schema definition."""
258
+ num_records: int = -1
259
+ num_cols: int = -1
260
+ is_flat: bool = True
261
+ id: Optional[str] = None
262
+ fields: Optional[list[FieldSchema]] = []
263
+ description: Optional[str] = None
264
+
265
+ class ReportSchema(BaseModel):
266
+ """Schema of the data file analysis results."""
267
+ filename: str
268
+ file_size: int
269
+ file_type: str
270
+ compression: str = None
271
+ total_tables: int = 1
272
+ total_records: int = -1
273
+ tables: Optional[list[TableSchema]] = []
274
+ metadata: dict = {}
275
+ success: bool = False
276
+ error: str = None
277
+
278
+
279
+ MAX_SAMPLE_SIZE = 200
280
+ DELIMITED_FILES = ['csv', 'tsv']
281
+ DUCKABLE_FILE_TYPES = ['csv', 'jsonl', 'json', 'parquet']
282
+ DUCKABLE_CODECS = ['zst', 'gzip', 'raw']
283
+
284
+
285
+ def table_from_objects(objects: list, table_id: str, objects_limit: int,
286
+ use_pandas: bool = False, filetype='csv',
287
+ autodoc: bool = False, lang: str = 'English',
288
+ ai_service: Optional[AIService] = None):
289
+ """Reconstructs table schema from list of objects."""
290
+ table = TableSchema(id=table_id)
291
+ table.num_records = len(objects)
292
+ if autodoc:
293
+ f = io.StringIO()
294
+ writer = csv.writer(f)
295
+ writer.writerows(objects[:MAX_SAMPLE_SIZE])
296
+ table.description = get_description(f.getvalue(), language=lang, ai_service=ai_service)
297
+ if use_pandas:
298
+ df = pd.DataFrame(objects)
299
+ columns_raw = duckdb_decompose(frame=df, path='*',
300
+ limit=objects_limit)
301
+ else:
302
+ suffix = '.' + filetype
303
+ tfile = tempfile.NamedTemporaryFile(suffix=suffix, mode='w',
304
+ encoding='utf8', delete=False)
305
+ tfile.close()
306
+ with ZstdFile(tfile.name, mode='w', level_or_option=9) as tfile_real:
307
+ wrapper = io.TextIOWrapper(tfile_real, encoding='utf8',
308
+ write_through=True)
309
+ if filetype == 'csv':
310
+ writer = csv.writer(wrapper)
311
+ writer.writerows(objects[:objects_limit])
312
+ elif filetype == 'jsonl':
313
+ for row in objects[:objects_limit]:
314
+ wrapper.write(json.dumps(row) + '\n')
315
+ # Getting structure
316
+ columns_raw = duckdb_decompose(tfile.name, filetype=filetype,
317
+ path='*', limit=objects_limit)
318
+ os.remove(tfile.name)
319
+ is_flat = True
320
+ table.num_cols = len(columns_raw)
321
+
322
+ for column in columns_raw:
323
+ field = FieldSchema(name=column[0], ftype=column[1],
324
+ is_array=column[2])
325
+ table.fields.append(field)
326
+ if field.ftype == 'STRUCT' or field.is_array:
327
+ is_flat = False
328
+ table.is_flat = is_flat
329
+ table.num_records = len(objects)
330
+ return table
331
+
332
+
333
+
334
+
335
+ def analyze(filename: str, filetype: str = None, compression: str = 'raw',
336
+ objects_limit: int = OBJECTS_ANALYZE_LIMIT, encoding: str = None,
337
+ scan: bool = True, stats: bool = True, engine: str = "auto", # noqa: ARG001
338
+ use_pandas: bool = False, ignore_errors: bool = True,
339
+ autodoc: bool = False, lang: str = 'English',
340
+ ai_provider: Optional[str] = None, ai_config: Optional[dict] = None):
341
+ """Analyzes any type of data file and provides meaningful insights.
342
+
343
+ Args:
344
+ ai_provider: AI provider name (openai, openrouter, ollama, lmstudio, perplexity)
345
+ ai_config: Optional AI configuration dictionary
346
+ """
347
+ fileext = filename.rsplit('.', 1)[-1].lower()
348
+ filesize = os.path.getsize(filename)
349
+ if filetype is None:
350
+ ftype = detect_file_type(filename)
351
+ if ftype['success']:
352
+ filetype = ftype['datatype'].id()
353
+ if ftype['codec'] is not None:
354
+ compression = ftype['codec'].id()
355
+ # Handling special cases
356
+ if filetype is None and fileext == 'docx':
357
+ filetype = 'docx'
358
+
359
+ report = ReportSchema(filename=filename, file_size=filesize,
360
+ file_type=filetype, compression=compression)
361
+
362
+ # Initialize AI service if autodoc is enabled
363
+ ai_service = None
364
+ if autodoc:
365
+ try:
366
+ config = ai_config or {}
367
+ if ai_provider:
368
+ config['provider'] = ai_provider
369
+ ai_service = get_ai_service(provider=ai_provider, config=config)
370
+ except Exception as e:
371
+ # If AI service fails to initialize, disable autodoc
372
+ import warnings
373
+ warnings.warn(f"Failed to initialize AI service: {e}. Disabling autodoc.")
374
+ autodoc = False
375
+
376
+ if filetype in TEXT_DATA_TYPES:
377
+ if encoding is None:
378
+ encoding = detect_encoding_any(filename)
379
+ enc_key = 'encoding' if 'encoding' in encoding else None
380
+ report.metadata['encoding'] = encoding.get(enc_key) if enc_key else None
381
+ else:
382
+ report.metadata['encoding'] = encoding
383
+ if scan:
384
+ duckable_cond = (report.file_type in DUCKABLE_FILE_TYPES and
385
+ report.compression in DUCKABLE_CODECS and
386
+ engine in ['auto', 'duckdb'])
387
+ if duckable_cond:
388
+ # Getting total count
389
+ text_ignore = ', ignore_errors=true' if ignore_errors else ''
390
+ if filetype in ['json', 'jsonl']:
391
+ query_str = f"select count(*) from read_json('{filename}'{text_ignore})"
392
+ num_records = duckdb.sql(query_str).fetchall()[0][0]
393
+ elif filetype in ['csv', 'tsv']:
394
+ query_str = f"select count(*) from read_csv('{filename}'{text_ignore})"
395
+ num_records = duckdb.sql(query_str).fetchall()[0][0]
396
+ else:
397
+ query_str = f"select count(*) from '{filename}'"
398
+ num_records = duckdb.sql(query_str).fetchall()[0][0]
399
+ table = TableSchema(id=os.path.basename(filename))
400
+ table.num_records = num_records
401
+ report.tables = [table]
402
+ report.total_records = table.num_records
403
+ report.total_tables = 1
404
+
405
+ # Getting structure
406
+ columns_raw = duckdb_decompose(filename, filetype=filetype,
407
+ path='*', limit=objects_limit)
408
+ is_flat = True
409
+ table.num_cols = len(columns_raw)
410
+ for column in columns_raw:
411
+ field = FieldSchema(name=column[0], ftype=column[1],
412
+ is_array=column[2])
413
+ table.fields.append(field)
414
+ if field.ftype == 'STRUCT' or field.is_array:
415
+ is_flat = False
416
+ table.is_flat = is_flat
417
+ query_str = f"select * from '{filename}' limit {MAX_SAMPLE_SIZE}"
418
+ sample = duckdb.sql(query_str).fetchall()
419
+ f = io.StringIO()
420
+ writer = csv.writer(f)
421
+ writer.writerows(sample[:MAX_SAMPLE_SIZE])
422
+ if autodoc:
423
+ table.description = get_description(f.getvalue(), language=lang, ai_service=ai_service)
424
+ else:
425
+ if engine == 'duckdb':
426
+ report.success = False
427
+ report.error = (f"Not supported file type {report.file_type} "
428
+ f"or compression {report.compression}")
429
+ else:
430
+ # Processing MS Word XML files
431
+ if fileext == 'docx':
432
+ docx_tables = analyze_docx(filename, extract_data=True)
433
+ total = 0
434
+ for dtable in docx_tables:
435
+ table = table_from_objects(dtable['data'],
436
+ table_id=str(dtable['id']),
437
+ objects_limit=objects_limit,
438
+ use_pandas=use_pandas,
439
+ filetype='csv',
440
+ autodoc=autodoc, lang=lang,
441
+ ai_service=ai_service)
442
+ total += table.num_records
443
+ report.tables.append(table)
444
+ report.total_records = total
445
+ report.total_tables = len(report.tables)
446
+ elif filetype == 'xlsx':
447
+ wb = load_workbook(filename)
448
+ total = 0
449
+ for sheetname in wb.sheetnames:
450
+ sheet = wb.get_sheet_by_name(sheetname)
451
+ objects = []
452
+ max_num = (objects_limit if objects_limit < sheet.max_row
453
+ else sheet.max_row)
454
+ for n in range(0, max_num):
455
+ row = next(sheet.iter_rows())
456
+ tmp = []
457
+ for cell in row:
458
+ tmp.append(str(cell.value))
459
+ objects.append(tmp)
460
+ table = table_from_objects(objects, table_id=sheetname,
461
+ objects_limit=objects_limit,
462
+ use_pandas=use_pandas,
463
+ filetype='csv',
464
+ autodoc=autodoc, lang=lang,
465
+ ai_service=ai_service)
466
+ total += table.num_records
467
+ report.tables.append(table)
468
+ report.total_records = total
469
+ report.total_tables = len(report.tables)
470
+ elif filetype == 'xls':
471
+ wb = xlrd.open_workbook(filename)
472
+ total = 0
473
+ for sheetname in wb.sheet_names():
474
+ sheet = wb.sheet_by_name(sheetname)
475
+ objects = []
476
+ max_num = (objects_limit if objects_limit < sheet.nrows
477
+ else sheet.nrows)
478
+ for n in range(0, max_num):
479
+ tmp = []
480
+ for i in range(0, sheet.ncols):
481
+ cell_value = sheet.cell_value(n, i)
482
+ get_col = str(cell_value)
483
+ tmp.append(get_col)
484
+ objects.append(tmp)
485
+ table = table_from_objects(objects, table_id=sheetname,
486
+ objects_limit=objects_limit,
487
+ use_pandas=use_pandas,
488
+ filetype='csv',
489
+ autodoc=autodoc, lang=lang,
490
+ ai_service=ai_service)
491
+ report.tables.append(table)
492
+ total += table.num_records
493
+ report.total_records = total
494
+ report.total_tables = len(report.tables)
495
+ elif filetype == 'xml':
496
+ fileobj = None
497
+ codec = None
498
+ if ftype['codec'] is not None:
499
+ codec = ftype['codec'](filename, open_it=True)
500
+ fileobj = codec.fileobj()
501
+ if fileobj is None:
502
+ with open(filename, 'rb') as f:
503
+ data = xmltodict.parse(f, process_namespaces=False)
504
+ else:
505
+ data = xmltodict.parse(fileobj, process_namespaces=False)
506
+ candidates = _seek_xml_lists(data, level=0)
507
+ if len(candidates) == 1:
508
+ fullkey = str(next(iter(candidates)))
509
+ table = TableSchema(id=fullkey)
510
+ objects = get_dict_value(data,
511
+ keys=fullkey.split('.'))[0]
512
+ table = table_from_objects(objects, table_id=fullkey,
513
+ objects_limit=objects_limit,
514
+ use_pandas=use_pandas,
515
+ filetype='jsonl',
516
+ autodoc=autodoc, lang=lang,
517
+ ai_service=ai_service)
518
+ report.tables.append(table)
519
+ report.total_tables = len(report.tables)
520
+ report.total_records = table.num_records
521
+ elif len(candidates) > 1:
522
+ total = 0
523
+ for fullkey in candidates:
524
+ table = TableSchema(id=fullkey)
525
+ objects = get_dict_value(data,
526
+ keys=fullkey.split('.'))[0]
527
+ table = table_from_objects(objects, table_id=fullkey,
528
+ objects_limit=objects_limit,
529
+ use_pandas=use_pandas,
530
+ filetype='jsonl',
531
+ autodoc=autodoc, lang=lang)
532
+ total += table.num_records
533
+ report.tables.append(table)
534
+ report.total_records = total
535
+ report.total_tables = len(report.tables)
536
+ if codec is not None:
537
+ codec.close()
538
+ else:
539
+ fileobj.close()
540
+ elif filetype == 'json':
541
+ fileobj = None
542
+ codec = None
543
+ if ftype['codec'] is not None:
544
+ codec = ftype['codec'](filename, open_it=True)
545
+ fileobj = codec.fileobj()
546
+ if fileobj is None:
547
+ with open(filename, 'rb') as f:
548
+ data = json.load(f)
549
+ else:
550
+ data = json.load(fileobj)
551
+ _process_json_data(data, report, fullkey,
552
+ objects_limit, use_pandas,
553
+ autodoc, lang, ai_service)
554
+ if codec is not None:
555
+ codec.close()
556
+ elif fileobj is not None:
557
+ fileobj.close()
558
+
559
+ if autodoc and report.total_tables > 0:
560
+ for table in report.tables:
561
+ fields = []
562
+ for column in table.fields:
563
+ fields.append(column.name)
564
+ descriptions = get_fields_info(fields, language=lang, ai_service=ai_service)
565
+ for column in table.fields:
566
+ if column.name in descriptions:
567
+ column.description = descriptions[column.name]
568
+ return report
569
+
570
+
571
+
572
+
573
+
574
+ def _format_file_size(size_bytes):
575
+ """Format file size in human-readable format."""
576
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
577
+ if size_bytes < 1024.0:
578
+ return f"{size_bytes:.2f} {unit}"
579
+ size_bytes /= 1024.0
580
+ return f"{size_bytes:.2f} PB"
581
+
582
+
583
+ def _format_number(num):
584
+ """Format number with commas for readability."""
585
+ if num is None or num == -1:
586
+ return "N/A"
587
+ return f"{num:,}"
588
+
589
+
590
+ def _write_analysis_output(report, options, output_stream):
591
+ """Write analysis report to output stream in the specified format."""
592
+ from tabulate import tabulate
593
+
594
+ if options['outtype'] == 'json':
595
+ json_output = json.dumps(report.model_dump(), indent=4, ensure_ascii=False)
596
+ output_stream.write(json_output)
597
+ output_stream.write('\n')
598
+ elif options['outtype'] == 'yaml':
599
+ yaml_output = yaml.dump(report.model_dump(), Dumper=yaml.Dumper)
600
+ output_stream.write(yaml_output)
601
+ elif options['outtype'] == 'markdown':
602
+ raise NotImplementedError("Markdown output not implemented")
603
+ else:
604
+ # Text output format
605
+ # Print header
606
+ print("=" * 70, file=output_stream)
607
+ print("ANALYSIS REPORT", file=output_stream)
608
+ print("=" * 70, file=output_stream)
609
+ print(file=output_stream)
610
+
611
+ # File information section
612
+ print("File Information", file=output_stream)
613
+ print("-" * 70, file=output_stream)
614
+ headers = ['Attribute', 'Value']
615
+ reptable = []
616
+ reptable.append(['Filename', str(report.filename)])
617
+ reptable.append(['File size', _format_file_size(report.file_size)])
618
+ reptable.append(['File type', report.file_type or 'N/A'])
619
+ reptable.append(['Compression', str(report.compression) if report.compression else 'None'])
620
+ reptable.append(['Total tables', _format_number(report.total_tables)])
621
+ reptable.append(['Total records', _format_number(report.total_records)])
622
+ for k, v in report.metadata.items():
623
+ reptable.append([k.replace('_', ' ').title(), str(v)])
624
+ print(tabulate(reptable, headers=headers, tablefmt='grid'), file=output_stream)
625
+ print(file=output_stream)
626
+
627
+ # Tables section
628
+ if report.tables:
629
+ print("=" * 70, file=output_stream)
630
+ print("TABLE STRUCTURES", file=output_stream)
631
+ print("=" * 70, file=output_stream)
632
+ print(file=output_stream)
633
+
634
+ tabheaders = ['Field Name', 'Type', 'Is Array', 'Description']
635
+ for idx, rtable in enumerate(report.tables, 1):
636
+ if len(report.tables) > 1:
637
+ print(f"Table {idx}: {rtable.id}", file=output_stream)
638
+ else:
639
+ print(f"Table: {rtable.id}", file=output_stream)
640
+ print("-" * 70, file=output_stream)
641
+ print(f" Records: {_format_number(rtable.num_records)}", file=output_stream)
642
+ print(f" Columns: {_format_number(rtable.num_cols)}", file=output_stream)
643
+ print(f" Structure: {'Flat' if rtable.is_flat else 'Nested'}", file=output_stream)
644
+ print(file=output_stream)
645
+
646
+ table = []
647
+ for field in rtable.fields:
648
+ desc = field.description if field.description else '-'
649
+ table.append([
650
+ field.name,
651
+ field.ftype,
652
+ 'Yes' if field.is_array else 'No',
653
+ desc
654
+ ])
655
+ print(tabulate(table, headers=tabheaders, tablefmt='grid'), file=output_stream)
656
+
657
+ if rtable.description:
658
+ print(file=output_stream)
659
+ print("Summary:", file=output_stream)
660
+ print("-" * 70, file=output_stream)
661
+ # Wrap description text for better readability
662
+ desc_lines = rtable.description.split('\n')
663
+ for line in desc_lines:
664
+ if line.strip():
665
+ print(f" {line.strip()}", file=output_stream)
666
+
667
+ if idx < len(report.tables):
668
+ print(file=output_stream)
669
+ print(file=output_stream)
670
+
671
+
672
+ class Analyzer:
673
+ """Data analysis handler."""
674
+ def __init__(self):
675
+ pass
676
+
677
+
678
+ def analyze(self, filename, options):
679
+ """Analyzes given data file and returns it's parameters"""
680
+ encoding = options.get('encoding')
681
+ report = analyze(filename, encoding=encoding,
682
+ engine=options['engine'],
683
+ use_pandas=options['use_pandas'],
684
+ autodoc=options['autodoc'], lang=options['lang'],
685
+ ai_provider=options.get('ai_provider'),
686
+ ai_config=options.get('ai_config'))
687
+
688
+ # Determine output destination
689
+ output_file = options.get('output')
690
+
691
+ if output_file:
692
+ # Use context manager for file output
693
+ with open(output_file, 'w', encoding='utf8') as output_stream:
694
+ _write_analysis_output(report, options, output_stream)
695
+ else:
696
+ # Write to stdout
697
+ _write_analysis_output(report, options, sys.stdout)