undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
undatum/core.py ADDED
@@ -0,0 +1,616 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf8 -*-
3
+ """Core module providing CLI commands for the undatum package.
4
+
5
+ This module defines the main CLI interface using Typer, including all
6
+ command handlers for data conversion, analysis, validation, and more.
7
+ """
8
+ import glob
9
+ import logging
10
+ from typing import Annotated
11
+
12
+ import typer
13
+
14
+ from .cmds.analyzer import Analyzer
15
+ from .cmds.converter import Converter
16
+ from .cmds.ingester import Ingester
17
+ from .cmds.query import DataQuery
18
+ from .cmds.schemer import Schemer
19
+ from .cmds.selector import Selector
20
+ from .cmds.statistics import StatProcessor
21
+ from .cmds.textproc import TextProcessor
22
+ from .cmds.transformer import Transformer
23
+ from .cmds.validator import Validator
24
+
25
+ DEFAULT_BATCH_SIZE = 1000
26
+
27
+ app = typer.Typer()
28
+
29
+ logging.basicConfig(
30
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
31
+ level=logging.INFO)
32
+
33
+
34
+ def enable_verbose():
35
+ """Enable verbose logging."""
36
+ logging.basicConfig(
37
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
38
+ level=logging.INFO)
39
+
40
+ @app.command()
41
+ def convert(
42
+ input_file: Annotated[str, typer.Argument(help="Path to input file to convert.")],
43
+ output: Annotated[str, typer.Argument(help="Path to output file.")],
44
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ',',
45
+ compression: Annotated[str, typer.Option(help="Compression type (e.g., 'brotli', 'gzip', 'xz').")] = 'brotli',
46
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = 'utf8',
47
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
48
+ flatten_data: Annotated[bool, typer.Option(help="Flatten nested data structures into flat records.")] = False,
49
+ prefix_strip: Annotated[bool, typer.Option(help="Strip XML namespace prefixes from element names.")] = True,
50
+ fields: Annotated[str, typer.Option(help="Comma-separated list of field names to include in output.")] = None,
51
+ start_line: Annotated[int, typer.Option(help="Line number (0-based) to start reading from.")] = 0,
52
+ skip_end_rows: Annotated[int, typer.Option(help="Number of rows to skip at the end of the file.")] = 0,
53
+ start_page: Annotated[int, typer.Option(help="Page number (0-based) to start from for Excel files.")] = 0,
54
+ tagname: Annotated[str, typer.Option(help="XML tag name that contains individual records.")] = None,
55
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl', 'xml').")] = None,
56
+ format_out: Annotated[str, typer.Option(help="Override output file format (e.g., 'csv', 'jsonl', 'parquet').")] = None,
57
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False
58
+ ):
59
+ """Convert one file to another format.
60
+
61
+ Supports conversion between XML, CSV, JSON, JSONL, BSON, Parquet, ORC, and AVRO formats.
62
+ """
63
+ if verbose:
64
+ enable_verbose()
65
+ options = {
66
+ 'delimiter': delimiter,
67
+ 'compression': compression,
68
+ 'flatten': flatten_data,
69
+ 'encoding': encoding,
70
+ 'prefix_strip': prefix_strip,
71
+ 'start_line': start_line,
72
+ 'skip_end_rows': skip_end_rows,
73
+ 'start_page': start_page,
74
+ 'tagname': tagname,
75
+ 'fields': fields,
76
+ 'format_in': format_in,
77
+ 'format_out': format_out,
78
+ 'zipfile': zipfile
79
+ }
80
+ acmd = Converter()
81
+ acmd.convert(input_file, output, options)
82
+
83
+ @app.command()
84
+ def convertold(
85
+ input_file: Annotated[str, typer.Argument(help="Path to input file to convert.")],
86
+ output: Annotated[str, typer.Argument(help="Path to output file.")],
87
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ',',
88
+ compression: Annotated[str, typer.Option(help="Compression type (e.g., 'brotli', 'gzip', 'xz').")] = 'brotli',
89
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = 'utf8',
90
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
91
+ flatten_data: Annotated[bool, typer.Option(help="Flatten nested data structures into flat records.")] = False,
92
+ prefix_strip: Annotated[bool, typer.Option(help="Strip XML namespace prefixes from element names.")] = True,
93
+ fields: Annotated[str, typer.Option(help="Comma-separated list of field names to include in output.")] = None,
94
+ start_line: Annotated[int, typer.Option(help="Line number (0-based) to start reading from.")] = 0,
95
+ skip_end_rows: Annotated[int, typer.Option(help="Number of rows to skip at the end of the file.")] = 0,
96
+ start_page: Annotated[int, typer.Option(help="Page number (0-based) to start from for Excel files.")] = 0,
97
+ tagname: Annotated[str, typer.Option(help="XML tag name that contains individual records.")] = None,
98
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl', 'xml').")] = None,
99
+ format_out: Annotated[str, typer.Option(help="Override output file format (e.g., 'csv', 'jsonl', 'parquet').")] = None,
100
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False
101
+ ):
102
+ """Convert one file to another using legacy conversion method.
103
+
104
+ .. deprecated:: 1.0.15
105
+ This command uses the old conversion implementation. Use 'convert' instead.
106
+ """
107
+ if verbose:
108
+ enable_verbose()
109
+ options = {
110
+ 'delimiter': delimiter,
111
+ 'compression': compression,
112
+ 'flatten': flatten_data,
113
+ 'encoding': encoding,
114
+ 'prefix_strip': prefix_strip,
115
+ 'start_line': start_line,
116
+ 'skip_end_rows': skip_end_rows,
117
+ 'start_page': start_page,
118
+ 'tagname': tagname,
119
+ 'fields': fields,
120
+ 'format_in': format_in,
121
+ 'format_out': format_out,
122
+ 'zipfile': zipfile
123
+ }
124
+ acmd = Converter()
125
+ acmd.convert_old(input_file, output, options)
126
+
127
+ @app.command()
128
+ def uniq(
129
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
130
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
131
+ fields: Annotated[str, typer.Option(help="Comma-separated list of field names to extract unique values from.")] = None,
132
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ',',
133
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = None,
134
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
135
+ filetype: Annotated[str, typer.Option(help="Override file type detection (e.g., 'csv', 'jsonl').")] = None,
136
+ engine: Annotated[str, typer.Option(help="Processing engine: 'auto' (default), 'duckdb', or 'iterable'.")] = "auto"
137
+ ):
138
+ """Extract all unique values from specified field(s).
139
+
140
+ Returns unique values or unique combinations if multiple fields are specified.
141
+ """
142
+ if verbose:
143
+ enable_verbose()
144
+ options = {
145
+ 'output': output,
146
+ 'fields': fields,
147
+ 'delimiter': delimiter,
148
+ 'encoding': encoding,
149
+ 'filetype': filetype,
150
+ 'engine': engine
151
+ }
152
+ acmd = Selector()
153
+ acmd.uniq(input_file, options)
154
+
155
+
156
+ @app.command()
157
+ def headers(
158
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
159
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
160
+ fields: Annotated[str, typer.Option(help="Field filter (kept for API compatibility, not currently used).")] = None, # pylint: disable=unused-argument
161
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ',',
162
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = None,
163
+ limit: Annotated[int, typer.Option(help="Maximum number of records to scan for field detection.")] = 10000,
164
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
165
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl', 'xml').")] = None,
166
+ format_out: Annotated[str, typer.Option(help="Override output format (e.g., 'csv', 'json').")] = None,
167
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False,
168
+ filter_expr: Annotated[str, typer.Option(help="Filter expression (kept for API compatibility, not currently used).")] = None # pylint: disable=unused-argument
169
+ ):
170
+ """Returns fieldnames of the file. Supports XML, CSV, JSON, BSON.
171
+
172
+ Scans the input file and returns all detected field/column names.
173
+ """
174
+ if verbose:
175
+ enable_verbose()
176
+ # fields and filter_expr kept for API compatibility but not currently used
177
+ options = {
178
+ 'output': output,
179
+ 'delimiter': delimiter,
180
+ 'encoding': encoding,
181
+ 'limit': limit,
182
+ 'format_in': format_in,
183
+ 'format_out': format_out,
184
+ 'zipfile': zipfile
185
+ }
186
+ acmd = Selector()
187
+ acmd.headers(input_file, options)
188
+
189
+ @app.command()
190
+ def stats(
191
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
192
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
193
+ dictshare: Annotated[int, typer.Option(help="Dictionary share threshold (0-100) for type detection.")] = None,
194
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl').")] = None,
195
+ format_out: Annotated[str, typer.Option(help="Override output format (e.g., 'json', 'yaml').")] = None,
196
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = None,
197
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
198
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False,
199
+ checkdates: Annotated[bool, typer.Option(help="Enable automatic date field detection.")] = True,
200
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = None
201
+ ):
202
+ """Generate detailed statistics about a dataset.
203
+
204
+ Provides field types, uniqueness counts, min/max/average lengths,
205
+ and optional date field detection.
206
+ """
207
+ if verbose:
208
+ enable_verbose()
209
+ options = {
210
+ 'output': output,
211
+ 'dictshare': dictshare,
212
+ 'zipfile': zipfile,
213
+ 'format_in': format_in,
214
+ 'format_out': format_out,
215
+ 'delimiter': delimiter,
216
+ 'checkdates': checkdates,
217
+ 'encoding': encoding,
218
+ 'verbose': verbose
219
+ }
220
+ acmd = StatProcessor(nodates=not checkdates)
221
+ acmd.stats(input_file, options)
222
+
223
+
224
+ @app.command()
225
+ def flatten(
226
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
227
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
228
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ',',
229
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = 'utf8',
230
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'jsonl', 'xml').")] = None,
231
+ filter_expr: Annotated[str, typer.Option(help="Filter expression to apply before flattening.")] = None,
232
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False
233
+ ):
234
+ """Flatten nested data records into one value per row.
235
+
236
+ Converts nested structures (arrays, objects) into flat records.
237
+ """
238
+ if verbose:
239
+ enable_verbose()
240
+ options = {
241
+ 'delimiter': delimiter,
242
+ 'output': output,
243
+ 'encoding': encoding,
244
+ 'format_in': format_in,
245
+ 'filter': filter_expr
246
+ }
247
+ acmd = TextProcessor()
248
+ acmd.flatten(input_file, options)
249
+
250
+
251
+ @app.command()
252
+ def frequency(
253
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
254
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
255
+ fields: Annotated[str, typer.Option(help="Comma-separated list of field names to calculate frequency for.")] = None,
256
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ",",
257
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = None,
258
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
259
+ filetype: Annotated[str, typer.Option(help="Override file type detection (e.g., 'csv', 'jsonl').")] = None,
260
+ engine: Annotated[str, typer.Option(help="Processing engine: 'auto' (default), 'duckdb', or 'iterable'.")] = "auto"
261
+ ):
262
+ """Calculate frequency distribution for specified fields.
263
+
264
+ Counts occurrences of each unique value in the specified field(s).
265
+ """
266
+ if verbose:
267
+ enable_verbose()
268
+ options = {
269
+ 'delimiter': delimiter,
270
+ 'fields': fields,
271
+ 'output': output,
272
+ 'encoding': encoding,
273
+ 'filetype': filetype,
274
+ 'engine': engine
275
+ }
276
+ acmd = Selector()
277
+ acmd.frequency(input_file, options)
278
+
279
+
280
+ @app.command()
281
+ def select(
282
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
283
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
284
+ fields: Annotated[str, typer.Option(help="Comma-separated list of field names to select and reorder.")] = None,
285
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ",",
286
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = None,
287
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
288
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl').")] = None,
289
+ format_out: Annotated[str, typer.Option(help="Override output format (e.g., 'csv', 'jsonl').")] = None,
290
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False,
291
+ filter_expr: Annotated[str, typer.Option(help="Filter expression to apply (e.g., \"`status` == 'active'\").")] = None
292
+ ):
293
+ """Select or reorder columns from file.
294
+
295
+ Supports CSV, JSONL, and BSON formats. Can also filter records.
296
+ """
297
+ if verbose:
298
+ enable_verbose()
299
+ options = {
300
+ 'delimiter': delimiter,
301
+ 'fields': fields,
302
+ 'output': output,
303
+ 'encoding': encoding,
304
+ 'format_in': format_in,
305
+ 'format_out': format_out,
306
+ 'zipfile': zipfile,
307
+ 'filter': filter_expr
308
+ }
309
+ acmd = Selector()
310
+ acmd.select(input_file, options)
311
+
312
+
313
+ @app.command()
314
+ def split(
315
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
316
+ output: Annotated[str, typer.Option(help="Optional output file path prefix. If not specified, uses input filename.")] = None,
317
+ fields: Annotated[str, typer.Option(help="Comma-separated field names to split by (creates one file per unique value combination).")] = None,
318
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ',',
319
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = "utf8",
320
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
321
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl').")] = None,
322
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False,
323
+ gzipfile: Annotated[str, typer.Option(help="Gzip compression option for output files.")] = None,
324
+ chunksize: Annotated[int, typer.Option(help="Number of records per chunk when splitting by size (default: 10000).")] = 10000,
325
+ filter_expr: Annotated[str, typer.Option(help="Filter expression to apply before splitting.")] = None,
326
+ dirname: Annotated[str, typer.Option(help="Directory path to write output files to.")] = None
327
+ ):
328
+ """Split a data file into multiple chunks.
329
+
330
+ Can split by chunk size or by unique field values.
331
+ """
332
+ if verbose:
333
+ enable_verbose()
334
+ options = {
335
+ 'delimiter': delimiter,
336
+ 'fields': fields,
337
+ 'output': output,
338
+ 'encoding': encoding,
339
+ 'format_in': format_in,
340
+ 'zipfile': zipfile,
341
+ 'gzipfile': gzipfile,
342
+ 'chunksize': chunksize,
343
+ 'filter': filter_expr,
344
+ 'dirname': dirname
345
+ }
346
+ acmd = Selector()
347
+ acmd.split(input_file, options)
348
+
349
+
350
+ @app.command()
351
+ def validate(
352
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
353
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
354
+ fields: Annotated[str, typer.Option(help="Comma-separated list of field names to validate.")] = None,
355
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ',',
356
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = 'utf8',
357
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
358
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl').")] = None,
359
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False,
360
+ rule: Annotated[str, typer.Option(help="Validation rule name (e.g., 'common.email', 'common.url', 'ru.org.inn', 'ru.org.ogrn').")] = None,
361
+ filter_expr: Annotated[str, typer.Option(help="Filter expression to apply before validation.")] = None,
362
+ mode: Annotated[str, typer.Option(help="Output mode: 'invalid' (default, show invalid records), 'stats' (show statistics), or 'valid' (show valid records).")] = "invalid"
363
+ ):
364
+ """Validate fields against built-in or custom validation rules.
365
+
366
+ Available rules: common.email, common.url, ru.org.inn, ru.org.ogrn
367
+ """
368
+ if verbose:
369
+ enable_verbose()
370
+ options = {
371
+ 'delimiter': delimiter,
372
+ 'fields': fields,
373
+ 'output': output,
374
+ 'encoding': encoding,
375
+ 'format_in': format_in,
376
+ 'zipfile': zipfile,
377
+ 'filter': filter_expr,
378
+ 'rule': rule,
379
+ 'mode': mode
380
+ }
381
+ acmd = Validator()
382
+ acmd.validate(input_file, options)
383
+
384
+
385
+ @app.command()
386
+ def apply(
387
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
388
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
389
+ fields: Annotated[str, typer.Option(help="Comma-separated list of field names (kept for compatibility).")] = None,
390
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ",",
391
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = 'utf8',
392
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
393
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl').")] = None,
394
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False,
395
+ script: Annotated[str, typer.Option(help="Path to Python script file containing transformation function.")] = None,
396
+ filter_expr: Annotated[str, typer.Option(help="Filter expression to apply before transformation.")] = None
397
+ ):
398
+ """Apply a transformation script to each record in the file.
399
+
400
+ Executes a Python script that transforms each record.
401
+ """
402
+ if verbose:
403
+ enable_verbose()
404
+ options = {
405
+ 'delimiter': delimiter,
406
+ 'fields': fields,
407
+ 'output': output,
408
+ 'encoding': encoding,
409
+ 'format_in': format_in,
410
+ 'zipfile': zipfile,
411
+ 'filter': filter_expr,
412
+ 'script': script
413
+ }
414
+ acmd = Transformer()
415
+ acmd.script(input_file, options)
416
+
417
+
418
+ @app.command()
419
+ def scheme(
420
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
421
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
422
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ',',
423
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = 'utf8',
424
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
425
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl').")] = None,
426
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False,
427
+ stype: Annotated[str, typer.Option(help="Schema type: 'cerberus' (default) or other schema formats.")] = 'cerberus'
428
+ ):
429
+ """Generate data schema from file.
430
+
431
+ Creates a schema definition based on the structure of the input data.
432
+ """
433
+ if verbose:
434
+ enable_verbose()
435
+ options = {
436
+ 'delimiter': delimiter,
437
+ 'output': output,
438
+ 'encoding': encoding,
439
+ 'format_in': format_in,
440
+ 'zipfile': zipfile,
441
+ 'stype': stype
442
+ }
443
+ acmd = Schemer()
444
+ acmd.generate_scheme(input_file, options)
445
+
446
+
447
+ @app.command()
448
+ def analyze(
449
+ input_file: Annotated[str, typer.Argument(help="Path to input file to analyze.")],
450
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
451
+ engine: Annotated[str, typer.Option(help="Processing engine: 'auto' (default), 'duckdb', or 'iterable'.")] = "auto",
452
+ use_pandas: Annotated[bool, typer.Option(help="Use pandas for data processing (may use more memory).")] = False,
453
+ outtype: Annotated[str, typer.Option(help="Output format: 'text' (default), 'json', or 'yaml'.")] = "text",
454
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
455
+ autodoc: Annotated[bool, typer.Option(help="Enable AI-powered automatic field and dataset documentation.")] = False,
456
+ lang: Annotated[str, typer.Option(help="Language for AI-generated documentation (default: 'English').")] = "English",
457
+ ai_provider: Annotated[str, typer.Option(help="AI provider to use: 'openai', 'openrouter', 'ollama', 'lmstudio', or 'perplexity'.")] = None,
458
+ ai_model: Annotated[str, typer.Option(help="Model name to use (provider-specific, e.g., 'gpt-4o-mini' for OpenAI).")] = None,
459
+ ai_base_url: Annotated[str, typer.Option(help="Base URL for AI API (optional, uses provider-specific defaults if not specified).")] = None
460
+ ):
461
+ """Analyzes given data file and returns human readable insights.
462
+
463
+ Provides detailed analysis of file structure, encoding, fields, data types,
464
+ and optionally AI-generated field descriptions and dataset summaries.
465
+ """
466
+ if verbose:
467
+ enable_verbose()
468
+
469
+ # Build AI configuration
470
+ ai_config = {}
471
+ if ai_model:
472
+ ai_config['model'] = ai_model
473
+ if ai_base_url:
474
+ ai_config['base_url'] = ai_base_url
475
+
476
+ options = {
477
+ 'engine': engine,
478
+ 'use_pandas': use_pandas,
479
+ 'outtype': outtype,
480
+ 'output': output,
481
+ 'autodoc': autodoc,
482
+ 'lang': lang,
483
+ 'ai_provider': ai_provider,
484
+ 'ai_config': ai_config if ai_config else None
485
+ }
486
+ acmd = Analyzer()
487
+ acmd.analyze(input_file, options)
488
+
489
+
490
+ @app.command()
491
+ def schema(
492
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
493
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
494
+ outtype: Annotated[str, typer.Option(help="Output format: 'text' (default), 'json', or 'yaml'.")] = "text",
495
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
496
+ autodoc: Annotated[bool, typer.Option(help="Enable AI-powered automatic field documentation.")] = False,
497
+ lang: Annotated[str, typer.Option(help="Language for AI-generated documentation (default: 'English').")] = "English"
498
+ ):
499
+ """Extract schema from a data file.
500
+
501
+ Generates a schema definition describing the structure and types of fields in the data.
502
+ """
503
+ if verbose:
504
+ enable_verbose()
505
+ options = {
506
+ 'outtype': outtype,
507
+ 'output': output,
508
+ 'autodoc': autodoc,
509
+ 'lang': lang
510
+ }
511
+ acmd = Schemer()
512
+ acmd.extract_schema(input_file, options)
513
+
514
+
515
+ @app.command()
516
+ def schema_bulk(
517
+ input_file: Annotated[str, typer.Argument(help="Glob pattern or directory path for input files (e.g., 'data/*.csv' or 'data/').")],
518
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
519
+ outtype: Annotated[str, typer.Option(help="Output format: 'text' (default), 'json', or 'yaml'.")] = "text",
520
+ output: Annotated[str, typer.Option(help="Output directory path for schema files.")] = None,
521
+ mode: Annotated[str, typer.Option(help="Extraction mode: 'distinct' (extract unique schemas, default) or 'perfile' (one schema per file).")] = "distinct",
522
+ autodoc: Annotated[bool, typer.Option(help="Enable AI-powered automatic field documentation.")] = False,
523
+ lang: Annotated[str, typer.Option(help="Language for AI-generated documentation (default: 'English').")] = "English"
524
+ ):
525
+ """Extract schemas from multiple files.
526
+
527
+ Processes multiple files and extracts their schemas, either as distinct unique schemas
528
+ or one schema per file.
529
+ """
530
+ if verbose:
531
+ enable_verbose()
532
+ options = {
533
+ 'outtype': outtype,
534
+ 'output': output,
535
+ 'mode': mode,
536
+ 'autodoc': autodoc,
537
+ 'lang': lang
538
+ }
539
+ acmd = Schemer()
540
+ acmd.extract_schema_bulk(input_file, options)
541
+
542
+
543
+ @app.command()
544
+ def ingest(
545
+ input_file: Annotated[str, typer.Argument(help="Path to input file or glob pattern (e.g., 'data/*.jsonl').")],
546
+ uri: Annotated[str, typer.Argument(help="Database connection URI (e.g., 'mongodb://localhost:27017' or 'https://elasticsearch:9200').")],
547
+ db: Annotated[str, typer.Argument(help="Database name.")],
548
+ table: Annotated[str, typer.Argument(help="Collection or table name.")],
549
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
550
+ batch: Annotated[int, typer.Option(help="Batch size for ingestion (number of records per batch, default: 1000).")] = DEFAULT_BATCH_SIZE,
551
+ dbtype: Annotated[str, typer.Option(help="Database type: 'mongodb' (default) or 'elasticsearch'.")] = "mongodb",
552
+ totals: Annotated[bool, typer.Option(help="Show total record counts during ingestion.")] = False,
553
+ drop: Annotated[bool, typer.Option(help="Drop existing collection/table before ingestion.")] = False,
554
+ timeout: Annotated[int, typer.Option(help="Connection timeout in seconds (default: -30).")] = -30,
555
+ skip: Annotated[int, typer.Option(help="Number of records to skip at the beginning.")] = None,
556
+ api_key: Annotated[str, typer.Option(help="API key for database authentication.")] = None,
557
+ doc_id: Annotated[str, typer.Option(help="Field name to use as document ID (for MongoDB).")] = None
558
+ ):
559
+ """Ingest data into a database.
560
+
561
+ Supports MongoDB and Elasticsearch databases. Reads data from files and inserts
562
+ them into the specified database collection or table.
563
+ """
564
+ if verbose:
565
+ enable_verbose()
566
+ options = {
567
+ 'dbtype': dbtype,
568
+ 'skip': skip,
569
+ 'drop': drop,
570
+ 'totals': totals,
571
+ 'doc_id': doc_id,
572
+ 'api_key': api_key,
573
+ 'timeout': timeout
574
+ }
575
+ acmd = Ingester(batch)
576
+ files = glob.glob(input_file.strip("'"))
577
+ acmd.ingest(files, uri, db, table, options)
578
+
579
+
580
+ @app.command()
581
+ def query(
582
+ input_file: Annotated[str, typer.Argument(help="Path to input file.")],
583
+ output: Annotated[str, typer.Option(help="Optional output file path. If not specified, prints to stdout.")] = None,
584
+ fields: Annotated[str, typer.Option(help="Comma-separated list of field names (kept for compatibility).")] = None,
585
+ delimiter: Annotated[str, typer.Option(help="CSV delimiter character.")] = ',',
586
+ encoding: Annotated[str, typer.Option(help="File encoding (e.g., 'utf8', 'latin1').")] = None,
587
+ verbose: Annotated[bool, typer.Option(help="Enable verbose logging output.")] = False,
588
+ format_in: Annotated[str, typer.Option(help="Override input file format detection (e.g., 'csv', 'jsonl').")] = None,
589
+ format_out: Annotated[str, typer.Option(help="Override output format (e.g., 'csv', 'jsonl').")] = None,
590
+ zipfile: Annotated[bool, typer.Option(help="Treat input file as a ZIP archive.")] = False,
591
+ query_expr: Annotated[str, typer.Option(help="MistQL query expression to execute on the data.")] = None
592
+ ):
593
+ """Query data using MistQL query language.
594
+
595
+ .. note:: Experimental feature. Requires 'mistql' package: pip install mistql
596
+
597
+ Executes MistQL queries on the input data and returns the results.
598
+ """
599
+ if verbose:
600
+ enable_verbose()
601
+ options = {
602
+ 'delimiter': delimiter,
603
+ 'fields': fields,
604
+ 'output': output,
605
+ 'encoding': encoding,
606
+ 'format_in': format_in,
607
+ 'format_out': format_out,
608
+ 'zipfile': zipfile,
609
+ 'query': query_expr
610
+ }
611
+ acmd = DataQuery()
612
+ acmd.query(input_file, options)
613
+
614
+
615
+ if __name__ == '__main__':
616
+ app()
@@ -0,0 +1,6 @@
1
+ # -*- coding: utf8 -*-
2
+ """File format handlers and converters.
3
+
4
+ This module provides format-specific handlers for various file types
5
+ including DOCX, XLSX, and other document formats.
6
+ """