etlplus 0.3.23__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,686 @@
1
+ """
2
+ :mod:`etlplus.cli.handlers` module.
3
+
4
+ Command handler functions for the ``etlplus`` command-line interface (CLI).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import csv
11
+ import io
12
+ import json
13
+ import os
14
+ import sys
15
+ from pathlib import Path
16
+ from typing import Any
17
+ from typing import Literal
18
+ from typing import cast
19
+
20
+ from ..config import PipelineConfig
21
+ from ..config import load_pipeline_config
22
+ from ..enums import FileFormat
23
+ from ..extract import extract
24
+ from ..file import File
25
+ from ..load import load
26
+ from ..run import run
27
+ from ..transform import transform
28
+ from ..types import JSONData
29
+ from ..utils import json_type
30
+ from ..utils import print_json
31
+ from ..validate import validate
32
+
33
+ # SECTION: EXPORTS ========================================================== #
34
+
35
+
36
+ __all__ = [
37
+ # Functions
38
+ 'cmd_extract',
39
+ 'cmd_list',
40
+ 'cmd_load',
41
+ 'cmd_pipeline',
42
+ 'cmd_run',
43
+ 'cmd_transform',
44
+ 'cmd_validate',
45
+ ]
46
+
47
+
48
+ # SECTION: INTERNAL CONSTANTS =============================================== #
49
+
50
+
51
+ # Standard output/error format behavior states
52
+ _FORMAT_ERROR_STATES = {'error', 'fail', 'strict'}
53
+ _FORMAT_SILENT_STATES = {'ignore', 'silent'}
54
+
55
+
56
+ # SECTION: CONSTANTS ======================================================== #
57
+
58
+
59
+ FORMAT_ENV_KEY = 'ETLPLUS_FORMAT_BEHAVIOR'
60
+
61
+
62
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
63
+
64
+
65
+ def _emit_behavioral_notice(
66
+ message: str,
67
+ behavior: str,
68
+ *,
69
+ quiet: bool,
70
+ ) -> None:
71
+ """
72
+ Emit or raise format-behavior notices.
73
+
74
+ Parameters
75
+ ----------
76
+ message : str
77
+ Warning message describing the ignored ``--format`` flag.
78
+ behavior : str
79
+ Effective format-behavior mode derived from CLI options and env.
80
+ quiet : bool
81
+ Whether non-essential warnings should be suppressed.
82
+
83
+ Raises
84
+ ------
85
+ ValueError
86
+ If ``behavior`` maps to an error state.
87
+ """
88
+ if behavior in _FORMAT_ERROR_STATES:
89
+ raise ValueError(message)
90
+ if behavior in _FORMAT_SILENT_STATES or quiet:
91
+ return
92
+ print(f'Warning: {message}', file=sys.stderr)
93
+
94
+
95
+ def _emit_json(
96
+ data: Any,
97
+ *,
98
+ pretty: bool,
99
+ ) -> None:
100
+ """
101
+ Emit JSON to stdout honoring the pretty/compact preference.
102
+
103
+ Parameters
104
+ ----------
105
+ data : Any
106
+ Arbitrary JSON-serializable payload.
107
+ pretty : bool
108
+ When ``True`` pretty-print via :func:`print_json`; otherwise emit a
109
+ compact JSON string.
110
+ """
111
+ if pretty:
112
+ print_json(data)
113
+ return
114
+
115
+ dumped = json.dumps(
116
+ data,
117
+ ensure_ascii=False,
118
+ separators=(',', ':'),
119
+ )
120
+ print(dumped)
121
+
122
+
123
+ def _format_behavior(
124
+ strict: bool,
125
+ ) -> str:
126
+ """
127
+ Return the effective format-behavior mode.
128
+
129
+ Parameters
130
+ ----------
131
+ strict : bool
132
+ Whether to enforce strict format behavior.
133
+
134
+ Returns
135
+ -------
136
+ str
137
+ The effective format-behavior mode.
138
+ """
139
+ if strict:
140
+ return 'error'
141
+ env_value = os.getenv(FORMAT_ENV_KEY, 'warn')
142
+ return (env_value or 'warn').strip().lower()
143
+
144
+
145
+ def _handle_format_guard(
146
+ *,
147
+ io_context: Literal['source', 'target'],
148
+ resource_type: str,
149
+ format_explicit: bool,
150
+ strict: bool,
151
+ quiet: bool,
152
+ ) -> None:
153
+ """
154
+ Warn or raise when --format is used alongside file resources.
155
+
156
+ Parameters
157
+ ----------
158
+ io_context : Literal['source', 'target']
159
+ Whether this is a source or target resource.
160
+ resource_type : str
161
+ The type of resource being processed.
162
+ format_explicit : bool
163
+ Whether the --format option was explicitly provided.
164
+ strict : bool
165
+ Whether to enforce strict format behavior.
166
+ quiet : bool
167
+ Whether to suppress warnings.
168
+ """
169
+ if resource_type != 'file' or not format_explicit:
170
+ return
171
+ message = (
172
+ f'--format is ignored for file {io_context}s; '
173
+ 'inferred from filename extension.'
174
+ )
175
+ behavior = _format_behavior(strict)
176
+ _emit_behavioral_notice(message, behavior, quiet=quiet)
177
+
178
+
179
+ def _infer_payload_format(
180
+ text: str,
181
+ ) -> str:
182
+ """
183
+ Infer JSON vs CSV from payload text.
184
+
185
+ Parameters
186
+ ----------
187
+ text : str
188
+ Incoming payload as plain text.
189
+
190
+ Returns
191
+ -------
192
+ str
193
+ ``'json'`` when the text starts with ``{``/``[``, else ``'csv'``.
194
+ """
195
+ stripped = text.lstrip()
196
+ if stripped.startswith('{') or stripped.startswith('['):
197
+ return 'json'
198
+ return 'csv'
199
+
200
+
201
+ def _list_sections(
202
+ cfg: PipelineConfig,
203
+ args: argparse.Namespace,
204
+ ) -> dict[str, Any]:
205
+ """
206
+ Build sectioned metadata output for the list command.
207
+
208
+ Parameters
209
+ ----------
210
+ cfg : PipelineConfig
211
+ The loaded pipeline configuration.
212
+ args : argparse.Namespace
213
+ Parsed command-line arguments.
214
+
215
+ Returns
216
+ -------
217
+ dict[str, Any]
218
+ Metadata output for the list command.
219
+ """
220
+ sections: dict[str, Any] = {}
221
+ if getattr(args, 'pipelines', False):
222
+ sections['pipelines'] = [cfg.name]
223
+ if getattr(args, 'sources', False):
224
+ sections['sources'] = [src.name for src in cfg.sources]
225
+ if getattr(args, 'targets', False):
226
+ sections['targets'] = [tgt.name for tgt in cfg.targets]
227
+ if getattr(args, 'transforms', False):
228
+ sections['transforms'] = [
229
+ getattr(trf, 'name', None) for trf in cfg.transforms
230
+ ]
231
+ if not sections:
232
+ sections['jobs'] = _pipeline_summary(cfg)['jobs']
233
+ return sections
234
+
235
+
236
+ def _materialize_csv_payload(
237
+ source: object,
238
+ ) -> JSONData | str:
239
+ """
240
+ Return parsed CSV rows when ``source`` points at a CSV file.
241
+
242
+ Parameters
243
+ ----------
244
+ source : object
245
+ The source of data.
246
+
247
+ Returns
248
+ -------
249
+ JSONData | str
250
+ Parsed CSV rows or the original source if not a CSV file.
251
+ """
252
+ if not isinstance(source, str):
253
+ return cast(JSONData, source)
254
+ path = Path(source)
255
+ if path.suffix.lower() != '.csv' or not path.is_file():
256
+ return source
257
+ return _read_csv_rows(path)
258
+
259
+
260
+ def _parse_text_payload(
261
+ text: str,
262
+ fmt: str | None,
263
+ ) -> JSONData | str:
264
+ """
265
+ Parse JSON/CSV text into a Python payload.
266
+
267
+ Parameters
268
+ ----------
269
+ text : str
270
+ The input text payload.
271
+ fmt : str | None
272
+ Explicit format hint: 'json', 'csv', or None to infer.
273
+
274
+ Returns
275
+ -------
276
+ JSONData | str
277
+ The parsed payload as JSON data or raw text.
278
+ """
279
+
280
+ effective = (fmt or '').strip().lower() or _infer_payload_format(text)
281
+ if effective == 'json':
282
+ return cast(JSONData, json_type(text))
283
+ if effective == 'csv':
284
+ reader = csv.DictReader(io.StringIO(text))
285
+ return [dict(row) for row in reader]
286
+ return text
287
+
288
+
289
+ def _pipeline_summary(
290
+ cfg: PipelineConfig,
291
+ ) -> dict[str, Any]:
292
+ """
293
+ Return a human-friendly snapshot of a pipeline config.
294
+
295
+ Parameters
296
+ ----------
297
+ cfg : PipelineConfig
298
+ The loaded pipeline configuration.
299
+
300
+ Returns
301
+ -------
302
+ dict[str, Any]
303
+ A human-friendly snapshot of a pipeline config.
304
+ """
305
+ sources = [src.name for src in cfg.sources]
306
+ targets = [tgt.name for tgt in cfg.targets]
307
+ jobs = [job.name for job in cfg.jobs]
308
+ return {
309
+ 'name': cfg.name,
310
+ 'version': cfg.version,
311
+ 'sources': sources,
312
+ 'targets': targets,
313
+ 'jobs': jobs,
314
+ }
315
+
316
+
317
+ def _presentation_flags(
318
+ args: argparse.Namespace,
319
+ ) -> tuple[bool, bool]:
320
+ """Return presentation toggles from the parsed namespace.
321
+
322
+ Parameters
323
+ ----------
324
+ args : argparse.Namespace
325
+ Namespace produced by the CLI parser.
326
+
327
+ Returns
328
+ -------
329
+ tuple[bool, bool]
330
+ Pair of ``(pretty, quiet)`` flags with safe defaults.
331
+ """
332
+ return getattr(args, 'pretty', True), getattr(args, 'quiet', False)
333
+
334
+
335
+ def _read_csv_rows(
336
+ path: Path,
337
+ ) -> list[dict[str, str]]:
338
+ """
339
+ Read CSV rows into dictionaries.
340
+
341
+ Parameters
342
+ ----------
343
+ path : Path
344
+ Path to a CSV file.
345
+
346
+ Returns
347
+ -------
348
+ list[dict[str, str]]
349
+ List of dictionaries, each representing a row in the CSV file.
350
+ """
351
+ with path.open(newline='', encoding='utf-8') as handle:
352
+ reader = csv.DictReader(handle)
353
+ return [dict(row) for row in reader]
354
+
355
+
356
+ def _read_stdin_text() -> str:
357
+ """
358
+ Return every character from ``stdin`` as a single string.
359
+
360
+ Returns
361
+ -------
362
+ str
363
+ Entire ``stdin`` contents.
364
+ """
365
+ return sys.stdin.read()
366
+
367
+
368
+ def _write_json_output(
369
+ data: Any,
370
+ output_path: str | None,
371
+ *,
372
+ success_message: str,
373
+ ) -> bool:
374
+ """
375
+ Optionally persist JSON data to disk.
376
+
377
+ Parameters
378
+ ----------
379
+ data : Any
380
+ Data to write.
381
+ output_path : str | None
382
+ Path to write the output to. None to print to stdout.
383
+ success_message : str
384
+ Message to print upon successful write.
385
+
386
+ Returns
387
+ -------
388
+ bool
389
+ True if output was written to a file, False if printed to stdout.
390
+ """
391
+ if not output_path or output_path == '-':
392
+ return False
393
+ File(Path(output_path), FileFormat.JSON).write_json(data)
394
+ print(f'{success_message} {output_path}')
395
+ return True
396
+
397
+
398
+ # SECTION: FUNCTIONS ======================================================== #
399
+
400
+
401
+ def cmd_extract(
402
+ args: argparse.Namespace,
403
+ ) -> int:
404
+ """
405
+ Extract data from a source.
406
+
407
+ Parameters
408
+ ----------
409
+ args : argparse.Namespace
410
+ Parsed command-line arguments.
411
+
412
+ Returns
413
+ -------
414
+ int
415
+ Zero on success.
416
+ """
417
+ pretty, quiet = _presentation_flags(args)
418
+
419
+ _handle_format_guard(
420
+ io_context='source',
421
+ resource_type=args.source_type,
422
+ format_explicit=getattr(args, '_format_explicit', False),
423
+ strict=getattr(args, 'strict_format', False),
424
+ quiet=quiet,
425
+ )
426
+
427
+ if args.source == '-':
428
+ text = _read_stdin_text()
429
+ payload = _parse_text_payload(text, getattr(args, 'format', None))
430
+ if not _write_json_output(
431
+ payload,
432
+ getattr(args, 'output', None),
433
+ success_message='Data extracted and saved to',
434
+ ):
435
+ _emit_json(payload, pretty=pretty)
436
+ return 0
437
+
438
+ if args.source_type == 'file':
439
+ result = extract(args.source_type, args.source)
440
+ else:
441
+ result = extract(
442
+ args.source_type,
443
+ args.source,
444
+ file_format=getattr(args, 'format', None),
445
+ )
446
+
447
+ if not _write_json_output(
448
+ result,
449
+ getattr(args, 'output', None),
450
+ success_message='Data extracted and saved to',
451
+ ):
452
+ _emit_json(result, pretty=pretty)
453
+
454
+ return 0
455
+
456
+
457
+ def cmd_validate(
458
+ args: argparse.Namespace,
459
+ ) -> int:
460
+ """
461
+ Validate data from a source.
462
+
463
+ Parameters
464
+ ----------
465
+ args : argparse.Namespace
466
+ Parsed command-line arguments.
467
+
468
+ Returns
469
+ -------
470
+ int
471
+ Zero on success.
472
+ """
473
+ pretty, _ = _presentation_flags(args)
474
+
475
+ if args.source == '-':
476
+ text = _read_stdin_text()
477
+ payload = _parse_text_payload(
478
+ text,
479
+ getattr(args, 'input_format', None),
480
+ )
481
+ else:
482
+ payload = _materialize_csv_payload(args.source)
483
+ result = validate(payload, args.rules)
484
+
485
+ output_path = getattr(args, 'output', None)
486
+ if output_path:
487
+ validated_data = result.get('data')
488
+ if validated_data is not None:
489
+ _write_json_output(
490
+ validated_data,
491
+ output_path,
492
+ success_message='Validation result saved to',
493
+ )
494
+ else:
495
+ print(
496
+ f'Validation failed, no data to save for {output_path}',
497
+ file=sys.stderr,
498
+ )
499
+ else:
500
+ _emit_json(result, pretty=pretty)
501
+
502
+ return 0
503
+
504
+
505
+ def cmd_transform(
506
+ args: argparse.Namespace,
507
+ ) -> int:
508
+ """
509
+ Transform data from a source.
510
+
511
+ Parameters
512
+ ----------
513
+ args : argparse.Namespace
514
+ Parsed command-line arguments.
515
+
516
+ Returns
517
+ -------
518
+ int
519
+ Zero on success.
520
+ """
521
+ pretty, _ = _presentation_flags(args)
522
+
523
+ if args.source == '-':
524
+ text = _read_stdin_text()
525
+ payload = _parse_text_payload(
526
+ text,
527
+ getattr(args, 'input_format', None),
528
+ )
529
+ else:
530
+ payload = _materialize_csv_payload(args.source)
531
+
532
+ data = transform(payload, args.operations)
533
+
534
+ if not _write_json_output(
535
+ data,
536
+ getattr(args, 'output', None),
537
+ success_message='Data transformed and saved to',
538
+ ):
539
+ _emit_json(data, pretty=pretty)
540
+
541
+ return 0
542
+
543
+
544
+ def cmd_load(
545
+ args: argparse.Namespace,
546
+ ) -> int:
547
+ """
548
+ Load data into a target.
549
+
550
+ Parameters
551
+ ----------
552
+ args : argparse.Namespace
553
+ Parsed command-line arguments.
554
+
555
+ Returns
556
+ -------
557
+ int
558
+ Zero on success.
559
+ """
560
+ pretty, quiet = _presentation_flags(args)
561
+
562
+ _handle_format_guard(
563
+ io_context='target',
564
+ resource_type=args.target_type,
565
+ format_explicit=getattr(args, '_format_explicit', False),
566
+ strict=getattr(args, 'strict_format', False),
567
+ quiet=quiet,
568
+ )
569
+
570
+ # Allow piping into load.
571
+ source_value: (
572
+ str | Path | os.PathLike[str] | dict[str, Any] | list[dict[str, Any]]
573
+ )
574
+ if args.source == '-':
575
+ text = _read_stdin_text()
576
+ source_value = cast(
577
+ str | dict[str, Any] | list[dict[str, Any]],
578
+ _parse_text_payload(
579
+ text,
580
+ getattr(args, 'input_format', None),
581
+ ),
582
+ )
583
+ else:
584
+ source_value = args.source
585
+
586
+ # Allow piping out of load for file targets.
587
+ if args.target_type == 'file' and args.target == '-':
588
+ payload = _materialize_csv_payload(source_value)
589
+ _emit_json(payload, pretty=pretty)
590
+ return 0
591
+
592
+ if args.target_type == 'file':
593
+ result = load(source_value, args.target_type, args.target)
594
+ else:
595
+ result = load(
596
+ source_value,
597
+ args.target_type,
598
+ args.target,
599
+ file_format=getattr(args, 'format', None),
600
+ )
601
+
602
+ if not _write_json_output(
603
+ result,
604
+ getattr(args, 'output', None),
605
+ success_message='Data loaded and saved to',
606
+ ):
607
+ _emit_json(result, pretty=pretty)
608
+
609
+ return 0
610
+
611
+
612
+ def cmd_pipeline(
613
+ args: argparse.Namespace,
614
+ ) -> int:
615
+ """
616
+ Inspect or run a pipeline YAML configuration.
617
+
618
+ Parameters
619
+ ----------
620
+ args : argparse.Namespace
621
+ Parsed command-line arguments.
622
+
623
+ Returns
624
+ -------
625
+ int
626
+ Zero on success.
627
+ """
628
+ cfg = load_pipeline_config(args.config, substitute=True)
629
+
630
+ if getattr(args, 'list', False) and not getattr(args, 'run', None):
631
+ print_json({'jobs': _pipeline_summary(cfg)['jobs']})
632
+ return 0
633
+
634
+ run_job = getattr(args, 'run', None)
635
+ if run_job:
636
+ result = run(job=run_job, config_path=args.config)
637
+ print_json({'status': 'ok', 'result': result})
638
+ return 0
639
+
640
+ print_json(_pipeline_summary(cfg))
641
+ return 0
642
+
643
+
644
+ def cmd_list(args: argparse.Namespace) -> int:
645
+ """
646
+ Print requested pipeline sections from a YAML configuration.
647
+
648
+ Parameters
649
+ ----------
650
+ args : argparse.Namespace
651
+ Parsed command-line arguments.
652
+
653
+ Returns
654
+ -------
655
+ int
656
+ Zero on success.
657
+ """
658
+ cfg = load_pipeline_config(args.config, substitute=True)
659
+ print_json(_list_sections(cfg, args))
660
+ return 0
661
+
662
+
663
+ def cmd_run(args: argparse.Namespace) -> int:
664
+ """
665
+ Execute an ETL job end-to-end from a pipeline YAML configuration.
666
+
667
+ Parameters
668
+ ----------
669
+ args : argparse.Namespace
670
+ Parsed command-line arguments.
671
+
672
+ Returns
673
+ -------
674
+ int
675
+ Zero on success.
676
+ """
677
+ cfg = load_pipeline_config(args.config, substitute=True)
678
+
679
+ job_name = getattr(args, 'job', None) or getattr(args, 'pipeline', None)
680
+ if job_name:
681
+ result = run(job=job_name, config_path=args.config)
682
+ print_json({'status': 'ok', 'result': result})
683
+ return 0
684
+
685
+ print_json(_pipeline_summary(cfg))
686
+ return 0