etlplus 0.10.4__py3-none-any.whl → 0.12.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. etlplus/README.md +37 -0
  2. etlplus/api/README.md +20 -3
  3. etlplus/cli/README.md +40 -0
  4. etlplus/cli/commands.py +1 -1
  5. etlplus/cli/constants.py +1 -1
  6. etlplus/cli/handlers.py +1 -1
  7. etlplus/cli/io.py +2 -2
  8. etlplus/config/README.md +52 -0
  9. etlplus/config/pipeline.py +2 -2
  10. etlplus/database/README.md +48 -0
  11. etlplus/database/ddl.py +1 -1
  12. etlplus/database/engine.py +1 -1
  13. etlplus/database/schema.py +1 -1
  14. etlplus/enums.py +2 -270
  15. etlplus/extract.py +5 -7
  16. etlplus/file/README.md +105 -0
  17. etlplus/file/__init__.py +25 -0
  18. etlplus/file/avro.py +198 -0
  19. etlplus/file/core.py +287 -0
  20. etlplus/file/csv.py +91 -0
  21. etlplus/file/enums.py +238 -0
  22. etlplus/file/feather.py +144 -0
  23. etlplus/file/gz.py +123 -0
  24. etlplus/file/json.py +98 -0
  25. etlplus/file/ndjson.py +109 -0
  26. etlplus/file/orc.py +142 -0
  27. etlplus/file/parquet.py +146 -0
  28. etlplus/file/tsv.py +91 -0
  29. etlplus/file/txt.py +99 -0
  30. etlplus/file/xls.py +132 -0
  31. etlplus/file/xlsx.py +142 -0
  32. etlplus/file/xml.py +174 -0
  33. etlplus/file/yaml.py +136 -0
  34. etlplus/file/zip.py +175 -0
  35. etlplus/load.py +9 -12
  36. etlplus/run.py +6 -9
  37. etlplus/templates/README.md +46 -0
  38. etlplus/validation/README.md +50 -0
  39. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/METADATA +58 -14
  40. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/RECORD +44 -20
  41. etlplus/file.py +0 -652
  42. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/WHEEL +0 -0
  43. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/entry_points.txt +0 -0
  44. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/licenses/LICENSE +0 -0
  45. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/top_level.txt +0 -0
etlplus/file.py DELETED
@@ -1,652 +0,0 @@
1
- """
2
- :mod:`etlplus.file` module.
3
-
4
- Shared helpers for reading and writing structured and semi-structured data
5
- files.
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- import csv
11
- import json
12
- import xml.etree.ElementTree as ET
13
- from dataclasses import dataclass
14
- from pathlib import Path
15
- from typing import Any
16
- from typing import cast
17
-
18
- from .enums import FileFormat
19
- from .enums import infer_file_format_and_compression
20
- from .types import JSONData
21
- from .types import JSONDict
22
- from .types import JSONList
23
- from .types import StrPath
24
- from .utils import count_records
25
-
26
- # SECTION: EXPORTS ========================================================== #
27
-
28
-
29
- __all__ = ['File']
30
-
31
-
32
- # SECTION: INTERNAL CONSTANTS ============================================== #
33
-
34
-
35
- _DEFAULT_XML_ROOT = 'root'
36
-
37
- # Optional YAML support (lazy-loaded to avoid hard dependency)
38
- # Cached access function to avoid global statements.
39
- _YAML_CACHE: dict[str, Any] = {}
40
-
41
-
42
- # SECTION: INTERNAL FUNCTIONS ============================================== #
43
-
44
-
45
- def _dict_to_element(
46
- name: str,
47
- payload: Any,
48
- ) -> ET.Element:
49
- """
50
- Convert a dictionary-like payload into an XML element.
51
-
52
- Parameters
53
- ----------
54
- name : str
55
- Name of the XML element.
56
- payload : Any
57
- The data to include in the XML element.
58
-
59
- Returns
60
- -------
61
- ET.Element
62
- The constructed XML element.
63
- """
64
- element = ET.Element(name)
65
-
66
- if isinstance(payload, dict):
67
- text = payload.get('text')
68
- if text is not None:
69
- element.text = str(text)
70
-
71
- for key, value in payload.items():
72
- if key == 'text':
73
- continue
74
- if key.startswith('@'):
75
- element.set(key[1:], str(value))
76
- continue
77
- if isinstance(value, list):
78
- for item in value:
79
- element.append(_dict_to_element(key, item))
80
- else:
81
- element.append(_dict_to_element(key, value))
82
- elif isinstance(payload, list):
83
- for item in payload:
84
- element.append(_dict_to_element('item', item))
85
- elif payload is not None:
86
- element.text = str(payload)
87
-
88
- return element
89
-
90
-
91
- def _element_to_dict(
92
- element: ET.Element,
93
- ) -> JSONDict:
94
- """
95
- Convert an XML element into a nested dictionary.
96
-
97
- Parameters
98
- ----------
99
- element : ET.Element
100
- XML element to convert.
101
-
102
- Returns
103
- -------
104
- JSONDict
105
- Nested dictionary representation of the XML element.
106
- """
107
- result: JSONDict = {}
108
- text = (element.text or '').strip()
109
- if text:
110
- result['text'] = text
111
-
112
- for child in element:
113
- child_data = _element_to_dict(child)
114
- tag = child.tag
115
- if tag in result:
116
- existing = result[tag]
117
- if isinstance(existing, list):
118
- existing.append(child_data)
119
- else:
120
- result[tag] = [existing, child_data]
121
- else:
122
- result[tag] = child_data
123
-
124
- for key, value in element.attrib.items():
125
- if key in result:
126
- result[f'@{key}'] = value
127
- else:
128
- result[key] = value
129
- return result
130
-
131
-
132
- def _get_yaml() -> Any:
133
- """
134
- Return the PyYAML module, importing it on first use.
135
-
136
- Raises an informative ImportError if the optional dependency is missing.
137
- """
138
- mod = _YAML_CACHE.get('mod')
139
- if mod is not None: # pragma: no cover - tiny branch
140
- return mod
141
- try:
142
- _yaml_mod = __import__('yaml') # type: ignore[assignment]
143
- except ImportError as e: # pragma: no cover
144
- raise ImportError(
145
- 'YAML support requires optional dependency "PyYAML".\n'
146
- 'Install with: pip install PyYAML',
147
- ) from e
148
- _YAML_CACHE['mod'] = _yaml_mod
149
-
150
- return _yaml_mod
151
-
152
-
153
- # SECTION: CLASS ============================================================ #
154
-
155
-
156
- @dataclass(slots=True)
157
- class File:
158
- """
159
- Convenience wrapper around structured file IO.
160
-
161
- This class encapsulates the one-off helpers in this module as convenient
162
- instance methods while retaining the original function API for
163
- backward compatibility (those functions delegate to this class).
164
-
165
- Attributes
166
- ----------
167
- path : Path
168
- Path to the file on disk.
169
- file_format : FileFormat | None, optional
170
- Explicit format. If omitted, the format is inferred from the file
171
- extension (``.csv``, ``.json``, or ``.xml``).
172
- """
173
-
174
- # -- Attributes -- #
175
-
176
- path: Path
177
- file_format: FileFormat | None = None
178
-
179
- # -- Magic Methods (Object Lifecycle) -- #
180
-
181
- def __post_init__(self) -> None:
182
- """
183
- Auto-detect and set the file format on initialization.
184
-
185
- If no explicit ``file_format`` is provided, attempt to infer it from
186
- the file path's extension and update :attr:`file_format`. If the
187
- extension is unknown, the attribute is left as ``None`` and will be
188
- validated later by :meth:`_ensure_format`.
189
- """
190
- # Normalize incoming path (allow str in constructor) to Path.
191
- if isinstance(self.path, str):
192
- self.path = Path(self.path)
193
-
194
- if self.file_format is None:
195
- try:
196
- self.file_format = self._guess_format()
197
- except ValueError:
198
- # Leave as None; _ensure_format() will raise on use if needed.
199
- pass
200
-
201
- # -- Internal Instance Methods -- #
202
-
203
- def _assert_exists(self) -> None:
204
- """
205
- Raise FileNotFoundError if :attr:`path` does not exist.
206
-
207
- This centralizes existence checks across multiple read methods.
208
- """
209
- if not self.path.exists():
210
- raise FileNotFoundError(f'File not found: {self.path}')
211
-
212
- def _ensure_format(self) -> FileFormat:
213
- """
214
- Resolve the active format, guessing from extension if needed.
215
-
216
- Returns
217
- -------
218
- FileFormat
219
- The resolved file format.
220
- """
221
- return (
222
- self.file_format
223
- if self.file_format is not None
224
- else self._guess_format()
225
- )
226
-
227
- def _guess_format(self) -> FileFormat:
228
- """
229
- Infer the file format from the filename extension.
230
-
231
- Returns
232
- -------
233
- FileFormat
234
- The inferred file format based on the file extension.
235
-
236
- Raises
237
- ------
238
- ValueError
239
- If the extension is unknown or unsupported.
240
- """
241
- fmt, compression = infer_file_format_and_compression(self.path)
242
- if fmt is not None:
243
- return fmt
244
- if compression is not None:
245
- raise ValueError(
246
- 'Cannot infer file format from compressed file '
247
- f'{self.path!r} with compression {compression.value!r}',
248
- )
249
- raise ValueError(
250
- f'Cannot infer file format from extension {self.path.suffix!r}',
251
- )
252
-
253
- # -- Instance Methods (Generic API) -- #
254
-
255
- def read(self) -> JSONData:
256
- """
257
- Read structured data from :attr:`path` using :attr:`file_format`.
258
-
259
- Returns
260
- -------
261
- JSONData
262
- The structured data read from the file.
263
-
264
- Raises
265
- ------
266
- ValueError
267
- If the resolved file format is unsupported.
268
- """
269
- fmt = self._ensure_format()
270
- match fmt:
271
- case FileFormat.JSON:
272
- return self.read_json()
273
- case FileFormat.CSV:
274
- return self.read_csv()
275
- case FileFormat.XML:
276
- return self.read_xml()
277
- case FileFormat.YAML:
278
- return self.read_yaml()
279
- raise ValueError(f'Unsupported format: {fmt}')
280
-
281
- def write(
282
- self,
283
- data: JSONData,
284
- *,
285
- root_tag: str = _DEFAULT_XML_ROOT,
286
- ) -> int:
287
- """
288
- Write ``data`` to :attr:`path` using :attr:`file_format`.
289
-
290
- Parameters
291
- ----------
292
- data : JSONData
293
- Data to write to the file.
294
- root_tag : str, optional
295
- Root tag name to use when writing XML files. Defaults to
296
- ``'root'``.
297
-
298
- Returns
299
- -------
300
- int
301
- The number of records written.
302
-
303
- Raises
304
- ------
305
- ValueError
306
- If the resolved file format is unsupported.
307
- """
308
- fmt = self._ensure_format()
309
- match fmt:
310
- case FileFormat.JSON:
311
- return self.write_json(data)
312
- case FileFormat.CSV:
313
- return self.write_csv(data)
314
- case FileFormat.XML:
315
- return self.write_xml(data, root_tag=root_tag)
316
- case FileFormat.YAML:
317
- return self.write_yaml(data)
318
- raise ValueError(f'Unsupported format: {fmt}')
319
-
320
- # -- Instance Methods (CSV) -- #
321
-
322
- def read_csv(self) -> JSONList:
323
- """
324
- Load CSV content as a list of dictionaries from :attr:`path`.
325
-
326
- Returns
327
- -------
328
- JSONList
329
- The list of dictionaries read from the CSV file.
330
- """
331
- self._assert_exists()
332
-
333
- with self.path.open('r', encoding='utf-8', newline='') as handle:
334
- reader: csv.DictReader[str] = csv.DictReader(handle)
335
- rows: JSONList = []
336
- for row in reader:
337
- if not any(row.values()):
338
- continue
339
- rows.append(cast(JSONDict, dict(row)))
340
- return rows
341
-
342
- def write_csv(
343
- self,
344
- data: JSONData,
345
- ) -> int:
346
- """
347
- Write CSV rows to :attr:`path` and return the number of rows.
348
-
349
- Parameters
350
- ----------
351
- data : JSONData
352
- Data to write as CSV. Should be a list of dictionaries or a
353
- single dictionary.
354
-
355
- Returns
356
- -------
357
- int
358
- The number of rows written to the CSV file.
359
- """
360
- rows: list[JSONDict]
361
- if isinstance(data, list):
362
- rows = [row for row in data if isinstance(row, dict)]
363
- else:
364
- rows = [data]
365
-
366
- if not rows:
367
- return 0
368
-
369
- fieldnames = sorted({key for row in rows for key in row})
370
- with self.path.open('w', encoding='utf-8', newline='') as handle:
371
- writer = csv.DictWriter(handle, fieldnames=fieldnames)
372
- writer.writeheader()
373
- for row in rows:
374
- writer.writerow(
375
- {field: row.get(field) for field in fieldnames},
376
- )
377
-
378
- return len(rows)
379
-
380
- # -- Instance Methods (JSON) -- #
381
-
382
- def read_json(self) -> JSONData:
383
- """
384
- Load and validate JSON payloads from :attr:`path`.
385
-
386
- Returns
387
- -------
388
- JSONData
389
- The structured data read from the JSON file.
390
-
391
- Raises
392
- ------
393
- TypeError
394
- If the JSON root is not an object or an array of objects.
395
- """
396
- self._assert_exists()
397
-
398
- with self.path.open('r', encoding='utf-8') as handle:
399
- loaded = json.load(handle)
400
-
401
- if isinstance(loaded, dict):
402
- return cast(JSONDict, loaded)
403
- if isinstance(loaded, list):
404
- if all(isinstance(item, dict) for item in loaded):
405
- return cast(JSONList, loaded)
406
- raise TypeError(
407
- 'JSON array must contain only objects (dicts) '
408
- 'when loading file',
409
- )
410
- raise TypeError(
411
- 'JSON root must be an object or an array of objects '
412
- 'when loading file',
413
- )
414
-
415
- def write_json(
416
- self,
417
- data: JSONData,
418
- ) -> int:
419
- """
420
- Write ``data`` as formatted JSON to :attr:`path`.
421
-
422
- Parameters
423
- ----------
424
- data : JSONData
425
- Data to serialize as JSON.
426
-
427
- Returns
428
- -------
429
- int
430
- The number of records written to the JSON file.
431
- """
432
- self.path.parent.mkdir(parents=True, exist_ok=True)
433
- with self.path.open('w', encoding='utf-8') as handle:
434
- json.dump(
435
- data,
436
- handle,
437
- indent=2,
438
- ensure_ascii=False,
439
- )
440
- handle.write('\n')
441
-
442
- return count_records(data)
443
-
444
- # -- Instance Methods (XML) -- #
445
-
446
- def read_xml(self) -> JSONDict:
447
- """
448
- Parse XML document at :attr:`path` into a nested dictionary.
449
-
450
- Returns
451
- -------
452
- JSONDict
453
- Nested dictionary representation of the XML file.
454
- """
455
- self._assert_exists()
456
-
457
- tree = ET.parse(self.path)
458
- root = tree.getroot()
459
-
460
- return {root.tag: _element_to_dict(root)}
461
-
462
- # -- Instance Methods (YAML) -- #
463
-
464
- def _require_yaml(self) -> None:
465
- """Ensure PyYAML is available or raise an informative error."""
466
- _get_yaml()
467
-
468
- def read_yaml(self) -> JSONData:
469
- """
470
- Load and validate YAML payloads from :attr:`path`.
471
-
472
- Returns
473
- -------
474
- JSONData
475
- The structured data read from the YAML file.
476
-
477
- Raises
478
- ------
479
- TypeError
480
- If the YAML root is not an object or an array of objects.
481
- """
482
- self._require_yaml()
483
- self._assert_exists()
484
-
485
- with self.path.open('r', encoding='utf-8') as handle:
486
- loaded = _get_yaml().safe_load(handle)
487
-
488
- if isinstance(loaded, dict):
489
- return cast(JSONDict, loaded)
490
- if isinstance(loaded, list):
491
- if all(isinstance(item, dict) for item in loaded):
492
- return cast(JSONList, loaded)
493
- raise TypeError(
494
- 'YAML array must contain only objects (dicts) when loading',
495
- )
496
- raise TypeError(
497
- 'YAML root must be an object or an array of objects when loading',
498
- )
499
-
500
- def write_xml(
501
- self,
502
- data: JSONData,
503
- *,
504
- root_tag: str = _DEFAULT_XML_ROOT,
505
- ) -> int:
506
- """
507
- Write ``data`` as XML to :attr:`path` and return record count.
508
-
509
- Parameters
510
- ----------
511
- data : JSONData
512
- Data to write as XML.
513
- root_tag : str, optional
514
- Root tag name to use when writing XML files. Defaults to
515
- ``'root'``.
516
-
517
- Returns
518
- -------
519
- int
520
- The number of records written to the XML file.
521
- """
522
- if isinstance(data, dict) and len(data) == 1:
523
- root_name, payload = next(iter(data.items()))
524
- root_element = _dict_to_element(str(root_name), payload)
525
- else:
526
- root_element = _dict_to_element(root_tag, data)
527
-
528
- tree = ET.ElementTree(root_element)
529
- tree.write(self.path, encoding='utf-8', xml_declaration=True)
530
-
531
- return count_records(data)
532
-
533
- def write_yaml(
534
- self,
535
- data: JSONData,
536
- ) -> int:
537
- """
538
- Write ``data`` as YAML to :attr:`path` and return record count.
539
-
540
- Parameters
541
- ----------
542
- data : JSONData
543
- Data to write as YAML.
544
-
545
- Returns
546
- -------
547
- int
548
- The number of records written.
549
- """
550
- self._require_yaml()
551
- with self.path.open('w', encoding='utf-8') as handle:
552
- _get_yaml().safe_dump(
553
- data,
554
- handle,
555
- sort_keys=False,
556
- allow_unicode=True,
557
- default_flow_style=False,
558
- )
559
- return count_records(data)
560
-
561
- # -- Class Methods -- #
562
-
563
- @classmethod
564
- def from_path(
565
- cls,
566
- path: StrPath,
567
- *,
568
- file_format: FileFormat | str | None = None,
569
- ) -> File:
570
- """
571
- Create a :class:`File` from any path-like and optional format.
572
-
573
- Parameters
574
- ----------
575
- path : StrPath
576
- Path to the file on disk.
577
- file_format : FileFormat | str | None, optional
578
- Explicit format. If omitted, the format is inferred from the file
579
- extension (``.csv``, ``.json``, or ``.xml``).
580
-
581
- Returns
582
- -------
583
- File
584
- The constructed :class:`File` instance.
585
- """
586
- resolved = Path(path)
587
- ff: FileFormat | None
588
- if isinstance(file_format, str):
589
- ff = FileFormat.coerce(file_format)
590
- else:
591
- ff = file_format
592
-
593
- return cls(resolved, ff)
594
-
595
- @classmethod
596
- def read_file(
597
- cls,
598
- path: StrPath,
599
- file_format: FileFormat | str | None = None,
600
- ) -> JSONData:
601
- """
602
- Read structured data.
603
-
604
- Parameters
605
- ----------
606
- path : StrPath
607
- Path to the file on disk.
608
- file_format : FileFormat | str | None, optional
609
- Explicit format. If omitted, the format is inferred from the file
610
- extension (``.csv``, ``.json``, or ``.xml``).
611
-
612
- Returns
613
- -------
614
- JSONData
615
- The structured data read from the file.
616
- """
617
- return cls.from_path(path, file_format=file_format).read()
618
-
619
- @classmethod
620
- def write_file(
621
- cls,
622
- path: StrPath,
623
- data: JSONData,
624
- file_format: FileFormat | str | None = None,
625
- *,
626
- root_tag: str = _DEFAULT_XML_ROOT,
627
- ) -> int:
628
- """
629
- Write structured data and count written records.
630
-
631
- Parameters
632
- ----------
633
- path : StrPath
634
- Path to the file on disk.
635
- data : JSONData
636
- Data to write to the file.
637
- file_format : FileFormat | str | None, optional
638
- Explicit format. If omitted, the format is inferred from the file
639
- extension (``.csv``, ``.json``, or ``.xml``).
640
- root_tag : str, optional
641
- Root tag name to use when writing XML files. Defaults to
642
- ``'root'``.
643
-
644
- Returns
645
- -------
646
- int
647
- The number of records written to the file.
648
- """
649
- return cls.from_path(path, file_format=file_format).write(
650
- data,
651
- root_tag=root_tag,
652
- )