etlplus 0.11.11__py3-none-any.whl → 0.12.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +37 -0
- etlplus/api/README.md +20 -3
- etlplus/cli/README.md +40 -0
- etlplus/config/README.md +52 -0
- etlplus/database/README.md +48 -0
- etlplus/file/README.md +105 -0
- etlplus/file/avro.py +157 -18
- etlplus/file/core.py +3 -3
- etlplus/file/csv.py +12 -3
- etlplus/file/feather.py +100 -15
- etlplus/file/gz.py +80 -16
- etlplus/file/json.py +13 -2
- etlplus/file/ndjson.py +61 -11
- etlplus/file/orc.py +95 -12
- etlplus/file/parquet.py +100 -13
- etlplus/file/tsv.py +52 -20
- etlplus/file/txt.py +56 -16
- etlplus/file/xls.py +85 -12
- etlplus/file/xlsx.py +95 -12
- etlplus/file/xml.py +12 -3
- etlplus/file/yaml.py +13 -2
- etlplus/file/zip.py +133 -7
- etlplus/templates/README.md +46 -0
- etlplus/validation/README.md +50 -0
- {etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/METADATA +58 -14
- {etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/RECORD +30 -23
- {etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/WHEEL +0 -0
- {etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/entry_points.txt +0 -0
- {etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/top_level.txt +0 -0
etlplus/file/parquet.py
CHANGED
|
@@ -1,21 +1,81 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.parquet` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing Parquet files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import cast
|
|
10
12
|
|
|
11
13
|
from ..types import JSONData
|
|
14
|
+
from ..types import JSONDict
|
|
15
|
+
from ..types import JSONList
|
|
12
16
|
|
|
13
17
|
# SECTION: EXPORTS ========================================================== #
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_PANDAS_CACHE: dict[str, Any] = {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_pandas() -> Any:
|
|
36
|
+
"""
|
|
37
|
+
Return the pandas module, importing it on first use.
|
|
38
|
+
|
|
39
|
+
Raises an informative ImportError if the optional dependency is missing.
|
|
40
|
+
"""
|
|
41
|
+
mod = _PANDAS_CACHE.get('mod')
|
|
42
|
+
if mod is not None: # pragma: no cover - tiny branch
|
|
43
|
+
return mod
|
|
44
|
+
try:
|
|
45
|
+
_pd = __import__('pandas') # type: ignore[assignment]
|
|
46
|
+
except ImportError as e: # pragma: no cover
|
|
47
|
+
raise ImportError(
|
|
48
|
+
'Parquet support requires optional dependency "pandas".\n'
|
|
49
|
+
'Install with: pip install pandas',
|
|
50
|
+
) from e
|
|
51
|
+
_PANDAS_CACHE['mod'] = _pd
|
|
52
|
+
|
|
53
|
+
return _pd
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _normalize_records(data: JSONData) -> JSONList:
|
|
17
57
|
"""
|
|
18
|
-
|
|
58
|
+
Normalize JSON payloads into a list of dictionaries.
|
|
59
|
+
|
|
60
|
+
Raises TypeError when payloads contain non-dict items.
|
|
61
|
+
"""
|
|
62
|
+
if isinstance(data, list):
|
|
63
|
+
if not all(isinstance(item, dict) for item in data):
|
|
64
|
+
raise TypeError(
|
|
65
|
+
'Parquet payloads must contain only objects (dicts)',
|
|
66
|
+
)
|
|
67
|
+
return cast(JSONList, data)
|
|
68
|
+
return [cast(JSONDict, data)]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def read(
|
|
75
|
+
path: Path,
|
|
76
|
+
) -> JSONList:
|
|
77
|
+
"""
|
|
78
|
+
Read Parquet content from ``path``.
|
|
19
79
|
|
|
20
80
|
Parameters
|
|
21
81
|
----------
|
|
@@ -24,20 +84,32 @@ def read(path: Path) -> JSONData:
|
|
|
24
84
|
|
|
25
85
|
Returns
|
|
26
86
|
-------
|
|
27
|
-
|
|
28
|
-
|
|
87
|
+
JSONList
|
|
88
|
+
The list of dictionaries read from the Parquet file.
|
|
29
89
|
|
|
30
90
|
Raises
|
|
31
91
|
------
|
|
32
|
-
|
|
33
|
-
|
|
92
|
+
ImportError
|
|
93
|
+
If optional dependencies for Parquet support are missing.
|
|
34
94
|
"""
|
|
35
|
-
|
|
95
|
+
pandas = _get_pandas()
|
|
96
|
+
try:
|
|
97
|
+
frame = pandas.read_parquet(path)
|
|
98
|
+
except ImportError as e: # pragma: no cover
|
|
99
|
+
raise ImportError(
|
|
100
|
+
'Parquet support requires optional dependency '
|
|
101
|
+
'"pyarrow" or "fastparquet".\n'
|
|
102
|
+
'Install with: pip install pyarrow',
|
|
103
|
+
) from e
|
|
104
|
+
return cast(JSONList, frame.to_dict(orient='records'))
|
|
36
105
|
|
|
37
106
|
|
|
38
|
-
def write(
|
|
107
|
+
def write(
|
|
108
|
+
path: Path,
|
|
109
|
+
data: JSONData,
|
|
110
|
+
) -> int:
|
|
39
111
|
"""
|
|
40
|
-
Write ``data`` to
|
|
112
|
+
Write ``data`` to Parquet at ``path`` and return record count.
|
|
41
113
|
|
|
42
114
|
Parameters
|
|
43
115
|
----------
|
|
@@ -53,7 +125,22 @@ def write(path: Path, data: JSONData) -> int:
|
|
|
53
125
|
|
|
54
126
|
Raises
|
|
55
127
|
------
|
|
56
|
-
|
|
57
|
-
|
|
128
|
+
ImportError
|
|
129
|
+
If optional dependencies for Parquet support are missing.
|
|
58
130
|
"""
|
|
59
|
-
|
|
131
|
+
records = _normalize_records(data)
|
|
132
|
+
if not records:
|
|
133
|
+
return 0
|
|
134
|
+
|
|
135
|
+
pandas = _get_pandas()
|
|
136
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
137
|
+
frame = pandas.DataFrame.from_records(records)
|
|
138
|
+
try:
|
|
139
|
+
frame.to_parquet(path, index=False)
|
|
140
|
+
except ImportError as e: # pragma: no cover
|
|
141
|
+
raise ImportError(
|
|
142
|
+
'Parquet support requires optional dependency '
|
|
143
|
+
'"pyarrow" or "fastparquet".\n'
|
|
144
|
+
'Install with: pip install pyarrow',
|
|
145
|
+
) from e
|
|
146
|
+
return len(records)
|
etlplus/file/tsv.py
CHANGED
|
@@ -1,19 +1,34 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.tsv` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing TSV files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
import csv
|
|
9
10
|
from pathlib import Path
|
|
11
|
+
from typing import cast
|
|
10
12
|
|
|
11
13
|
from ..types import JSONData
|
|
14
|
+
from ..types import JSONDict
|
|
15
|
+
from ..types import JSONList
|
|
12
16
|
|
|
13
17
|
# SECTION: EXPORTS ========================================================== #
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def read(
|
|
30
|
+
path: Path,
|
|
31
|
+
) -> JSONList:
|
|
17
32
|
"""
|
|
18
33
|
Read TSV content from ``path``.
|
|
19
34
|
|
|
@@ -24,36 +39,53 @@ def read(path: Path) -> JSONData:
|
|
|
24
39
|
|
|
25
40
|
Returns
|
|
26
41
|
-------
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
Raises
|
|
31
|
-
------
|
|
32
|
-
NotImplementedError
|
|
33
|
-
TSV :func:`read` is not implemented yet.
|
|
42
|
+
JSONList
|
|
43
|
+
The list of dictionaries read from the TSV file.
|
|
34
44
|
"""
|
|
35
|
-
|
|
45
|
+
with path.open('r', encoding='utf-8', newline='') as handle:
|
|
46
|
+
reader: csv.DictReader[str] = csv.DictReader(handle, delimiter='\t')
|
|
47
|
+
rows: JSONList = []
|
|
48
|
+
for row in reader:
|
|
49
|
+
if not any(row.values()):
|
|
50
|
+
continue
|
|
51
|
+
rows.append(cast(JSONDict, dict(row)))
|
|
52
|
+
return rows
|
|
36
53
|
|
|
37
54
|
|
|
38
|
-
def write(
|
|
55
|
+
def write(
|
|
56
|
+
path: Path,
|
|
57
|
+
data: JSONData,
|
|
58
|
+
) -> int:
|
|
39
59
|
"""
|
|
40
|
-
Write ``data`` to TSV at ``path
|
|
60
|
+
Write ``data`` to TSV at ``path`` and return record count.
|
|
41
61
|
|
|
42
62
|
Parameters
|
|
43
63
|
----------
|
|
44
64
|
path : Path
|
|
45
65
|
Path to the TSV file on disk.
|
|
46
66
|
data : JSONData
|
|
47
|
-
Data to write.
|
|
67
|
+
Data to write as TSV. Should be a list of dictionaries or a
|
|
68
|
+
single dictionary.
|
|
48
69
|
|
|
49
70
|
Returns
|
|
50
71
|
-------
|
|
51
72
|
int
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
Raises
|
|
55
|
-
------
|
|
56
|
-
NotImplementedError
|
|
57
|
-
TSV :func:`write` is not implemented yet.
|
|
73
|
+
The number of rows written to the TSV file.
|
|
58
74
|
"""
|
|
59
|
-
|
|
75
|
+
rows: list[JSONDict]
|
|
76
|
+
if isinstance(data, list):
|
|
77
|
+
rows = [row for row in data if isinstance(row, dict)]
|
|
78
|
+
else:
|
|
79
|
+
rows = [data]
|
|
80
|
+
|
|
81
|
+
if not rows:
|
|
82
|
+
return 0
|
|
83
|
+
|
|
84
|
+
fieldnames = sorted({key for row in rows for key in row})
|
|
85
|
+
with path.open('w', encoding='utf-8', newline='') as handle:
|
|
86
|
+
writer = csv.DictWriter(handle, fieldnames=fieldnames, delimiter='\t')
|
|
87
|
+
writer.writeheader()
|
|
88
|
+
for row in rows:
|
|
89
|
+
writer.writerow({field: row.get(field) for field in fieldnames})
|
|
90
|
+
|
|
91
|
+
return len(rows)
|
etlplus/file/txt.py
CHANGED
|
@@ -1,19 +1,34 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.txt` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing text files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
+
from typing import cast
|
|
10
11
|
|
|
11
12
|
from ..types import JSONData
|
|
13
|
+
from ..types import JSONDict
|
|
14
|
+
from ..types import JSONList
|
|
15
|
+
from ..utils import count_records
|
|
12
16
|
|
|
13
17
|
# SECTION: EXPORTS ========================================================== #
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def read(
|
|
30
|
+
path: Path,
|
|
31
|
+
) -> JSONList:
|
|
17
32
|
"""
|
|
18
33
|
Read TXT content from ``path``.
|
|
19
34
|
|
|
@@ -24,27 +39,32 @@ def read(path: Path) -> JSONData:
|
|
|
24
39
|
|
|
25
40
|
Returns
|
|
26
41
|
-------
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
Raises
|
|
31
|
-
------
|
|
32
|
-
NotImplementedError
|
|
33
|
-
TXT :func:`read` is not implemented yet.
|
|
42
|
+
JSONList
|
|
43
|
+
The list of dictionaries read from the TXT file.
|
|
34
44
|
"""
|
|
35
|
-
|
|
45
|
+
rows: JSONList = []
|
|
46
|
+
with path.open('r', encoding='utf-8') as handle:
|
|
47
|
+
for line in handle:
|
|
48
|
+
text = line.rstrip('\n')
|
|
49
|
+
if text == '':
|
|
50
|
+
continue
|
|
51
|
+
rows.append({'text': text})
|
|
52
|
+
return rows
|
|
36
53
|
|
|
37
54
|
|
|
38
|
-
def write(
|
|
55
|
+
def write(
|
|
56
|
+
path: Path,
|
|
57
|
+
data: JSONData,
|
|
58
|
+
) -> int:
|
|
39
59
|
"""
|
|
40
|
-
Write ``data`` to TXT at ``path
|
|
60
|
+
Write ``data`` to TXT at ``path`` and return record count.
|
|
41
61
|
|
|
42
62
|
Parameters
|
|
43
63
|
----------
|
|
44
64
|
path : Path
|
|
45
65
|
Path to the TXT file on disk.
|
|
46
66
|
data : JSONData
|
|
47
|
-
Data to write.
|
|
67
|
+
Data to write. Expects ``{'text': '...'} `` or a list of those.
|
|
48
68
|
|
|
49
69
|
Returns
|
|
50
70
|
-------
|
|
@@ -53,7 +73,27 @@ def write(path: Path, data: JSONData) -> int:
|
|
|
53
73
|
|
|
54
74
|
Raises
|
|
55
75
|
------
|
|
56
|
-
|
|
57
|
-
|
|
76
|
+
TypeError
|
|
77
|
+
If any item in ``data`` is not a dictionary or if any dictionary
|
|
78
|
+
does not contain a ``'text'`` key.
|
|
58
79
|
"""
|
|
59
|
-
|
|
80
|
+
rows: JSONList
|
|
81
|
+
if isinstance(data, list):
|
|
82
|
+
if not all(isinstance(item, dict) for item in data):
|
|
83
|
+
raise TypeError('TXT payloads must contain only objects (dicts)')
|
|
84
|
+
rows = cast(JSONList, data)
|
|
85
|
+
else:
|
|
86
|
+
rows = [cast(JSONDict, data)]
|
|
87
|
+
|
|
88
|
+
if not rows:
|
|
89
|
+
return 0
|
|
90
|
+
|
|
91
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
with path.open('w', encoding='utf-8') as handle:
|
|
93
|
+
for row in rows:
|
|
94
|
+
if 'text' not in row:
|
|
95
|
+
raise TypeError('TXT payloads must include a "text" key')
|
|
96
|
+
handle.write(str(row['text']))
|
|
97
|
+
handle.write('\n')
|
|
98
|
+
|
|
99
|
+
return count_records(rows)
|
etlplus/file/xls.py
CHANGED
|
@@ -1,19 +1,77 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.xls` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing Excel XLS files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import cast
|
|
10
12
|
|
|
11
13
|
from ..types import JSONData
|
|
14
|
+
from ..types import JSONDict
|
|
15
|
+
from ..types import JSONList
|
|
12
16
|
|
|
13
17
|
# SECTION: EXPORTS ========================================================== #
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_PANDAS_CACHE: dict[str, Any] = {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_pandas() -> Any:
|
|
36
|
+
"""
|
|
37
|
+
Return the pandas module, importing it on first use.
|
|
38
|
+
|
|
39
|
+
Raises an informative ImportError if the optional dependency is missing.
|
|
40
|
+
"""
|
|
41
|
+
mod = _PANDAS_CACHE.get('mod')
|
|
42
|
+
if mod is not None: # pragma: no cover - tiny branch
|
|
43
|
+
return mod
|
|
44
|
+
try:
|
|
45
|
+
_pd = __import__('pandas') # type: ignore[assignment]
|
|
46
|
+
except ImportError as e: # pragma: no cover
|
|
47
|
+
raise ImportError(
|
|
48
|
+
'XLS support requires optional dependency "pandas".\n'
|
|
49
|
+
'Install with: pip install pandas',
|
|
50
|
+
) from e
|
|
51
|
+
_PANDAS_CACHE['mod'] = _pd
|
|
52
|
+
|
|
53
|
+
return _pd
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _normalize_records(data: JSONData) -> JSONList:
|
|
57
|
+
"""
|
|
58
|
+
Normalize JSON payloads into a list of dictionaries.
|
|
59
|
+
|
|
60
|
+
Raises TypeError when payloads contain non-dict items.
|
|
61
|
+
"""
|
|
62
|
+
if isinstance(data, list):
|
|
63
|
+
if not all(isinstance(item, dict) for item in data):
|
|
64
|
+
raise TypeError('XLS payloads must contain only objects (dicts)')
|
|
65
|
+
return cast(JSONList, data)
|
|
66
|
+
return [cast(JSONDict, data)]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def read(
|
|
73
|
+
path: Path,
|
|
74
|
+
) -> JSONList:
|
|
17
75
|
"""
|
|
18
76
|
Read XLS content from ``path``.
|
|
19
77
|
|
|
@@ -24,20 +82,35 @@ def read(path: Path) -> JSONData:
|
|
|
24
82
|
|
|
25
83
|
Returns
|
|
26
84
|
-------
|
|
27
|
-
|
|
28
|
-
|
|
85
|
+
JSONList
|
|
86
|
+
The list of dictionaries read from the XLS file.
|
|
29
87
|
|
|
30
88
|
Raises
|
|
31
89
|
------
|
|
32
|
-
|
|
33
|
-
|
|
90
|
+
ImportError
|
|
91
|
+
If the optional dependency "xlrd" is not installed.
|
|
34
92
|
"""
|
|
35
|
-
|
|
93
|
+
pandas = _get_pandas()
|
|
94
|
+
try:
|
|
95
|
+
frame = pandas.read_excel(path, engine='xlrd')
|
|
96
|
+
except ImportError as e: # pragma: no cover
|
|
97
|
+
raise ImportError(
|
|
98
|
+
'XLS support requires optional dependency "xlrd".\n'
|
|
99
|
+
'Install with: pip install xlrd',
|
|
100
|
+
) from e
|
|
101
|
+
return cast(JSONList, frame.to_dict(orient='records'))
|
|
36
102
|
|
|
37
103
|
|
|
38
|
-
def write(
|
|
104
|
+
def write(
|
|
105
|
+
path: Path,
|
|
106
|
+
data: JSONData,
|
|
107
|
+
) -> int:
|
|
39
108
|
"""
|
|
40
|
-
Write ``data`` to XLS at ``path
|
|
109
|
+
Write ``data`` to XLS at ``path`` and return record count.
|
|
110
|
+
|
|
111
|
+
Notes
|
|
112
|
+
-----
|
|
113
|
+
XLS writing is not supported by pandas 2.x. Use XLSX for writes.
|
|
41
114
|
|
|
42
115
|
Parameters
|
|
43
116
|
----------
|
|
@@ -53,7 +126,7 @@ def write(path: Path, data: JSONData) -> int:
|
|
|
53
126
|
|
|
54
127
|
Raises
|
|
55
128
|
------
|
|
56
|
-
|
|
57
|
-
|
|
129
|
+
ImportError
|
|
130
|
+
If the optional dependency "xlwt" is not installed.
|
|
58
131
|
"""
|
|
59
|
-
raise
|
|
132
|
+
raise RuntimeError('XLS write is not supported; use XLSX instead')
|
etlplus/file/xlsx.py
CHANGED
|
@@ -1,19 +1,77 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.xlsx` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing Excel XLSX files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import cast
|
|
10
12
|
|
|
11
13
|
from ..types import JSONData
|
|
14
|
+
from ..types import JSONDict
|
|
15
|
+
from ..types import JSONList
|
|
12
16
|
|
|
13
17
|
# SECTION: EXPORTS ========================================================== #
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_PANDAS_CACHE: dict[str, Any] = {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_pandas() -> Any:
|
|
36
|
+
"""
|
|
37
|
+
Return the pandas module, importing it on first use.
|
|
38
|
+
|
|
39
|
+
Raises an informative ImportError if the optional dependency is missing.
|
|
40
|
+
"""
|
|
41
|
+
mod = _PANDAS_CACHE.get('mod')
|
|
42
|
+
if mod is not None: # pragma: no cover - tiny branch
|
|
43
|
+
return mod
|
|
44
|
+
try:
|
|
45
|
+
_pd = __import__('pandas') # type: ignore[assignment]
|
|
46
|
+
except ImportError as e: # pragma: no cover
|
|
47
|
+
raise ImportError(
|
|
48
|
+
'XLSX support requires optional dependency "pandas".\n'
|
|
49
|
+
'Install with: pip install pandas',
|
|
50
|
+
) from e
|
|
51
|
+
_PANDAS_CACHE['mod'] = _pd
|
|
52
|
+
|
|
53
|
+
return _pd
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _normalize_records(data: JSONData) -> JSONList:
|
|
57
|
+
"""
|
|
58
|
+
Normalize JSON payloads into a list of dictionaries.
|
|
59
|
+
|
|
60
|
+
Raises TypeError when payloads contain non-dict items.
|
|
61
|
+
"""
|
|
62
|
+
if isinstance(data, list):
|
|
63
|
+
if not all(isinstance(item, dict) for item in data):
|
|
64
|
+
raise TypeError('XLSX payloads must contain only objects (dicts)')
|
|
65
|
+
return cast(JSONList, data)
|
|
66
|
+
return [cast(JSONDict, data)]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def read(
|
|
73
|
+
path: Path,
|
|
74
|
+
) -> JSONList:
|
|
17
75
|
"""
|
|
18
76
|
Read XLSX content from ``path``.
|
|
19
77
|
|
|
@@ -24,20 +82,31 @@ def read(path: Path) -> JSONData:
|
|
|
24
82
|
|
|
25
83
|
Returns
|
|
26
84
|
-------
|
|
27
|
-
|
|
28
|
-
|
|
85
|
+
JSONList
|
|
86
|
+
The list of dictionaries read from the XLSX file.
|
|
29
87
|
|
|
30
88
|
Raises
|
|
31
89
|
------
|
|
32
|
-
|
|
33
|
-
|
|
90
|
+
ImportError
|
|
91
|
+
If optional dependencies for XLSX support are missing.
|
|
34
92
|
"""
|
|
35
|
-
|
|
93
|
+
pandas = _get_pandas()
|
|
94
|
+
try:
|
|
95
|
+
frame = pandas.read_excel(path)
|
|
96
|
+
except ImportError as e: # pragma: no cover
|
|
97
|
+
raise ImportError(
|
|
98
|
+
'XLSX support requires optional dependency "openpyxl".\n'
|
|
99
|
+
'Install with: pip install openpyxl',
|
|
100
|
+
) from e
|
|
101
|
+
return cast(JSONList, frame.to_dict(orient='records'))
|
|
36
102
|
|
|
37
103
|
|
|
38
|
-
def write(
|
|
104
|
+
def write(
|
|
105
|
+
path: Path,
|
|
106
|
+
data: JSONData,
|
|
107
|
+
) -> int:
|
|
39
108
|
"""
|
|
40
|
-
Write ``data`` to XLSX at ``path
|
|
109
|
+
Write ``data`` to XLSX at ``path`` and return record count.
|
|
41
110
|
|
|
42
111
|
Parameters
|
|
43
112
|
----------
|
|
@@ -53,7 +122,21 @@ def write(path: Path, data: JSONData) -> int:
|
|
|
53
122
|
|
|
54
123
|
Raises
|
|
55
124
|
------
|
|
56
|
-
|
|
57
|
-
|
|
125
|
+
ImportError
|
|
126
|
+
If optional dependencies for XLSX support are missing.
|
|
58
127
|
"""
|
|
59
|
-
|
|
128
|
+
records = _normalize_records(data)
|
|
129
|
+
if not records:
|
|
130
|
+
return 0
|
|
131
|
+
|
|
132
|
+
pandas = _get_pandas()
|
|
133
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
134
|
+
frame = pandas.DataFrame.from_records(records)
|
|
135
|
+
try:
|
|
136
|
+
frame.to_excel(path, index=False)
|
|
137
|
+
except ImportError as e: # pragma: no cover
|
|
138
|
+
raise ImportError(
|
|
139
|
+
'XLSX support requires optional dependency "openpyxl".\n'
|
|
140
|
+
'Install with: pip install openpyxl',
|
|
141
|
+
) from e
|
|
142
|
+
return len(records)
|
etlplus/file/xml.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.xml` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing XML files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
@@ -14,6 +14,15 @@ from ..types import JSONData
|
|
|
14
14
|
from ..types import JSONDict
|
|
15
15
|
from ..utils import count_records
|
|
16
16
|
|
|
17
|
+
# SECTION: EXPORTS ========================================================== #
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
17
26
|
# SECTION: CONSTANTS ======================================================== #
|
|
18
27
|
|
|
19
28
|
|
|
@@ -117,7 +126,7 @@ def read(
|
|
|
117
126
|
path: Path,
|
|
118
127
|
) -> JSONDict:
|
|
119
128
|
"""
|
|
120
|
-
|
|
129
|
+
Read XML content from ``path``.
|
|
121
130
|
|
|
122
131
|
Parameters
|
|
123
132
|
----------
|
|
@@ -137,7 +146,7 @@ def read(
|
|
|
137
146
|
|
|
138
147
|
def write(path: Path, data: JSONData, *, root_tag: str) -> int:
|
|
139
148
|
"""
|
|
140
|
-
Write ``data``
|
|
149
|
+
Write ``data`` to XML at ``path`` and return record count.
|
|
141
150
|
|
|
142
151
|
Parameters
|
|
143
152
|
----------
|