etlplus 0.12.2__py3-none-any.whl → 0.12.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/file/_io.py +120 -0
- etlplus/file/_pandas.py +58 -0
- etlplus/file/avro.py +30 -42
- etlplus/file/csv.py +4 -28
- etlplus/file/feather.py +5 -50
- etlplus/file/orc.py +5 -48
- etlplus/file/parquet.py +5 -50
- etlplus/file/tsv.py +4 -28
- etlplus/file/xls.py +4 -48
- etlplus/file/xlsx.py +5 -48
- {etlplus-0.12.2.dist-info → etlplus-0.12.4.dist-info}/METADATA +1 -1
- {etlplus-0.12.2.dist-info → etlplus-0.12.4.dist-info}/RECORD +16 -14
- {etlplus-0.12.2.dist-info → etlplus-0.12.4.dist-info}/WHEEL +0 -0
- {etlplus-0.12.2.dist-info → etlplus-0.12.4.dist-info}/entry_points.txt +0 -0
- {etlplus-0.12.2.dist-info → etlplus-0.12.4.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.12.2.dist-info → etlplus-0.12.4.dist-info}/top_level.txt +0 -0
etlplus/file/_io.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file._io` module.
|
|
3
|
+
|
|
4
|
+
Shared helpers for record normalization and delimited text formats.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import csv
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import cast
|
|
12
|
+
|
|
13
|
+
from ..types import JSONData
|
|
14
|
+
from ..types import JSONDict
|
|
15
|
+
from ..types import JSONList
|
|
16
|
+
|
|
17
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def normalize_records(
|
|
21
|
+
data: JSONData,
|
|
22
|
+
format_name: str,
|
|
23
|
+
) -> JSONList:
|
|
24
|
+
"""
|
|
25
|
+
Normalize payloads into a list of dictionaries.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
data : JSONData
|
|
30
|
+
Input payload to normalize.
|
|
31
|
+
format_name : str
|
|
32
|
+
Human-readable format name for error messages.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
JSONList
|
|
37
|
+
Normalized list of dictionaries.
|
|
38
|
+
|
|
39
|
+
Raises
|
|
40
|
+
------
|
|
41
|
+
TypeError
|
|
42
|
+
If a list payload contains non-dict items.
|
|
43
|
+
"""
|
|
44
|
+
if isinstance(data, list):
|
|
45
|
+
if not all(isinstance(item, dict) for item in data):
|
|
46
|
+
raise TypeError(
|
|
47
|
+
f'{format_name} payloads must contain only objects (dicts)',
|
|
48
|
+
)
|
|
49
|
+
return cast(JSONList, data)
|
|
50
|
+
return [cast(JSONDict, data)]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def read_delimited(path: Path, *, delimiter: str) -> JSONList:
|
|
54
|
+
"""
|
|
55
|
+
Read delimited content from ``path``.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
path : Path
|
|
60
|
+
Path to the delimited file on disk.
|
|
61
|
+
delimiter : str
|
|
62
|
+
Delimiter character for parsing.
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
JSONList
|
|
67
|
+
The list of dictionaries read from the delimited file.
|
|
68
|
+
"""
|
|
69
|
+
with path.open('r', encoding='utf-8', newline='') as handle:
|
|
70
|
+
reader: csv.DictReader[str] = csv.DictReader(
|
|
71
|
+
handle,
|
|
72
|
+
delimiter=delimiter,
|
|
73
|
+
)
|
|
74
|
+
rows: JSONList = []
|
|
75
|
+
for row in reader:
|
|
76
|
+
if not any(row.values()):
|
|
77
|
+
continue
|
|
78
|
+
rows.append(cast(JSONDict, dict(row)))
|
|
79
|
+
return rows
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def write_delimited(path: Path, data: JSONData, *, delimiter: str) -> int:
|
|
83
|
+
"""
|
|
84
|
+
Write ``data`` to a delimited file and return record count.
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
path : Path
|
|
89
|
+
Path to the delimited file on disk.
|
|
90
|
+
data : JSONData
|
|
91
|
+
Data to write as delimited rows.
|
|
92
|
+
delimiter : str
|
|
93
|
+
Delimiter character for writing.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
int
|
|
98
|
+
The number of rows written.
|
|
99
|
+
"""
|
|
100
|
+
rows: list[JSONDict]
|
|
101
|
+
if isinstance(data, list):
|
|
102
|
+
rows = [row for row in data if isinstance(row, dict)]
|
|
103
|
+
else:
|
|
104
|
+
rows = [data]
|
|
105
|
+
|
|
106
|
+
if not rows:
|
|
107
|
+
return 0
|
|
108
|
+
|
|
109
|
+
fieldnames = sorted({key for row in rows for key in row})
|
|
110
|
+
with path.open('w', encoding='utf-8', newline='') as handle:
|
|
111
|
+
writer = csv.DictWriter(
|
|
112
|
+
handle,
|
|
113
|
+
fieldnames=fieldnames,
|
|
114
|
+
delimiter=delimiter,
|
|
115
|
+
)
|
|
116
|
+
writer.writeheader()
|
|
117
|
+
for row in rows:
|
|
118
|
+
writer.writerow({field: row.get(field) for field in fieldnames})
|
|
119
|
+
|
|
120
|
+
return len(rows)
|
etlplus/file/_pandas.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file._pandas` module.
|
|
3
|
+
|
|
4
|
+
Shared helpers for optional pandas usage.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
# SECTION: EXPORTS ========================================================== #
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
'get_pandas',
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
_PANDAS_CACHE: dict[str, Any] = {}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_pandas(format_name: str) -> Any:
|
|
28
|
+
"""
|
|
29
|
+
Return the pandas module, importing it on first use.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
format_name : str
|
|
34
|
+
Human-readable format name for error messages.
|
|
35
|
+
|
|
36
|
+
Returns
|
|
37
|
+
-------
|
|
38
|
+
Any
|
|
39
|
+
The pandas module.
|
|
40
|
+
|
|
41
|
+
Raises
|
|
42
|
+
------
|
|
43
|
+
ImportError
|
|
44
|
+
If the optional dependency is missing.
|
|
45
|
+
"""
|
|
46
|
+
mod = _PANDAS_CACHE.get('mod')
|
|
47
|
+
if mod is not None: # pragma: no cover - tiny branch
|
|
48
|
+
return mod
|
|
49
|
+
try:
|
|
50
|
+
_pd = __import__('pandas') # type: ignore[assignment]
|
|
51
|
+
except ImportError as e: # pragma: no cover
|
|
52
|
+
raise ImportError(
|
|
53
|
+
f'{format_name} support requires optional dependency "pandas".\n'
|
|
54
|
+
'Install with: pip install pandas',
|
|
55
|
+
) from e
|
|
56
|
+
_PANDAS_CACHE['mod'] = _pd
|
|
57
|
+
|
|
58
|
+
return _pd
|
etlplus/file/avro.py
CHANGED
|
@@ -13,6 +13,7 @@ from typing import cast
|
|
|
13
13
|
from ..types import JSONData
|
|
14
14
|
from ..types import JSONDict
|
|
15
15
|
from ..types import JSONList
|
|
16
|
+
from ._io import normalize_records
|
|
16
17
|
|
|
17
18
|
# SECTION: EXPORTS ========================================================== #
|
|
18
19
|
|
|
@@ -63,17 +64,37 @@ def _get_fastavro() -> Any:
|
|
|
63
64
|
return _fastavro
|
|
64
65
|
|
|
65
66
|
|
|
66
|
-
def
|
|
67
|
+
def _infer_schema(records: JSONList) -> dict[str, Any]:
|
|
67
68
|
"""
|
|
68
|
-
|
|
69
|
+
Infer a basic Avro schema from record payloads.
|
|
69
70
|
|
|
70
|
-
|
|
71
|
+
Only primitive field values are supported; complex values raise TypeError.
|
|
71
72
|
"""
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
73
|
+
field_names = sorted({key for record in records for key in record})
|
|
74
|
+
fields: list[dict[str, Any]] = []
|
|
75
|
+
for name in field_names:
|
|
76
|
+
types: list[str] = []
|
|
77
|
+
for record in records:
|
|
78
|
+
value = record.get(name)
|
|
79
|
+
if value is None:
|
|
80
|
+
types.append('null')
|
|
81
|
+
continue
|
|
82
|
+
if isinstance(value, dict | list):
|
|
83
|
+
raise TypeError(
|
|
84
|
+
'AVRO payloads must contain only primitive values',
|
|
85
|
+
)
|
|
86
|
+
if not isinstance(value, _PRIMITIVE_TYPES):
|
|
87
|
+
raise TypeError(
|
|
88
|
+
'AVRO payloads must contain only primitive values',
|
|
89
|
+
)
|
|
90
|
+
types.append(cast(str, _infer_value_type(value)))
|
|
91
|
+
fields.append({'name': name, 'type': _merge_types(types)})
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
'name': 'etlplus_record',
|
|
95
|
+
'type': 'record',
|
|
96
|
+
'fields': fields,
|
|
97
|
+
}
|
|
77
98
|
|
|
78
99
|
|
|
79
100
|
def _infer_value_type(value: object) -> str | list[str]:
|
|
@@ -106,39 +127,6 @@ def _merge_types(types: list[str]) -> str | list[str]:
|
|
|
106
127
|
return ordered
|
|
107
128
|
|
|
108
129
|
|
|
109
|
-
def _infer_schema(records: JSONList) -> dict[str, Any]:
|
|
110
|
-
"""
|
|
111
|
-
Infer a basic Avro schema from record payloads.
|
|
112
|
-
|
|
113
|
-
Only primitive field values are supported; complex values raise TypeError.
|
|
114
|
-
"""
|
|
115
|
-
field_names = sorted({key for record in records for key in record})
|
|
116
|
-
fields: list[dict[str, Any]] = []
|
|
117
|
-
for name in field_names:
|
|
118
|
-
types: list[str] = []
|
|
119
|
-
for record in records:
|
|
120
|
-
value = record.get(name)
|
|
121
|
-
if value is None:
|
|
122
|
-
types.append('null')
|
|
123
|
-
continue
|
|
124
|
-
if isinstance(value, dict | list):
|
|
125
|
-
raise TypeError(
|
|
126
|
-
'AVRO payloads must contain only primitive values',
|
|
127
|
-
)
|
|
128
|
-
if not isinstance(value, _PRIMITIVE_TYPES):
|
|
129
|
-
raise TypeError(
|
|
130
|
-
'AVRO payloads must contain only primitive values',
|
|
131
|
-
)
|
|
132
|
-
types.append(cast(str, _infer_value_type(value)))
|
|
133
|
-
fields.append({'name': name, 'type': _merge_types(types)})
|
|
134
|
-
|
|
135
|
-
return {
|
|
136
|
-
'name': 'etlplus_record',
|
|
137
|
-
'type': 'record',
|
|
138
|
-
'fields': fields,
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
|
|
142
130
|
# SECTION: FUNCTIONS ======================================================== #
|
|
143
131
|
|
|
144
132
|
|
|
@@ -183,7 +171,7 @@ def write(
|
|
|
183
171
|
int
|
|
184
172
|
Number of records written.
|
|
185
173
|
"""
|
|
186
|
-
records =
|
|
174
|
+
records = normalize_records(data, 'AVRO')
|
|
187
175
|
if not records:
|
|
188
176
|
return 0
|
|
189
177
|
|
etlplus/file/csv.py
CHANGED
|
@@ -6,13 +6,12 @@ Helpers for reading/writing CSV files.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
import csv
|
|
10
9
|
from pathlib import Path
|
|
11
|
-
from typing import cast
|
|
12
10
|
|
|
13
11
|
from ..types import JSONData
|
|
14
|
-
from ..types import JSONDict
|
|
15
12
|
from ..types import JSONList
|
|
13
|
+
from ._io import read_delimited
|
|
14
|
+
from ._io import write_delimited
|
|
16
15
|
|
|
17
16
|
# SECTION: EXPORTS ========================================================== #
|
|
18
17
|
|
|
@@ -42,14 +41,7 @@ def read(
|
|
|
42
41
|
JSONList
|
|
43
42
|
The list of dictionaries read from the CSV file.
|
|
44
43
|
"""
|
|
45
|
-
|
|
46
|
-
reader: csv.DictReader[str] = csv.DictReader(handle)
|
|
47
|
-
rows: JSONList = []
|
|
48
|
-
for row in reader:
|
|
49
|
-
if not any(row.values()):
|
|
50
|
-
continue
|
|
51
|
-
rows.append(cast(JSONDict, dict(row)))
|
|
52
|
-
return rows
|
|
44
|
+
return read_delimited(path, delimiter=',')
|
|
53
45
|
|
|
54
46
|
|
|
55
47
|
def write(
|
|
@@ -72,20 +64,4 @@ def write(
|
|
|
72
64
|
int
|
|
73
65
|
The number of rows written to the CSV file.
|
|
74
66
|
"""
|
|
75
|
-
|
|
76
|
-
if isinstance(data, list):
|
|
77
|
-
rows = [row for row in data if isinstance(row, dict)]
|
|
78
|
-
else:
|
|
79
|
-
rows = [data]
|
|
80
|
-
|
|
81
|
-
if not rows:
|
|
82
|
-
return 0
|
|
83
|
-
|
|
84
|
-
fieldnames = sorted({key for row in rows for key in row})
|
|
85
|
-
with path.open('w', encoding='utf-8', newline='') as handle:
|
|
86
|
-
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
|
87
|
-
writer.writeheader()
|
|
88
|
-
for row in rows:
|
|
89
|
-
writer.writerow({field: row.get(field) for field in fieldnames})
|
|
90
|
-
|
|
91
|
-
return len(rows)
|
|
67
|
+
return write_delimited(path, data, delimiter=',')
|
etlplus/file/feather.py
CHANGED
|
@@ -7,12 +7,12 @@ Helpers for reading/writing Feather files.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
10
|
from typing import cast
|
|
12
11
|
|
|
13
12
|
from ..types import JSONData
|
|
14
|
-
from ..types import JSONDict
|
|
15
13
|
from ..types import JSONList
|
|
14
|
+
from ._io import normalize_records
|
|
15
|
+
from ._pandas import get_pandas
|
|
16
16
|
|
|
17
17
|
# SECTION: EXPORTS ========================================================== #
|
|
18
18
|
|
|
@@ -23,51 +23,6 @@ __all__ = [
|
|
|
23
23
|
]
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
_PANDAS_CACHE: dict[str, Any] = {}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _get_pandas() -> Any:
|
|
36
|
-
"""
|
|
37
|
-
Return the pandas module, importing it on first use.
|
|
38
|
-
|
|
39
|
-
Raises an informative ImportError if the optional dependency is missing.
|
|
40
|
-
"""
|
|
41
|
-
mod = _PANDAS_CACHE.get('mod')
|
|
42
|
-
if mod is not None: # pragma: no cover - tiny branch
|
|
43
|
-
return mod
|
|
44
|
-
try:
|
|
45
|
-
_pd = __import__('pandas') # type: ignore[assignment]
|
|
46
|
-
except ImportError as e: # pragma: no cover
|
|
47
|
-
raise ImportError(
|
|
48
|
-
'Feather support requires optional dependency "pandas".\n'
|
|
49
|
-
'Install with: pip install pandas',
|
|
50
|
-
) from e
|
|
51
|
-
_PANDAS_CACHE['mod'] = _pd
|
|
52
|
-
|
|
53
|
-
return _pd
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _normalize_records(data: JSONData) -> JSONList:
|
|
57
|
-
"""
|
|
58
|
-
Normalize JSON payloads into a list of dictionaries.
|
|
59
|
-
|
|
60
|
-
Raises TypeError when payloads contain non-dict items.
|
|
61
|
-
"""
|
|
62
|
-
if isinstance(data, list):
|
|
63
|
-
if not all(isinstance(item, dict) for item in data):
|
|
64
|
-
raise TypeError(
|
|
65
|
-
'Feather payloads must contain only objects (dicts)',
|
|
66
|
-
)
|
|
67
|
-
return cast(JSONList, data)
|
|
68
|
-
return [cast(JSONDict, data)]
|
|
69
|
-
|
|
70
|
-
|
|
71
26
|
# SECTION: FUNCTIONS ======================================================== #
|
|
72
27
|
|
|
73
28
|
|
|
@@ -92,7 +47,7 @@ def read(
|
|
|
92
47
|
ImportError
|
|
93
48
|
When optional dependency "pyarrow" is missing.
|
|
94
49
|
"""
|
|
95
|
-
pandas =
|
|
50
|
+
pandas = get_pandas('Feather')
|
|
96
51
|
try:
|
|
97
52
|
frame = pandas.read_feather(path)
|
|
98
53
|
except ImportError as e: # pragma: no cover
|
|
@@ -127,11 +82,11 @@ def write(
|
|
|
127
82
|
ImportError
|
|
128
83
|
When optional dependency "pyarrow" is missing.
|
|
129
84
|
"""
|
|
130
|
-
records =
|
|
85
|
+
records = normalize_records(data, 'Feather')
|
|
131
86
|
if not records:
|
|
132
87
|
return 0
|
|
133
88
|
|
|
134
|
-
pandas =
|
|
89
|
+
pandas = get_pandas('Feather')
|
|
135
90
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
136
91
|
frame = pandas.DataFrame.from_records(records)
|
|
137
92
|
try:
|
etlplus/file/orc.py
CHANGED
|
@@ -7,12 +7,12 @@ Helpers for reading/writing ORC files.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
10
|
from typing import cast
|
|
12
11
|
|
|
13
12
|
from ..types import JSONData
|
|
14
|
-
from ..types import JSONDict
|
|
15
13
|
from ..types import JSONList
|
|
14
|
+
from ._io import normalize_records
|
|
15
|
+
from ._pandas import get_pandas
|
|
16
16
|
|
|
17
17
|
# SECTION: EXPORTS ========================================================== #
|
|
18
18
|
|
|
@@ -23,49 +23,6 @@ __all__ = [
|
|
|
23
23
|
]
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
_PANDAS_CACHE: dict[str, Any] = {}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _get_pandas() -> Any:
|
|
36
|
-
"""
|
|
37
|
-
Return the pandas module, importing it on first use.
|
|
38
|
-
|
|
39
|
-
Raises an informative ImportError if the optional dependency is missing.
|
|
40
|
-
"""
|
|
41
|
-
mod = _PANDAS_CACHE.get('mod')
|
|
42
|
-
if mod is not None: # pragma: no cover - tiny branch
|
|
43
|
-
return mod
|
|
44
|
-
try:
|
|
45
|
-
_pd = __import__('pandas') # type: ignore[assignment]
|
|
46
|
-
except ImportError as e: # pragma: no cover
|
|
47
|
-
raise ImportError(
|
|
48
|
-
'ORC support requires optional dependency "pandas".\n'
|
|
49
|
-
'Install with: pip install pandas',
|
|
50
|
-
) from e
|
|
51
|
-
_PANDAS_CACHE['mod'] = _pd
|
|
52
|
-
|
|
53
|
-
return _pd
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _normalize_records(data: JSONData) -> JSONList:
|
|
57
|
-
"""
|
|
58
|
-
Normalize JSON payloads into a list of dictionaries.
|
|
59
|
-
|
|
60
|
-
Raises TypeError when payloads contain non-dict items.
|
|
61
|
-
"""
|
|
62
|
-
if isinstance(data, list):
|
|
63
|
-
if not all(isinstance(item, dict) for item in data):
|
|
64
|
-
raise TypeError('ORC payloads must contain only objects (dicts)')
|
|
65
|
-
return cast(JSONList, data)
|
|
66
|
-
return [cast(JSONDict, data)]
|
|
67
|
-
|
|
68
|
-
|
|
69
26
|
# SECTION: FUNCTIONS ======================================================== #
|
|
70
27
|
|
|
71
28
|
|
|
@@ -90,7 +47,7 @@ def read(
|
|
|
90
47
|
ImportError
|
|
91
48
|
When optional dependency "pyarrow" is missing.
|
|
92
49
|
"""
|
|
93
|
-
pandas =
|
|
50
|
+
pandas = get_pandas('ORC')
|
|
94
51
|
try:
|
|
95
52
|
frame = pandas.read_orc(path)
|
|
96
53
|
except ImportError as e: # pragma: no cover
|
|
@@ -125,11 +82,11 @@ def write(
|
|
|
125
82
|
ImportError
|
|
126
83
|
When optional dependency "pyarrow" is missing.
|
|
127
84
|
"""
|
|
128
|
-
records =
|
|
85
|
+
records = normalize_records(data, 'ORC')
|
|
129
86
|
if not records:
|
|
130
87
|
return 0
|
|
131
88
|
|
|
132
|
-
pandas =
|
|
89
|
+
pandas = get_pandas('ORC')
|
|
133
90
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
134
91
|
frame = pandas.DataFrame.from_records(records)
|
|
135
92
|
try:
|
etlplus/file/parquet.py
CHANGED
|
@@ -7,12 +7,12 @@ Helpers for reading/writing Parquet files.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
10
|
from typing import cast
|
|
12
11
|
|
|
13
12
|
from ..types import JSONData
|
|
14
|
-
from ..types import JSONDict
|
|
15
13
|
from ..types import JSONList
|
|
14
|
+
from ._io import normalize_records
|
|
15
|
+
from ._pandas import get_pandas
|
|
16
16
|
|
|
17
17
|
# SECTION: EXPORTS ========================================================== #
|
|
18
18
|
|
|
@@ -23,51 +23,6 @@ __all__ = [
|
|
|
23
23
|
]
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
_PANDAS_CACHE: dict[str, Any] = {}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _get_pandas() -> Any:
|
|
36
|
-
"""
|
|
37
|
-
Return the pandas module, importing it on first use.
|
|
38
|
-
|
|
39
|
-
Raises an informative ImportError if the optional dependency is missing.
|
|
40
|
-
"""
|
|
41
|
-
mod = _PANDAS_CACHE.get('mod')
|
|
42
|
-
if mod is not None: # pragma: no cover - tiny branch
|
|
43
|
-
return mod
|
|
44
|
-
try:
|
|
45
|
-
_pd = __import__('pandas') # type: ignore[assignment]
|
|
46
|
-
except ImportError as e: # pragma: no cover
|
|
47
|
-
raise ImportError(
|
|
48
|
-
'Parquet support requires optional dependency "pandas".\n'
|
|
49
|
-
'Install with: pip install pandas',
|
|
50
|
-
) from e
|
|
51
|
-
_PANDAS_CACHE['mod'] = _pd
|
|
52
|
-
|
|
53
|
-
return _pd
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _normalize_records(data: JSONData) -> JSONList:
|
|
57
|
-
"""
|
|
58
|
-
Normalize JSON payloads into a list of dictionaries.
|
|
59
|
-
|
|
60
|
-
Raises TypeError when payloads contain non-dict items.
|
|
61
|
-
"""
|
|
62
|
-
if isinstance(data, list):
|
|
63
|
-
if not all(isinstance(item, dict) for item in data):
|
|
64
|
-
raise TypeError(
|
|
65
|
-
'Parquet payloads must contain only objects (dicts)',
|
|
66
|
-
)
|
|
67
|
-
return cast(JSONList, data)
|
|
68
|
-
return [cast(JSONDict, data)]
|
|
69
|
-
|
|
70
|
-
|
|
71
26
|
# SECTION: FUNCTIONS ======================================================== #
|
|
72
27
|
|
|
73
28
|
|
|
@@ -92,7 +47,7 @@ def read(
|
|
|
92
47
|
ImportError
|
|
93
48
|
If optional dependencies for Parquet support are missing.
|
|
94
49
|
"""
|
|
95
|
-
pandas =
|
|
50
|
+
pandas = get_pandas('Parquet')
|
|
96
51
|
try:
|
|
97
52
|
frame = pandas.read_parquet(path)
|
|
98
53
|
except ImportError as e: # pragma: no cover
|
|
@@ -128,11 +83,11 @@ def write(
|
|
|
128
83
|
ImportError
|
|
129
84
|
If optional dependencies for Parquet support are missing.
|
|
130
85
|
"""
|
|
131
|
-
records =
|
|
86
|
+
records = normalize_records(data, 'Parquet')
|
|
132
87
|
if not records:
|
|
133
88
|
return 0
|
|
134
89
|
|
|
135
|
-
pandas =
|
|
90
|
+
pandas = get_pandas('Parquet')
|
|
136
91
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
137
92
|
frame = pandas.DataFrame.from_records(records)
|
|
138
93
|
try:
|
etlplus/file/tsv.py
CHANGED
|
@@ -6,13 +6,12 @@ Helpers for reading/writing TSV files.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
import csv
|
|
10
9
|
from pathlib import Path
|
|
11
|
-
from typing import cast
|
|
12
10
|
|
|
13
11
|
from ..types import JSONData
|
|
14
|
-
from ..types import JSONDict
|
|
15
12
|
from ..types import JSONList
|
|
13
|
+
from ._io import read_delimited
|
|
14
|
+
from ._io import write_delimited
|
|
16
15
|
|
|
17
16
|
# SECTION: EXPORTS ========================================================== #
|
|
18
17
|
|
|
@@ -42,14 +41,7 @@ def read(
|
|
|
42
41
|
JSONList
|
|
43
42
|
The list of dictionaries read from the TSV file.
|
|
44
43
|
"""
|
|
45
|
-
|
|
46
|
-
reader: csv.DictReader[str] = csv.DictReader(handle, delimiter='\t')
|
|
47
|
-
rows: JSONList = []
|
|
48
|
-
for row in reader:
|
|
49
|
-
if not any(row.values()):
|
|
50
|
-
continue
|
|
51
|
-
rows.append(cast(JSONDict, dict(row)))
|
|
52
|
-
return rows
|
|
44
|
+
return read_delimited(path, delimiter='\t')
|
|
53
45
|
|
|
54
46
|
|
|
55
47
|
def write(
|
|
@@ -72,20 +64,4 @@ def write(
|
|
|
72
64
|
int
|
|
73
65
|
The number of rows written to the TSV file.
|
|
74
66
|
"""
|
|
75
|
-
|
|
76
|
-
if isinstance(data, list):
|
|
77
|
-
rows = [row for row in data if isinstance(row, dict)]
|
|
78
|
-
else:
|
|
79
|
-
rows = [data]
|
|
80
|
-
|
|
81
|
-
if not rows:
|
|
82
|
-
return 0
|
|
83
|
-
|
|
84
|
-
fieldnames = sorted({key for row in rows for key in row})
|
|
85
|
-
with path.open('w', encoding='utf-8', newline='') as handle:
|
|
86
|
-
writer = csv.DictWriter(handle, fieldnames=fieldnames, delimiter='\t')
|
|
87
|
-
writer.writeheader()
|
|
88
|
-
for row in rows:
|
|
89
|
-
writer.writerow({field: row.get(field) for field in fieldnames})
|
|
90
|
-
|
|
91
|
-
return len(rows)
|
|
67
|
+
return write_delimited(path, data, delimiter='\t')
|
etlplus/file/xls.py
CHANGED
|
@@ -7,12 +7,11 @@ Helpers for reading/writing Excel XLS files.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
10
|
from typing import cast
|
|
12
11
|
|
|
13
12
|
from ..types import JSONData
|
|
14
|
-
from ..types import JSONDict
|
|
15
13
|
from ..types import JSONList
|
|
14
|
+
from ._pandas import get_pandas
|
|
16
15
|
|
|
17
16
|
# SECTION: EXPORTS ========================================================== #
|
|
18
17
|
|
|
@@ -23,49 +22,6 @@ __all__ = [
|
|
|
23
22
|
]
|
|
24
23
|
|
|
25
24
|
|
|
26
|
-
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
_PANDAS_CACHE: dict[str, Any] = {}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _get_pandas() -> Any:
|
|
36
|
-
"""
|
|
37
|
-
Return the pandas module, importing it on first use.
|
|
38
|
-
|
|
39
|
-
Raises an informative ImportError if the optional dependency is missing.
|
|
40
|
-
"""
|
|
41
|
-
mod = _PANDAS_CACHE.get('mod')
|
|
42
|
-
if mod is not None: # pragma: no cover - tiny branch
|
|
43
|
-
return mod
|
|
44
|
-
try:
|
|
45
|
-
_pd = __import__('pandas') # type: ignore[assignment]
|
|
46
|
-
except ImportError as e: # pragma: no cover
|
|
47
|
-
raise ImportError(
|
|
48
|
-
'XLS support requires optional dependency "pandas".\n'
|
|
49
|
-
'Install with: pip install pandas',
|
|
50
|
-
) from e
|
|
51
|
-
_PANDAS_CACHE['mod'] = _pd
|
|
52
|
-
|
|
53
|
-
return _pd
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _normalize_records(data: JSONData) -> JSONList:
|
|
57
|
-
"""
|
|
58
|
-
Normalize JSON payloads into a list of dictionaries.
|
|
59
|
-
|
|
60
|
-
Raises TypeError when payloads contain non-dict items.
|
|
61
|
-
"""
|
|
62
|
-
if isinstance(data, list):
|
|
63
|
-
if not all(isinstance(item, dict) for item in data):
|
|
64
|
-
raise TypeError('XLS payloads must contain only objects (dicts)')
|
|
65
|
-
return cast(JSONList, data)
|
|
66
|
-
return [cast(JSONDict, data)]
|
|
67
|
-
|
|
68
|
-
|
|
69
25
|
# SECTION: FUNCTIONS ======================================================== #
|
|
70
26
|
|
|
71
27
|
|
|
@@ -90,7 +46,7 @@ def read(
|
|
|
90
46
|
ImportError
|
|
91
47
|
If the optional dependency "xlrd" is not installed.
|
|
92
48
|
"""
|
|
93
|
-
pandas =
|
|
49
|
+
pandas = get_pandas('XLS')
|
|
94
50
|
try:
|
|
95
51
|
frame = pandas.read_excel(path, engine='xlrd')
|
|
96
52
|
except ImportError as e: # pragma: no cover
|
|
@@ -126,7 +82,7 @@ def write(
|
|
|
126
82
|
|
|
127
83
|
Raises
|
|
128
84
|
------
|
|
129
|
-
|
|
130
|
-
If
|
|
85
|
+
RuntimeError
|
|
86
|
+
If XLS writing is attempted.
|
|
131
87
|
"""
|
|
132
88
|
raise RuntimeError('XLS write is not supported; use XLSX instead')
|
etlplus/file/xlsx.py
CHANGED
|
@@ -7,12 +7,12 @@ Helpers for reading/writing Excel XLSX files.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
10
|
from typing import cast
|
|
12
11
|
|
|
13
12
|
from ..types import JSONData
|
|
14
|
-
from ..types import JSONDict
|
|
15
13
|
from ..types import JSONList
|
|
14
|
+
from ._io import normalize_records
|
|
15
|
+
from ._pandas import get_pandas
|
|
16
16
|
|
|
17
17
|
# SECTION: EXPORTS ========================================================== #
|
|
18
18
|
|
|
@@ -23,49 +23,6 @@ __all__ = [
|
|
|
23
23
|
]
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
_PANDAS_CACHE: dict[str, Any] = {}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _get_pandas() -> Any:
|
|
36
|
-
"""
|
|
37
|
-
Return the pandas module, importing it on first use.
|
|
38
|
-
|
|
39
|
-
Raises an informative ImportError if the optional dependency is missing.
|
|
40
|
-
"""
|
|
41
|
-
mod = _PANDAS_CACHE.get('mod')
|
|
42
|
-
if mod is not None: # pragma: no cover - tiny branch
|
|
43
|
-
return mod
|
|
44
|
-
try:
|
|
45
|
-
_pd = __import__('pandas') # type: ignore[assignment]
|
|
46
|
-
except ImportError as e: # pragma: no cover
|
|
47
|
-
raise ImportError(
|
|
48
|
-
'XLSX support requires optional dependency "pandas".\n'
|
|
49
|
-
'Install with: pip install pandas',
|
|
50
|
-
) from e
|
|
51
|
-
_PANDAS_CACHE['mod'] = _pd
|
|
52
|
-
|
|
53
|
-
return _pd
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _normalize_records(data: JSONData) -> JSONList:
|
|
57
|
-
"""
|
|
58
|
-
Normalize JSON payloads into a list of dictionaries.
|
|
59
|
-
|
|
60
|
-
Raises TypeError when payloads contain non-dict items.
|
|
61
|
-
"""
|
|
62
|
-
if isinstance(data, list):
|
|
63
|
-
if not all(isinstance(item, dict) for item in data):
|
|
64
|
-
raise TypeError('XLSX payloads must contain only objects (dicts)')
|
|
65
|
-
return cast(JSONList, data)
|
|
66
|
-
return [cast(JSONDict, data)]
|
|
67
|
-
|
|
68
|
-
|
|
69
26
|
# SECTION: FUNCTIONS ======================================================== #
|
|
70
27
|
|
|
71
28
|
|
|
@@ -90,7 +47,7 @@ def read(
|
|
|
90
47
|
ImportError
|
|
91
48
|
If optional dependencies for XLSX support are missing.
|
|
92
49
|
"""
|
|
93
|
-
pandas =
|
|
50
|
+
pandas = get_pandas('XLSX')
|
|
94
51
|
try:
|
|
95
52
|
frame = pandas.read_excel(path)
|
|
96
53
|
except ImportError as e: # pragma: no cover
|
|
@@ -125,11 +82,11 @@ def write(
|
|
|
125
82
|
ImportError
|
|
126
83
|
If optional dependencies for XLSX support are missing.
|
|
127
84
|
"""
|
|
128
|
-
records =
|
|
85
|
+
records = normalize_records(data, 'XLSX')
|
|
129
86
|
if not records:
|
|
130
87
|
return 0
|
|
131
88
|
|
|
132
|
-
pandas =
|
|
89
|
+
pandas = get_pandas('XLSX')
|
|
133
90
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
134
91
|
frame = pandas.DataFrame.from_records(records)
|
|
135
92
|
try:
|
|
@@ -57,20 +57,22 @@ etlplus/database/schema.py,sha256=813C0Dd3WE53KTYot4dgjAxctgKXLXx-8_Rk_4r2e28,70
|
|
|
57
57
|
etlplus/database/types.py,sha256=_pkQyC14TzAlgyeIqZG4F5LWYknZbHw3TW68Auk7Ya0,795
|
|
58
58
|
etlplus/file/README.md,sha256=avWnyeKfs3uP3qa-DVBJ6t05jS2oFUPeQ3xf1Ph0eC0,3626
|
|
59
59
|
etlplus/file/__init__.py,sha256=X03bosSM-uSd6dh3ur0un6_ozFRw2Tm4PE6kVUjtXK8,475
|
|
60
|
-
etlplus/file/
|
|
60
|
+
etlplus/file/_io.py,sha256=kSbe4Bc9J8br7g856IzBvmKIWSSlng8vo66XN9Z2aiw,2917
|
|
61
|
+
etlplus/file/_pandas.py,sha256=6ZqU7QzEMBq7OFl3mfEtotnKunpS3XV_GGRgz7SIHsI,1282
|
|
62
|
+
etlplus/file/avro.py,sha256=JHK95zrwuHHICRe8f20xfKmeWzv1wP0Br5pOnINdLSc,4621
|
|
61
63
|
etlplus/file/core.py,sha256=BkCliUez8SBEgpagxSeDbJixnX9QvD5XQp0dbYOOw0k,8692
|
|
62
|
-
etlplus/file/csv.py,sha256=
|
|
64
|
+
etlplus/file/csv.py,sha256=gtEUWJO54veEtgaLB_QnmR8yOpeToq78nrtAPVTTl44,1269
|
|
63
65
|
etlplus/file/enums.py,sha256=rwrbwj6PejG0c5v6jzcsmeNu9cSqDyWB1foIuM5UyJo,6648
|
|
64
|
-
etlplus/file/feather.py,sha256=
|
|
66
|
+
etlplus/file/feather.py,sha256=WYZBn2f_Z7KDZZJ1eX0RS-934MnYIMydD0p2Oo30do4,2182
|
|
65
67
|
etlplus/file/gz.py,sha256=NKsvIV7TIWn8USbvuZmRH9hr6OrXh4TzTfDykHD41Kk,2631
|
|
66
68
|
etlplus/file/json.py,sha256=_KAXb4rZ1C8xnaV10IkihuFh1lhbWvajFOlMrBCNVjQ,2099
|
|
67
69
|
etlplus/file/ndjson.py,sha256=gT-kgcqCUUSxtm2j-JMejoh65jk-njMvFwxKCquLZw0,2393
|
|
68
|
-
etlplus/file/orc.py,sha256=
|
|
69
|
-
etlplus/file/parquet.py,sha256=
|
|
70
|
-
etlplus/file/tsv.py,sha256=
|
|
70
|
+
etlplus/file/orc.py,sha256=GUrq9rgXCLBJ0i8Jd0Xsl4DzldDBg0FDxYhytb4OgxQ,2139
|
|
71
|
+
etlplus/file/parquet.py,sha256=Tp2bi_PAIUdkzc25nArJp7beuUaudw5NdciV6IFHsdQ,2281
|
|
72
|
+
etlplus/file/tsv.py,sha256=NiqF84Ck8e_DinaiO8yKRR6fVUTnUhpThzo4E1QUD8k,1271
|
|
71
73
|
etlplus/file/txt.py,sha256=BStC7crpkGT4qddEeAD1_1mi_2-vQSXLj2DI-ddPFQE,2206
|
|
72
|
-
etlplus/file/xls.py,sha256=
|
|
73
|
-
etlplus/file/xlsx.py,sha256=
|
|
74
|
+
etlplus/file/xls.py,sha256=83BbBJGxHAdbKH8Imz1l4mOgQT34uo-tyujp2WONRY4,1771
|
|
75
|
+
etlplus/file/xlsx.py,sha256=mBKc3dSci9tk4KjQX3CaODwG1ueGtFAfztNUOaWYQAE,2181
|
|
74
76
|
etlplus/file/xml.py,sha256=rYtCPvyLn9djClN2xKeqRCPsMXnvCH4R8zj94NJRdQc,4018
|
|
75
77
|
etlplus/file/yaml.py,sha256=pWJf0rWyiRpOVOBAwVOosPsdIzuywZ_Cv8_tXLZ6RFw,3183
|
|
76
78
|
etlplus/file/zip.py,sha256=nd26V3S0edklriKnKOGDTLlO8RBXTda_zLLEQrJgKL4,4185
|
|
@@ -81,9 +83,9 @@ etlplus/templates/view.sql.j2,sha256=Iy8DHfhq5yyvrUKDxqp_aHIEXY4Tm6j4wT7YDEFWAhk
|
|
|
81
83
|
etlplus/validation/README.md,sha256=qusyiyJu2DsaK80jlwfXVZ0iDgeuTPOX2EL3a_fcFiw,1401
|
|
82
84
|
etlplus/validation/__init__.py,sha256=Pe5Xg1_EA4uiNZGYu5WTF3j7odjmyxnAJ8rcioaplSQ,1254
|
|
83
85
|
etlplus/validation/utils.py,sha256=Mtqg449VIke0ziy_wd2r6yrwJzQkA1iulZC87FzXMjo,10201
|
|
84
|
-
etlplus-0.12.
|
|
85
|
-
etlplus-0.12.
|
|
86
|
-
etlplus-0.12.
|
|
87
|
-
etlplus-0.12.
|
|
88
|
-
etlplus-0.12.
|
|
89
|
-
etlplus-0.12.
|
|
86
|
+
etlplus-0.12.4.dist-info/licenses/LICENSE,sha256=MuNO63i6kWmgnV2pbP2SLqP54mk1BGmu7CmbtxMmT-U,1069
|
|
87
|
+
etlplus-0.12.4.dist-info/METADATA,sha256=FS-Se52lzyRJ2yAlzaIjAeyJ9GBJIN36nRr-wZRCLtM,22878
|
|
88
|
+
etlplus-0.12.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
89
|
+
etlplus-0.12.4.dist-info/entry_points.txt,sha256=6w-2-jzuPa55spzK34h-UKh2JTEShh38adFRONNP9QE,45
|
|
90
|
+
etlplus-0.12.4.dist-info/top_level.txt,sha256=aWWF-udn_sLGuHTM6W6MLh99ArS9ROkUWO8Mi8y1_2U,8
|
|
91
|
+
etlplus-0.12.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|