etlplus 0.12.10__py3-none-any.whl → 0.14.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +1 -1
- etlplus/__init__.py +1 -26
- etlplus/api/__init__.py +10 -0
- etlplus/api/config.py +36 -20
- etlplus/api/endpoint_client.py +3 -3
- etlplus/api/enums.py +51 -0
- etlplus/api/pagination/client.py +1 -1
- etlplus/api/rate_limiting/config.py +13 -1
- etlplus/api/rate_limiting/rate_limiter.py +8 -11
- etlplus/api/request_manager.py +11 -6
- etlplus/api/transport.py +14 -2
- etlplus/api/types.py +7 -6
- etlplus/{run_helpers.py → api/utils.py} +205 -153
- etlplus/cli/handlers.py +17 -7
- etlplus/config/jobs.py +14 -4
- etlplus/dag.py +103 -0
- etlplus/enums.py +0 -32
- etlplus/file/cfg.py +2 -2
- etlplus/file/conf.py +2 -2
- etlplus/file/dta.py +77 -0
- etlplus/file/enums.py +10 -4
- etlplus/file/hbs.py +78 -0
- etlplus/file/hdf5.py +78 -0
- etlplus/file/jinja2.py +78 -0
- etlplus/file/mat.py +78 -0
- etlplus/file/mustache.py +78 -0
- etlplus/file/nc.py +78 -0
- etlplus/file/numbers.py +75 -0
- etlplus/file/ods.py +79 -0
- etlplus/file/properties.py +13 -13
- etlplus/file/rda.py +78 -0
- etlplus/file/rds.py +78 -0
- etlplus/file/sas7bdat.py +78 -0
- etlplus/file/sav.py +77 -0
- etlplus/file/sylk.py +77 -0
- etlplus/file/toml.py +1 -1
- etlplus/file/vm.py +78 -0
- etlplus/file/wks.py +77 -0
- etlplus/file/xlsm.py +79 -0
- etlplus/file/xpt.py +78 -0
- etlplus/file/zsav.py +77 -0
- etlplus/{validation → ops}/README.md +2 -2
- etlplus/ops/__init__.py +61 -0
- etlplus/{extract.py → ops/extract.py} +78 -94
- etlplus/{load.py → ops/load.py} +73 -93
- etlplus/{run.py → ops/run.py} +140 -110
- etlplus/{transform.py → ops/transform.py} +75 -68
- etlplus/{validation → ops}/utils.py +80 -15
- etlplus/{validate.py → ops/validate.py} +19 -9
- etlplus/types.py +2 -2
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/METADATA +91 -60
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/RECORD +56 -35
- etlplus/validation/__init__.py +0 -44
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/WHEEL +0 -0
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/entry_points.txt +0 -0
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/top_level.txt +0 -0
etlplus/file/vm.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file.vm` module.
|
|
3
|
+
|
|
4
|
+
Helpers for reading/writing Apache Velocity (VM) template files.
|
|
5
|
+
|
|
6
|
+
Notes
|
|
7
|
+
-----
|
|
8
|
+
- A VM file is a text file used for generating HTML or other text formats
|
|
9
|
+
by combining templates with data.
|
|
10
|
+
- Common cases:
|
|
11
|
+
- HTML templates.
|
|
12
|
+
- Email templates.
|
|
13
|
+
- Configuration files.
|
|
14
|
+
- Rule of thumb:
|
|
15
|
+
- If you need to work with Apache Velocity template files, use this module
|
|
16
|
+
for reading and writing.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from ..types import JSONData
|
|
24
|
+
from ..types import JSONList
|
|
25
|
+
from . import stub
|
|
26
|
+
|
|
27
|
+
# SECTION: EXPORTS ========================================================== #
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
'read',
|
|
32
|
+
'write',
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def read(
|
|
40
|
+
path: Path,
|
|
41
|
+
) -> JSONList:
|
|
42
|
+
"""
|
|
43
|
+
Read VM content from ``path``.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
path : Path
|
|
48
|
+
Path to the VM file on disk.
|
|
49
|
+
|
|
50
|
+
Returns
|
|
51
|
+
-------
|
|
52
|
+
JSONList
|
|
53
|
+
The list of dictionaries read from the VM file.
|
|
54
|
+
"""
|
|
55
|
+
return stub.read(path, format_name='VM')
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def write(
|
|
59
|
+
path: Path,
|
|
60
|
+
data: JSONData,
|
|
61
|
+
) -> int:
|
|
62
|
+
"""
|
|
63
|
+
Write ``data`` to VM file at ``path`` and return record count.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
path : Path
|
|
68
|
+
Path to the VM file on disk.
|
|
69
|
+
data : JSONData
|
|
70
|
+
Data to write as VM file. Should be a list of dictionaries or a single
|
|
71
|
+
dictionary.
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
int
|
|
76
|
+
The number of rows written to the VM file.
|
|
77
|
+
"""
|
|
78
|
+
return stub.write(path, data, format_name='VM')
|
etlplus/file/wks.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file.wks` module.
|
|
3
|
+
|
|
4
|
+
Helpers for reading/writing Lotus 1-2-3 (WKS) spreadsheet files.
|
|
5
|
+
|
|
6
|
+
Notes
|
|
7
|
+
-----
|
|
8
|
+
- A WKS file is a spreadsheet file created using the Lotus 1-2-3 format.
|
|
9
|
+
- Common cases:
|
|
10
|
+
- Reading data from legacy Lotus 1-2-3 spreadsheets.
|
|
11
|
+
- Writing data to Lotus 1-2-3 format for compatibility.
|
|
12
|
+
- Converting WKS files to more modern formats.
|
|
13
|
+
- Rule of thumb:
|
|
14
|
+
- If you need to work with Lotus 1-2-3 spreadsheet files, use this module
|
|
15
|
+
for reading and writing.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from ..types import JSONData
|
|
23
|
+
from ..types import JSONList
|
|
24
|
+
from . import stub
|
|
25
|
+
|
|
26
|
+
# SECTION: EXPORTS ========================================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
'read',
|
|
31
|
+
'write',
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def read(
|
|
39
|
+
path: Path,
|
|
40
|
+
) -> JSONList:
|
|
41
|
+
"""
|
|
42
|
+
Read WKS content from ``path``.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
path : Path
|
|
47
|
+
Path to the WKS file on disk.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
JSONList
|
|
52
|
+
The list of dictionaries read from the WKS file.
|
|
53
|
+
"""
|
|
54
|
+
return stub.read(path, format_name='WKS')
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def write(
|
|
58
|
+
path: Path,
|
|
59
|
+
data: JSONData,
|
|
60
|
+
) -> int:
|
|
61
|
+
"""
|
|
62
|
+
Write ``data`` to WKS file at ``path`` and return record count.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
path : Path
|
|
67
|
+
Path to the WKS file on disk.
|
|
68
|
+
data : JSONData
|
|
69
|
+
Data to write as WKS file. Should be a list of dictionaries or a
|
|
70
|
+
single dictionary.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
int
|
|
75
|
+
The number of rows written to the WKS file.
|
|
76
|
+
"""
|
|
77
|
+
return stub.write(path, data, format_name='WKS')
|
etlplus/file/xlsm.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file.xlsm` module.
|
|
3
|
+
|
|
4
|
+
Helpers for reading/writing Microsoft Excel Macro-Enabled (XLSM) spreadsheet
|
|
5
|
+
files.
|
|
6
|
+
|
|
7
|
+
Notes
|
|
8
|
+
-----
|
|
9
|
+
- An XLSM file is a spreadsheet file created using the Microsoft Excel Macro-
|
|
10
|
+
Enabled (Open XML) format.
|
|
11
|
+
- Common cases:
|
|
12
|
+
- Reading data from Excel Macro-Enabled spreadsheets.
|
|
13
|
+
- Writing data to Excel Macro-Enabled format for compatibility.
|
|
14
|
+
- Converting XLSM files to more modern formats.
|
|
15
|
+
- Rule of thumb:
|
|
16
|
+
- If you need to work with Excel Macro-Enabled spreadsheet files, use this
|
|
17
|
+
module for reading and writing.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from ..types import JSONData
|
|
25
|
+
from ..types import JSONList
|
|
26
|
+
from . import stub
|
|
27
|
+
|
|
28
|
+
# SECTION: EXPORTS ========================================================== #
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
'read',
|
|
33
|
+
'write',
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def read(
|
|
41
|
+
path: Path,
|
|
42
|
+
) -> JSONList:
|
|
43
|
+
"""
|
|
44
|
+
Read XLSM content from ``path``.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
path : Path
|
|
49
|
+
Path to the XLSM file on disk.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
JSONList
|
|
54
|
+
The list of dictionaries read from the XLSM file.
|
|
55
|
+
"""
|
|
56
|
+
return stub.read(path, format_name='XLSM')
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def write(
|
|
60
|
+
path: Path,
|
|
61
|
+
data: JSONData,
|
|
62
|
+
) -> int:
|
|
63
|
+
"""
|
|
64
|
+
Write ``data`` to XLSM file at ``path`` and return record count.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
path : Path
|
|
69
|
+
Path to the XLSM file on disk.
|
|
70
|
+
data : JSONData
|
|
71
|
+
Data to write as XLSM file. Should be a list of dictionaries or a
|
|
72
|
+
single dictionary.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
int
|
|
77
|
+
The number of rows written to the XLSM file.
|
|
78
|
+
"""
|
|
79
|
+
return stub.write(path, data, format_name='XLSM')
|
etlplus/file/xpt.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file.xpt` module.
|
|
3
|
+
|
|
4
|
+
Helpers for reading/writing SAS Transport (XPT) files.
|
|
5
|
+
|
|
6
|
+
Notes
|
|
7
|
+
-----
|
|
8
|
+
- A SAS Transport (XPT) file is a standardized file format used to transfer
|
|
9
|
+
SAS datasets between different systems.
|
|
10
|
+
- Common cases:
|
|
11
|
+
- Sharing datasets between different SAS installations.
|
|
12
|
+
- Archiving datasets in a platform-independent format.
|
|
13
|
+
- Importing/exporting data to/from statistical software that supports XPT.
|
|
14
|
+
- Rule of thumb:
|
|
15
|
+
- If you need to work with XPT files, use this module for reading
|
|
16
|
+
and writing.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from ..types import JSONData
|
|
24
|
+
from ..types import JSONList
|
|
25
|
+
from . import stub
|
|
26
|
+
|
|
27
|
+
# SECTION: EXPORTS ========================================================== #
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
'read',
|
|
32
|
+
'write',
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def read(
|
|
40
|
+
path: Path,
|
|
41
|
+
) -> JSONList:
|
|
42
|
+
"""
|
|
43
|
+
Read XPT content from ``path``.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
path : Path
|
|
48
|
+
Path to the XPT file on disk.
|
|
49
|
+
|
|
50
|
+
Returns
|
|
51
|
+
-------
|
|
52
|
+
JSONList
|
|
53
|
+
The list of dictionaries read from the XPT file.
|
|
54
|
+
"""
|
|
55
|
+
return stub.read(path, format_name='XPT')
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def write(
|
|
59
|
+
path: Path,
|
|
60
|
+
data: JSONData,
|
|
61
|
+
) -> int:
|
|
62
|
+
"""
|
|
63
|
+
Write ``data`` to XPT file at ``path`` and return record count.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
path : Path
|
|
68
|
+
Path to the XPT file on disk.
|
|
69
|
+
data : JSONData
|
|
70
|
+
Data to write as XPT file. Should be a list of dictionaries or a
|
|
71
|
+
single dictionary.
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
int
|
|
76
|
+
The number of rows written to the XPT file.
|
|
77
|
+
"""
|
|
78
|
+
return stub.write(path, data, format_name='XPT')
|
etlplus/file/zsav.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file.zsav` module.
|
|
3
|
+
|
|
4
|
+
Helpers for reading/writing compressed SPSS (ZSAV) data files.
|
|
5
|
+
|
|
6
|
+
Notes
|
|
7
|
+
-----
|
|
8
|
+
- A ZSAV file is a compressed binary file format used by SPSS to store
|
|
9
|
+
datasets, including variables, labels, and data types.
|
|
10
|
+
- Common cases:
|
|
11
|
+
- Reading compressed data for analysis in Python.
|
|
12
|
+
- Writing processed data back to compressed SPSS format.
|
|
13
|
+
- Rule of thumb:
|
|
14
|
+
- If you need to work with compressed SPSS data files, use this module for
|
|
15
|
+
reading and writing.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from ..types import JSONData
|
|
23
|
+
from ..types import JSONList
|
|
24
|
+
from . import stub
|
|
25
|
+
|
|
26
|
+
# SECTION: EXPORTS ========================================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
'read',
|
|
31
|
+
'write',
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def read(
|
|
39
|
+
path: Path,
|
|
40
|
+
) -> JSONList:
|
|
41
|
+
"""
|
|
42
|
+
Read ZSAV content from ``path``.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
path : Path
|
|
47
|
+
Path to the ZSAV file on disk.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
JSONList
|
|
52
|
+
The list of dictionaries read from the ZSAV file.
|
|
53
|
+
"""
|
|
54
|
+
return stub.read(path, format_name='ZSAV')
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def write(
|
|
58
|
+
path: Path,
|
|
59
|
+
data: JSONData,
|
|
60
|
+
) -> int:
|
|
61
|
+
"""
|
|
62
|
+
Write ``data`` to ZSAV file at ``path`` and return record count.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
path : Path
|
|
67
|
+
Path to the ZSAV file on disk.
|
|
68
|
+
data : JSONData
|
|
69
|
+
Data to write as ZSAV file. Should be a list of dictionaries or a
|
|
70
|
+
single dictionary.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
int
|
|
75
|
+
The number of rows written to the ZSAV file.
|
|
76
|
+
"""
|
|
77
|
+
return stub.write(path, data, format_name='ZSAV')
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# etlplus.
|
|
1
|
+
# etlplus.ops subpackage
|
|
2
2
|
|
|
3
3
|
Documentation for the `etlplus.validation` subpackage: data validation utilities and helpers.
|
|
4
4
|
|
|
@@ -8,7 +8,7 @@ Documentation for the `etlplus.validation` subpackage: data validation utilities
|
|
|
8
8
|
|
|
9
9
|
Back to project overview: see the top-level [README](../../README.md).
|
|
10
10
|
|
|
11
|
-
- [etlplus.
|
|
11
|
+
- [etlplus.ops subpackage](#etlplusops-subpackage)
|
|
12
12
|
- [Validation Features](#validation-features)
|
|
13
13
|
- [Defining Validation Rules](#defining-validation-rules)
|
|
14
14
|
- [Example: Validating Data](#example-validating-data)
|
etlplus/ops/__init__.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.ops` package.
|
|
3
|
+
|
|
4
|
+
Data operations helpers.
|
|
5
|
+
|
|
6
|
+
Importing :mod:`etlplus.ops` exposes the coarse-grained helpers most users care
|
|
7
|
+
about: ``extract``, ``transform``, ``load``, ``validate``, ``run``, and
|
|
8
|
+
``run_pipeline``. Each helper delegates to the richer modules under
|
|
9
|
+
``etlplus.ops.*`` while presenting a compact public API surface. Conditional
|
|
10
|
+
validation orchestration is available via
|
|
11
|
+
:func:`etlplus.ops.utils.maybe_validate`. The legacy compatibility module
|
|
12
|
+
:mod:`etlplus.ops.__init__validation` is deprecated in favor of this package.
|
|
13
|
+
|
|
14
|
+
Examples
|
|
15
|
+
--------
|
|
16
|
+
>>> from etlplus.ops import extract, transform
|
|
17
|
+
>>> raw = extract('file', 'input.json')
|
|
18
|
+
>>> curated = transform(raw, {'select': ['id', 'name']})
|
|
19
|
+
|
|
20
|
+
>>> from etlplus.ops.utils import maybe_validate
|
|
21
|
+
>>> payload = {'name': 'Alice'}
|
|
22
|
+
>>> rules = {'required': ['name']}
|
|
23
|
+
>>> def validator(data, config):
|
|
24
|
+
... missing = [field for field in config['required'] if field not in data]
|
|
25
|
+
... return {'valid': not missing, 'errors': missing, 'data': data}
|
|
26
|
+
>>> maybe_validate(
|
|
27
|
+
... payload,
|
|
28
|
+
... when='both',
|
|
29
|
+
... enabled=True,
|
|
30
|
+
... rules=rules,
|
|
31
|
+
... phase='before_transform',
|
|
32
|
+
... severity='warn',
|
|
33
|
+
... validate_fn=validator,
|
|
34
|
+
... print_json_fn=lambda message: message,
|
|
35
|
+
... )
|
|
36
|
+
{'name': 'Alice'}
|
|
37
|
+
|
|
38
|
+
See Also
|
|
39
|
+
--------
|
|
40
|
+
:mod:`etlplus.ops.run`
|
|
41
|
+
:mod:`etlplus.ops.utils`
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
from .extract import extract
|
|
45
|
+
from .load import load
|
|
46
|
+
from .run import run
|
|
47
|
+
from .run import run_pipeline
|
|
48
|
+
from .transform import transform
|
|
49
|
+
from .validate import validate
|
|
50
|
+
|
|
51
|
+
# SECTION: EXPORTS ========================================================== #
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
__all__ = [
|
|
55
|
+
'extract',
|
|
56
|
+
'load',
|
|
57
|
+
'run',
|
|
58
|
+
'run_pipeline',
|
|
59
|
+
'transform',
|
|
60
|
+
'validate',
|
|
61
|
+
]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.extract` module.
|
|
2
|
+
:mod:`etlplus.ops.extract` module.
|
|
3
3
|
|
|
4
4
|
Helpers to extract data from files, databases, and REST APIs.
|
|
5
5
|
"""
|
|
@@ -10,56 +10,81 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Any
|
|
11
11
|
from typing import cast
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from .types import StrPath
|
|
13
|
+
from ..api import HttpMethod
|
|
14
|
+
from ..api.utils import resolve_request
|
|
15
|
+
from ..enums import DataConnectorType
|
|
16
|
+
from ..file import File
|
|
17
|
+
from ..file import FileFormat
|
|
18
|
+
from ..types import JSONData
|
|
19
|
+
from ..types import JSONDict
|
|
20
|
+
from ..types import JSONList
|
|
21
|
+
from ..types import StrPath
|
|
23
22
|
|
|
24
23
|
# SECTION: FUNCTIONS ======================================================== #
|
|
25
24
|
|
|
26
25
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
file_path: StrPath,
|
|
32
|
-
file_format: FileFormat | str | None = FileFormat.JSON,
|
|
26
|
+
def extract_from_api(
|
|
27
|
+
url: str,
|
|
28
|
+
method: HttpMethod | str = HttpMethod.GET,
|
|
29
|
+
**kwargs: Any,
|
|
33
30
|
) -> JSONData:
|
|
34
31
|
"""
|
|
35
|
-
Extract
|
|
32
|
+
Extract data from a REST API.
|
|
36
33
|
|
|
37
34
|
Parameters
|
|
38
35
|
----------
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
36
|
+
url : str
|
|
37
|
+
API endpoint URL.
|
|
38
|
+
method : HttpMethod | str, optional
|
|
39
|
+
HTTP method to use. Defaults to ``GET``.
|
|
40
|
+
**kwargs : Any
|
|
41
|
+
Extra arguments forwarded to the underlying ``requests`` call
|
|
42
|
+
(for example, ``timeout``). To use a pre-configured
|
|
43
|
+
:class:`requests.Session`, provide it via ``session``.
|
|
44
|
+
When omitted, ``timeout`` defaults to 10 seconds.
|
|
45
45
|
|
|
46
46
|
Returns
|
|
47
47
|
-------
|
|
48
48
|
JSONData
|
|
49
|
-
Parsed
|
|
50
|
-
"""
|
|
51
|
-
path = Path(file_path)
|
|
52
|
-
|
|
53
|
-
# If no explicit format is provided, let File infer from extension.
|
|
54
|
-
if file_format is None:
|
|
55
|
-
return File(path, None).read()
|
|
56
|
-
fmt = FileFormat.coerce(file_format)
|
|
49
|
+
Parsed JSON payload, or a fallback object with raw text.
|
|
57
50
|
|
|
58
|
-
|
|
59
|
-
|
|
51
|
+
Raises
|
|
52
|
+
------
|
|
53
|
+
TypeError
|
|
54
|
+
If a provided ``session`` does not expose the required HTTP
|
|
55
|
+
method (for example, ``get``).
|
|
56
|
+
"""
|
|
57
|
+
timeout = kwargs.pop('timeout', None)
|
|
58
|
+
session = kwargs.pop('session', None)
|
|
59
|
+
request_callable, timeout, _ = resolve_request(
|
|
60
|
+
method,
|
|
61
|
+
session=session,
|
|
62
|
+
timeout=timeout,
|
|
63
|
+
)
|
|
64
|
+
response = request_callable(url, timeout=timeout, **kwargs)
|
|
65
|
+
response.raise_for_status()
|
|
60
66
|
|
|
67
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
68
|
+
if 'application/json' in content_type:
|
|
69
|
+
try:
|
|
70
|
+
payload: Any = response.json()
|
|
71
|
+
except ValueError:
|
|
72
|
+
# Malformed JSON despite content-type; fall back to text
|
|
73
|
+
return {
|
|
74
|
+
'content': response.text,
|
|
75
|
+
'content_type': content_type,
|
|
76
|
+
}
|
|
77
|
+
if isinstance(payload, dict):
|
|
78
|
+
return cast(JSONDict, payload)
|
|
79
|
+
if isinstance(payload, list):
|
|
80
|
+
if all(isinstance(x, dict) for x in payload):
|
|
81
|
+
return cast(JSONList, payload)
|
|
82
|
+
# Coerce non-dict array items into objects for consistency
|
|
83
|
+
return [{'value': x} for x in payload]
|
|
84
|
+
# Fallback: wrap scalar JSON
|
|
85
|
+
return {'value': payload}
|
|
61
86
|
|
|
62
|
-
|
|
87
|
+
return {'content': response.text, 'content_type': content_type}
|
|
63
88
|
|
|
64
89
|
|
|
65
90
|
def extract_from_database(
|
|
@@ -94,77 +119,36 @@ def extract_from_database(
|
|
|
94
119
|
]
|
|
95
120
|
|
|
96
121
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def extract_from_api(
|
|
101
|
-
url: str,
|
|
102
|
-
method: HttpMethod | str = HttpMethod.GET,
|
|
103
|
-
**kwargs: Any,
|
|
122
|
+
def extract_from_file(
|
|
123
|
+
file_path: StrPath,
|
|
124
|
+
file_format: FileFormat | str | None = FileFormat.JSON,
|
|
104
125
|
) -> JSONData:
|
|
105
126
|
"""
|
|
106
|
-
Extract data from a
|
|
127
|
+
Extract (semi-)structured data from a local file.
|
|
107
128
|
|
|
108
129
|
Parameters
|
|
109
130
|
----------
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
(for example, ``timeout``). To use a pre-configured
|
|
117
|
-
:class:`requests.Session`, provide it via ``session``.
|
|
131
|
+
file_path : StrPath
|
|
132
|
+
Source file path.
|
|
133
|
+
file_format : FileFormat | str | None, optional
|
|
134
|
+
File format to parse. If ``None``, infer from the filename
|
|
135
|
+
extension. Defaults to `'json'` for backward compatibility when
|
|
136
|
+
explicitly provided.
|
|
118
137
|
|
|
119
138
|
Returns
|
|
120
139
|
-------
|
|
121
140
|
JSONData
|
|
122
|
-
Parsed
|
|
123
|
-
|
|
124
|
-
Raises
|
|
125
|
-
------
|
|
126
|
-
TypeError
|
|
127
|
-
If a provided ``session`` does not expose the required HTTP
|
|
128
|
-
method (for example, ``get``).
|
|
141
|
+
Parsed data as a mapping or a list of mappings.
|
|
129
142
|
"""
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
# Apply a conservative timeout to guard against hanging requests.
|
|
133
|
-
timeout = kwargs.pop('timeout', 10.0)
|
|
134
|
-
session = kwargs.pop('session', None)
|
|
135
|
-
requester = session or requests
|
|
136
|
-
|
|
137
|
-
request_callable = getattr(requester, http_method.value, None)
|
|
138
|
-
if not callable(request_callable):
|
|
139
|
-
raise TypeError(
|
|
140
|
-
'Session object must supply a callable'
|
|
141
|
-
f'"{http_method.value}" method',
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
response = request_callable(url, timeout=timeout, **kwargs)
|
|
145
|
-
response.raise_for_status()
|
|
143
|
+
path = Path(file_path)
|
|
146
144
|
|
|
147
|
-
|
|
148
|
-
if
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
except ValueError:
|
|
152
|
-
# Malformed JSON despite content-type; fall back to text
|
|
153
|
-
return {
|
|
154
|
-
'content': response.text,
|
|
155
|
-
'content_type': content_type,
|
|
156
|
-
}
|
|
157
|
-
if isinstance(payload, dict):
|
|
158
|
-
return cast(JSONDict, payload)
|
|
159
|
-
if isinstance(payload, list):
|
|
160
|
-
if all(isinstance(x, dict) for x in payload):
|
|
161
|
-
return cast(JSONList, payload)
|
|
162
|
-
# Coerce non-dict array items into objects for consistency
|
|
163
|
-
return [{'value': x} for x in payload]
|
|
164
|
-
# Fallback: wrap scalar JSON
|
|
165
|
-
return {'value': payload}
|
|
145
|
+
# If no explicit format is provided, let File infer from extension.
|
|
146
|
+
if file_format is None:
|
|
147
|
+
return File(path, None).read()
|
|
148
|
+
fmt = FileFormat.coerce(file_format)
|
|
166
149
|
|
|
167
|
-
|
|
150
|
+
# Let file module perform existence and format validation.
|
|
151
|
+
return File(path, fmt).read()
|
|
168
152
|
|
|
169
153
|
|
|
170
154
|
# -- Orchestration -- #
|