pycharter 0.0.22__py3-none-any.whl → 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- api/main.py +27 -1
- api/models/docs.py +68 -0
- api/models/evolution.py +117 -0
- api/models/tracking.py +111 -0
- api/models/validation.py +46 -6
- api/routes/v1/__init__.py +14 -1
- api/routes/v1/docs.py +187 -0
- api/routes/v1/evolution.py +337 -0
- api/routes/v1/templates.py +211 -27
- api/routes/v1/tracking.py +301 -0
- api/routes/v1/validation.py +68 -31
- pycharter/__init__.py +268 -58
- pycharter/data/templates/contract/template_coercion_rules.yaml +57 -0
- pycharter/data/templates/contract/template_contract.yaml +122 -0
- pycharter/data/templates/contract/template_metadata.yaml +68 -0
- pycharter/data/templates/contract/template_schema.yaml +100 -0
- pycharter/data/templates/contract/template_validation_rules.yaml +75 -0
- pycharter/data/templates/etl/README.md +224 -0
- pycharter/data/templates/etl/extract_cloud_azure.yaml +24 -0
- pycharter/data/templates/etl/extract_cloud_gcs.yaml +25 -0
- pycharter/data/templates/etl/extract_cloud_s3.yaml +30 -0
- pycharter/data/templates/etl/extract_database.yaml +34 -0
- pycharter/data/templates/etl/extract_database_ssh.yaml +40 -0
- pycharter/data/templates/etl/extract_file_csv.yaml +21 -0
- pycharter/data/templates/etl/extract_file_glob.yaml +25 -0
- pycharter/data/templates/etl/extract_file_json.yaml +24 -0
- pycharter/data/templates/etl/extract_file_parquet.yaml +20 -0
- pycharter/data/templates/etl/extract_http_paginated.yaml +79 -0
- pycharter/data/templates/etl/extract_http_path_params.yaml +38 -0
- pycharter/data/templates/etl/extract_http_simple.yaml +62 -0
- pycharter/data/templates/etl/load_cloud_azure.yaml +24 -0
- pycharter/data/templates/etl/load_cloud_gcs.yaml +22 -0
- pycharter/data/templates/etl/load_cloud_s3.yaml +27 -0
- pycharter/data/templates/etl/load_file.yaml +34 -0
- pycharter/data/templates/etl/load_insert.yaml +18 -0
- pycharter/data/templates/etl/load_postgresql.yaml +39 -0
- pycharter/data/templates/etl/load_sqlite.yaml +21 -0
- pycharter/data/templates/etl/load_truncate_and_load.yaml +20 -0
- pycharter/data/templates/etl/load_upsert.yaml +25 -0
- pycharter/data/templates/etl/load_with_dlq.yaml +34 -0
- pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +35 -0
- pycharter/data/templates/etl/pipeline_http_to_db.yaml +75 -0
- pycharter/data/templates/etl/transform_combined.yaml +48 -0
- pycharter/data/templates/etl/transform_custom_function.yaml +58 -0
- pycharter/data/templates/etl/transform_jsonata.yaml +51 -0
- pycharter/data/templates/etl/transform_simple.yaml +59 -0
- pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
- pycharter/docs_generator/__init__.py +43 -0
- pycharter/docs_generator/generator.py +465 -0
- pycharter/docs_generator/renderers.py +247 -0
- pycharter/etl_generator/__init__.py +168 -80
- pycharter/etl_generator/builder.py +121 -0
- pycharter/etl_generator/config_loader.py +394 -0
- pycharter/etl_generator/config_validator.py +418 -0
- pycharter/etl_generator/context.py +132 -0
- pycharter/etl_generator/expression.py +499 -0
- pycharter/etl_generator/extractors/__init__.py +30 -0
- pycharter/etl_generator/extractors/base.py +70 -0
- pycharter/etl_generator/extractors/cloud_storage.py +530 -0
- pycharter/etl_generator/extractors/database.py +221 -0
- pycharter/etl_generator/extractors/factory.py +185 -0
- pycharter/etl_generator/extractors/file.py +475 -0
- pycharter/etl_generator/extractors/http.py +895 -0
- pycharter/etl_generator/extractors/streaming.py +57 -0
- pycharter/etl_generator/loaders/__init__.py +41 -0
- pycharter/etl_generator/loaders/base.py +35 -0
- pycharter/etl_generator/loaders/cloud.py +87 -0
- pycharter/etl_generator/loaders/cloud_storage_loader.py +275 -0
- pycharter/etl_generator/loaders/database.py +274 -0
- pycharter/etl_generator/loaders/factory.py +180 -0
- pycharter/etl_generator/loaders/file.py +72 -0
- pycharter/etl_generator/loaders/file_loader.py +130 -0
- pycharter/etl_generator/pipeline.py +743 -0
- pycharter/etl_generator/protocols.py +54 -0
- pycharter/etl_generator/result.py +63 -0
- pycharter/etl_generator/schemas/__init__.py +49 -0
- pycharter/etl_generator/transformers/__init__.py +49 -0
- pycharter/etl_generator/transformers/base.py +63 -0
- pycharter/etl_generator/transformers/config.py +45 -0
- pycharter/etl_generator/transformers/custom_function.py +101 -0
- pycharter/etl_generator/transformers/jsonata_transformer.py +56 -0
- pycharter/etl_generator/transformers/operations.py +218 -0
- pycharter/etl_generator/transformers/pipeline.py +54 -0
- pycharter/etl_generator/transformers/simple_operations.py +131 -0
- pycharter/quality/__init__.py +25 -0
- pycharter/quality/tracking/__init__.py +64 -0
- pycharter/quality/tracking/collector.py +318 -0
- pycharter/quality/tracking/exporters.py +238 -0
- pycharter/quality/tracking/models.py +194 -0
- pycharter/quality/tracking/store.py +385 -0
- pycharter/runtime_validator/__init__.py +20 -7
- pycharter/runtime_validator/builder.py +328 -0
- pycharter/runtime_validator/validator.py +311 -7
- pycharter/runtime_validator/validator_core.py +61 -0
- pycharter/schema_evolution/__init__.py +61 -0
- pycharter/schema_evolution/compatibility.py +270 -0
- pycharter/schema_evolution/diff.py +496 -0
- pycharter/schema_evolution/models.py +201 -0
- pycharter/shared/__init__.py +56 -0
- pycharter/shared/errors.py +296 -0
- pycharter/shared/protocols.py +234 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/METADATA +146 -26
- pycharter-0.0.24.dist-info/RECORD +543 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/WHEEL +1 -1
- ui/static/404/index.html +1 -1
- ui/static/404.html +1 -1
- ui/static/__next.__PAGE__.txt +1 -1
- ui/static/__next._full.txt +1 -1
- ui/static/__next._head.txt +1 -1
- ui/static/__next._index.txt +1 -1
- ui/static/__next._tree.txt +1 -1
- ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
- ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
- ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
- ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
- ui/static/_not-found/__next._full.txt +1 -1
- ui/static/_not-found/__next._head.txt +1 -1
- ui/static/_not-found/__next._index.txt +1 -1
- ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- ui/static/_not-found/__next._not-found.txt +1 -1
- ui/static/_not-found/__next._tree.txt +1 -1
- ui/static/_not-found/index.html +1 -1
- ui/static/_not-found/index.txt +1 -1
- ui/static/contracts/__next._full.txt +2 -2
- ui/static/contracts/__next._head.txt +1 -1
- ui/static/contracts/__next._index.txt +1 -1
- ui/static/contracts/__next._tree.txt +1 -1
- ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
- ui/static/contracts/__next.contracts.txt +1 -1
- ui/static/contracts/index.html +1 -1
- ui/static/contracts/index.txt +2 -2
- ui/static/documentation/__next._full.txt +1 -1
- ui/static/documentation/__next._head.txt +1 -1
- ui/static/documentation/__next._index.txt +1 -1
- ui/static/documentation/__next._tree.txt +1 -1
- ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
- ui/static/documentation/__next.documentation.txt +1 -1
- ui/static/documentation/index.html +2 -2
- ui/static/documentation/index.txt +1 -1
- ui/static/index.html +1 -1
- ui/static/index.txt +1 -1
- ui/static/metadata/__next._full.txt +1 -1
- ui/static/metadata/__next._head.txt +1 -1
- ui/static/metadata/__next._index.txt +1 -1
- ui/static/metadata/__next._tree.txt +1 -1
- ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
- ui/static/metadata/__next.metadata.txt +1 -1
- ui/static/metadata/index.html +1 -1
- ui/static/metadata/index.txt +1 -1
- ui/static/quality/__next._full.txt +2 -2
- ui/static/quality/__next._head.txt +1 -1
- ui/static/quality/__next._index.txt +1 -1
- ui/static/quality/__next._tree.txt +1 -1
- ui/static/quality/__next.quality.__PAGE__.txt +2 -2
- ui/static/quality/__next.quality.txt +1 -1
- ui/static/quality/index.html +2 -2
- ui/static/quality/index.txt +2 -2
- ui/static/rules/__next._full.txt +1 -1
- ui/static/rules/__next._head.txt +1 -1
- ui/static/rules/__next._index.txt +1 -1
- ui/static/rules/__next._tree.txt +1 -1
- ui/static/rules/__next.rules.__PAGE__.txt +1 -1
- ui/static/rules/__next.rules.txt +1 -1
- ui/static/rules/index.html +1 -1
- ui/static/rules/index.txt +1 -1
- ui/static/schemas/__next._full.txt +1 -1
- ui/static/schemas/__next._head.txt +1 -1
- ui/static/schemas/__next._index.txt +1 -1
- ui/static/schemas/__next._tree.txt +1 -1
- ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
- ui/static/schemas/__next.schemas.txt +1 -1
- ui/static/schemas/index.html +1 -1
- ui/static/schemas/index.txt +1 -1
- ui/static/settings/__next._full.txt +1 -1
- ui/static/settings/__next._head.txt +1 -1
- ui/static/settings/__next._index.txt +1 -1
- ui/static/settings/__next._tree.txt +1 -1
- ui/static/settings/__next.settings.__PAGE__.txt +1 -1
- ui/static/settings/__next.settings.txt +1 -1
- ui/static/settings/index.html +1 -1
- ui/static/settings/index.txt +1 -1
- ui/static/static/404/index.html +1 -1
- ui/static/static/404.html +1 -1
- ui/static/static/__next.__PAGE__.txt +1 -1
- ui/static/static/__next._full.txt +2 -2
- ui/static/static/__next._head.txt +1 -1
- ui/static/static/__next._index.txt +2 -2
- ui/static/static/__next._tree.txt +2 -2
- ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
- ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
- ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
- ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
- ui/static/static/_not-found/__next._full.txt +2 -2
- ui/static/static/_not-found/__next._head.txt +1 -1
- ui/static/static/_not-found/__next._index.txt +2 -2
- ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- ui/static/static/_not-found/__next._not-found.txt +1 -1
- ui/static/static/_not-found/__next._tree.txt +2 -2
- ui/static/static/_not-found/index.html +1 -1
- ui/static/static/_not-found/index.txt +2 -2
- ui/static/static/contracts/__next._full.txt +3 -3
- ui/static/static/contracts/__next._head.txt +1 -1
- ui/static/static/contracts/__next._index.txt +2 -2
- ui/static/static/contracts/__next._tree.txt +2 -2
- ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
- ui/static/static/contracts/__next.contracts.txt +1 -1
- ui/static/static/contracts/index.html +1 -1
- ui/static/static/contracts/index.txt +3 -3
- ui/static/static/documentation/__next._full.txt +3 -3
- ui/static/static/documentation/__next._head.txt +1 -1
- ui/static/static/documentation/__next._index.txt +2 -2
- ui/static/static/documentation/__next._tree.txt +2 -2
- ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
- ui/static/static/documentation/__next.documentation.txt +1 -1
- ui/static/static/documentation/index.html +2 -2
- ui/static/static/documentation/index.txt +3 -3
- ui/static/static/index.html +1 -1
- ui/static/static/index.txt +2 -2
- ui/static/static/metadata/__next._full.txt +2 -2
- ui/static/static/metadata/__next._head.txt +1 -1
- ui/static/static/metadata/__next._index.txt +2 -2
- ui/static/static/metadata/__next._tree.txt +2 -2
- ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
- ui/static/static/metadata/__next.metadata.txt +1 -1
- ui/static/static/metadata/index.html +1 -1
- ui/static/static/metadata/index.txt +2 -2
- ui/static/static/quality/__next._full.txt +2 -2
- ui/static/static/quality/__next._head.txt +1 -1
- ui/static/static/quality/__next._index.txt +2 -2
- ui/static/static/quality/__next._tree.txt +2 -2
- ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
- ui/static/static/quality/__next.quality.txt +1 -1
- ui/static/static/quality/index.html +2 -2
- ui/static/static/quality/index.txt +2 -2
- ui/static/static/rules/__next._full.txt +2 -2
- ui/static/static/rules/__next._head.txt +1 -1
- ui/static/static/rules/__next._index.txt +2 -2
- ui/static/static/rules/__next._tree.txt +2 -2
- ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
- ui/static/static/rules/__next.rules.txt +1 -1
- ui/static/static/rules/index.html +1 -1
- ui/static/static/rules/index.txt +2 -2
- ui/static/static/schemas/__next._full.txt +2 -2
- ui/static/static/schemas/__next._head.txt +1 -1
- ui/static/static/schemas/__next._index.txt +2 -2
- ui/static/static/schemas/__next._tree.txt +2 -2
- ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
- ui/static/static/schemas/__next.schemas.txt +1 -1
- ui/static/static/schemas/index.html +1 -1
- ui/static/static/schemas/index.txt +2 -2
- ui/static/static/settings/__next._full.txt +2 -2
- ui/static/static/settings/__next._head.txt +1 -1
- ui/static/static/settings/__next._index.txt +2 -2
- ui/static/static/settings/__next._tree.txt +2 -2
- ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
- ui/static/static/settings/__next.settings.txt +1 -1
- ui/static/static/settings/index.html +1 -1
- ui/static/static/settings/index.txt +2 -2
- ui/static/static/static/.gitkeep +0 -0
- ui/static/static/static/404/index.html +1 -0
- ui/static/static/static/404.html +1 -0
- ui/static/static/static/__next.__PAGE__.txt +10 -0
- ui/static/static/static/__next._full.txt +30 -0
- ui/static/static/static/__next._head.txt +7 -0
- ui/static/static/static/__next._index.txt +9 -0
- ui/static/static/static/__next._tree.txt +2 -0
- ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
- ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
- ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
- ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
- ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
- ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
- ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
- ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
- ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
- ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
- ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
- ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
- ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
- ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
- ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
- ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
- ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
- ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
- ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
- ui/static/static/static/_not-found/__next._full.txt +17 -0
- ui/static/static/static/_not-found/__next._head.txt +7 -0
- ui/static/static/static/_not-found/__next._index.txt +9 -0
- ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
- ui/static/static/static/_not-found/__next._not-found.txt +4 -0
- ui/static/static/static/_not-found/__next._tree.txt +2 -0
- ui/static/static/static/_not-found/index.html +1 -0
- ui/static/static/static/_not-found/index.txt +17 -0
- ui/static/static/static/contracts/__next._full.txt +21 -0
- ui/static/static/static/contracts/__next._head.txt +7 -0
- ui/static/static/static/contracts/__next._index.txt +9 -0
- ui/static/static/static/contracts/__next._tree.txt +2 -0
- ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
- ui/static/static/static/contracts/__next.contracts.txt +4 -0
- ui/static/static/static/contracts/index.html +1 -0
- ui/static/static/static/contracts/index.txt +21 -0
- ui/static/static/static/documentation/__next._full.txt +21 -0
- ui/static/static/static/documentation/__next._head.txt +7 -0
- ui/static/static/static/documentation/__next._index.txt +9 -0
- ui/static/static/static/documentation/__next._tree.txt +2 -0
- ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
- ui/static/static/static/documentation/__next.documentation.txt +4 -0
- ui/static/static/static/documentation/index.html +93 -0
- ui/static/static/static/documentation/index.txt +21 -0
- ui/static/static/static/index.html +1 -0
- ui/static/static/static/index.txt +30 -0
- ui/static/static/static/metadata/__next._full.txt +21 -0
- ui/static/static/static/metadata/__next._head.txt +7 -0
- ui/static/static/static/metadata/__next._index.txt +9 -0
- ui/static/static/static/metadata/__next._tree.txt +2 -0
- ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
- ui/static/static/static/metadata/__next.metadata.txt +4 -0
- ui/static/static/static/metadata/index.html +1 -0
- ui/static/static/static/metadata/index.txt +21 -0
- ui/static/static/static/quality/__next._full.txt +21 -0
- ui/static/static/static/quality/__next._head.txt +7 -0
- ui/static/static/static/quality/__next._index.txt +9 -0
- ui/static/static/static/quality/__next._tree.txt +2 -0
- ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
- ui/static/static/static/quality/__next.quality.txt +4 -0
- ui/static/static/static/quality/index.html +2 -0
- ui/static/static/static/quality/index.txt +21 -0
- ui/static/static/static/rules/__next._full.txt +21 -0
- ui/static/static/static/rules/__next._head.txt +7 -0
- ui/static/static/static/rules/__next._index.txt +9 -0
- ui/static/static/static/rules/__next._tree.txt +2 -0
- ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
- ui/static/static/static/rules/__next.rules.txt +4 -0
- ui/static/static/static/rules/index.html +1 -0
- ui/static/static/static/rules/index.txt +21 -0
- ui/static/static/static/schemas/__next._full.txt +21 -0
- ui/static/static/static/schemas/__next._head.txt +7 -0
- ui/static/static/static/schemas/__next._index.txt +9 -0
- ui/static/static/static/schemas/__next._tree.txt +2 -0
- ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
- ui/static/static/static/schemas/__next.schemas.txt +4 -0
- ui/static/static/static/schemas/index.html +1 -0
- ui/static/static/static/schemas/index.txt +21 -0
- ui/static/static/static/settings/__next._full.txt +21 -0
- ui/static/static/static/settings/__next._head.txt +7 -0
- ui/static/static/static/settings/__next._index.txt +9 -0
- ui/static/static/static/settings/__next._tree.txt +2 -0
- ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
- ui/static/static/static/settings/__next.settings.txt +4 -0
- ui/static/static/static/settings/index.html +1 -0
- ui/static/static/static/settings/index.txt +21 -0
- ui/static/static/static/validation/__next._full.txt +21 -0
- ui/static/static/static/validation/__next._head.txt +7 -0
- ui/static/static/static/validation/__next._index.txt +9 -0
- ui/static/static/static/validation/__next._tree.txt +2 -0
- ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
- ui/static/static/static/validation/__next.validation.txt +4 -0
- ui/static/static/static/validation/index.html +1 -0
- ui/static/static/static/validation/index.txt +21 -0
- ui/static/static/validation/__next._full.txt +2 -2
- ui/static/static/validation/__next._head.txt +1 -1
- ui/static/static/validation/__next._index.txt +2 -2
- ui/static/static/validation/__next._tree.txt +2 -2
- ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
- ui/static/static/validation/__next.validation.txt +1 -1
- ui/static/static/validation/index.html +1 -1
- ui/static/static/validation/index.txt +2 -2
- ui/static/validation/__next._full.txt +2 -2
- ui/static/validation/__next._head.txt +1 -1
- ui/static/validation/__next._index.txt +1 -1
- ui/static/validation/__next._tree.txt +1 -1
- ui/static/validation/__next.validation.__PAGE__.txt +2 -2
- ui/static/validation/__next.validation.txt +1 -1
- ui/static/validation/index.html +1 -1
- ui/static/validation/index.txt +2 -2
- pycharter/data/templates/template_coercion_rules.yaml +0 -15
- pycharter/data/templates/template_contract.yaml +0 -587
- pycharter/data/templates/template_metadata.yaml +0 -38
- pycharter/data/templates/template_schema.yaml +0 -22
- pycharter/data/templates/template_transform_advanced.yaml +0 -50
- pycharter/data/templates/template_transform_simple.yaml +0 -59
- pycharter/data/templates/template_validation_rules.yaml +0 -29
- pycharter/etl_generator/extraction.py +0 -916
- pycharter/etl_generator/factory.py +0 -174
- pycharter/etl_generator/orchestrator.py +0 -1650
- pycharter/integrations/__init__.py +0 -19
- pycharter/integrations/kafka.py +0 -178
- pycharter/integrations/streaming.py +0 -100
- pycharter-0.0.22.dist-info/RECORD +0 -358
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/entry_points.txt +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/licenses/LICENSE +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/top_level.txt +0 -0
- /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
- /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
- /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
- /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
- /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
|
@@ -1,1650 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
ETL Orchestrator - Streaming ETL pipeline with simple operations, JSONata, and custom functions.
|
|
3
|
-
|
|
4
|
-
Executes ETL pipelines: Extract → Transform (Simple Operations → JSONata → Custom Functions) → Load.
|
|
5
|
-
|
|
6
|
-
Transformation Pipeline:
|
|
7
|
-
1. Simple Operations: rename, convert, defaults, add, select, drop (declarative, easy to use)
|
|
8
|
-
2. JSONata: Powerful query language for complex transformations (full JSONata support)
|
|
9
|
-
3. Custom Functions: Import and run external Python modules/functions
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
import asyncio
|
|
13
|
-
import gc
|
|
14
|
-
import importlib
|
|
15
|
-
import logging
|
|
16
|
-
import re
|
|
17
|
-
import uuid
|
|
18
|
-
import warnings
|
|
19
|
-
from collections import Counter, defaultdict
|
|
20
|
-
from datetime import datetime
|
|
21
|
-
from pathlib import Path
|
|
22
|
-
from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Tuple
|
|
23
|
-
|
|
24
|
-
import jsonata
|
|
25
|
-
import yaml
|
|
26
|
-
|
|
27
|
-
from pycharter.contract_parser import ContractMetadata, parse_contract_file
|
|
28
|
-
from pycharter.etl_generator.checkpoint import CheckpointManager
|
|
29
|
-
from pycharter.etl_generator.database import get_database_connection, load_data
|
|
30
|
-
from pycharter.etl_generator.dlq import DeadLetterQueue, DLQReason
|
|
31
|
-
from pycharter.etl_generator.extraction import extract_with_pagination_streaming
|
|
32
|
-
from pycharter.etl_generator.progress import ETLProgress, ProgressTracker
|
|
33
|
-
from pycharter.utils.value_injector import resolve_values
|
|
34
|
-
|
|
35
|
-
logger = logging.getLogger(__name__)
|
|
36
|
-
|
|
37
|
-
# Optional memory monitoring
|
|
38
|
-
try:
|
|
39
|
-
import psutil
|
|
40
|
-
PSUTIL_AVAILABLE = True
|
|
41
|
-
except ImportError:
|
|
42
|
-
PSUTIL_AVAILABLE = False
|
|
43
|
-
|
|
44
|
-
DEFAULT_BATCH_SIZE = 1000
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class ETLOrchestrator:
|
|
48
|
-
"""
|
|
49
|
-
Generic ETL Orchestrator that executes pipelines from contract artifacts and ETL configs.
|
|
50
|
-
|
|
51
|
-
Processes data in streaming mode: Extract-Batch → Transform-Batch → Load-Batch.
|
|
52
|
-
This ensures constant memory usage regardless of dataset size.
|
|
53
|
-
|
|
54
|
-
Example:
|
|
55
|
-
>>> from pycharter.etl_generator import ETLOrchestrator
|
|
56
|
-
>>> orchestrator = ETLOrchestrator(contract_dir="data/examples/my_contract")
|
|
57
|
-
>>> await orchestrator.run()
|
|
58
|
-
"""
|
|
59
|
-
|
|
60
|
-
def __init__(
|
|
61
|
-
self,
|
|
62
|
-
contract_dir: Optional[str] = None,
|
|
63
|
-
contract_file: Optional[str] = None,
|
|
64
|
-
contract_dict: Optional[Dict[str, Any]] = None,
|
|
65
|
-
contract_metadata: Optional[ContractMetadata] = None,
|
|
66
|
-
checkpoint_dir: Optional[str] = None,
|
|
67
|
-
progress_callback: Optional[Callable[[ETLProgress], None]] = None,
|
|
68
|
-
verbose: bool = True,
|
|
69
|
-
max_memory_mb: Optional[int] = None,
|
|
70
|
-
config_context: Optional[Dict[str, Any]] = None,
|
|
71
|
-
# ETL config options (alternative to loading from contract_dir)
|
|
72
|
-
extract_config: Optional[Dict[str, Any]] = None,
|
|
73
|
-
transform_config: Optional[Dict[str, Any]] = None,
|
|
74
|
-
load_config: Optional[Dict[str, Any]] = None,
|
|
75
|
-
extract_file: Optional[str] = None,
|
|
76
|
-
transform_file: Optional[str] = None,
|
|
77
|
-
load_file: Optional[str] = None,
|
|
78
|
-
):
|
|
79
|
-
"""
|
|
80
|
-
Initialize the ETL orchestrator with contract artifacts.
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
contract_dir: Directory containing contract files and ETL configs
|
|
84
|
-
contract_file: Path to complete contract file (YAML/JSON)
|
|
85
|
-
contract_dict: Contract as dictionary
|
|
86
|
-
contract_metadata: ContractMetadata object (from parse_contract)
|
|
87
|
-
checkpoint_dir: Directory for checkpoint files (None = disabled)
|
|
88
|
-
progress_callback: Optional callback for progress updates
|
|
89
|
-
verbose: If True, print progress to stdout
|
|
90
|
-
max_memory_mb: Maximum memory usage in MB (None = no limit)
|
|
91
|
-
config_context: Optional context dictionary for value injection.
|
|
92
|
-
Values in this dict have highest priority when resolving
|
|
93
|
-
variables in config files (e.g., ${VAR}).
|
|
94
|
-
Useful for injecting application-level settings.
|
|
95
|
-
extract_config: Optional extract configuration as dictionary.
|
|
96
|
-
If provided, overrides extract.yaml from contract_dir.
|
|
97
|
-
transform_config: Optional transform configuration as dictionary.
|
|
98
|
-
If provided, overrides transform.yaml from contract_dir.
|
|
99
|
-
load_config: Optional load configuration as dictionary.
|
|
100
|
-
If provided, overrides load.yaml from contract_dir.
|
|
101
|
-
extract_file: Optional path to extract.yaml file.
|
|
102
|
-
If provided, overrides extract.yaml from contract_dir.
|
|
103
|
-
transform_file: Optional path to transform.yaml file.
|
|
104
|
-
If provided, overrides transform.yaml from contract_dir.
|
|
105
|
-
load_file: Optional path to load.yaml file.
|
|
106
|
-
If provided, overrides load.yaml from contract_dir.
|
|
107
|
-
|
|
108
|
-
Note:
|
|
109
|
-
ETL config priority: direct dict > file path > contract_dir
|
|
110
|
-
If contract_dir is not provided, you must provide extract_config/transform_config/load_config
|
|
111
|
-
or extract_file/transform_file/load_file.
|
|
112
|
-
|
|
113
|
-
Note:
|
|
114
|
-
Tables must be created manually or via migrations (e.g., Alembic).
|
|
115
|
-
PyCharter no longer creates tables from schema.json.
|
|
116
|
-
"""
|
|
117
|
-
self.contract_dir: Optional[Path] = None
|
|
118
|
-
self.schema: Optional[Dict[str, Any]] = None
|
|
119
|
-
self.coercion_rules: Dict[str, Any] = {}
|
|
120
|
-
self.validation_rules: Dict[str, Any] = {}
|
|
121
|
-
self.input_params: Dict[str, Dict[str, Any]] = {}
|
|
122
|
-
|
|
123
|
-
# Configuration context for value injection
|
|
124
|
-
self.config_context = config_context or {}
|
|
125
|
-
|
|
126
|
-
# Store ETL config parameters for later loading
|
|
127
|
-
self._extract_config_param = extract_config
|
|
128
|
-
self._transform_config_param = transform_config
|
|
129
|
-
self._load_config_param = load_config
|
|
130
|
-
self._extract_file_param = extract_file
|
|
131
|
-
self._transform_file_param = transform_file
|
|
132
|
-
self._load_file_param = load_file
|
|
133
|
-
|
|
134
|
-
# Enhanced features
|
|
135
|
-
self.checkpoint_manager = CheckpointManager(checkpoint_dir)
|
|
136
|
-
self.progress_tracker = ProgressTracker(progress_callback, verbose)
|
|
137
|
-
self.max_memory_mb = max_memory_mb
|
|
138
|
-
self.process = None
|
|
139
|
-
if PSUTIL_AVAILABLE:
|
|
140
|
-
self.process = psutil.Process()
|
|
141
|
-
|
|
142
|
-
# Logging context
|
|
143
|
-
self.run_id: Optional[str] = None # Correlation ID for this run
|
|
144
|
-
self._current_stage: Optional[str] = None # Current pipeline stage
|
|
145
|
-
|
|
146
|
-
# Load contract artifacts
|
|
147
|
-
if contract_metadata:
|
|
148
|
-
self._load_from_metadata(contract_metadata)
|
|
149
|
-
elif contract_dict:
|
|
150
|
-
self._load_from_dict(contract_dict)
|
|
151
|
-
elif contract_file:
|
|
152
|
-
file_path = Path(contract_file)
|
|
153
|
-
self.contract_dir = file_path.parent
|
|
154
|
-
self._load_from_file(file_path)
|
|
155
|
-
elif contract_dir:
|
|
156
|
-
self.contract_dir = Path(contract_dir)
|
|
157
|
-
self._load_from_directory(self.contract_dir)
|
|
158
|
-
else:
|
|
159
|
-
# If no contract source provided, we still need contract_dir for ETL configs
|
|
160
|
-
# unless all ETL configs are provided directly
|
|
161
|
-
if not (extract_config or extract_file) and not contract_dir:
|
|
162
|
-
raise ValueError(
|
|
163
|
-
"Must provide one of: contract_dir, contract_file, contract_dict, "
|
|
164
|
-
"contract_metadata, or extract_config/extract_file"
|
|
165
|
-
)
|
|
166
|
-
# Set contract_dir to None if not provided (ETL configs will be loaded from params)
|
|
167
|
-
self.contract_dir = None
|
|
168
|
-
|
|
169
|
-
# Load ETL configurations (extract, transform, load)
|
|
170
|
-
# Priority: direct dict > file path > contract_dir
|
|
171
|
-
self._load_etl_configs()
|
|
172
|
-
|
|
173
|
-
# ============================================================================
|
|
174
|
-
# INITIALIZATION AND CONFIGURATION LOADING
|
|
175
|
-
# ============================================================================
|
|
176
|
-
|
|
177
|
-
def _load_from_metadata(self, metadata: ContractMetadata) -> None:
|
|
178
|
-
"""Load contract from ContractMetadata object."""
|
|
179
|
-
self.schema = metadata.schema
|
|
180
|
-
self.coercion_rules = metadata.coercion_rules or {}
|
|
181
|
-
self.validation_rules = metadata.validation_rules or {}
|
|
182
|
-
|
|
183
|
-
def _load_from_dict(self, contract: Dict[str, Any]) -> None:
|
|
184
|
-
"""Load contract from dictionary."""
|
|
185
|
-
self.schema = contract.get("schema")
|
|
186
|
-
if not self.schema:
|
|
187
|
-
raise ValueError("Contract dictionary must contain 'schema'")
|
|
188
|
-
|
|
189
|
-
self.coercion_rules = self._extract_rules(contract.get("coercion_rules", {}))
|
|
190
|
-
self.validation_rules = self._extract_rules(contract.get("validation_rules", {}))
|
|
191
|
-
|
|
192
|
-
@staticmethod
|
|
193
|
-
def _extract_rules(rules_data: Any) -> Dict[str, Any]:
|
|
194
|
-
"""Extract rules from various formats."""
|
|
195
|
-
if not isinstance(rules_data, dict):
|
|
196
|
-
return {}
|
|
197
|
-
|
|
198
|
-
if "rules" in rules_data:
|
|
199
|
-
return rules_data["rules"]
|
|
200
|
-
elif not any(k in rules_data for k in ["version", "description", "title"]):
|
|
201
|
-
return rules_data
|
|
202
|
-
else:
|
|
203
|
-
return {}
|
|
204
|
-
|
|
205
|
-
def _load_from_file(self, file_path: Path) -> None:
|
|
206
|
-
"""Load contract from file."""
|
|
207
|
-
contract_metadata = parse_contract_file(str(file_path))
|
|
208
|
-
self._load_from_metadata(contract_metadata)
|
|
209
|
-
|
|
210
|
-
def _load_from_directory(self, contract_dir: Path) -> None:
|
|
211
|
-
"""Load contract components from directory."""
|
|
212
|
-
if not contract_dir.exists():
|
|
213
|
-
raise ValueError(f"Contract directory not found: {contract_dir}")
|
|
214
|
-
|
|
215
|
-
# Load schema (required) - support both YAML and JSON
|
|
216
|
-
schema_path_yaml = contract_dir / "schema.yaml"
|
|
217
|
-
schema_path_json = contract_dir / "schema.json"
|
|
218
|
-
|
|
219
|
-
schema_path = None
|
|
220
|
-
if schema_path_yaml.exists():
|
|
221
|
-
schema_path = schema_path_yaml
|
|
222
|
-
elif schema_path_json.exists():
|
|
223
|
-
schema_path = schema_path_json
|
|
224
|
-
else:
|
|
225
|
-
# Try to find JSON schema files with dataset name pattern
|
|
226
|
-
dataset_name = contract_dir.name
|
|
227
|
-
possible_json_schemas = [
|
|
228
|
-
contract_dir / f"{dataset_name}_schema.json",
|
|
229
|
-
contract_dir / f"{dataset_name}.schema.json",
|
|
230
|
-
contract_dir / "schema.json",
|
|
231
|
-
]
|
|
232
|
-
for possible_path in possible_json_schemas:
|
|
233
|
-
if possible_path.exists():
|
|
234
|
-
schema_path = possible_path
|
|
235
|
-
break
|
|
236
|
-
|
|
237
|
-
if schema_path and schema_path.exists():
|
|
238
|
-
if schema_path.suffix == '.json':
|
|
239
|
-
import json
|
|
240
|
-
with open(schema_path, 'r', encoding='utf-8') as f:
|
|
241
|
-
self.schema = json.load(f)
|
|
242
|
-
else:
|
|
243
|
-
self.schema = self._load_yaml(schema_path)
|
|
244
|
-
else:
|
|
245
|
-
raise ValueError(
|
|
246
|
-
f"Schema file not found in {contract_dir}. "
|
|
247
|
-
f"Expected: schema.yaml, schema.json, or {contract_dir.name}_schema.json"
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
# Load coercion rules (optional)
|
|
251
|
-
coercion_path = contract_dir / "coercion_rules.yaml"
|
|
252
|
-
if coercion_path.exists():
|
|
253
|
-
coercion_data = self._load_yaml(coercion_path)
|
|
254
|
-
self.coercion_rules = self._extract_rules(coercion_data)
|
|
255
|
-
|
|
256
|
-
# Load validation rules (optional)
|
|
257
|
-
validation_path = contract_dir / "validation_rules.yaml"
|
|
258
|
-
if validation_path.exists():
|
|
259
|
-
validation_data = self._load_yaml(validation_path)
|
|
260
|
-
self.validation_rules = self._extract_rules(validation_data)
|
|
261
|
-
|
|
262
|
-
def _load_etl_configs(self) -> None:
|
|
263
|
-
"""
|
|
264
|
-
Load ETL configuration files (extract, transform, load).
|
|
265
|
-
|
|
266
|
-
Priority order:
|
|
267
|
-
1. Direct dictionary parameters (extract_config, transform_config, load_config)
|
|
268
|
-
2. File path parameters (extract_file, transform_file, load_file)
|
|
269
|
-
3. Files in contract_dir (extract.yaml, transform.yaml, load.yaml)
|
|
270
|
-
"""
|
|
271
|
-
# Load extract config (required)
|
|
272
|
-
self.extract_config = self._load_single_config(
|
|
273
|
-
config_param=self._extract_config_param,
|
|
274
|
-
file_param=self._extract_file_param,
|
|
275
|
-
default_filename="extract.yaml",
|
|
276
|
-
required=True,
|
|
277
|
-
config_name="Extract"
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
# Load transform config (optional)
|
|
281
|
-
self.transform_config = self._load_single_config(
|
|
282
|
-
config_param=self._transform_config_param,
|
|
283
|
-
file_param=self._transform_file_param,
|
|
284
|
-
default_filename="transform.yaml",
|
|
285
|
-
required=False,
|
|
286
|
-
config_name="Transform"
|
|
287
|
-
)
|
|
288
|
-
|
|
289
|
-
# Load load config (required)
|
|
290
|
-
self.load_config = self._load_single_config(
|
|
291
|
-
config_param=self._load_config_param,
|
|
292
|
-
file_param=self._load_file_param,
|
|
293
|
-
default_filename="load.yaml",
|
|
294
|
-
required=True,
|
|
295
|
-
config_name="Load"
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
# Parse input parameters from extract config
|
|
299
|
-
self._parse_input_params()
|
|
300
|
-
|
|
301
|
-
if not self.schema:
|
|
302
|
-
raise ValueError("Schema not loaded")
|
|
303
|
-
|
|
304
|
-
# Initialize Dead Letter Queue (will be configured with session in run() method)
|
|
305
|
-
self.dlq: Optional[DeadLetterQueue] = None
|
|
306
|
-
|
|
307
|
-
def _load_single_config(
|
|
308
|
-
self,
|
|
309
|
-
config_param: Optional[Dict[str, Any]],
|
|
310
|
-
file_param: Optional[str],
|
|
311
|
-
default_filename: str,
|
|
312
|
-
required: bool,
|
|
313
|
-
config_name: str,
|
|
314
|
-
) -> Dict[str, Any]:
|
|
315
|
-
"""
|
|
316
|
-
Load a single ETL config following priority order.
|
|
317
|
-
|
|
318
|
-
Args:
|
|
319
|
-
config_param: Direct dictionary config (highest priority)
|
|
320
|
-
file_param: File path to config (medium priority)
|
|
321
|
-
default_filename: Default filename in contract_dir (lowest priority)
|
|
322
|
-
required: Whether this config is required
|
|
323
|
-
config_name: Name for error messages
|
|
324
|
-
|
|
325
|
-
Returns:
|
|
326
|
-
Loaded config dictionary (empty dict if not required and not found)
|
|
327
|
-
"""
|
|
328
|
-
# Priority 1: Direct dictionary
|
|
329
|
-
if config_param is not None:
|
|
330
|
-
return config_param
|
|
331
|
-
|
|
332
|
-
# Priority 2: File path
|
|
333
|
-
if file_param:
|
|
334
|
-
config_path = Path(file_param)
|
|
335
|
-
if not config_path.exists():
|
|
336
|
-
raise ValueError(f"{config_name} config file not found: {config_path}")
|
|
337
|
-
config = self._load_yaml(config_path)
|
|
338
|
-
# Set contract_dir from file if not already set
|
|
339
|
-
if not self.contract_dir:
|
|
340
|
-
self.contract_dir = config_path.parent
|
|
341
|
-
return config
|
|
342
|
-
|
|
343
|
-
# Priority 3: From contract_dir
|
|
344
|
-
if self.contract_dir and self.contract_dir.exists():
|
|
345
|
-
config_path = self.contract_dir / default_filename
|
|
346
|
-
if config_path.exists():
|
|
347
|
-
return self._load_yaml(config_path)
|
|
348
|
-
|
|
349
|
-
# Handle missing config
|
|
350
|
-
if required:
|
|
351
|
-
raise ValueError(
|
|
352
|
-
f"{config_name} configuration not found. Provide one of: "
|
|
353
|
-
f"{config_name.lower()}_config (dict), {config_name.lower()}_file (path), "
|
|
354
|
-
f"or contract_dir with {default_filename}"
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
return {}
|
|
358
|
-
|
|
359
|
-
def _parse_input_params(self) -> None:
|
|
360
|
-
"""Parse input parameters from extract config."""
|
|
361
|
-
input_params_config = self.extract_config.get('input_params', [])
|
|
362
|
-
if isinstance(input_params_config, list):
|
|
363
|
-
self.input_params = {name: {} for name in input_params_config}
|
|
364
|
-
elif isinstance(input_params_config, dict):
|
|
365
|
-
self.input_params = input_params_config
|
|
366
|
-
else:
|
|
367
|
-
self.input_params = {}
|
|
368
|
-
|
|
369
|
-
def _load_yaml(self, file_path: Path) -> Dict[str, Any]:
|
|
370
|
-
"""Load YAML file, return empty dict if not found."""
|
|
371
|
-
if not file_path.exists():
|
|
372
|
-
return {}
|
|
373
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
374
|
-
return yaml.safe_load(f) or {}
|
|
375
|
-
|
|
376
|
-
def _prepare_params(self, **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
377
|
-
"""Prepare params and headers from config and kwargs."""
|
|
378
|
-
params = self.extract_config.get('params', {}).copy()
|
|
379
|
-
headers = self.extract_config.get('headers', {})
|
|
380
|
-
|
|
381
|
-
# Get parameter mapping from extract config (maps input param names to API param names)
|
|
382
|
-
param_mapping = self.extract_config.get('param_mapping', {})
|
|
383
|
-
|
|
384
|
-
# Merge input arguments
|
|
385
|
-
for param_name, param_value in kwargs.items():
|
|
386
|
-
if param_name in self.input_params:
|
|
387
|
-
# Check if there's a mapping for this parameter
|
|
388
|
-
api_param_name = param_mapping.get(param_name, param_name)
|
|
389
|
-
params[api_param_name] = param_value
|
|
390
|
-
else:
|
|
391
|
-
warnings.warn(
|
|
392
|
-
f"Unknown input parameter '{param_name}'. "
|
|
393
|
-
f"Available: {list(self.input_params.keys())}",
|
|
394
|
-
UserWarning
|
|
395
|
-
)
|
|
396
|
-
|
|
397
|
-
# Validate required input parameters and apply defaults for optional ones
|
|
398
|
-
for param_name, param_meta in self.input_params.items():
|
|
399
|
-
if param_meta.get('required', False):
|
|
400
|
-
# Check if input parameter was provided in kwargs
|
|
401
|
-
if param_name not in kwargs:
|
|
402
|
-
raise ValueError(
|
|
403
|
-
f"Required input parameter '{param_name}' not provided. "
|
|
404
|
-
f"Please provide: {param_name}=value"
|
|
405
|
-
)
|
|
406
|
-
else:
|
|
407
|
-
# Apply default value for optional parameters if not provided
|
|
408
|
-
if param_name not in kwargs and 'default' in param_meta:
|
|
409
|
-
default_value = param_meta.get('default')
|
|
410
|
-
# Only add if default is not None (None means truly optional)
|
|
411
|
-
if default_value is not None:
|
|
412
|
-
api_param_name = param_mapping.get(param_name, param_name)
|
|
413
|
-
params[api_param_name] = default_value
|
|
414
|
-
|
|
415
|
-
# Resolve values with config context
|
|
416
|
-
source_file = str(self.contract_dir / "extract.yaml") if self.contract_dir else None
|
|
417
|
-
params = resolve_values(params, context=self.config_context, source_file=source_file)
|
|
418
|
-
headers = resolve_values(headers, context=self.config_context, source_file=source_file)
|
|
419
|
-
|
|
420
|
-
return params, headers
|
|
421
|
-
|
|
422
|
-
# ============================================================================
|
|
423
|
-
# EXTRACTION
|
|
424
|
-
# ============================================================================
|
|
425
|
-
|
|
426
|
-
async def extract(
|
|
427
|
-
self,
|
|
428
|
-
batch_size: Optional[int] = None,
|
|
429
|
-
max_records: Optional[int] = None,
|
|
430
|
-
**kwargs
|
|
431
|
-
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
432
|
-
"""
|
|
433
|
-
Extract data in batches using async generator.
|
|
434
|
-
|
|
435
|
-
Yields batches of records for memory-efficient processing.
|
|
436
|
-
|
|
437
|
-
Args:
|
|
438
|
-
batch_size: Number of records per batch (defaults to extract.yaml config)
|
|
439
|
-
max_records: Maximum total records to extract (None = all)
|
|
440
|
-
**kwargs: Input parameters defined in extract.yaml's input_params section
|
|
441
|
-
|
|
442
|
-
Yields:
|
|
443
|
-
Batches of extracted records (lists of dictionaries)
|
|
444
|
-
|
|
445
|
-
Example:
|
|
446
|
-
>>> async for batch in orchestrator.extract(symbol="AAPL"):
|
|
447
|
-
... print(f"Extracted {len(batch)} records")
|
|
448
|
-
"""
|
|
449
|
-
self._current_stage = 'extract'
|
|
450
|
-
if batch_size is None:
|
|
451
|
-
batch_size = self.extract_config.get('batch_size', DEFAULT_BATCH_SIZE)
|
|
452
|
-
|
|
453
|
-
params, headers = self._prepare_params(**kwargs)
|
|
454
|
-
|
|
455
|
-
async for batch in extract_with_pagination_streaming(
|
|
456
|
-
self.extract_config, params, headers, self.contract_dir, batch_size, max_records, config_context=self.config_context
|
|
457
|
-
):
|
|
458
|
-
yield batch
|
|
459
|
-
|
|
460
|
-
# ============================================================================
|
|
461
|
-
# TRANSFORMATION (Simple Operations → JSONata → Custom Functions)
|
|
462
|
-
# ============================================================================
|
|
463
|
-
|
|
464
|
-
def transform(self, raw_data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]:
|
|
465
|
-
"""
|
|
466
|
-
Transform data using simple operations, JSONata expressions, and/or custom Python functions.
|
|
467
|
-
|
|
468
|
-
Pipeline order (applied sequentially):
|
|
469
|
-
1. Simple operations (rename, select, drop, convert, defaults, add)
|
|
470
|
-
2. JSONata transformation (if configured)
|
|
471
|
-
3. Custom function execution (if configured)
|
|
472
|
-
|
|
473
|
-
Args:
|
|
474
|
-
raw_data: Raw data from extraction
|
|
475
|
-
**kwargs: Additional parameters (passed to custom functions)
|
|
476
|
-
|
|
477
|
-
Returns:
|
|
478
|
-
Transformed data
|
|
479
|
-
|
|
480
|
-
Example - Simple operations:
|
|
481
|
-
transform_config:
|
|
482
|
-
rename:
|
|
483
|
-
oldName: new_name
|
|
484
|
-
camelCase: snake_case
|
|
485
|
-
select:
|
|
486
|
-
- field1
|
|
487
|
-
- field2
|
|
488
|
-
convert:
|
|
489
|
-
price: float
|
|
490
|
-
quantity: integer
|
|
491
|
-
defaults:
|
|
492
|
-
status: "pending"
|
|
493
|
-
|
|
494
|
-
Example - JSONata (advanced):
|
|
495
|
-
transform_config:
|
|
496
|
-
jsonata:
|
|
497
|
-
expression: |
|
|
498
|
-
$.{
|
|
499
|
-
"ticker": symbol,
|
|
500
|
-
"avg_price": $average(prices)
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
Example - Custom function:
|
|
504
|
-
transform_config:
|
|
505
|
-
custom_function:
|
|
506
|
-
module: "myproject.transforms"
|
|
507
|
-
function: "optimize_data"
|
|
508
|
-
mode: "batch"
|
|
509
|
-
"""
|
|
510
|
-
if not self.transform_config:
|
|
511
|
-
return raw_data
|
|
512
|
-
|
|
513
|
-
data = raw_data
|
|
514
|
-
|
|
515
|
-
# Step 1: Apply simple operations (in order)
|
|
516
|
-
# Support both new 'transform' key and legacy top-level keys for backward compatibility
|
|
517
|
-
simple_ops = {}
|
|
518
|
-
|
|
519
|
-
# New format: transform: { rename: {...}, select: [...] }
|
|
520
|
-
if 'transform' in self.transform_config:
|
|
521
|
-
simple_ops = self.transform_config.get('transform', {})
|
|
522
|
-
|
|
523
|
-
# Legacy format: rename: {...} at top level (for backward compatibility)
|
|
524
|
-
if 'rename' in self.transform_config and 'transform' not in self.transform_config:
|
|
525
|
-
simple_ops['rename'] = self.transform_config.get('rename')
|
|
526
|
-
if 'select' in self.transform_config and 'transform' not in self.transform_config:
|
|
527
|
-
simple_ops['select'] = self.transform_config.get('select')
|
|
528
|
-
if 'drop' in self.transform_config and 'transform' not in self.transform_config:
|
|
529
|
-
simple_ops['drop'] = self.transform_config.get('drop')
|
|
530
|
-
if 'convert' in self.transform_config and 'transform' not in self.transform_config:
|
|
531
|
-
simple_ops['convert'] = self.transform_config.get('convert')
|
|
532
|
-
if 'defaults' in self.transform_config and 'transform' not in self.transform_config:
|
|
533
|
-
simple_ops['defaults'] = self.transform_config.get('defaults')
|
|
534
|
-
if 'add' in self.transform_config and 'transform' not in self.transform_config:
|
|
535
|
-
simple_ops['add'] = self.transform_config.get('add')
|
|
536
|
-
|
|
537
|
-
if simple_ops:
|
|
538
|
-
data = self._apply_simple_operations(data, simple_ops)
|
|
539
|
-
|
|
540
|
-
# Step 2: Apply JSONata transformation (if configured)
|
|
541
|
-
jsonata_config = self.transform_config.get('jsonata')
|
|
542
|
-
if jsonata_config:
|
|
543
|
-
data = self._apply_jsonata(data, jsonata_config)
|
|
544
|
-
|
|
545
|
-
# Step 3: Apply custom function (if configured)
|
|
546
|
-
custom_func_config = self.transform_config.get('custom_function')
|
|
547
|
-
if custom_func_config:
|
|
548
|
-
data = self._apply_custom_function(data, custom_func_config, **kwargs)
|
|
549
|
-
|
|
550
|
-
return data
|
|
551
|
-
|
|
552
|
-
def _apply_simple_operations(
|
|
553
|
-
self,
|
|
554
|
-
data: List[Dict[str, Any]],
|
|
555
|
-
config: Dict[str, Any]
|
|
556
|
-
) -> List[Dict[str, Any]]:
|
|
557
|
-
"""
|
|
558
|
-
Apply simple declarative transformation operations.
|
|
559
|
-
|
|
560
|
-
Operations are applied in this order:
|
|
561
|
-
1. rename - Rename fields (old_name: new_name)
|
|
562
|
-
2. convert - Convert field types (field: type)
|
|
563
|
-
3. defaults - Set default values for missing fields
|
|
564
|
-
4. add - Add computed fields with expressions
|
|
565
|
-
5. select - Keep only specified fields
|
|
566
|
-
6. drop - Remove specified fields
|
|
567
|
-
|
|
568
|
-
Args:
|
|
569
|
-
data: Input data (list of records)
|
|
570
|
-
config: Simple operations configuration
|
|
571
|
-
|
|
572
|
-
Returns:
|
|
573
|
-
Transformed data
|
|
574
|
-
|
|
575
|
-
Example config:
|
|
576
|
-
transform:
|
|
577
|
-
rename:
|
|
578
|
-
oldName: new_name
|
|
579
|
-
camelCase: snake_case
|
|
580
|
-
convert:
|
|
581
|
-
price: float
|
|
582
|
-
quantity: integer
|
|
583
|
-
active: boolean
|
|
584
|
-
defaults:
|
|
585
|
-
status: "pending"
|
|
586
|
-
priority: 0
|
|
587
|
-
add:
|
|
588
|
-
full_name: "${first_name} ${last_name}"
|
|
589
|
-
created_at: "now()"
|
|
590
|
-
record_id: "uuid()"
|
|
591
|
-
select:
|
|
592
|
-
- field1
|
|
593
|
-
- field2
|
|
594
|
-
drop:
|
|
595
|
-
- internal_id
|
|
596
|
-
- debug_info
|
|
597
|
-
"""
|
|
598
|
-
if not data:
|
|
599
|
-
return data
|
|
600
|
-
|
|
601
|
-
result = []
|
|
602
|
-
|
|
603
|
-
# Get available fields from first record for validation
|
|
604
|
-
available_fields = set(data[0].keys()) if data else set()
|
|
605
|
-
|
|
606
|
-
# Step 1: Rename fields
|
|
607
|
-
rename_map = config.get('rename', {})
|
|
608
|
-
if rename_map:
|
|
609
|
-
# Validate rename mappings
|
|
610
|
-
missing_fields = [old for old in rename_map.keys() if old not in available_fields]
|
|
611
|
-
if missing_fields:
|
|
612
|
-
logger.warning(
|
|
613
|
-
f"Rename operation: Fields not found in data: {missing_fields}. "
|
|
614
|
-
f"Available fields: {sorted(available_fields)}"
|
|
615
|
-
)
|
|
616
|
-
|
|
617
|
-
# Step 2: Convert types
|
|
618
|
-
convert_map = config.get('convert', {})
|
|
619
|
-
|
|
620
|
-
# Step 3: Defaults
|
|
621
|
-
defaults_map = config.get('defaults', {})
|
|
622
|
-
|
|
623
|
-
# Step 4: Add computed fields
|
|
624
|
-
add_map = config.get('add', {})
|
|
625
|
-
|
|
626
|
-
# Step 5: Select fields (keep only these)
|
|
627
|
-
select_fields = config.get('select')
|
|
628
|
-
|
|
629
|
-
# Step 6: Drop fields (remove these)
|
|
630
|
-
drop_fields = set(config.get('drop', []))
|
|
631
|
-
|
|
632
|
-
for record in data:
|
|
633
|
-
transformed = dict(record)
|
|
634
|
-
|
|
635
|
-
# 1. Rename
|
|
636
|
-
if rename_map:
|
|
637
|
-
for old_name, new_name in rename_map.items():
|
|
638
|
-
if old_name in transformed:
|
|
639
|
-
transformed[new_name] = transformed.pop(old_name)
|
|
640
|
-
|
|
641
|
-
# 2. Convert types
|
|
642
|
-
if convert_map:
|
|
643
|
-
for field_name, target_type in convert_map.items():
|
|
644
|
-
if field_name in transformed:
|
|
645
|
-
try:
|
|
646
|
-
transformed[field_name] = self._convert_type(
|
|
647
|
-
transformed[field_name], target_type
|
|
648
|
-
)
|
|
649
|
-
except (ValueError, TypeError) as e:
|
|
650
|
-
logger.warning(
|
|
651
|
-
f"Failed to convert field '{field_name}' to {target_type}: {e}. "
|
|
652
|
-
f"Keeping original value."
|
|
653
|
-
)
|
|
654
|
-
|
|
655
|
-
# 3. Apply defaults
|
|
656
|
-
if defaults_map:
|
|
657
|
-
for field_name, default_value in defaults_map.items():
|
|
658
|
-
if field_name not in transformed or transformed[field_name] is None:
|
|
659
|
-
transformed[field_name] = default_value
|
|
660
|
-
|
|
661
|
-
# 4. Add computed fields
|
|
662
|
-
if add_map:
|
|
663
|
-
for field_name, expression in add_map.items():
|
|
664
|
-
try:
|
|
665
|
-
transformed[field_name] = self._evaluate_expression(
|
|
666
|
-
expression, transformed
|
|
667
|
-
)
|
|
668
|
-
except Exception as e:
|
|
669
|
-
logger.warning(
|
|
670
|
-
f"Failed to compute field '{field_name}': {e}. "
|
|
671
|
-
f"Skipping this field."
|
|
672
|
-
)
|
|
673
|
-
|
|
674
|
-
# 5. Select (keep only specified fields)
|
|
675
|
-
if select_fields:
|
|
676
|
-
transformed = {
|
|
677
|
-
k: v for k, v in transformed.items()
|
|
678
|
-
if k in select_fields
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
# 6. Drop (remove specified fields)
|
|
682
|
-
if drop_fields:
|
|
683
|
-
transformed = {
|
|
684
|
-
k: v for k, v in transformed.items()
|
|
685
|
-
if k not in drop_fields
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
result.append(transformed)
|
|
689
|
-
|
|
690
|
-
return result
|
|
691
|
-
|
|
692
|
-
def _convert_type(self, value: Any, target_type: str) -> Any:
|
|
693
|
-
"""
|
|
694
|
-
Convert a value to the specified type.
|
|
695
|
-
|
|
696
|
-
Args:
|
|
697
|
-
value: Value to convert
|
|
698
|
-
target_type: Target type (string, integer, float, boolean, datetime, date)
|
|
699
|
-
|
|
700
|
-
Returns:
|
|
701
|
-
Converted value
|
|
702
|
-
"""
|
|
703
|
-
if value is None:
|
|
704
|
-
return None
|
|
705
|
-
|
|
706
|
-
target_type_lower = target_type.lower().strip()
|
|
707
|
-
|
|
708
|
-
if target_type_lower in ('str', 'string'):
|
|
709
|
-
return str(value)
|
|
710
|
-
elif target_type_lower in ('int', 'integer'):
|
|
711
|
-
if isinstance(value, str):
|
|
712
|
-
# Try to parse as float first (handles "1.0" -> 1)
|
|
713
|
-
try:
|
|
714
|
-
return int(float(value))
|
|
715
|
-
except ValueError:
|
|
716
|
-
return int(value)
|
|
717
|
-
return int(value)
|
|
718
|
-
elif target_type_lower in ('float', 'number', 'numeric'):
|
|
719
|
-
if isinstance(value, str):
|
|
720
|
-
return float(value)
|
|
721
|
-
return float(value)
|
|
722
|
-
elif target_type_lower in ('bool', 'boolean'):
|
|
723
|
-
if isinstance(value, str):
|
|
724
|
-
return value.lower() in ('true', '1', 'yes', 'on')
|
|
725
|
-
return bool(value)
|
|
726
|
-
elif target_type_lower == 'datetime':
|
|
727
|
-
from datetime import datetime
|
|
728
|
-
if isinstance(value, str):
|
|
729
|
-
# Try common datetime formats
|
|
730
|
-
for fmt in [
|
|
731
|
-
'%Y-%m-%dT%H:%M:%S',
|
|
732
|
-
'%Y-%m-%dT%H:%M:%S.%f',
|
|
733
|
-
'%Y-%m-%dT%H:%M:%SZ',
|
|
734
|
-
'%Y-%m-%dT%H:%M:%S.%fZ',
|
|
735
|
-
'%Y-%m-%d %H:%M:%S',
|
|
736
|
-
'%Y-%m-%d %H:%M:%S.%f',
|
|
737
|
-
]:
|
|
738
|
-
try:
|
|
739
|
-
return datetime.strptime(value, fmt)
|
|
740
|
-
except ValueError:
|
|
741
|
-
continue
|
|
742
|
-
raise ValueError(f"Cannot parse datetime: {value}")
|
|
743
|
-
return value
|
|
744
|
-
elif target_type_lower == 'date':
|
|
745
|
-
from datetime import date, datetime
|
|
746
|
-
if isinstance(value, str):
|
|
747
|
-
# Try common date formats
|
|
748
|
-
for fmt in ['%Y-%m-%d', '%Y/%m/%d', '%m/%d/%Y']:
|
|
749
|
-
try:
|
|
750
|
-
dt = datetime.strptime(value, fmt)
|
|
751
|
-
return dt.date()
|
|
752
|
-
except ValueError:
|
|
753
|
-
continue
|
|
754
|
-
raise ValueError(f"Cannot parse date: {value}")
|
|
755
|
-
elif isinstance(value, datetime):
|
|
756
|
-
return value.date()
|
|
757
|
-
return value
|
|
758
|
-
else:
|
|
759
|
-
raise ValueError(f"Unsupported target type: {target_type}")
|
|
760
|
-
|
|
761
|
-
def _evaluate_expression(self, expression: str, record: Dict[str, Any]) -> Any:
|
|
762
|
-
"""
|
|
763
|
-
Evaluate a simple expression in the context of a record.
|
|
764
|
-
|
|
765
|
-
Supports:
|
|
766
|
-
- Field references: "${field_name}"
|
|
767
|
-
- String concatenation: "${field1} ${field2}"
|
|
768
|
-
- Simple functions: "now()", "uuid()"
|
|
769
|
-
- Literal values (if no placeholders)
|
|
770
|
-
|
|
771
|
-
Args:
|
|
772
|
-
expression: Expression string
|
|
773
|
-
record: Record dictionary for context
|
|
774
|
-
|
|
775
|
-
Returns:
|
|
776
|
-
Evaluated result
|
|
777
|
-
|
|
778
|
-
Examples:
|
|
779
|
-
"${first_name} ${last_name}" -> "John Doe"
|
|
780
|
-
"now()" -> "2024-01-01T12:00:00"
|
|
781
|
-
"uuid()" -> "123e4567-e89b-12d3-a456-426614174000"
|
|
782
|
-
"static_value" -> "static_value"
|
|
783
|
-
"""
|
|
784
|
-
if not isinstance(expression, str):
|
|
785
|
-
return expression
|
|
786
|
-
|
|
787
|
-
expression = expression.strip()
|
|
788
|
-
|
|
789
|
-
# Handle special functions
|
|
790
|
-
if expression == 'now()':
|
|
791
|
-
return datetime.now().isoformat()
|
|
792
|
-
elif expression == 'uuid()':
|
|
793
|
-
return str(uuid.uuid4())
|
|
794
|
-
|
|
795
|
-
# Handle field references and string interpolation
|
|
796
|
-
try:
|
|
797
|
-
# Simple string interpolation: "${field1} ${field2}"
|
|
798
|
-
result = expression
|
|
799
|
-
placeholders_found = False
|
|
800
|
-
|
|
801
|
-
# Find all ${...} placeholders
|
|
802
|
-
placeholder_pattern = r'\$\{([^}]+)\}'
|
|
803
|
-
matches = re.findall(placeholder_pattern, expression)
|
|
804
|
-
|
|
805
|
-
if matches:
|
|
806
|
-
placeholders_found = True
|
|
807
|
-
for field_name in matches:
|
|
808
|
-
if field_name in record:
|
|
809
|
-
value = record[field_name]
|
|
810
|
-
placeholder = f"${{{field_name}}}"
|
|
811
|
-
result = result.replace(placeholder, str(value) if value is not None else '')
|
|
812
|
-
else:
|
|
813
|
-
logger.warning(
|
|
814
|
-
f"Expression '{expression}': Field '{field_name}' not found in record. "
|
|
815
|
-
f"Available fields: {sorted(record.keys())}"
|
|
816
|
-
)
|
|
817
|
-
# Replace with empty string if field not found
|
|
818
|
-
placeholder = f"${{{field_name}}}"
|
|
819
|
-
result = result.replace(placeholder, '')
|
|
820
|
-
|
|
821
|
-
# If no placeholders were found and it's not a function, return as literal
|
|
822
|
-
if not placeholders_found and not expression.endswith('()'):
|
|
823
|
-
return expression
|
|
824
|
-
|
|
825
|
-
return result
|
|
826
|
-
except Exception as e:
|
|
827
|
-
raise ValueError(f"Failed to evaluate expression '{expression}': {e}") from e
|
|
828
|
-
|
|
829
|
-
def _apply_jsonata(
|
|
830
|
-
self,
|
|
831
|
-
data: List[Dict[str, Any]],
|
|
832
|
-
config: Dict[str, Any]
|
|
833
|
-
) -> List[Dict[str, Any]]:
|
|
834
|
-
"""
|
|
835
|
-
Apply JSONata expression to transform data.
|
|
836
|
-
|
|
837
|
-
Args:
|
|
838
|
-
data: Input data (list of records)
|
|
839
|
-
config: JSONata configuration with 'expression' and optional 'mode'
|
|
840
|
-
|
|
841
|
-
Returns:
|
|
842
|
-
Transformed data
|
|
843
|
-
|
|
844
|
-
Example config:
|
|
845
|
-
jsonata:
|
|
846
|
-
expression: |
|
|
847
|
-
$.{
|
|
848
|
-
"ticker": symbol,
|
|
849
|
-
"avg_price": $average(prices),
|
|
850
|
-
"total_volume": $sum(volumes)
|
|
851
|
-
}
|
|
852
|
-
mode: "batch" # or "record"
|
|
853
|
-
"""
|
|
854
|
-
expression_str = config.get('expression')
|
|
855
|
-
if not expression_str:
|
|
856
|
-
return data
|
|
857
|
-
|
|
858
|
-
mode = config.get('mode', 'batch')
|
|
859
|
-
|
|
860
|
-
try:
|
|
861
|
-
expr = jsonata.Jsonata(expression_str)
|
|
862
|
-
|
|
863
|
-
if mode == 'batch':
|
|
864
|
-
# Apply expression to entire dataset
|
|
865
|
-
result = expr.evaluate(data)
|
|
866
|
-
if result is None:
|
|
867
|
-
return []
|
|
868
|
-
return result if isinstance(result, list) else [result]
|
|
869
|
-
else:
|
|
870
|
-
# Apply expression to each record individually
|
|
871
|
-
return [expr.evaluate(record) for record in data if expr.evaluate(record) is not None]
|
|
872
|
-
|
|
873
|
-
except Exception as e:
|
|
874
|
-
logger.error(f"JSONata transformation failed: {e}")
|
|
875
|
-
raise ValueError(f"JSONata transformation error: {e}") from e
|
|
876
|
-
|
|
877
|
-
def _apply_custom_function(
|
|
878
|
-
self,
|
|
879
|
-
data: List[Dict[str, Any]],
|
|
880
|
-
config: Dict[str, Any],
|
|
881
|
-
**kwargs
|
|
882
|
-
) -> List[Dict[str, Any]]:
|
|
883
|
-
"""
|
|
884
|
-
Execute a custom Python function for transformation.
|
|
885
|
-
|
|
886
|
-
Args:
|
|
887
|
-
data: Input data
|
|
888
|
-
config: Custom function configuration
|
|
889
|
-
**kwargs: Additional parameters passed to the function
|
|
890
|
-
|
|
891
|
-
Returns:
|
|
892
|
-
Transformed data
|
|
893
|
-
|
|
894
|
-
Example config:
|
|
895
|
-
custom_function:
|
|
896
|
-
module: "pyoptima"
|
|
897
|
-
function: "optimize_from_etl_inputs"
|
|
898
|
-
mode: "batch"
|
|
899
|
-
kwargs:
|
|
900
|
-
method: "min_volatility"
|
|
901
|
-
solver: "ipopt"
|
|
902
|
-
|
|
903
|
-
Alternative config (using callable path):
|
|
904
|
-
custom_function:
|
|
905
|
-
callable: "myproject.transforms.optimize_portfolio"
|
|
906
|
-
mode: "batch"
|
|
907
|
-
"""
|
|
908
|
-
# Get module and function
|
|
909
|
-
callable_path = config.get('callable')
|
|
910
|
-
module_path = config.get('module')
|
|
911
|
-
func_name = config.get('function')
|
|
912
|
-
|
|
913
|
-
if callable_path:
|
|
914
|
-
# Parse "module.submodule.function" format
|
|
915
|
-
parts = callable_path.rsplit('.', 1)
|
|
916
|
-
if len(parts) != 2:
|
|
917
|
-
raise ValueError(f"Invalid callable path: {callable_path}. Use 'module.function' format.")
|
|
918
|
-
module_path, func_name = parts
|
|
919
|
-
|
|
920
|
-
if not module_path or not func_name:
|
|
921
|
-
raise ValueError("custom_function requires either 'callable' or 'module' + 'function'")
|
|
922
|
-
|
|
923
|
-
# Dynamic import
|
|
924
|
-
try:
|
|
925
|
-
module = importlib.import_module(module_path)
|
|
926
|
-
func = getattr(module, func_name)
|
|
927
|
-
except ImportError as e:
|
|
928
|
-
raise ValueError(f"Cannot import module '{module_path}': {e}") from e
|
|
929
|
-
except AttributeError as e:
|
|
930
|
-
raise ValueError(f"Function '{func_name}' not found in module '{module_path}'") from e
|
|
931
|
-
|
|
932
|
-
# Handle class-based methods (e.g., pyoptima optimization methods)
|
|
933
|
-
if isinstance(func, type):
|
|
934
|
-
instance = func()
|
|
935
|
-
if hasattr(instance, 'optimize'):
|
|
936
|
-
func = instance.optimize
|
|
937
|
-
elif hasattr(instance, 'run'):
|
|
938
|
-
func = instance.run
|
|
939
|
-
elif hasattr(instance, '__call__'):
|
|
940
|
-
func = instance
|
|
941
|
-
else:
|
|
942
|
-
raise ValueError(f"Class '{func_name}' has no 'optimize', 'run', or '__call__' method")
|
|
943
|
-
|
|
944
|
-
# Get mode and kwargs
|
|
945
|
-
mode = config.get('mode', 'batch')
|
|
946
|
-
func_kwargs = config.get('kwargs', {})
|
|
947
|
-
|
|
948
|
-
# Merge with runtime kwargs
|
|
949
|
-
merged_kwargs = {**func_kwargs, **kwargs}
|
|
950
|
-
|
|
951
|
-
try:
|
|
952
|
-
if mode == 'batch':
|
|
953
|
-
result = func(data, **merged_kwargs)
|
|
954
|
-
if result is None:
|
|
955
|
-
return []
|
|
956
|
-
return result if isinstance(result, list) else [result]
|
|
957
|
-
else:
|
|
958
|
-
# Record mode
|
|
959
|
-
results = []
|
|
960
|
-
for record in data:
|
|
961
|
-
record_result = func(record, **merged_kwargs)
|
|
962
|
-
if record_result is not None:
|
|
963
|
-
if isinstance(record_result, list):
|
|
964
|
-
results.extend(record_result)
|
|
965
|
-
else:
|
|
966
|
-
results.append(record_result)
|
|
967
|
-
return results
|
|
968
|
-
|
|
969
|
-
except Exception as e:
|
|
970
|
-
logger.error(f"Custom function '{func_name}' failed: {e}")
|
|
971
|
-
raise ValueError(f"Custom function error: {e}") from e
|
|
972
|
-
|
|
973
|
-
# ============================================================================
|
|
974
|
-
# LOADING
|
|
975
|
-
# ============================================================================
|
|
976
|
-
|
|
977
|
-
async def load(
|
|
978
|
-
self,
|
|
979
|
-
transformed_data: List[Dict[str, Any]],
|
|
980
|
-
session: Any = None,
|
|
981
|
-
**kwargs,
|
|
982
|
-
) -> Dict[str, Any]:
|
|
983
|
-
"""Load transformed data into the database."""
|
|
984
|
-
target_table = self.load_config.get('target_table')
|
|
985
|
-
schema_name = self.load_config.get('schema_name')
|
|
986
|
-
if not schema_name:
|
|
987
|
-
raise ValueError(
|
|
988
|
-
"schema_name must be specified in load.yaml. "
|
|
989
|
-
"Example: schema_name: public"
|
|
990
|
-
)
|
|
991
|
-
write_method = self.load_config.get('write_method', 'upsert')
|
|
992
|
-
primary_key = self.load_config.get('primary_key')
|
|
993
|
-
unique_constraints = self.load_config.get('unique_constraints', [])
|
|
994
|
-
# Keep primary_key as-is (can be string or list for composite keys)
|
|
995
|
-
# The load functions now handle both single and composite primary keys
|
|
996
|
-
batch_size = self.load_config.get('batch_size', 1000)
|
|
997
|
-
|
|
998
|
-
# If primary_key is 'id' and not in the data, use unique constraints for conflict detection
|
|
999
|
-
# This allows using UUID primary keys while upserting on natural keys
|
|
1000
|
-
conflict_key = primary_key
|
|
1001
|
-
if write_method == 'upsert' and transformed_data:
|
|
1002
|
-
incoming_columns = set(transformed_data[0].keys())
|
|
1003
|
-
# Check if primary_key is 'id' (string) or contains 'id' (list)
|
|
1004
|
-
pk_is_id = (isinstance(primary_key, str) and primary_key == 'id') or \
|
|
1005
|
-
(isinstance(primary_key, list) and len(primary_key) == 1 and primary_key[0] == 'id')
|
|
1006
|
-
|
|
1007
|
-
if pk_is_id and 'id' not in incoming_columns:
|
|
1008
|
-
# Use first unique constraint for conflict detection
|
|
1009
|
-
if unique_constraints:
|
|
1010
|
-
# unique_constraints can be a list of lists or a list of strings
|
|
1011
|
-
if isinstance(unique_constraints[0], list):
|
|
1012
|
-
conflict_key = unique_constraints[0] # First constraint (can be composite)
|
|
1013
|
-
else:
|
|
1014
|
-
conflict_key = unique_constraints[0] if isinstance(unique_constraints[0], str) else unique_constraints
|
|
1015
|
-
else:
|
|
1016
|
-
# Fallback: if no unique constraints, can't do upsert
|
|
1017
|
-
raise ValueError(
|
|
1018
|
-
f"Cannot perform upsert: primary_key is 'id' (auto-generated) but no unique_constraints "
|
|
1019
|
-
f"specified in load.yaml for conflict detection. Please specify unique_constraints."
|
|
1020
|
-
)
|
|
1021
|
-
|
|
1022
|
-
if not target_table:
|
|
1023
|
-
raise ValueError("target_table not specified in load configuration")
|
|
1024
|
-
|
|
1025
|
-
tunnel = None
|
|
1026
|
-
if session is None:
|
|
1027
|
-
try:
|
|
1028
|
-
engine, db_session, db_type, tunnel = get_database_connection(
|
|
1029
|
-
self.load_config, self.contract_dir, config_context=self.config_context
|
|
1030
|
-
)
|
|
1031
|
-
try:
|
|
1032
|
-
result = load_data(
|
|
1033
|
-
transformed_data,
|
|
1034
|
-
db_session,
|
|
1035
|
-
schema_name,
|
|
1036
|
-
target_table,
|
|
1037
|
-
write_method,
|
|
1038
|
-
conflict_key, # Use conflict_key (may be unique constraint instead of PK)
|
|
1039
|
-
batch_size,
|
|
1040
|
-
db_type,
|
|
1041
|
-
)
|
|
1042
|
-
return result
|
|
1043
|
-
finally:
|
|
1044
|
-
db_session.close()
|
|
1045
|
-
if tunnel:
|
|
1046
|
-
tunnel.stop()
|
|
1047
|
-
except Exception as e:
|
|
1048
|
-
if tunnel:
|
|
1049
|
-
try:
|
|
1050
|
-
tunnel.stop()
|
|
1051
|
-
except Exception:
|
|
1052
|
-
pass
|
|
1053
|
-
raise
|
|
1054
|
-
else:
|
|
1055
|
-
from pycharter.etl_generator.database import detect_database_type
|
|
1056
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
1057
|
-
|
|
1058
|
-
# Detect database type
|
|
1059
|
-
db_type = "postgresql"
|
|
1060
|
-
if hasattr(session, 'bind') and hasattr(session.bind, 'url'):
|
|
1061
|
-
db_url = str(session.bind.url)
|
|
1062
|
-
db_type = detect_database_type(db_url)
|
|
1063
|
-
|
|
1064
|
-
# load_data is now async and expects AsyncSession
|
|
1065
|
-
if not isinstance(session, AsyncSession):
|
|
1066
|
-
raise ValueError(
|
|
1067
|
-
f"load_data requires an AsyncSession, but got {type(session)}. "
|
|
1068
|
-
"Please use an AsyncSession for database operations."
|
|
1069
|
-
)
|
|
1070
|
-
|
|
1071
|
-
return await load_data(
|
|
1072
|
-
transformed_data,
|
|
1073
|
-
session,
|
|
1074
|
-
schema_name,
|
|
1075
|
-
target_table,
|
|
1076
|
-
write_method,
|
|
1077
|
-
conflict_key, # Use conflict_key (may be unique constraint instead of PK)
|
|
1078
|
-
batch_size,
|
|
1079
|
-
db_type,
|
|
1080
|
-
)
|
|
1081
|
-
|
|
1082
|
-
# ============================================================================
|
|
1083
|
-
# MEMORY MANAGEMENT
|
|
1084
|
-
# ============================================================================
|
|
1085
|
-
|
|
1086
|
-
def _check_memory(self) -> Optional[float]:
|
|
1087
|
-
"""Get current memory usage in MB, or None if psutil not available."""
|
|
1088
|
-
if not PSUTIL_AVAILABLE or not self.process:
|
|
1089
|
-
return None
|
|
1090
|
-
return self.process.memory_info().rss / 1024 / 1024
|
|
1091
|
-
|
|
1092
|
-
def _enforce_memory_limit(self):
|
|
1093
|
-
"""Check and enforce memory limits."""
|
|
1094
|
-
if self.max_memory_mb:
|
|
1095
|
-
current = self._check_memory()
|
|
1096
|
-
if current and current > self.max_memory_mb:
|
|
1097
|
-
gc.collect()
|
|
1098
|
-
current = self._check_memory()
|
|
1099
|
-
|
|
1100
|
-
if current and current > self.max_memory_mb:
|
|
1101
|
-
raise MemoryError(
|
|
1102
|
-
f"Memory limit exceeded: {current:.1f}MB > {self.max_memory_mb}MB. "
|
|
1103
|
-
f"Consider increasing batch_size."
|
|
1104
|
-
)
|
|
1105
|
-
|
|
1106
|
-
# ============================================================================
|
|
1107
|
-
# PIPELINE EXECUTION
|
|
1108
|
-
# ============================================================================
|
|
1109
|
-
|
|
1110
|
-
def _log_error(self, message: str, error: Exception, **context) -> None:
|
|
1111
|
-
"""Log error with context and traceback."""
|
|
1112
|
-
extra = {
|
|
1113
|
-
'run_id': self.run_id,
|
|
1114
|
-
'pipeline': self.contract_dir.name if self.contract_dir else 'unknown',
|
|
1115
|
-
'stage': self._current_stage,
|
|
1116
|
-
'error_type': type(error).__name__,
|
|
1117
|
-
**context
|
|
1118
|
-
}
|
|
1119
|
-
logger.error(message, extra=extra, exc_info=True)
|
|
1120
|
-
|
|
1121
|
-
def _log_warning(self, message: str, **context) -> None:
|
|
1122
|
-
"""Log warning with context."""
|
|
1123
|
-
extra = {
|
|
1124
|
-
'run_id': self.run_id,
|
|
1125
|
-
'pipeline': self.contract_dir.name if self.contract_dir else 'unknown',
|
|
1126
|
-
'stage': self._current_stage,
|
|
1127
|
-
**context
|
|
1128
|
-
}
|
|
1129
|
-
logger.warning(message, extra=extra)
|
|
1130
|
-
|
|
1131
|
-
def _log_info(self, message: str, **context) -> None:
|
|
1132
|
-
"""Log info with context."""
|
|
1133
|
-
extra = {
|
|
1134
|
-
'run_id': self.run_id,
|
|
1135
|
-
'pipeline': self.contract_dir.name if self.contract_dir else 'unknown',
|
|
1136
|
-
'stage': self._current_stage,
|
|
1137
|
-
**context
|
|
1138
|
-
}
|
|
1139
|
-
logger.info(message, extra=extra)
|
|
1140
|
-
|
|
1141
|
-
def _summarize_errors(self, failed_batches: List[Dict[str, Any]]) -> str:
|
|
1142
|
-
"""
|
|
1143
|
-
Summarize errors from failed batches.
|
|
1144
|
-
|
|
1145
|
-
Groups errors by type and shows the most common errors first.
|
|
1146
|
-
|
|
1147
|
-
Args:
|
|
1148
|
-
failed_batches: List of failed batch dictionaries with 'error', 'error_type', 'batch_num', 'records'
|
|
1149
|
-
|
|
1150
|
-
Returns:
|
|
1151
|
-
Formatted error summary string
|
|
1152
|
-
"""
|
|
1153
|
-
if not failed_batches:
|
|
1154
|
-
return "No error details available."
|
|
1155
|
-
|
|
1156
|
-
# Group errors by type
|
|
1157
|
-
error_type_counts = Counter(batch['error_type'] for batch in failed_batches)
|
|
1158
|
-
errors_by_type = defaultdict(list)
|
|
1159
|
-
for batch in failed_batches:
|
|
1160
|
-
errors_by_type[batch['error_type']].append(batch)
|
|
1161
|
-
|
|
1162
|
-
# Build summary
|
|
1163
|
-
lines = []
|
|
1164
|
-
|
|
1165
|
-
# Summary statistics
|
|
1166
|
-
total_failed = len(failed_batches)
|
|
1167
|
-
total_records_failed = sum(batch.get('records', 0) for batch in failed_batches)
|
|
1168
|
-
lines.append(f"Total failed batches: {total_failed}")
|
|
1169
|
-
lines.append(f"Total records in failed batches: {total_records_failed}")
|
|
1170
|
-
lines.append("")
|
|
1171
|
-
|
|
1172
|
-
# Group by error type (most common first)
|
|
1173
|
-
lines.append("Errors by type:")
|
|
1174
|
-
for error_type, count in error_type_counts.most_common():
|
|
1175
|
-
batches_of_type = errors_by_type[error_type]
|
|
1176
|
-
lines.append(f" {error_type}: {count} occurrence(s)")
|
|
1177
|
-
|
|
1178
|
-
# Show sample error messages (up to 3 unique ones)
|
|
1179
|
-
unique_errors = {}
|
|
1180
|
-
for batch in batches_of_type:
|
|
1181
|
-
error_msg = batch.get('error', 'Unknown error')
|
|
1182
|
-
# Truncate very long error messages
|
|
1183
|
-
if len(error_msg) > 200:
|
|
1184
|
-
error_msg = error_msg[:200] + "..."
|
|
1185
|
-
if error_msg not in unique_errors:
|
|
1186
|
-
unique_errors[error_msg] = batch.get('batch_num', '?')
|
|
1187
|
-
if len(unique_errors) >= 3:
|
|
1188
|
-
break
|
|
1189
|
-
|
|
1190
|
-
for error_msg, batch_num in unique_errors.items():
|
|
1191
|
-
lines.append(f" - Batch {batch_num}: {error_msg}")
|
|
1192
|
-
|
|
1193
|
-
if len(unique_errors) < len(batches_of_type):
|
|
1194
|
-
remaining = len(batches_of_type) - len(unique_errors)
|
|
1195
|
-
lines.append(f" ... and {remaining} more occurrence(s) of this error type")
|
|
1196
|
-
lines.append("")
|
|
1197
|
-
|
|
1198
|
-
return "\n".join(lines)
|
|
1199
|
-
|
|
1200
|
-
async def run(
|
|
1201
|
-
self,
|
|
1202
|
-
dry_run: bool = False,
|
|
1203
|
-
session: Any = None,
|
|
1204
|
-
checkpoint_id: Optional[str] = None,
|
|
1205
|
-
resume: bool = False,
|
|
1206
|
-
batch_size: Optional[int] = None,
|
|
1207
|
-
max_retries: int = 3,
|
|
1208
|
-
error_threshold: float = 0.1,
|
|
1209
|
-
**kwargs,
|
|
1210
|
-
) -> Dict[str, Any]:
|
|
1211
|
-
"""
|
|
1212
|
-
Run the complete ETL pipeline in streaming mode.
|
|
1213
|
-
|
|
1214
|
-
Processes data incrementally: Extract-Batch → Transform-Batch → Load-Batch.
|
|
1215
|
-
This ensures constant memory usage regardless of dataset size.
|
|
1216
|
-
|
|
1217
|
-
Args:
|
|
1218
|
-
dry_run: If True, skip database operations
|
|
1219
|
-
session: Optional database session
|
|
1220
|
-
checkpoint_id: Optional checkpoint ID for resume capability
|
|
1221
|
-
resume: If True, resume from checkpoint
|
|
1222
|
-
batch_size: Batch size for processing (defaults to extract.yaml config)
|
|
1223
|
-
max_retries: Maximum retries for failed batches
|
|
1224
|
-
error_threshold: Error rate threshold (0.0-1.0) before aborting
|
|
1225
|
-
**kwargs: Additional parameters passed to extract()
|
|
1226
|
-
|
|
1227
|
-
Returns:
|
|
1228
|
-
Pipeline execution results dictionary
|
|
1229
|
-
"""
|
|
1230
|
-
# Generate correlation ID for this run
|
|
1231
|
-
self.run_id = str(uuid.uuid4())[:8]
|
|
1232
|
-
|
|
1233
|
-
if batch_size is None:
|
|
1234
|
-
batch_size = self.extract_config.get('batch_size', DEFAULT_BATCH_SIZE)
|
|
1235
|
-
|
|
1236
|
-
# Note: Tables must be created manually or via migrations.
|
|
1237
|
-
# PyCharter no longer creates tables from schema.json.
|
|
1238
|
-
|
|
1239
|
-
# Initialize Dead Letter Queue
|
|
1240
|
-
dlq_config = self.load_config.get('dead_letter_queue', {})
|
|
1241
|
-
dlq_enabled = dlq_config.get('enabled', True)
|
|
1242
|
-
dlq_backend = dlq_config.get('backend', 'database')
|
|
1243
|
-
dlq_storage_path = dlq_config.get('storage_path')
|
|
1244
|
-
dlq_schema = dlq_config.get('schema_name') # Optional schema name
|
|
1245
|
-
|
|
1246
|
-
# Get pipeline name for DLQ
|
|
1247
|
-
pipeline_name = (
|
|
1248
|
-
self.contract_dir.name if self.contract_dir else
|
|
1249
|
-
self.extract_config.get('title', 'unknown_pipeline')
|
|
1250
|
-
)
|
|
1251
|
-
|
|
1252
|
-
self.dlq = DeadLetterQueue(
|
|
1253
|
-
db_session=session if not dry_run else None,
|
|
1254
|
-
storage_backend=dlq_backend,
|
|
1255
|
-
storage_path=dlq_storage_path,
|
|
1256
|
-
enabled=dlq_enabled,
|
|
1257
|
-
schema_name=dlq_schema, # Pass schema name if provided
|
|
1258
|
-
)
|
|
1259
|
-
|
|
1260
|
-
self._log_info(
|
|
1261
|
-
"Starting ETL pipeline",
|
|
1262
|
-
batch_size=batch_size,
|
|
1263
|
-
dry_run=dry_run,
|
|
1264
|
-
checkpoint_id=checkpoint_id,
|
|
1265
|
-
resume=resume,
|
|
1266
|
-
input_params=kwargs,
|
|
1267
|
-
dlq_enabled=dlq_enabled,
|
|
1268
|
-
)
|
|
1269
|
-
|
|
1270
|
-
results = {
|
|
1271
|
-
'extraction': {'batches_processed': 0, 'total_records': 0},
|
|
1272
|
-
'transformation': {'batches_processed': 0, 'total_records': 0},
|
|
1273
|
-
'loading': {'batches_processed': 0, 'total_records': 0, 'inserted': 0, 'updated': 0},
|
|
1274
|
-
'success': False,
|
|
1275
|
-
'failed_batches': [],
|
|
1276
|
-
'dlq_records': 0,
|
|
1277
|
-
}
|
|
1278
|
-
|
|
1279
|
-
# Load checkpoint if resuming
|
|
1280
|
-
start_batch = 0
|
|
1281
|
-
if resume and checkpoint_id:
|
|
1282
|
-
checkpoint_state = self.checkpoint_manager.load(checkpoint_id)
|
|
1283
|
-
if checkpoint_state:
|
|
1284
|
-
kwargs.update(checkpoint_state.last_processed_params)
|
|
1285
|
-
start_batch = checkpoint_state.batch_num
|
|
1286
|
-
|
|
1287
|
-
self.progress_tracker.start()
|
|
1288
|
-
batch_num = 0
|
|
1289
|
-
total_records = 0
|
|
1290
|
-
failed_batches = []
|
|
1291
|
-
|
|
1292
|
-
try:
|
|
1293
|
-
async for batch in self.extract(batch_size=batch_size, **kwargs):
|
|
1294
|
-
batch_num += 1
|
|
1295
|
-
|
|
1296
|
-
# Skip batches if resuming
|
|
1297
|
-
if batch_num <= start_batch:
|
|
1298
|
-
continue
|
|
1299
|
-
|
|
1300
|
-
batch_start_time = datetime.now()
|
|
1301
|
-
|
|
1302
|
-
try:
|
|
1303
|
-
self._enforce_memory_limit()
|
|
1304
|
-
|
|
1305
|
-
# Transform batch
|
|
1306
|
-
self._current_stage = 'transform'
|
|
1307
|
-
transformed_batch = self.transform(batch, **kwargs)
|
|
1308
|
-
|
|
1309
|
-
# Load batch
|
|
1310
|
-
self._current_stage = 'load'
|
|
1311
|
-
if not dry_run:
|
|
1312
|
-
load_result = await self.load(transformed_batch, session=session, **kwargs)
|
|
1313
|
-
results['loading']['inserted'] += load_result.get('inserted', 0)
|
|
1314
|
-
results['loading']['updated'] += load_result.get('updated', 0)
|
|
1315
|
-
results['loading']['total_records'] += load_result.get('total', 0)
|
|
1316
|
-
|
|
1317
|
-
# Update counters
|
|
1318
|
-
total_records += len(batch)
|
|
1319
|
-
results['extraction']['total_records'] += len(batch)
|
|
1320
|
-
results['extraction']['batches_processed'] = batch_num
|
|
1321
|
-
results['transformation']['total_records'] += len(transformed_batch)
|
|
1322
|
-
results['transformation']['batches_processed'] = batch_num
|
|
1323
|
-
results['loading']['batches_processed'] = batch_num
|
|
1324
|
-
|
|
1325
|
-
# Report progress
|
|
1326
|
-
memory_usage = self._check_memory()
|
|
1327
|
-
batch_time = (datetime.now() - batch_start_time).total_seconds()
|
|
1328
|
-
self.progress_tracker.record_batch_time(batch_time)
|
|
1329
|
-
self.progress_tracker.report(
|
|
1330
|
-
'extract',
|
|
1331
|
-
batch_num,
|
|
1332
|
-
total_records,
|
|
1333
|
-
memory_usage_mb=memory_usage,
|
|
1334
|
-
)
|
|
1335
|
-
|
|
1336
|
-
# Save checkpoint
|
|
1337
|
-
if checkpoint_id:
|
|
1338
|
-
self.checkpoint_manager.save(
|
|
1339
|
-
checkpoint_id,
|
|
1340
|
-
'extract',
|
|
1341
|
-
batch_num,
|
|
1342
|
-
total_records,
|
|
1343
|
-
kwargs,
|
|
1344
|
-
)
|
|
1345
|
-
|
|
1346
|
-
# Cleanup
|
|
1347
|
-
del batch, transformed_batch
|
|
1348
|
-
gc.collect()
|
|
1349
|
-
|
|
1350
|
-
except Exception as e:
|
|
1351
|
-
batch_duration = (datetime.now() - batch_start_time).total_seconds()
|
|
1352
|
-
error_msg = str(e)
|
|
1353
|
-
error_type = type(e).__name__
|
|
1354
|
-
|
|
1355
|
-
# Check if this is a connection-related error
|
|
1356
|
-
is_connection_error = (
|
|
1357
|
-
'connection' in error_msg.lower() or
|
|
1358
|
-
'closed' in error_msg.lower() or
|
|
1359
|
-
'ConnectionDoesNotExistError' in error_type or
|
|
1360
|
-
'ConnectionError' in error_type or
|
|
1361
|
-
'InterfaceError' in error_type or
|
|
1362
|
-
'DBAPIError' in error_type and ('connection' in error_msg.lower() or 'closed' in error_msg.lower())
|
|
1363
|
-
)
|
|
1364
|
-
|
|
1365
|
-
self._log_error(
|
|
1366
|
-
"Batch processing failed",
|
|
1367
|
-
e,
|
|
1368
|
-
batch_num=batch_num,
|
|
1369
|
-
batch_size=len(batch) if batch else 0,
|
|
1370
|
-
total_records=total_records,
|
|
1371
|
-
is_connection_error=is_connection_error,
|
|
1372
|
-
)
|
|
1373
|
-
|
|
1374
|
-
# For connection errors, retry before checking error rate
|
|
1375
|
-
# This prevents aborting on transient connection issues
|
|
1376
|
-
if is_connection_error and len(failed_batches) < max_retries:
|
|
1377
|
-
wait_time = min(2 ** len(failed_batches), 5.0) # Exponential backoff, max 5s
|
|
1378
|
-
self._log_warning(
|
|
1379
|
-
f"Connection error in batch {batch_num}, retrying (attempt {len(failed_batches) + 1}/{max_retries})",
|
|
1380
|
-
batch_num=batch_num,
|
|
1381
|
-
retry_attempt=len(failed_batches) + 1,
|
|
1382
|
-
wait_seconds=wait_time,
|
|
1383
|
-
)
|
|
1384
|
-
await asyncio.sleep(wait_time)
|
|
1385
|
-
continue # Retry the batch without adding to failed_batches
|
|
1386
|
-
|
|
1387
|
-
# Not a connection error, or retries exhausted - add to failed batches
|
|
1388
|
-
failed_batches.append({
|
|
1389
|
-
'batch_num': batch_num,
|
|
1390
|
-
'error': error_msg,
|
|
1391
|
-
'error_type': error_type,
|
|
1392
|
-
'records': len(batch) if batch else 0,
|
|
1393
|
-
})
|
|
1394
|
-
|
|
1395
|
-
# Add failed batch to Dead Letter Queue
|
|
1396
|
-
if batch and self.dlq:
|
|
1397
|
-
# Determine DLQ reason
|
|
1398
|
-
if is_connection_error:
|
|
1399
|
-
dlq_reason = DLQReason.CONNECTION_ERROR
|
|
1400
|
-
elif self._current_stage == 'extract':
|
|
1401
|
-
dlq_reason = DLQReason.EXTRACTION_ERROR
|
|
1402
|
-
elif self._current_stage == 'transform':
|
|
1403
|
-
dlq_reason = DLQReason.TRANSFORMATION_ERROR
|
|
1404
|
-
elif self._current_stage == 'load':
|
|
1405
|
-
dlq_reason = DLQReason.LOAD_ERROR
|
|
1406
|
-
else:
|
|
1407
|
-
dlq_reason = DLQReason.UNKNOWN
|
|
1408
|
-
|
|
1409
|
-
# Add batch to DLQ
|
|
1410
|
-
dlq_records = await self.dlq.add_batch(
|
|
1411
|
-
pipeline_name=pipeline_name,
|
|
1412
|
-
batch=batch,
|
|
1413
|
-
reason=dlq_reason,
|
|
1414
|
-
error_message=error_msg,
|
|
1415
|
-
error_type=error_type,
|
|
1416
|
-
stage=self._current_stage or 'unknown',
|
|
1417
|
-
metadata={
|
|
1418
|
-
'batch_num': batch_num,
|
|
1419
|
-
'total_records': total_records,
|
|
1420
|
-
'run_id': self.run_id,
|
|
1421
|
-
'is_connection_error': is_connection_error,
|
|
1422
|
-
},
|
|
1423
|
-
)
|
|
1424
|
-
results['dlq_records'] += len(dlq_records)
|
|
1425
|
-
|
|
1426
|
-
# Check error rate (only after connection retries are exhausted)
|
|
1427
|
-
# Also be more lenient for small batch counts (don't abort on first failure)
|
|
1428
|
-
min_batches_for_error_check = 3 # Need at least 3 batches before checking error rate
|
|
1429
|
-
if batch_num >= min_batches_for_error_check:
|
|
1430
|
-
error_rate = len(failed_batches) / batch_num if batch_num > 0 else 1.0
|
|
1431
|
-
if error_rate > error_threshold:
|
|
1432
|
-
# Summarize errors before raising
|
|
1433
|
-
error_summary = self._summarize_errors(failed_batches)
|
|
1434
|
-
|
|
1435
|
-
self._log_error(
|
|
1436
|
-
"Error rate threshold exceeded",
|
|
1437
|
-
RuntimeError("Error rate threshold exceeded"),
|
|
1438
|
-
error_rate=error_rate,
|
|
1439
|
-
threshold=error_threshold,
|
|
1440
|
-
failed_batches=len(failed_batches),
|
|
1441
|
-
total_batches=batch_num,
|
|
1442
|
-
error_summary=error_summary,
|
|
1443
|
-
)
|
|
1444
|
-
|
|
1445
|
-
error_msg = (
|
|
1446
|
-
f"Error rate too high: {error_rate:.1%} > {error_threshold:.1%}. "
|
|
1447
|
-
f"Aborting pipeline.\n\n"
|
|
1448
|
-
f"Error Summary ({len(failed_batches)} failed batches out of {batch_num} total):\n"
|
|
1449
|
-
f"{error_summary}"
|
|
1450
|
-
)
|
|
1451
|
-
raise RuntimeError(error_msg)
|
|
1452
|
-
|
|
1453
|
-
# Retry logic for non-connection errors
|
|
1454
|
-
if len(failed_batches) <= max_retries:
|
|
1455
|
-
wait_time = 2 ** len(failed_batches)
|
|
1456
|
-
self._log_warning(
|
|
1457
|
-
f"Retrying batch {batch_num}",
|
|
1458
|
-
batch_num=batch_num,
|
|
1459
|
-
retry_attempt=len(failed_batches),
|
|
1460
|
-
wait_seconds=wait_time,
|
|
1461
|
-
)
|
|
1462
|
-
await asyncio.sleep(wait_time)
|
|
1463
|
-
continue
|
|
1464
|
-
else:
|
|
1465
|
-
self.progress_tracker.report(
|
|
1466
|
-
'extract',
|
|
1467
|
-
batch_num,
|
|
1468
|
-
total_records,
|
|
1469
|
-
error_count=len(failed_batches),
|
|
1470
|
-
)
|
|
1471
|
-
|
|
1472
|
-
results['failed_batches'] = failed_batches
|
|
1473
|
-
results['success'] = len(failed_batches) < batch_num * error_threshold
|
|
1474
|
-
|
|
1475
|
-
# Add DLQ statistics to results
|
|
1476
|
-
if self.dlq:
|
|
1477
|
-
try:
|
|
1478
|
-
dlq_stats = self.dlq.get_statistics(pipeline_name=pipeline_name)
|
|
1479
|
-
results['dlq_statistics'] = dlq_stats
|
|
1480
|
-
except Exception as e:
|
|
1481
|
-
logger.warning(f"Failed to get DLQ statistics: {e}")
|
|
1482
|
-
|
|
1483
|
-
self._log_info(
|
|
1484
|
-
"ETL pipeline completed",
|
|
1485
|
-
batches=batch_num,
|
|
1486
|
-
records=total_records,
|
|
1487
|
-
failed_batches=len(failed_batches),
|
|
1488
|
-
inserted=results['loading'].get('inserted', 0),
|
|
1489
|
-
updated=results['loading'].get('updated', 0),
|
|
1490
|
-
dlq_records=results.get('dlq_records', 0),
|
|
1491
|
-
)
|
|
1492
|
-
|
|
1493
|
-
# Delete checkpoint on success
|
|
1494
|
-
if checkpoint_id and results['success']:
|
|
1495
|
-
self.checkpoint_manager.delete(checkpoint_id)
|
|
1496
|
-
|
|
1497
|
-
except Exception as e:
|
|
1498
|
-
self._log_error(
|
|
1499
|
-
"ETL pipeline failed",
|
|
1500
|
-
e,
|
|
1501
|
-
batches_processed=batch_num,
|
|
1502
|
-
records_processed=total_records,
|
|
1503
|
-
failed_batches=len(failed_batches),
|
|
1504
|
-
)
|
|
1505
|
-
|
|
1506
|
-
if checkpoint_id:
|
|
1507
|
-
self.checkpoint_manager.save(
|
|
1508
|
-
checkpoint_id,
|
|
1509
|
-
'error',
|
|
1510
|
-
batch_num,
|
|
1511
|
-
total_records,
|
|
1512
|
-
kwargs,
|
|
1513
|
-
error=str(e),
|
|
1514
|
-
)
|
|
1515
|
-
results['error'] = str(e)
|
|
1516
|
-
results['success'] = False
|
|
1517
|
-
raise
|
|
1518
|
-
|
|
1519
|
-
return results
|
|
1520
|
-
|
|
1521
|
-
async def run_multiple(
|
|
1522
|
-
self,
|
|
1523
|
-
param_name: Optional[str] = None,
|
|
1524
|
-
param_values: Optional[List[Any]] = None,
|
|
1525
|
-
param_sets: Optional[List[Dict[str, Any]]] = None,
|
|
1526
|
-
batch_size: int = 5,
|
|
1527
|
-
delay_between_runs: float = 1.0,
|
|
1528
|
-
dry_run: bool = False,
|
|
1529
|
-
session: Any = None,
|
|
1530
|
-
**kwargs,
|
|
1531
|
-
) -> List[Dict[str, Any]]:
|
|
1532
|
-
"""
|
|
1533
|
-
Run ETL pipeline multiple times with different parameter sets.
|
|
1534
|
-
|
|
1535
|
-
This method allows you to efficiently run the same ETL pipeline multiple times
|
|
1536
|
-
with varying parameters. You can either:
|
|
1537
|
-
1. Provide a single parameter name and list of values (simple case)
|
|
1538
|
-
2. Provide a list of parameter dictionaries (complex case with multiple varying params)
|
|
1539
|
-
|
|
1540
|
-
Args:
|
|
1541
|
-
param_name: Name of the parameter to vary (e.g., 'symbol', 'ticker', 'date')
|
|
1542
|
-
Required if using param_values.
|
|
1543
|
-
param_values: List of values for the specified parameter.
|
|
1544
|
-
Each value will be passed as {param_name: value} to run().
|
|
1545
|
-
param_sets: List of parameter dictionaries. Each dict will be unpacked
|
|
1546
|
-
and passed to run() as **params. Use this when multiple
|
|
1547
|
-
parameters vary between runs.
|
|
1548
|
-
batch_size: Number of runs to process before a brief pause (for rate limiting)
|
|
1549
|
-
delay_between_runs: Delay in seconds between individual runs (for rate limiting)
|
|
1550
|
-
dry_run: If True, skip database operations
|
|
1551
|
-
session: Optional database session
|
|
1552
|
-
**kwargs: Additional parameters passed to each run() call (common to all runs)
|
|
1553
|
-
|
|
1554
|
-
Returns:
|
|
1555
|
-
List of result dictionaries, each containing:
|
|
1556
|
-
- 'params': The parameters used for this run
|
|
1557
|
-
- 'success': Whether the run succeeded
|
|
1558
|
-
- 'records': Number of records processed (if successful)
|
|
1559
|
-
- 'result': Full result dictionary from run() (if successful)
|
|
1560
|
-
- 'error': Error message (if failed)
|
|
1561
|
-
|
|
1562
|
-
Examples:
|
|
1563
|
-
# Simple case: vary a single parameter
|
|
1564
|
-
>>> results = await orchestrator.run_multiple(
|
|
1565
|
-
... param_name='symbol',
|
|
1566
|
-
... param_values=['AAPL', 'MSFT', 'GOOGL'],
|
|
1567
|
-
... batch_size=5,
|
|
1568
|
-
... delay_between_runs=1.0
|
|
1569
|
-
... )
|
|
1570
|
-
|
|
1571
|
-
# Complex case: vary multiple parameters
|
|
1572
|
-
>>> results = await orchestrator.run_multiple(
|
|
1573
|
-
... param_sets=[
|
|
1574
|
-
... {'symbol': 'AAPL', 'date': '2024-01-01'},
|
|
1575
|
-
... {'symbol': 'MSFT', 'date': '2024-01-02'},
|
|
1576
|
-
... ],
|
|
1577
|
-
... batch_size=3,
|
|
1578
|
-
... delay_between_runs=0.5
|
|
1579
|
-
... )
|
|
1580
|
-
"""
|
|
1581
|
-
# Validate inputs
|
|
1582
|
-
if param_sets is not None:
|
|
1583
|
-
if param_name is not None or param_values is not None:
|
|
1584
|
-
raise ValueError(
|
|
1585
|
-
"Cannot use both param_sets and param_name/param_values. "
|
|
1586
|
-
"Use either param_sets OR param_name+param_values."
|
|
1587
|
-
)
|
|
1588
|
-
if not isinstance(param_sets, list) or len(param_sets) == 0:
|
|
1589
|
-
raise ValueError("param_sets must be a non-empty list of dictionaries")
|
|
1590
|
-
# Convert param_sets to list of dicts
|
|
1591
|
-
runs = [dict(params) for params in param_sets]
|
|
1592
|
-
elif param_name is not None and param_values is not None:
|
|
1593
|
-
if not isinstance(param_values, list) or len(param_values) == 0:
|
|
1594
|
-
raise ValueError("param_values must be a non-empty list")
|
|
1595
|
-
# Convert param_name + param_values to list of dicts
|
|
1596
|
-
runs = [{param_name: value} for value in param_values]
|
|
1597
|
-
else:
|
|
1598
|
-
raise ValueError(
|
|
1599
|
-
"Must provide either (param_name + param_values) OR param_sets"
|
|
1600
|
-
)
|
|
1601
|
-
|
|
1602
|
-
results = []
|
|
1603
|
-
|
|
1604
|
-
for i in range(0, len(runs), batch_size):
|
|
1605
|
-
run_batch = runs[i:i + batch_size]
|
|
1606
|
-
|
|
1607
|
-
for run_params in run_batch:
|
|
1608
|
-
try:
|
|
1609
|
-
# Merge run_params with common kwargs
|
|
1610
|
-
merged_params = {**kwargs, **run_params}
|
|
1611
|
-
result = await self.run(
|
|
1612
|
-
dry_run=dry_run,
|
|
1613
|
-
session=session,
|
|
1614
|
-
**merged_params
|
|
1615
|
-
)
|
|
1616
|
-
results.append({
|
|
1617
|
-
'params': run_params,
|
|
1618
|
-
'success': result['success'],
|
|
1619
|
-
'records': result.get('loading', {}).get('total_records', 0),
|
|
1620
|
-
'result': result,
|
|
1621
|
-
})
|
|
1622
|
-
except Exception as e:
|
|
1623
|
-
results.append({
|
|
1624
|
-
'params': run_params,
|
|
1625
|
-
'success': False,
|
|
1626
|
-
'error': str(e),
|
|
1627
|
-
})
|
|
1628
|
-
|
|
1629
|
-
# Rate limiting
|
|
1630
|
-
if i + batch_size < len(runs) or run_params != run_batch[-1]:
|
|
1631
|
-
await asyncio.sleep(delay_between_runs)
|
|
1632
|
-
|
|
1633
|
-
return results
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
def create_orchestrator(
|
|
1637
|
-
contract_dir: Optional[str] = None,
|
|
1638
|
-
**kwargs,
|
|
1639
|
-
) -> ETLOrchestrator:
|
|
1640
|
-
"""
|
|
1641
|
-
Create an ETL orchestrator instance.
|
|
1642
|
-
|
|
1643
|
-
Args:
|
|
1644
|
-
contract_dir: Directory containing contract files and ETL configs
|
|
1645
|
-
**kwargs: Additional arguments passed to ETLOrchestrator
|
|
1646
|
-
|
|
1647
|
-
Returns:
|
|
1648
|
-
ETLOrchestrator instance
|
|
1649
|
-
"""
|
|
1650
|
-
return ETLOrchestrator(contract_dir=contract_dir, **kwargs)
|