pycharter 0.0.22__py3-none-any.whl → 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- api/main.py +27 -1
- api/models/docs.py +68 -0
- api/models/evolution.py +117 -0
- api/models/tracking.py +111 -0
- api/models/validation.py +46 -6
- api/routes/v1/__init__.py +14 -1
- api/routes/v1/docs.py +187 -0
- api/routes/v1/evolution.py +337 -0
- api/routes/v1/templates.py +211 -27
- api/routes/v1/tracking.py +301 -0
- api/routes/v1/validation.py +68 -31
- pycharter/__init__.py +268 -58
- pycharter/data/templates/contract/template_coercion_rules.yaml +57 -0
- pycharter/data/templates/contract/template_contract.yaml +122 -0
- pycharter/data/templates/contract/template_metadata.yaml +68 -0
- pycharter/data/templates/contract/template_schema.yaml +100 -0
- pycharter/data/templates/contract/template_validation_rules.yaml +75 -0
- pycharter/data/templates/etl/README.md +224 -0
- pycharter/data/templates/etl/extract_cloud_azure.yaml +24 -0
- pycharter/data/templates/etl/extract_cloud_gcs.yaml +25 -0
- pycharter/data/templates/etl/extract_cloud_s3.yaml +30 -0
- pycharter/data/templates/etl/extract_database.yaml +34 -0
- pycharter/data/templates/etl/extract_database_ssh.yaml +40 -0
- pycharter/data/templates/etl/extract_file_csv.yaml +21 -0
- pycharter/data/templates/etl/extract_file_glob.yaml +25 -0
- pycharter/data/templates/etl/extract_file_json.yaml +24 -0
- pycharter/data/templates/etl/extract_file_parquet.yaml +20 -0
- pycharter/data/templates/etl/extract_http_paginated.yaml +79 -0
- pycharter/data/templates/etl/extract_http_path_params.yaml +38 -0
- pycharter/data/templates/etl/extract_http_simple.yaml +62 -0
- pycharter/data/templates/etl/load_cloud_azure.yaml +24 -0
- pycharter/data/templates/etl/load_cloud_gcs.yaml +22 -0
- pycharter/data/templates/etl/load_cloud_s3.yaml +27 -0
- pycharter/data/templates/etl/load_file.yaml +34 -0
- pycharter/data/templates/etl/load_insert.yaml +18 -0
- pycharter/data/templates/etl/load_postgresql.yaml +39 -0
- pycharter/data/templates/etl/load_sqlite.yaml +21 -0
- pycharter/data/templates/etl/load_truncate_and_load.yaml +20 -0
- pycharter/data/templates/etl/load_upsert.yaml +25 -0
- pycharter/data/templates/etl/load_with_dlq.yaml +34 -0
- pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +35 -0
- pycharter/data/templates/etl/pipeline_http_to_db.yaml +75 -0
- pycharter/data/templates/etl/transform_combined.yaml +48 -0
- pycharter/data/templates/etl/transform_custom_function.yaml +58 -0
- pycharter/data/templates/etl/transform_jsonata.yaml +51 -0
- pycharter/data/templates/etl/transform_simple.yaml +59 -0
- pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
- pycharter/docs_generator/__init__.py +43 -0
- pycharter/docs_generator/generator.py +465 -0
- pycharter/docs_generator/renderers.py +247 -0
- pycharter/etl_generator/__init__.py +168 -80
- pycharter/etl_generator/builder.py +121 -0
- pycharter/etl_generator/config_loader.py +394 -0
- pycharter/etl_generator/config_validator.py +418 -0
- pycharter/etl_generator/context.py +132 -0
- pycharter/etl_generator/expression.py +499 -0
- pycharter/etl_generator/extractors/__init__.py +30 -0
- pycharter/etl_generator/extractors/base.py +70 -0
- pycharter/etl_generator/extractors/cloud_storage.py +530 -0
- pycharter/etl_generator/extractors/database.py +221 -0
- pycharter/etl_generator/extractors/factory.py +185 -0
- pycharter/etl_generator/extractors/file.py +475 -0
- pycharter/etl_generator/extractors/http.py +895 -0
- pycharter/etl_generator/extractors/streaming.py +57 -0
- pycharter/etl_generator/loaders/__init__.py +41 -0
- pycharter/etl_generator/loaders/base.py +35 -0
- pycharter/etl_generator/loaders/cloud.py +87 -0
- pycharter/etl_generator/loaders/cloud_storage_loader.py +275 -0
- pycharter/etl_generator/loaders/database.py +274 -0
- pycharter/etl_generator/loaders/factory.py +180 -0
- pycharter/etl_generator/loaders/file.py +72 -0
- pycharter/etl_generator/loaders/file_loader.py +130 -0
- pycharter/etl_generator/pipeline.py +743 -0
- pycharter/etl_generator/protocols.py +54 -0
- pycharter/etl_generator/result.py +63 -0
- pycharter/etl_generator/schemas/__init__.py +49 -0
- pycharter/etl_generator/transformers/__init__.py +49 -0
- pycharter/etl_generator/transformers/base.py +63 -0
- pycharter/etl_generator/transformers/config.py +45 -0
- pycharter/etl_generator/transformers/custom_function.py +101 -0
- pycharter/etl_generator/transformers/jsonata_transformer.py +56 -0
- pycharter/etl_generator/transformers/operations.py +218 -0
- pycharter/etl_generator/transformers/pipeline.py +54 -0
- pycharter/etl_generator/transformers/simple_operations.py +131 -0
- pycharter/quality/__init__.py +25 -0
- pycharter/quality/tracking/__init__.py +64 -0
- pycharter/quality/tracking/collector.py +318 -0
- pycharter/quality/tracking/exporters.py +238 -0
- pycharter/quality/tracking/models.py +194 -0
- pycharter/quality/tracking/store.py +385 -0
- pycharter/runtime_validator/__init__.py +20 -7
- pycharter/runtime_validator/builder.py +328 -0
- pycharter/runtime_validator/validator.py +311 -7
- pycharter/runtime_validator/validator_core.py +61 -0
- pycharter/schema_evolution/__init__.py +61 -0
- pycharter/schema_evolution/compatibility.py +270 -0
- pycharter/schema_evolution/diff.py +496 -0
- pycharter/schema_evolution/models.py +201 -0
- pycharter/shared/__init__.py +56 -0
- pycharter/shared/errors.py +296 -0
- pycharter/shared/protocols.py +234 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/METADATA +146 -26
- pycharter-0.0.24.dist-info/RECORD +543 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/WHEEL +1 -1
- ui/static/404/index.html +1 -1
- ui/static/404.html +1 -1
- ui/static/__next.__PAGE__.txt +1 -1
- ui/static/__next._full.txt +1 -1
- ui/static/__next._head.txt +1 -1
- ui/static/__next._index.txt +1 -1
- ui/static/__next._tree.txt +1 -1
- ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
- ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
- ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
- ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
- ui/static/_not-found/__next._full.txt +1 -1
- ui/static/_not-found/__next._head.txt +1 -1
- ui/static/_not-found/__next._index.txt +1 -1
- ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- ui/static/_not-found/__next._not-found.txt +1 -1
- ui/static/_not-found/__next._tree.txt +1 -1
- ui/static/_not-found/index.html +1 -1
- ui/static/_not-found/index.txt +1 -1
- ui/static/contracts/__next._full.txt +2 -2
- ui/static/contracts/__next._head.txt +1 -1
- ui/static/contracts/__next._index.txt +1 -1
- ui/static/contracts/__next._tree.txt +1 -1
- ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
- ui/static/contracts/__next.contracts.txt +1 -1
- ui/static/contracts/index.html +1 -1
- ui/static/contracts/index.txt +2 -2
- ui/static/documentation/__next._full.txt +1 -1
- ui/static/documentation/__next._head.txt +1 -1
- ui/static/documentation/__next._index.txt +1 -1
- ui/static/documentation/__next._tree.txt +1 -1
- ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
- ui/static/documentation/__next.documentation.txt +1 -1
- ui/static/documentation/index.html +2 -2
- ui/static/documentation/index.txt +1 -1
- ui/static/index.html +1 -1
- ui/static/index.txt +1 -1
- ui/static/metadata/__next._full.txt +1 -1
- ui/static/metadata/__next._head.txt +1 -1
- ui/static/metadata/__next._index.txt +1 -1
- ui/static/metadata/__next._tree.txt +1 -1
- ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
- ui/static/metadata/__next.metadata.txt +1 -1
- ui/static/metadata/index.html +1 -1
- ui/static/metadata/index.txt +1 -1
- ui/static/quality/__next._full.txt +2 -2
- ui/static/quality/__next._head.txt +1 -1
- ui/static/quality/__next._index.txt +1 -1
- ui/static/quality/__next._tree.txt +1 -1
- ui/static/quality/__next.quality.__PAGE__.txt +2 -2
- ui/static/quality/__next.quality.txt +1 -1
- ui/static/quality/index.html +2 -2
- ui/static/quality/index.txt +2 -2
- ui/static/rules/__next._full.txt +1 -1
- ui/static/rules/__next._head.txt +1 -1
- ui/static/rules/__next._index.txt +1 -1
- ui/static/rules/__next._tree.txt +1 -1
- ui/static/rules/__next.rules.__PAGE__.txt +1 -1
- ui/static/rules/__next.rules.txt +1 -1
- ui/static/rules/index.html +1 -1
- ui/static/rules/index.txt +1 -1
- ui/static/schemas/__next._full.txt +1 -1
- ui/static/schemas/__next._head.txt +1 -1
- ui/static/schemas/__next._index.txt +1 -1
- ui/static/schemas/__next._tree.txt +1 -1
- ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
- ui/static/schemas/__next.schemas.txt +1 -1
- ui/static/schemas/index.html +1 -1
- ui/static/schemas/index.txt +1 -1
- ui/static/settings/__next._full.txt +1 -1
- ui/static/settings/__next._head.txt +1 -1
- ui/static/settings/__next._index.txt +1 -1
- ui/static/settings/__next._tree.txt +1 -1
- ui/static/settings/__next.settings.__PAGE__.txt +1 -1
- ui/static/settings/__next.settings.txt +1 -1
- ui/static/settings/index.html +1 -1
- ui/static/settings/index.txt +1 -1
- ui/static/static/404/index.html +1 -1
- ui/static/static/404.html +1 -1
- ui/static/static/__next.__PAGE__.txt +1 -1
- ui/static/static/__next._full.txt +2 -2
- ui/static/static/__next._head.txt +1 -1
- ui/static/static/__next._index.txt +2 -2
- ui/static/static/__next._tree.txt +2 -2
- ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
- ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
- ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
- ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
- ui/static/static/_not-found/__next._full.txt +2 -2
- ui/static/static/_not-found/__next._head.txt +1 -1
- ui/static/static/_not-found/__next._index.txt +2 -2
- ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- ui/static/static/_not-found/__next._not-found.txt +1 -1
- ui/static/static/_not-found/__next._tree.txt +2 -2
- ui/static/static/_not-found/index.html +1 -1
- ui/static/static/_not-found/index.txt +2 -2
- ui/static/static/contracts/__next._full.txt +3 -3
- ui/static/static/contracts/__next._head.txt +1 -1
- ui/static/static/contracts/__next._index.txt +2 -2
- ui/static/static/contracts/__next._tree.txt +2 -2
- ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
- ui/static/static/contracts/__next.contracts.txt +1 -1
- ui/static/static/contracts/index.html +1 -1
- ui/static/static/contracts/index.txt +3 -3
- ui/static/static/documentation/__next._full.txt +3 -3
- ui/static/static/documentation/__next._head.txt +1 -1
- ui/static/static/documentation/__next._index.txt +2 -2
- ui/static/static/documentation/__next._tree.txt +2 -2
- ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
- ui/static/static/documentation/__next.documentation.txt +1 -1
- ui/static/static/documentation/index.html +2 -2
- ui/static/static/documentation/index.txt +3 -3
- ui/static/static/index.html +1 -1
- ui/static/static/index.txt +2 -2
- ui/static/static/metadata/__next._full.txt +2 -2
- ui/static/static/metadata/__next._head.txt +1 -1
- ui/static/static/metadata/__next._index.txt +2 -2
- ui/static/static/metadata/__next._tree.txt +2 -2
- ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
- ui/static/static/metadata/__next.metadata.txt +1 -1
- ui/static/static/metadata/index.html +1 -1
- ui/static/static/metadata/index.txt +2 -2
- ui/static/static/quality/__next._full.txt +2 -2
- ui/static/static/quality/__next._head.txt +1 -1
- ui/static/static/quality/__next._index.txt +2 -2
- ui/static/static/quality/__next._tree.txt +2 -2
- ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
- ui/static/static/quality/__next.quality.txt +1 -1
- ui/static/static/quality/index.html +2 -2
- ui/static/static/quality/index.txt +2 -2
- ui/static/static/rules/__next._full.txt +2 -2
- ui/static/static/rules/__next._head.txt +1 -1
- ui/static/static/rules/__next._index.txt +2 -2
- ui/static/static/rules/__next._tree.txt +2 -2
- ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
- ui/static/static/rules/__next.rules.txt +1 -1
- ui/static/static/rules/index.html +1 -1
- ui/static/static/rules/index.txt +2 -2
- ui/static/static/schemas/__next._full.txt +2 -2
- ui/static/static/schemas/__next._head.txt +1 -1
- ui/static/static/schemas/__next._index.txt +2 -2
- ui/static/static/schemas/__next._tree.txt +2 -2
- ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
- ui/static/static/schemas/__next.schemas.txt +1 -1
- ui/static/static/schemas/index.html +1 -1
- ui/static/static/schemas/index.txt +2 -2
- ui/static/static/settings/__next._full.txt +2 -2
- ui/static/static/settings/__next._head.txt +1 -1
- ui/static/static/settings/__next._index.txt +2 -2
- ui/static/static/settings/__next._tree.txt +2 -2
- ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
- ui/static/static/settings/__next.settings.txt +1 -1
- ui/static/static/settings/index.html +1 -1
- ui/static/static/settings/index.txt +2 -2
- ui/static/static/static/.gitkeep +0 -0
- ui/static/static/static/404/index.html +1 -0
- ui/static/static/static/404.html +1 -0
- ui/static/static/static/__next.__PAGE__.txt +10 -0
- ui/static/static/static/__next._full.txt +30 -0
- ui/static/static/static/__next._head.txt +7 -0
- ui/static/static/static/__next._index.txt +9 -0
- ui/static/static/static/__next._tree.txt +2 -0
- ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
- ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
- ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
- ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
- ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
- ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
- ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
- ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
- ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
- ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
- ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
- ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
- ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
- ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
- ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
- ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
- ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
- ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
- ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
- ui/static/static/static/_not-found/__next._full.txt +17 -0
- ui/static/static/static/_not-found/__next._head.txt +7 -0
- ui/static/static/static/_not-found/__next._index.txt +9 -0
- ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
- ui/static/static/static/_not-found/__next._not-found.txt +4 -0
- ui/static/static/static/_not-found/__next._tree.txt +2 -0
- ui/static/static/static/_not-found/index.html +1 -0
- ui/static/static/static/_not-found/index.txt +17 -0
- ui/static/static/static/contracts/__next._full.txt +21 -0
- ui/static/static/static/contracts/__next._head.txt +7 -0
- ui/static/static/static/contracts/__next._index.txt +9 -0
- ui/static/static/static/contracts/__next._tree.txt +2 -0
- ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
- ui/static/static/static/contracts/__next.contracts.txt +4 -0
- ui/static/static/static/contracts/index.html +1 -0
- ui/static/static/static/contracts/index.txt +21 -0
- ui/static/static/static/documentation/__next._full.txt +21 -0
- ui/static/static/static/documentation/__next._head.txt +7 -0
- ui/static/static/static/documentation/__next._index.txt +9 -0
- ui/static/static/static/documentation/__next._tree.txt +2 -0
- ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
- ui/static/static/static/documentation/__next.documentation.txt +4 -0
- ui/static/static/static/documentation/index.html +93 -0
- ui/static/static/static/documentation/index.txt +21 -0
- ui/static/static/static/index.html +1 -0
- ui/static/static/static/index.txt +30 -0
- ui/static/static/static/metadata/__next._full.txt +21 -0
- ui/static/static/static/metadata/__next._head.txt +7 -0
- ui/static/static/static/metadata/__next._index.txt +9 -0
- ui/static/static/static/metadata/__next._tree.txt +2 -0
- ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
- ui/static/static/static/metadata/__next.metadata.txt +4 -0
- ui/static/static/static/metadata/index.html +1 -0
- ui/static/static/static/metadata/index.txt +21 -0
- ui/static/static/static/quality/__next._full.txt +21 -0
- ui/static/static/static/quality/__next._head.txt +7 -0
- ui/static/static/static/quality/__next._index.txt +9 -0
- ui/static/static/static/quality/__next._tree.txt +2 -0
- ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
- ui/static/static/static/quality/__next.quality.txt +4 -0
- ui/static/static/static/quality/index.html +2 -0
- ui/static/static/static/quality/index.txt +21 -0
- ui/static/static/static/rules/__next._full.txt +21 -0
- ui/static/static/static/rules/__next._head.txt +7 -0
- ui/static/static/static/rules/__next._index.txt +9 -0
- ui/static/static/static/rules/__next._tree.txt +2 -0
- ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
- ui/static/static/static/rules/__next.rules.txt +4 -0
- ui/static/static/static/rules/index.html +1 -0
- ui/static/static/static/rules/index.txt +21 -0
- ui/static/static/static/schemas/__next._full.txt +21 -0
- ui/static/static/static/schemas/__next._head.txt +7 -0
- ui/static/static/static/schemas/__next._index.txt +9 -0
- ui/static/static/static/schemas/__next._tree.txt +2 -0
- ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
- ui/static/static/static/schemas/__next.schemas.txt +4 -0
- ui/static/static/static/schemas/index.html +1 -0
- ui/static/static/static/schemas/index.txt +21 -0
- ui/static/static/static/settings/__next._full.txt +21 -0
- ui/static/static/static/settings/__next._head.txt +7 -0
- ui/static/static/static/settings/__next._index.txt +9 -0
- ui/static/static/static/settings/__next._tree.txt +2 -0
- ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
- ui/static/static/static/settings/__next.settings.txt +4 -0
- ui/static/static/static/settings/index.html +1 -0
- ui/static/static/static/settings/index.txt +21 -0
- ui/static/static/static/validation/__next._full.txt +21 -0
- ui/static/static/static/validation/__next._head.txt +7 -0
- ui/static/static/static/validation/__next._index.txt +9 -0
- ui/static/static/static/validation/__next._tree.txt +2 -0
- ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
- ui/static/static/static/validation/__next.validation.txt +4 -0
- ui/static/static/static/validation/index.html +1 -0
- ui/static/static/static/validation/index.txt +21 -0
- ui/static/static/validation/__next._full.txt +2 -2
- ui/static/static/validation/__next._head.txt +1 -1
- ui/static/static/validation/__next._index.txt +2 -2
- ui/static/static/validation/__next._tree.txt +2 -2
- ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
- ui/static/static/validation/__next.validation.txt +1 -1
- ui/static/static/validation/index.html +1 -1
- ui/static/static/validation/index.txt +2 -2
- ui/static/validation/__next._full.txt +2 -2
- ui/static/validation/__next._head.txt +1 -1
- ui/static/validation/__next._index.txt +1 -1
- ui/static/validation/__next._tree.txt +1 -1
- ui/static/validation/__next.validation.__PAGE__.txt +2 -2
- ui/static/validation/__next.validation.txt +1 -1
- ui/static/validation/index.html +1 -1
- ui/static/validation/index.txt +2 -2
- pycharter/data/templates/template_coercion_rules.yaml +0 -15
- pycharter/data/templates/template_contract.yaml +0 -587
- pycharter/data/templates/template_metadata.yaml +0 -38
- pycharter/data/templates/template_schema.yaml +0 -22
- pycharter/data/templates/template_transform_advanced.yaml +0 -50
- pycharter/data/templates/template_transform_simple.yaml +0 -59
- pycharter/data/templates/template_validation_rules.yaml +0 -29
- pycharter/etl_generator/extraction.py +0 -916
- pycharter/etl_generator/factory.py +0 -174
- pycharter/etl_generator/orchestrator.py +0 -1650
- pycharter/integrations/__init__.py +0 -19
- pycharter/integrations/kafka.py +0 -178
- pycharter/integrations/streaming.py +0 -100
- pycharter-0.0.22.dist-info/RECORD +0 -358
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/entry_points.txt +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/licenses/LICENSE +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/top_level.txt +0 -0
- /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
- /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
- /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
- /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
- /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cloud storage extractor for ETL orchestrator.
|
|
3
|
+
|
|
4
|
+
Supports extracting data from cloud storage:
|
|
5
|
+
- AWS S3
|
|
6
|
+
- Google Cloud Storage (GCS)
|
|
7
|
+
- Azure Blob Storage
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
import tempfile
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, AsyncIterator, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
from pycharter.etl_generator.extractors.base import BaseExtractor
|
|
17
|
+
from pycharter.etl_generator.extractors.file import FileExtractor
|
|
18
|
+
from pycharter.utils.value_injector import resolve_values
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
# Try to import cloud storage libraries
|
|
23
|
+
try:
|
|
24
|
+
import boto3
|
|
25
|
+
from botocore.exceptions import ClientError
|
|
26
|
+
S3_AVAILABLE = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
S3_AVAILABLE = False
|
|
29
|
+
boto3 = None
|
|
30
|
+
ClientError = None
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from google.cloud import storage as gcs_storage
|
|
34
|
+
GCS_AVAILABLE = True
|
|
35
|
+
except ImportError:
|
|
36
|
+
GCS_AVAILABLE = False
|
|
37
|
+
gcs_storage = None
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
from azure.storage.blob import BlobServiceClient
|
|
41
|
+
AZURE_AVAILABLE = True
|
|
42
|
+
except ImportError:
|
|
43
|
+
AZURE_AVAILABLE = False
|
|
44
|
+
BlobServiceClient = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class CloudStorageExtractor(BaseExtractor):
|
|
48
|
+
"""
|
|
49
|
+
Extractor for cloud storage data sources.
|
|
50
|
+
|
|
51
|
+
Supports two modes:
|
|
52
|
+
1. Programmatic API:
|
|
53
|
+
>>> extractor = CloudStorageExtractor(provider="s3", bucket="my-bucket", path="data/")
|
|
54
|
+
>>> async for batch in extractor.extract():
|
|
55
|
+
... process(batch)
|
|
56
|
+
|
|
57
|
+
2. Config-driven:
|
|
58
|
+
>>> extractor = CloudStorageExtractor()
|
|
59
|
+
>>> async for batch in extractor.extract_streaming(config, params, headers):
|
|
60
|
+
... process(batch)
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
provider: Optional[str] = None,
|
|
66
|
+
bucket: Optional[str] = None,
|
|
67
|
+
path: Optional[str] = None,
|
|
68
|
+
credentials: Optional[Dict[str, Any]] = None,
|
|
69
|
+
file_format: Optional[str] = None,
|
|
70
|
+
batch_size: int = 1000,
|
|
71
|
+
max_records: Optional[int] = None,
|
|
72
|
+
):
|
|
73
|
+
self.provider = provider
|
|
74
|
+
self.bucket = bucket
|
|
75
|
+
self.path = path
|
|
76
|
+
self.credentials = credentials
|
|
77
|
+
self.file_format = file_format
|
|
78
|
+
self.batch_size = batch_size
|
|
79
|
+
self.max_records = max_records
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_config(cls, config: Dict[str, Any]) -> "CloudStorageExtractor":
|
|
83
|
+
"""Create extractor from configuration dict."""
|
|
84
|
+
storage_config = config.get("storage", {})
|
|
85
|
+
return cls(
|
|
86
|
+
provider=storage_config.get("provider") or config.get("provider"),
|
|
87
|
+
bucket=storage_config.get("bucket") or config.get("bucket"),
|
|
88
|
+
path=storage_config.get("path") or config.get("path"),
|
|
89
|
+
credentials=storage_config.get("credentials") or config.get("credentials"),
|
|
90
|
+
file_format=config.get("format"),
|
|
91
|
+
batch_size=config.get("batch_size", 1000),
|
|
92
|
+
max_records=config.get("max_records"),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
async def extract(self, **params) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
96
|
+
"""
|
|
97
|
+
Extract data from cloud storage.
|
|
98
|
+
|
|
99
|
+
Yields:
|
|
100
|
+
Batches of records
|
|
101
|
+
"""
|
|
102
|
+
if not self.provider:
|
|
103
|
+
raise ValueError("Provider is required (s3, gcs, azure)")
|
|
104
|
+
if not self.bucket:
|
|
105
|
+
raise ValueError("Bucket is required")
|
|
106
|
+
if not self.path:
|
|
107
|
+
raise ValueError("Path is required")
|
|
108
|
+
|
|
109
|
+
extract_config = {
|
|
110
|
+
"storage": {
|
|
111
|
+
"provider": self.provider,
|
|
112
|
+
"bucket": self.bucket,
|
|
113
|
+
"path": self.path,
|
|
114
|
+
"credentials": self.credentials,
|
|
115
|
+
},
|
|
116
|
+
"format": self.file_format,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
async for batch in self.extract_streaming(
|
|
120
|
+
extract_config, {}, {},
|
|
121
|
+
batch_size=self.batch_size,
|
|
122
|
+
max_records=self.max_records,
|
|
123
|
+
):
|
|
124
|
+
yield batch
|
|
125
|
+
|
|
126
|
+
def validate_config(self, extract_config: Dict[str, Any]) -> None:
|
|
127
|
+
"""Validate cloud storage extractor configuration."""
|
|
128
|
+
if 'source_type' in extract_config and extract_config['source_type'] != 'cloud_storage':
|
|
129
|
+
raise ValueError(
|
|
130
|
+
f"CloudStorageExtractor requires source_type='cloud_storage', "
|
|
131
|
+
f"got '{extract_config.get('source_type')}'"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
storage_config = extract_config.get('storage', {})
|
|
135
|
+
provider = storage_config.get('provider', '').lower()
|
|
136
|
+
|
|
137
|
+
if provider not in ['s3', 'gcs', 'azure']:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"Cloud storage provider must be 's3', 'gcs', or 'azure', got '{provider}'"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
if not storage_config.get('bucket'):
|
|
143
|
+
raise ValueError("Cloud storage extractor requires 'storage.bucket' in extract_config")
|
|
144
|
+
|
|
145
|
+
if not storage_config.get('path'):
|
|
146
|
+
raise ValueError("Cloud storage extractor requires 'storage.path' in extract_config")
|
|
147
|
+
|
|
148
|
+
async def extract_streaming(
|
|
149
|
+
self,
|
|
150
|
+
extract_config: Dict[str, Any],
|
|
151
|
+
params: Dict[str, Any],
|
|
152
|
+
headers: Dict[str, Any],
|
|
153
|
+
contract_dir: Optional[Any] = None,
|
|
154
|
+
batch_size: int = 1000,
|
|
155
|
+
max_records: Optional[int] = None,
|
|
156
|
+
config_context: Optional[Dict[str, Any]] = None,
|
|
157
|
+
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
158
|
+
"""
|
|
159
|
+
Extract data from cloud storage.
|
|
160
|
+
|
|
161
|
+
Downloads files from cloud storage and processes them using FileExtractor.
|
|
162
|
+
Supports single files and prefixes (for multiple files).
|
|
163
|
+
"""
|
|
164
|
+
storage_config = extract_config.get('storage', {})
|
|
165
|
+
provider = storage_config.get('provider', '').lower()
|
|
166
|
+
|
|
167
|
+
# Resolve variables
|
|
168
|
+
source_file = str(contract_dir / "extract.yaml") if contract_dir else None
|
|
169
|
+
bucket = resolve_values(storage_config.get('bucket'), context=config_context, source_file=source_file)
|
|
170
|
+
path = resolve_values(storage_config.get('path'), context=config_context, source_file=source_file)
|
|
171
|
+
credentials = storage_config.get('credentials')
|
|
172
|
+
|
|
173
|
+
# Detect format
|
|
174
|
+
file_format = extract_config.get('format')
|
|
175
|
+
if not file_format:
|
|
176
|
+
# Try to detect from path
|
|
177
|
+
path_obj = Path(path)
|
|
178
|
+
file_format = self._detect_format_from_path(path_obj)
|
|
179
|
+
|
|
180
|
+
logger.info(f"Extracting from {provider.upper()}: {bucket}/{path}")
|
|
181
|
+
|
|
182
|
+
# Download and process files
|
|
183
|
+
if provider == 's3':
|
|
184
|
+
async for batch in self._extract_from_s3(
|
|
185
|
+
bucket, path, credentials, file_format, batch_size, max_records, config_context, source_file
|
|
186
|
+
):
|
|
187
|
+
yield batch
|
|
188
|
+
elif provider == 'gcs':
|
|
189
|
+
async for batch in self._extract_from_gcs(
|
|
190
|
+
bucket, path, credentials, file_format, batch_size, max_records, config_context, source_file
|
|
191
|
+
):
|
|
192
|
+
yield batch
|
|
193
|
+
elif provider == 'azure':
|
|
194
|
+
async for batch in self._extract_from_azure(
|
|
195
|
+
bucket, path, credentials, file_format, batch_size, max_records, config_context, source_file
|
|
196
|
+
):
|
|
197
|
+
yield batch
|
|
198
|
+
else:
|
|
199
|
+
raise ValueError(f"Unsupported cloud storage provider: {provider}")
|
|
200
|
+
|
|
201
|
+
async def _extract_from_s3(
|
|
202
|
+
self,
|
|
203
|
+
bucket: str,
|
|
204
|
+
path: str,
|
|
205
|
+
credentials: Optional[Dict[str, Any]],
|
|
206
|
+
file_format: Optional[str],
|
|
207
|
+
batch_size: int,
|
|
208
|
+
max_records: Optional[int],
|
|
209
|
+
config_context: Optional[Dict[str, Any]],
|
|
210
|
+
source_file: Optional[str],
|
|
211
|
+
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
212
|
+
"""Extract data from AWS S3."""
|
|
213
|
+
if not S3_AVAILABLE:
|
|
214
|
+
raise ImportError(
|
|
215
|
+
"boto3 is required for S3 extraction. "
|
|
216
|
+
"Install with: pip install boto3 or pip install pycharter[etl]"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Initialize S3 client
|
|
220
|
+
s3_client = boto3.client('s3')
|
|
221
|
+
|
|
222
|
+
# Handle credentials if provided
|
|
223
|
+
if credentials:
|
|
224
|
+
if isinstance(credentials, dict):
|
|
225
|
+
aws_access_key_id = credentials.get('aws_access_key_id')
|
|
226
|
+
aws_secret_access_key = credentials.get('aws_secret_access_key')
|
|
227
|
+
region = credentials.get('region', 'us-east-1')
|
|
228
|
+
|
|
229
|
+
if aws_access_key_id and aws_secret_access_key:
|
|
230
|
+
s3_client = boto3.client(
|
|
231
|
+
's3',
|
|
232
|
+
aws_access_key_id=aws_access_key_id,
|
|
233
|
+
aws_secret_access_key=aws_secret_access_key,
|
|
234
|
+
region_name=region,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Check if path is a prefix (ends with / or contains *)
|
|
238
|
+
if path.endswith('/') or '*' in path:
|
|
239
|
+
# List objects with prefix
|
|
240
|
+
prefix = path.rstrip('/')
|
|
241
|
+
if '*' in prefix:
|
|
242
|
+
# Convert glob pattern to prefix
|
|
243
|
+
prefix = prefix.split('*')[0]
|
|
244
|
+
|
|
245
|
+
paginator = s3_client.get_paginator('list_objects_v2')
|
|
246
|
+
pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
|
|
247
|
+
|
|
248
|
+
total_extracted = 0
|
|
249
|
+
for page in pages:
|
|
250
|
+
if 'Contents' not in page:
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
for obj in page['Contents']:
|
|
254
|
+
if max_records and total_extracted >= max_records:
|
|
255
|
+
break
|
|
256
|
+
|
|
257
|
+
key = obj['Key']
|
|
258
|
+
logger.info(f"Processing S3 object: {bucket}/{key}")
|
|
259
|
+
|
|
260
|
+
# Download file to temp location
|
|
261
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(key).suffix) as tmp_file:
|
|
262
|
+
try:
|
|
263
|
+
s3_client.download_fileobj(bucket, key, tmp_file)
|
|
264
|
+
tmp_path = Path(tmp_file.name)
|
|
265
|
+
|
|
266
|
+
# Use FileExtractor to process the file
|
|
267
|
+
file_extractor = FileExtractor()
|
|
268
|
+
file_config = {
|
|
269
|
+
'source_type': 'file',
|
|
270
|
+
'file_path': str(tmp_path),
|
|
271
|
+
'format': file_format,
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
async for batch in file_extractor.extract_streaming(
|
|
275
|
+
file_config, {}, {}, None, batch_size, max_records, config_context
|
|
276
|
+
):
|
|
277
|
+
total_extracted += len(batch)
|
|
278
|
+
yield batch
|
|
279
|
+
if max_records and total_extracted >= max_records:
|
|
280
|
+
break
|
|
281
|
+
finally:
|
|
282
|
+
# Cleanup temp file
|
|
283
|
+
if tmp_path.exists():
|
|
284
|
+
tmp_path.unlink()
|
|
285
|
+
else:
|
|
286
|
+
# Single file
|
|
287
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(path).suffix) as tmp_file:
|
|
288
|
+
try:
|
|
289
|
+
s3_client.download_fileobj(bucket, path, tmp_file)
|
|
290
|
+
tmp_path = Path(tmp_file.name)
|
|
291
|
+
|
|
292
|
+
# Use FileExtractor to process the file
|
|
293
|
+
file_extractor = FileExtractor()
|
|
294
|
+
file_config = {
|
|
295
|
+
'source_type': 'file',
|
|
296
|
+
'file_path': str(tmp_path),
|
|
297
|
+
'format': file_format,
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
async for batch in file_extractor.extract_streaming(
|
|
301
|
+
file_config, {}, {}, None, batch_size, max_records, config_context
|
|
302
|
+
):
|
|
303
|
+
yield batch
|
|
304
|
+
finally:
|
|
305
|
+
if tmp_path.exists():
|
|
306
|
+
tmp_path.unlink()
|
|
307
|
+
|
|
308
|
+
async def _extract_from_gcs(
|
|
309
|
+
self,
|
|
310
|
+
bucket: str,
|
|
311
|
+
path: str,
|
|
312
|
+
credentials: Optional[Dict[str, Any]],
|
|
313
|
+
file_format: Optional[str],
|
|
314
|
+
batch_size: int,
|
|
315
|
+
max_records: Optional[int],
|
|
316
|
+
config_context: Optional[Dict[str, Any]],
|
|
317
|
+
source_file: Optional[str],
|
|
318
|
+
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
319
|
+
"""Extract data from Google Cloud Storage."""
|
|
320
|
+
if not GCS_AVAILABLE:
|
|
321
|
+
raise ImportError(
|
|
322
|
+
"google-cloud-storage is required for GCS extraction. "
|
|
323
|
+
"Install with: pip install google-cloud-storage"
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Initialize GCS client
|
|
327
|
+
if credentials:
|
|
328
|
+
# Use provided credentials (path to JSON key file or dict)
|
|
329
|
+
if isinstance(credentials, str):
|
|
330
|
+
client = gcs_storage.Client.from_service_account_json(credentials)
|
|
331
|
+
elif isinstance(credentials, dict):
|
|
332
|
+
# Create temporary JSON file
|
|
333
|
+
import json
|
|
334
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
|
|
335
|
+
json.dump(credentials, tmp)
|
|
336
|
+
tmp_path = tmp.name
|
|
337
|
+
try:
|
|
338
|
+
client = gcs_storage.Client.from_service_account_json(tmp_path)
|
|
339
|
+
finally:
|
|
340
|
+
Path(tmp_path).unlink()
|
|
341
|
+
else:
|
|
342
|
+
client = gcs_storage.Client()
|
|
343
|
+
else:
|
|
344
|
+
client = gcs_storage.Client()
|
|
345
|
+
|
|
346
|
+
bucket_obj = client.bucket(bucket)
|
|
347
|
+
|
|
348
|
+
# Check if path is a prefix
|
|
349
|
+
if path.endswith('/') or '*' in path:
|
|
350
|
+
prefix = path.rstrip('/')
|
|
351
|
+
if '*' in prefix:
|
|
352
|
+
prefix = prefix.split('*')[0]
|
|
353
|
+
|
|
354
|
+
blobs = bucket_obj.list_blobs(prefix=prefix)
|
|
355
|
+
|
|
356
|
+
total_extracted = 0
|
|
357
|
+
for blob in blobs:
|
|
358
|
+
if max_records and total_extracted >= max_records:
|
|
359
|
+
break
|
|
360
|
+
|
|
361
|
+
logger.info(f"Processing GCS blob: {bucket}/{blob.name}")
|
|
362
|
+
|
|
363
|
+
# Download to temp file
|
|
364
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(blob.name).suffix) as tmp_file:
|
|
365
|
+
try:
|
|
366
|
+
blob.download_to_filename(tmp_file.name)
|
|
367
|
+
tmp_path = Path(tmp_file.name)
|
|
368
|
+
|
|
369
|
+
# Use FileExtractor
|
|
370
|
+
file_extractor = FileExtractor()
|
|
371
|
+
file_config = {
|
|
372
|
+
'source_type': 'file',
|
|
373
|
+
'file_path': str(tmp_path),
|
|
374
|
+
'format': file_format,
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
async for batch in file_extractor.extract_streaming(
|
|
378
|
+
file_config, {}, {}, None, batch_size, max_records, config_context
|
|
379
|
+
):
|
|
380
|
+
total_extracted += len(batch)
|
|
381
|
+
yield batch
|
|
382
|
+
if max_records and total_extracted >= max_records:
|
|
383
|
+
break
|
|
384
|
+
finally:
|
|
385
|
+
if tmp_path.exists():
|
|
386
|
+
tmp_path.unlink()
|
|
387
|
+
else:
|
|
388
|
+
# Single file
|
|
389
|
+
blob = bucket_obj.blob(path)
|
|
390
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(path).suffix) as tmp_file:
|
|
391
|
+
try:
|
|
392
|
+
blob.download_to_filename(tmp_file.name)
|
|
393
|
+
tmp_path = Path(tmp_file.name)
|
|
394
|
+
|
|
395
|
+
# Use FileExtractor
|
|
396
|
+
file_extractor = FileExtractor()
|
|
397
|
+
file_config = {
|
|
398
|
+
'source_type': 'file',
|
|
399
|
+
'file_path': str(tmp_path),
|
|
400
|
+
'format': file_format,
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
async for batch in file_extractor.extract_streaming(
|
|
404
|
+
file_config, {}, {}, None, batch_size, max_records, config_context
|
|
405
|
+
):
|
|
406
|
+
yield batch
|
|
407
|
+
finally:
|
|
408
|
+
if tmp_path.exists():
|
|
409
|
+
tmp_path.unlink()
|
|
410
|
+
|
|
411
|
+
async def _extract_from_azure(
|
|
412
|
+
self,
|
|
413
|
+
container: str,
|
|
414
|
+
path: str,
|
|
415
|
+
credentials: Optional[Dict[str, Any]],
|
|
416
|
+
file_format: Optional[str],
|
|
417
|
+
batch_size: int,
|
|
418
|
+
max_records: Optional[int],
|
|
419
|
+
config_context: Optional[Dict[str, Any]],
|
|
420
|
+
source_file: Optional[str],
|
|
421
|
+
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
422
|
+
"""Extract data from Azure Blob Storage."""
|
|
423
|
+
if not AZURE_AVAILABLE:
|
|
424
|
+
raise ImportError(
|
|
425
|
+
"azure-storage-blob is required for Azure extraction. "
|
|
426
|
+
"Install with: pip install azure-storage-blob"
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Initialize Azure client
|
|
430
|
+
if credentials:
|
|
431
|
+
connection_string = credentials.get('connection_string')
|
|
432
|
+
account_name = credentials.get('account_name')
|
|
433
|
+
account_key = credentials.get('account_key')
|
|
434
|
+
|
|
435
|
+
if connection_string:
|
|
436
|
+
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
|
|
437
|
+
elif account_name and account_key:
|
|
438
|
+
account_url = f"https://{account_name}.blob.core.windows.net"
|
|
439
|
+
blob_service_client = BlobServiceClient(account_url, credential=account_key)
|
|
440
|
+
else:
|
|
441
|
+
raise ValueError("Azure credentials must include 'connection_string' or ('account_name', 'account_key')")
|
|
442
|
+
else:
|
|
443
|
+
# Use default credentials (environment variables)
|
|
444
|
+
blob_service_client = BlobServiceClient.from_connection_string(
|
|
445
|
+
os.environ.get('AZURE_STORAGE_CONNECTION_STRING', '')
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
container_client = blob_service_client.get_container_client(container)
|
|
449
|
+
|
|
450
|
+
# Check if path is a prefix
|
|
451
|
+
if path.endswith('/') or '*' in path:
|
|
452
|
+
prefix = path.rstrip('/')
|
|
453
|
+
if '*' in prefix:
|
|
454
|
+
prefix = prefix.split('*')[0]
|
|
455
|
+
|
|
456
|
+
blobs = container_client.list_blobs(name_starts_with=prefix)
|
|
457
|
+
|
|
458
|
+
total_extracted = 0
|
|
459
|
+
for blob in blobs:
|
|
460
|
+
if max_records and total_extracted >= max_records:
|
|
461
|
+
break
|
|
462
|
+
|
|
463
|
+
logger.info(f"Processing Azure blob: {container}/{blob.name}")
|
|
464
|
+
|
|
465
|
+
# Download to temp file
|
|
466
|
+
blob_client = container_client.get_blob_client(blob.name)
|
|
467
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(blob.name).suffix) as tmp_file:
|
|
468
|
+
try:
|
|
469
|
+
blob_data = blob_client.download_blob()
|
|
470
|
+
blob_data.download_to_stream(tmp_file)
|
|
471
|
+
tmp_path = Path(tmp_file.name)
|
|
472
|
+
|
|
473
|
+
# Use FileExtractor
|
|
474
|
+
file_extractor = FileExtractor()
|
|
475
|
+
file_config = {
|
|
476
|
+
'source_type': 'file',
|
|
477
|
+
'file_path': str(tmp_path),
|
|
478
|
+
'format': file_format,
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
async for batch in file_extractor.extract_streaming(
|
|
482
|
+
file_config, {}, {}, None, batch_size, max_records, config_context
|
|
483
|
+
):
|
|
484
|
+
total_extracted += len(batch)
|
|
485
|
+
yield batch
|
|
486
|
+
if max_records and total_extracted >= max_records:
|
|
487
|
+
break
|
|
488
|
+
finally:
|
|
489
|
+
if tmp_path.exists():
|
|
490
|
+
tmp_path.unlink()
|
|
491
|
+
else:
|
|
492
|
+
# Single file
|
|
493
|
+
blob_client = container_client.get_blob_client(path)
|
|
494
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(path).suffix) as tmp_file:
|
|
495
|
+
try:
|
|
496
|
+
blob_data = blob_client.download_blob()
|
|
497
|
+
blob_data.download_to_stream(tmp_file)
|
|
498
|
+
tmp_path = Path(tmp_file.name)
|
|
499
|
+
|
|
500
|
+
# Use FileExtractor
|
|
501
|
+
file_extractor = FileExtractor()
|
|
502
|
+
file_config = {
|
|
503
|
+
'source_type': 'file',
|
|
504
|
+
'file_path': str(tmp_path),
|
|
505
|
+
'format': file_format,
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
async for batch in file_extractor.extract_streaming(
|
|
509
|
+
file_config, {}, {}, None, batch_size, max_records, config_context
|
|
510
|
+
):
|
|
511
|
+
yield batch
|
|
512
|
+
finally:
|
|
513
|
+
if tmp_path.exists():
|
|
514
|
+
tmp_path.unlink()
|
|
515
|
+
|
|
516
|
+
def _detect_format_from_path(self, path: Path) -> Optional[str]:
|
|
517
|
+
"""Detect file format from path extension."""
|
|
518
|
+
suffix = path.suffix.lower()
|
|
519
|
+
format_map = {
|
|
520
|
+
'.csv': 'csv',
|
|
521
|
+
'.tsv': 'tsv',
|
|
522
|
+
'.json': 'json',
|
|
523
|
+
'.jsonl': 'jsonl',
|
|
524
|
+
'.ndjson': 'jsonl',
|
|
525
|
+
'.parquet': 'parquet',
|
|
526
|
+
'.xlsx': 'excel',
|
|
527
|
+
'.xls': 'excel',
|
|
528
|
+
'.xml': 'xml',
|
|
529
|
+
}
|
|
530
|
+
return format_map.get(suffix)
|