pycharter 0.0.22__py3-none-any.whl → 0.0.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- api/routes/v1/templates.py +43 -24
- pycharter/data/templates/etl/README.md +91 -0
- pycharter/data/templates/etl/extract_cloud_azure.yaml +23 -0
- pycharter/data/templates/etl/extract_cloud_gcs.yaml +22 -0
- pycharter/data/templates/etl/extract_cloud_s3.yaml +24 -0
- pycharter/data/templates/etl/extract_database.yaml +28 -0
- pycharter/data/templates/etl/extract_database_ssh.yaml +27 -0
- pycharter/data/templates/etl/extract_file_csv.yaml +17 -0
- pycharter/data/templates/etl/extract_file_glob.yaml +17 -0
- pycharter/data/templates/etl/extract_file_json.yaml +14 -0
- pycharter/data/templates/etl/extract_file_parquet.yaml +13 -0
- pycharter/data/templates/etl/extract_http_paginated.yaml +75 -0
- pycharter/data/templates/etl/extract_http_path_params.yaml +45 -0
- pycharter/data/templates/etl/extract_http_simple.yaml +52 -0
- pycharter/data/templates/etl/load_insert.yaml +17 -0
- pycharter/data/templates/etl/load_postgresql.yaml +17 -0
- pycharter/data/templates/etl/load_sqlite.yaml +16 -0
- pycharter/data/templates/etl/load_truncate_and_load.yaml +18 -0
- pycharter/data/templates/etl/load_upsert.yaml +28 -0
- pycharter/data/templates/etl/load_with_dlq.yaml +24 -0
- pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +28 -0
- pycharter/data/templates/etl/pipeline_http_to_db.yaml +38 -0
- pycharter/data/templates/etl/transform_combined.yaml +38 -0
- pycharter/data/templates/etl/transform_custom_function.yaml +18 -0
- pycharter/data/templates/etl/transform_jsonata.yaml +20 -0
- pycharter/data/templates/etl/transform_simple.yaml +41 -0
- pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
- pycharter/etl_generator/extraction.py +47 -262
- pycharter/etl_generator/extractors/__init__.py +26 -0
- pycharter/etl_generator/extractors/base.py +70 -0
- pycharter/etl_generator/extractors/cloud_storage.py +454 -0
- pycharter/etl_generator/extractors/database.py +151 -0
- pycharter/etl_generator/extractors/factory.py +141 -0
- pycharter/etl_generator/extractors/file.py +418 -0
- pycharter/etl_generator/extractors/http.py +816 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/METADATA +6 -1
- pycharter-0.0.23.dist-info/RECORD +498 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/WHEEL +1 -1
- ui/static/404/index.html +1 -1
- ui/static/404.html +1 -1
- ui/static/__next.__PAGE__.txt +1 -1
- ui/static/__next._full.txt +1 -1
- ui/static/__next._head.txt +1 -1
- ui/static/__next._index.txt +1 -1
- ui/static/__next._tree.txt +1 -1
- ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
- ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
- ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
- ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
- ui/static/_not-found/__next._full.txt +1 -1
- ui/static/_not-found/__next._head.txt +1 -1
- ui/static/_not-found/__next._index.txt +1 -1
- ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- ui/static/_not-found/__next._not-found.txt +1 -1
- ui/static/_not-found/__next._tree.txt +1 -1
- ui/static/_not-found/index.html +1 -1
- ui/static/_not-found/index.txt +1 -1
- ui/static/contracts/__next._full.txt +2 -2
- ui/static/contracts/__next._head.txt +1 -1
- ui/static/contracts/__next._index.txt +1 -1
- ui/static/contracts/__next._tree.txt +1 -1
- ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
- ui/static/contracts/__next.contracts.txt +1 -1
- ui/static/contracts/index.html +1 -1
- ui/static/contracts/index.txt +2 -2
- ui/static/documentation/__next._full.txt +1 -1
- ui/static/documentation/__next._head.txt +1 -1
- ui/static/documentation/__next._index.txt +1 -1
- ui/static/documentation/__next._tree.txt +1 -1
- ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
- ui/static/documentation/__next.documentation.txt +1 -1
- ui/static/documentation/index.html +2 -2
- ui/static/documentation/index.txt +1 -1
- ui/static/index.html +1 -1
- ui/static/index.txt +1 -1
- ui/static/metadata/__next._full.txt +1 -1
- ui/static/metadata/__next._head.txt +1 -1
- ui/static/metadata/__next._index.txt +1 -1
- ui/static/metadata/__next._tree.txt +1 -1
- ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
- ui/static/metadata/__next.metadata.txt +1 -1
- ui/static/metadata/index.html +1 -1
- ui/static/metadata/index.txt +1 -1
- ui/static/quality/__next._full.txt +2 -2
- ui/static/quality/__next._head.txt +1 -1
- ui/static/quality/__next._index.txt +1 -1
- ui/static/quality/__next._tree.txt +1 -1
- ui/static/quality/__next.quality.__PAGE__.txt +2 -2
- ui/static/quality/__next.quality.txt +1 -1
- ui/static/quality/index.html +2 -2
- ui/static/quality/index.txt +2 -2
- ui/static/rules/__next._full.txt +1 -1
- ui/static/rules/__next._head.txt +1 -1
- ui/static/rules/__next._index.txt +1 -1
- ui/static/rules/__next._tree.txt +1 -1
- ui/static/rules/__next.rules.__PAGE__.txt +1 -1
- ui/static/rules/__next.rules.txt +1 -1
- ui/static/rules/index.html +1 -1
- ui/static/rules/index.txt +1 -1
- ui/static/schemas/__next._full.txt +1 -1
- ui/static/schemas/__next._head.txt +1 -1
- ui/static/schemas/__next._index.txt +1 -1
- ui/static/schemas/__next._tree.txt +1 -1
- ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
- ui/static/schemas/__next.schemas.txt +1 -1
- ui/static/schemas/index.html +1 -1
- ui/static/schemas/index.txt +1 -1
- ui/static/settings/__next._full.txt +1 -1
- ui/static/settings/__next._head.txt +1 -1
- ui/static/settings/__next._index.txt +1 -1
- ui/static/settings/__next._tree.txt +1 -1
- ui/static/settings/__next.settings.__PAGE__.txt +1 -1
- ui/static/settings/__next.settings.txt +1 -1
- ui/static/settings/index.html +1 -1
- ui/static/settings/index.txt +1 -1
- ui/static/static/404/index.html +1 -1
- ui/static/static/404.html +1 -1
- ui/static/static/__next.__PAGE__.txt +1 -1
- ui/static/static/__next._full.txt +2 -2
- ui/static/static/__next._head.txt +1 -1
- ui/static/static/__next._index.txt +2 -2
- ui/static/static/__next._tree.txt +2 -2
- ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
- ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
- ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
- ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
- ui/static/static/_not-found/__next._full.txt +2 -2
- ui/static/static/_not-found/__next._head.txt +1 -1
- ui/static/static/_not-found/__next._index.txt +2 -2
- ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- ui/static/static/_not-found/__next._not-found.txt +1 -1
- ui/static/static/_not-found/__next._tree.txt +2 -2
- ui/static/static/_not-found/index.html +1 -1
- ui/static/static/_not-found/index.txt +2 -2
- ui/static/static/contracts/__next._full.txt +3 -3
- ui/static/static/contracts/__next._head.txt +1 -1
- ui/static/static/contracts/__next._index.txt +2 -2
- ui/static/static/contracts/__next._tree.txt +2 -2
- ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
- ui/static/static/contracts/__next.contracts.txt +1 -1
- ui/static/static/contracts/index.html +1 -1
- ui/static/static/contracts/index.txt +3 -3
- ui/static/static/documentation/__next._full.txt +3 -3
- ui/static/static/documentation/__next._head.txt +1 -1
- ui/static/static/documentation/__next._index.txt +2 -2
- ui/static/static/documentation/__next._tree.txt +2 -2
- ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
- ui/static/static/documentation/__next.documentation.txt +1 -1
- ui/static/static/documentation/index.html +2 -2
- ui/static/static/documentation/index.txt +3 -3
- ui/static/static/index.html +1 -1
- ui/static/static/index.txt +2 -2
- ui/static/static/metadata/__next._full.txt +2 -2
- ui/static/static/metadata/__next._head.txt +1 -1
- ui/static/static/metadata/__next._index.txt +2 -2
- ui/static/static/metadata/__next._tree.txt +2 -2
- ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
- ui/static/static/metadata/__next.metadata.txt +1 -1
- ui/static/static/metadata/index.html +1 -1
- ui/static/static/metadata/index.txt +2 -2
- ui/static/static/quality/__next._full.txt +2 -2
- ui/static/static/quality/__next._head.txt +1 -1
- ui/static/static/quality/__next._index.txt +2 -2
- ui/static/static/quality/__next._tree.txt +2 -2
- ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
- ui/static/static/quality/__next.quality.txt +1 -1
- ui/static/static/quality/index.html +2 -2
- ui/static/static/quality/index.txt +2 -2
- ui/static/static/rules/__next._full.txt +2 -2
- ui/static/static/rules/__next._head.txt +1 -1
- ui/static/static/rules/__next._index.txt +2 -2
- ui/static/static/rules/__next._tree.txt +2 -2
- ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
- ui/static/static/rules/__next.rules.txt +1 -1
- ui/static/static/rules/index.html +1 -1
- ui/static/static/rules/index.txt +2 -2
- ui/static/static/schemas/__next._full.txt +2 -2
- ui/static/static/schemas/__next._head.txt +1 -1
- ui/static/static/schemas/__next._index.txt +2 -2
- ui/static/static/schemas/__next._tree.txt +2 -2
- ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
- ui/static/static/schemas/__next.schemas.txt +1 -1
- ui/static/static/schemas/index.html +1 -1
- ui/static/static/schemas/index.txt +2 -2
- ui/static/static/settings/__next._full.txt +2 -2
- ui/static/static/settings/__next._head.txt +1 -1
- ui/static/static/settings/__next._index.txt +2 -2
- ui/static/static/settings/__next._tree.txt +2 -2
- ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
- ui/static/static/settings/__next.settings.txt +1 -1
- ui/static/static/settings/index.html +1 -1
- ui/static/static/settings/index.txt +2 -2
- ui/static/static/static/.gitkeep +0 -0
- ui/static/static/static/404/index.html +1 -0
- ui/static/static/static/404.html +1 -0
- ui/static/static/static/__next.__PAGE__.txt +10 -0
- ui/static/static/static/__next._full.txt +30 -0
- ui/static/static/static/__next._head.txt +7 -0
- ui/static/static/static/__next._index.txt +9 -0
- ui/static/static/static/__next._tree.txt +2 -0
- ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
- ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
- ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
- ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
- ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
- ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
- ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
- ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
- ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
- ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
- ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
- ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
- ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
- ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
- ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
- ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
- ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
- ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
- ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
- ui/static/static/static/_not-found/__next._full.txt +17 -0
- ui/static/static/static/_not-found/__next._head.txt +7 -0
- ui/static/static/static/_not-found/__next._index.txt +9 -0
- ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
- ui/static/static/static/_not-found/__next._not-found.txt +4 -0
- ui/static/static/static/_not-found/__next._tree.txt +2 -0
- ui/static/static/static/_not-found/index.html +1 -0
- ui/static/static/static/_not-found/index.txt +17 -0
- ui/static/static/static/contracts/__next._full.txt +21 -0
- ui/static/static/static/contracts/__next._head.txt +7 -0
- ui/static/static/static/contracts/__next._index.txt +9 -0
- ui/static/static/static/contracts/__next._tree.txt +2 -0
- ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
- ui/static/static/static/contracts/__next.contracts.txt +4 -0
- ui/static/static/static/contracts/index.html +1 -0
- ui/static/static/static/contracts/index.txt +21 -0
- ui/static/static/static/documentation/__next._full.txt +21 -0
- ui/static/static/static/documentation/__next._head.txt +7 -0
- ui/static/static/static/documentation/__next._index.txt +9 -0
- ui/static/static/static/documentation/__next._tree.txt +2 -0
- ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
- ui/static/static/static/documentation/__next.documentation.txt +4 -0
- ui/static/static/static/documentation/index.html +93 -0
- ui/static/static/static/documentation/index.txt +21 -0
- ui/static/static/static/index.html +1 -0
- ui/static/static/static/index.txt +30 -0
- ui/static/static/static/metadata/__next._full.txt +21 -0
- ui/static/static/static/metadata/__next._head.txt +7 -0
- ui/static/static/static/metadata/__next._index.txt +9 -0
- ui/static/static/static/metadata/__next._tree.txt +2 -0
- ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
- ui/static/static/static/metadata/__next.metadata.txt +4 -0
- ui/static/static/static/metadata/index.html +1 -0
- ui/static/static/static/metadata/index.txt +21 -0
- ui/static/static/static/quality/__next._full.txt +21 -0
- ui/static/static/static/quality/__next._head.txt +7 -0
- ui/static/static/static/quality/__next._index.txt +9 -0
- ui/static/static/static/quality/__next._tree.txt +2 -0
- ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
- ui/static/static/static/quality/__next.quality.txt +4 -0
- ui/static/static/static/quality/index.html +2 -0
- ui/static/static/static/quality/index.txt +21 -0
- ui/static/static/static/rules/__next._full.txt +21 -0
- ui/static/static/static/rules/__next._head.txt +7 -0
- ui/static/static/static/rules/__next._index.txt +9 -0
- ui/static/static/static/rules/__next._tree.txt +2 -0
- ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
- ui/static/static/static/rules/__next.rules.txt +4 -0
- ui/static/static/static/rules/index.html +1 -0
- ui/static/static/static/rules/index.txt +21 -0
- ui/static/static/static/schemas/__next._full.txt +21 -0
- ui/static/static/static/schemas/__next._head.txt +7 -0
- ui/static/static/static/schemas/__next._index.txt +9 -0
- ui/static/static/static/schemas/__next._tree.txt +2 -0
- ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
- ui/static/static/static/schemas/__next.schemas.txt +4 -0
- ui/static/static/static/schemas/index.html +1 -0
- ui/static/static/static/schemas/index.txt +21 -0
- ui/static/static/static/settings/__next._full.txt +21 -0
- ui/static/static/static/settings/__next._head.txt +7 -0
- ui/static/static/static/settings/__next._index.txt +9 -0
- ui/static/static/static/settings/__next._tree.txt +2 -0
- ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
- ui/static/static/static/settings/__next.settings.txt +4 -0
- ui/static/static/static/settings/index.html +1 -0
- ui/static/static/static/settings/index.txt +21 -0
- ui/static/static/static/validation/__next._full.txt +21 -0
- ui/static/static/static/validation/__next._head.txt +7 -0
- ui/static/static/static/validation/__next._index.txt +9 -0
- ui/static/static/static/validation/__next._tree.txt +2 -0
- ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
- ui/static/static/static/validation/__next.validation.txt +4 -0
- ui/static/static/static/validation/index.html +1 -0
- ui/static/static/static/validation/index.txt +21 -0
- ui/static/static/validation/__next._full.txt +2 -2
- ui/static/static/validation/__next._head.txt +1 -1
- ui/static/static/validation/__next._index.txt +2 -2
- ui/static/static/validation/__next._tree.txt +2 -2
- ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
- ui/static/static/validation/__next.validation.txt +1 -1
- ui/static/static/validation/index.html +1 -1
- ui/static/static/validation/index.txt +2 -2
- ui/static/validation/__next._full.txt +2 -2
- ui/static/validation/__next._head.txt +1 -1
- ui/static/validation/__next._index.txt +1 -1
- ui/static/validation/__next._tree.txt +1 -1
- ui/static/validation/__next.validation.__PAGE__.txt +2 -2
- ui/static/validation/__next.validation.txt +1 -1
- ui/static/validation/index.html +1 -1
- ui/static/validation/index.txt +2 -2
- pycharter/data/templates/template_transform_advanced.yaml +0 -50
- pycharter/data/templates/template_transform_simple.yaml +0 -59
- pycharter-0.0.22.dist-info/RECORD +0 -358
- /pycharter/data/templates/{template_coercion_rules.yaml → contract/template_coercion_rules.yaml} +0 -0
- /pycharter/data/templates/{template_contract.yaml → contract/template_contract.yaml} +0 -0
- /pycharter/data/templates/{template_metadata.yaml → contract/template_metadata.yaml} +0 -0
- /pycharter/data/templates/{template_schema.yaml → contract/template_schema.yaml} +0 -0
- /pycharter/data/templates/{template_validation_rules.yaml → contract/template_validation_rules.yaml} +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/entry_points.txt +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/licenses/LICENSE +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/top_level.txt +0 -0
- /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
- /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
- /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
- /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
- /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
This module
|
|
5
|
-
- HTTP
|
|
6
|
-
-
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
|
|
10
|
-
|
|
2
|
+
Extraction utilities for ETL orchestrator.
|
|
3
|
+
|
|
4
|
+
This module provides the main entry point for data extraction from various sources:
|
|
5
|
+
- HTTP/API extraction
|
|
6
|
+
- File-based extraction (CSV, JSON, Parquet, Excel, TSV, XML)
|
|
7
|
+
- Database extraction (PostgreSQL, MySQL, SQLite, MSSQL, Oracle)
|
|
8
|
+
- Cloud storage extraction (S3, GCS, Azure Blob)
|
|
9
|
+
|
|
10
|
+
The module maintains backward compatibility with the original HTTP-only interface
|
|
11
|
+
while supporting the new modular extractor architecture.
|
|
11
12
|
"""
|
|
12
13
|
|
|
13
14
|
import asyncio
|
|
@@ -648,269 +649,53 @@ async def extract_with_pagination_streaming(
|
|
|
648
649
|
"""
|
|
649
650
|
Extract data with pagination support, yielding batches for memory-efficient processing.
|
|
650
651
|
|
|
652
|
+
This is the main entry point for data extraction. It supports multiple source types:
|
|
653
|
+
- HTTP/API (default for backward compatibility)
|
|
654
|
+
- File-based (CSV, JSON, Parquet, Excel, TSV, XML)
|
|
655
|
+
- Database (PostgreSQL, MySQL, SQLite, MSSQL, Oracle)
|
|
656
|
+
- Cloud storage (S3, GCS, Azure Blob)
|
|
657
|
+
|
|
658
|
+
The source type is auto-detected from extract_config or can be explicitly set
|
|
659
|
+
via 'source_type' field.
|
|
660
|
+
|
|
651
661
|
Yields batches as they are extracted, preventing memory exhaustion for large datasets.
|
|
652
662
|
|
|
653
663
|
Args:
|
|
654
664
|
extract_config: Extract configuration dictionary
|
|
655
|
-
params: Request parameters
|
|
656
|
-
headers: Request headers
|
|
665
|
+
params: Request/query parameters (source-specific)
|
|
666
|
+
headers: Request headers (source-specific, mainly for HTTP)
|
|
657
667
|
contract_dir: Contract directory (for variable resolution)
|
|
658
668
|
batch_size: Number of records to yield per batch
|
|
659
669
|
max_records: Maximum total records to extract (None = all)
|
|
670
|
+
config_context: Optional context dictionary for value injection
|
|
660
671
|
|
|
661
672
|
Yields:
|
|
662
673
|
Batches of extracted records (lists of dictionaries)
|
|
663
|
-
"""
|
|
664
|
-
pagination_config = extract_config.get('pagination', {})
|
|
665
|
-
|
|
666
|
-
# If pagination is not enabled, extract all and yield in batches
|
|
667
|
-
if not pagination_config.get('enabled', False):
|
|
668
|
-
logger.info("Pagination disabled, extracting all data in single request")
|
|
669
|
-
all_data = await extract_with_retry(extract_config, params, headers, contract_dir, config_context=config_context)
|
|
670
|
-
if max_records:
|
|
671
|
-
logger.info(f"Limiting to {max_records} records (extracted {len(all_data)})")
|
|
672
|
-
all_data = all_data[:max_records]
|
|
673
|
-
|
|
674
|
-
logger.info(f"Yielding {len(all_data)} records in batches of {batch_size}")
|
|
675
|
-
for i in range(0, len(all_data), batch_size):
|
|
676
|
-
batch = all_data[i:i + batch_size]
|
|
677
|
-
logger.debug(f"Yielding batch {i // batch_size + 1} with {len(batch)} records")
|
|
678
|
-
yield batch
|
|
679
|
-
return
|
|
680
|
-
|
|
681
|
-
# Pagination enabled - stream pages and yield in batches
|
|
682
|
-
strategy = pagination_config.get('strategy', 'page')
|
|
683
|
-
stop_conditions = pagination_config.get('stop_conditions', [])
|
|
684
|
-
page_delay = float(pagination_config.get('page_delay', 0.1))
|
|
685
|
-
max_pages = 1000
|
|
686
|
-
max_records_from_config = None
|
|
687
|
-
|
|
688
|
-
# Get max_pages and max_records from stop conditions
|
|
689
|
-
for condition in stop_conditions:
|
|
690
|
-
if condition.get('type') == 'max_pages':
|
|
691
|
-
max_pages = condition.get('value', 1000)
|
|
692
|
-
elif condition.get('type') == 'max_records':
|
|
693
|
-
max_records_from_config = condition.get('value')
|
|
694
|
-
|
|
695
|
-
# Use config max_records if not provided as parameter
|
|
696
|
-
if max_records is None:
|
|
697
|
-
max_records = max_records_from_config
|
|
698
|
-
|
|
699
|
-
current_batch = []
|
|
700
|
-
total_extracted = 0
|
|
701
|
-
page_count = 0
|
|
702
|
-
current_url = None
|
|
703
|
-
current_cursor = None
|
|
704
|
-
|
|
705
|
-
# Initialize pagination state
|
|
706
|
-
if strategy == 'page':
|
|
707
|
-
page_config = pagination_config.get('page', {})
|
|
708
|
-
current_page = page_config.get('start', 0)
|
|
709
|
-
page_increment = page_config.get('increment', 1)
|
|
710
|
-
page_param_name = page_config.get('param_name', 'page')
|
|
711
|
-
elif strategy == 'offset':
|
|
712
|
-
offset_config = pagination_config.get('offset', {})
|
|
713
|
-
current_offset = offset_config.get('start', 0)
|
|
714
|
-
offset_param_name = offset_config.get('param_name', 'offset')
|
|
715
|
-
increment_by = offset_config.get('increment_by', 'limit')
|
|
716
|
-
elif strategy == 'cursor':
|
|
717
|
-
cursor_config = pagination_config.get('cursor', {})
|
|
718
|
-
cursor_param_name = cursor_config.get('param_name', 'cursor')
|
|
719
|
-
cursor_response_path = cursor_config.get('response_path', 'next_cursor')
|
|
720
|
-
elif strategy == 'next_url':
|
|
721
|
-
next_url_config = pagination_config.get('next_url', {})
|
|
722
|
-
next_url_response_path = next_url_config.get('response_path', 'next_url')
|
|
723
|
-
elif strategy == 'link_header':
|
|
724
|
-
pass
|
|
725
|
-
else:
|
|
726
|
-
raise ValueError(f"Unsupported pagination strategy: {strategy}")
|
|
727
|
-
|
|
728
|
-
extract_config_copy = extract_config.copy()
|
|
729
|
-
original_endpoint = extract_config_copy.get('api_endpoint')
|
|
730
|
-
original_base_url = extract_config_copy.get('base_url', '')
|
|
731
674
|
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
if max_records and total_extracted >= max_records:
|
|
741
|
-
logger.info(
|
|
742
|
-
f"Reached max_records limit ({max_records}), stopping pagination "
|
|
743
|
-
f"(extracted {total_extracted} records from {page_count} pages)"
|
|
744
|
-
)
|
|
745
|
-
if current_batch:
|
|
746
|
-
yield current_batch
|
|
747
|
-
return
|
|
748
|
-
|
|
749
|
-
# Update params/URL based on strategy
|
|
750
|
-
if strategy == 'page':
|
|
751
|
-
params[page_param_name] = current_page
|
|
752
|
-
logger.debug(f"Fetching page {current_page} (page_count: {page_count + 1}/{max_pages})")
|
|
753
|
-
elif strategy == 'offset':
|
|
754
|
-
params[offset_param_name] = current_offset
|
|
755
|
-
elif strategy == 'cursor' and current_cursor:
|
|
756
|
-
params[cursor_param_name] = current_cursor
|
|
757
|
-
elif strategy == 'next_url' and current_url:
|
|
758
|
-
extract_config_copy['api_endpoint'] = current_url
|
|
759
|
-
extract_config_copy['base_url'] = ''
|
|
760
|
-
|
|
761
|
-
# Make request
|
|
762
|
-
need_full_response = strategy in ['cursor', 'next_url', 'link_header']
|
|
763
|
-
try:
|
|
764
|
-
logger.debug(f"Extracting page {page_count + 1} (total extracted so far: {total_extracted})")
|
|
765
|
-
page_data, full_response_data, response_obj = await _extract_single_page(
|
|
766
|
-
extract_config_copy, params, headers, contract_dir, return_full_response=need_full_response, config_context=config_context
|
|
767
|
-
)
|
|
768
|
-
logger.info(f"Page {page_count + 1} extracted: {len(page_data)} records")
|
|
769
|
-
except Exception as e:
|
|
770
|
-
logger.error(
|
|
771
|
-
f"Error extracting page {page_count + 1}",
|
|
772
|
-
extra={
|
|
773
|
-
'page': page_count + 1,
|
|
774
|
-
'extracted': total_extracted,
|
|
775
|
-
},
|
|
776
|
-
exc_info=True
|
|
777
|
-
)
|
|
778
|
-
# Yield what we have so far before raising
|
|
779
|
-
if current_batch:
|
|
780
|
-
yield current_batch
|
|
781
|
-
raise
|
|
782
|
-
|
|
783
|
-
# Restore original endpoint if modified
|
|
784
|
-
if strategy == 'next_url' and current_url:
|
|
785
|
-
extract_config_copy['api_endpoint'] = original_endpoint
|
|
786
|
-
extract_config_copy['base_url'] = original_base_url
|
|
787
|
-
|
|
788
|
-
# Check for empty page first (before adding to batch)
|
|
789
|
-
if not page_data:
|
|
790
|
-
logger.info(f"Empty page {page_count + 1} received, stopping pagination")
|
|
791
|
-
if current_batch:
|
|
792
|
-
yield current_batch
|
|
793
|
-
break
|
|
794
|
-
|
|
795
|
-
# Check stop conditions BEFORE adding records to batch
|
|
796
|
-
# This prevents unnecessary API calls when we know we should stop
|
|
797
|
-
page_count += 1
|
|
798
|
-
limit_value = params.get('limit', 100)
|
|
799
|
-
record_count = len(page_data)
|
|
800
|
-
logger.info(
|
|
801
|
-
f"Evaluating stop conditions for page {page_count}: "
|
|
802
|
-
f"{record_count} records returned, limit={limit_value}"
|
|
803
|
-
)
|
|
804
|
-
should_stop = _check_stop_conditions(page_data, stop_conditions, params, full_response_data)
|
|
805
|
-
if should_stop:
|
|
806
|
-
logger.info(
|
|
807
|
-
f"✅ Stop condition met at page {page_count} "
|
|
808
|
-
f"(page returned {record_count} records, limit: {limit_value})"
|
|
809
|
-
)
|
|
810
|
-
# Add the final page's records to current batch before yielding
|
|
811
|
-
for record in page_data:
|
|
812
|
-
current_batch.append(record)
|
|
813
|
-
total_extracted += 1
|
|
814
|
-
# Yield batch if it reaches batch_size during this final page
|
|
815
|
-
if len(current_batch) >= batch_size:
|
|
816
|
-
yield current_batch
|
|
817
|
-
current_batch = []
|
|
818
|
-
# Yield any remaining records
|
|
819
|
-
if current_batch:
|
|
820
|
-
yield current_batch
|
|
821
|
-
break
|
|
822
|
-
|
|
823
|
-
# Add page data to current batch (only if we're not stopping)
|
|
824
|
-
for record in page_data:
|
|
825
|
-
current_batch.append(record)
|
|
826
|
-
total_extracted += 1
|
|
827
|
-
|
|
828
|
-
# Yield batch when full
|
|
829
|
-
if len(current_batch) >= batch_size:
|
|
830
|
-
yield current_batch
|
|
831
|
-
current_batch = []
|
|
832
|
-
|
|
833
|
-
# Check max_records limit
|
|
834
|
-
if max_records and total_extracted >= max_records:
|
|
835
|
-
if current_batch:
|
|
836
|
-
yield current_batch
|
|
837
|
-
return
|
|
675
|
+
Example:
|
|
676
|
+
>>> # HTTP extraction (backward compatible)
|
|
677
|
+
>>> extract_config = {
|
|
678
|
+
... 'base_url': 'https://api.example.com',
|
|
679
|
+
... 'api_endpoint': '/v1/data'
|
|
680
|
+
... }
|
|
681
|
+
>>> async for batch in extract_with_pagination_streaming(extract_config, {}, {}):
|
|
682
|
+
... print(f"Extracted {len(batch)} records")
|
|
838
683
|
|
|
839
|
-
#
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
if current and isinstance(current, str):
|
|
853
|
-
current_cursor = current
|
|
854
|
-
elif current:
|
|
855
|
-
current_cursor = str(current)
|
|
856
|
-
else:
|
|
857
|
-
if current_batch:
|
|
858
|
-
yield current_batch
|
|
859
|
-
break
|
|
860
|
-
except (KeyError, IndexError, TypeError, ValueError):
|
|
861
|
-
if current_batch:
|
|
862
|
-
yield current_batch
|
|
863
|
-
break
|
|
864
|
-
|
|
865
|
-
elif strategy == 'next_url' and full_response_data:
|
|
866
|
-
try:
|
|
867
|
-
current = full_response_data
|
|
868
|
-
for part in next_url_response_path.split('.'):
|
|
869
|
-
if isinstance(current, dict):
|
|
870
|
-
current = current.get(part)
|
|
871
|
-
elif isinstance(current, list) and part.isdigit():
|
|
872
|
-
current = current[int(part)]
|
|
873
|
-
else:
|
|
874
|
-
current = None
|
|
875
|
-
break
|
|
876
|
-
|
|
877
|
-
if current and isinstance(current, str):
|
|
878
|
-
current_url = current
|
|
879
|
-
else:
|
|
880
|
-
current_url = None
|
|
881
|
-
|
|
882
|
-
if not current_url:
|
|
883
|
-
if current_batch:
|
|
884
|
-
yield current_batch
|
|
885
|
-
break
|
|
886
|
-
except (KeyError, IndexError, TypeError, ValueError):
|
|
887
|
-
if current_batch:
|
|
888
|
-
yield current_batch
|
|
889
|
-
break
|
|
890
|
-
|
|
891
|
-
elif strategy == 'link_header' and response_obj:
|
|
892
|
-
current_url = _extract_link_header_url(response_obj)
|
|
893
|
-
if not current_url:
|
|
894
|
-
if current_batch:
|
|
895
|
-
yield current_batch
|
|
896
|
-
break
|
|
897
|
-
extract_config_copy['api_endpoint'] = current_url
|
|
898
|
-
extract_config_copy['base_url'] = ''
|
|
899
|
-
|
|
900
|
-
# Update pagination state
|
|
901
|
-
if strategy == 'page':
|
|
902
|
-
current_page += page_increment
|
|
903
|
-
elif strategy == 'offset':
|
|
904
|
-
limit = params.get('limit', 100)
|
|
905
|
-
if increment_by == 'limit':
|
|
906
|
-
current_offset += limit
|
|
907
|
-
else:
|
|
908
|
-
current_offset += int(increment_by)
|
|
909
|
-
|
|
910
|
-
# Delay between pages
|
|
911
|
-
if page_delay > 0:
|
|
912
|
-
await asyncio.sleep(page_delay)
|
|
684
|
+
>>> # File extraction
|
|
685
|
+
>>> extract_config = {
|
|
686
|
+
... 'source_type': 'file',
|
|
687
|
+
... 'file_path': '/path/to/data.csv'
|
|
688
|
+
... }
|
|
689
|
+
>>> async for batch in extract_with_pagination_streaming(extract_config, {}, {}):
|
|
690
|
+
... print(f"Extracted {len(batch)} records")
|
|
691
|
+
"""
|
|
692
|
+
# Use factory to get appropriate extractor
|
|
693
|
+
from pycharter.etl_generator.extractors.factory import ExtractorFactory
|
|
694
|
+
|
|
695
|
+
extractor = ExtractorFactory.get_extractor(extract_config)
|
|
913
696
|
|
|
914
|
-
#
|
|
915
|
-
|
|
916
|
-
|
|
697
|
+
# Delegate to extractor
|
|
698
|
+
async for batch in extractor.extract_streaming(
|
|
699
|
+
extract_config, params, headers, contract_dir, batch_size, max_records, config_context
|
|
700
|
+
):
|
|
701
|
+
yield batch
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extractors module for ETL orchestrator.
|
|
3
|
+
|
|
4
|
+
This module provides a modular architecture for data extraction from various sources:
|
|
5
|
+
- HTTP/API extraction
|
|
6
|
+
- File-based extraction (CSV, JSON, Parquet, Excel, TSV, XML)
|
|
7
|
+
- Database extraction (PostgreSQL, MySQL, SQLite, MSSQL, Oracle)
|
|
8
|
+
- Cloud storage extraction (S3, GCS, Azure Blob)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pycharter.etl_generator.extractors.base import BaseExtractor
|
|
12
|
+
from pycharter.etl_generator.extractors.cloud_storage import CloudStorageExtractor
|
|
13
|
+
from pycharter.etl_generator.extractors.database import DatabaseExtractor
|
|
14
|
+
from pycharter.etl_generator.extractors.file import FileExtractor
|
|
15
|
+
from pycharter.etl_generator.extractors.factory import ExtractorFactory, get_extractor
|
|
16
|
+
from pycharter.etl_generator.extractors.http import HTTPExtractor
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"BaseExtractor",
|
|
20
|
+
"ExtractorFactory",
|
|
21
|
+
"get_extractor",
|
|
22
|
+
"HTTPExtractor",
|
|
23
|
+
"FileExtractor",
|
|
24
|
+
"DatabaseExtractor",
|
|
25
|
+
"CloudStorageExtractor",
|
|
26
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base extractor interface for ETL orchestrator.
|
|
3
|
+
|
|
4
|
+
All extractors must implement this interface to ensure consistent behavior
|
|
5
|
+
across different data sources.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Any, AsyncIterator, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseExtractor(ABC):
|
|
13
|
+
"""
|
|
14
|
+
Base class for all data extractors.
|
|
15
|
+
|
|
16
|
+
All extractors must implement the extract_streaming method which yields
|
|
17
|
+
batches of records as dictionaries. Extractors are schema-agnostic and
|
|
18
|
+
focus purely on data retrieval from their respective sources.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
async def extract_streaming(
|
|
23
|
+
self,
|
|
24
|
+
extract_config: Dict[str, Any],
|
|
25
|
+
params: Dict[str, Any],
|
|
26
|
+
headers: Dict[str, Any],
|
|
27
|
+
contract_dir: Optional[Any] = None,
|
|
28
|
+
batch_size: int = 1000,
|
|
29
|
+
max_records: Optional[int] = None,
|
|
30
|
+
config_context: Optional[Dict[str, Any]] = None,
|
|
31
|
+
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
32
|
+
"""
|
|
33
|
+
Extract data in batches using async generator.
|
|
34
|
+
|
|
35
|
+
This is the main interface that all extractors must implement.
|
|
36
|
+
It yields batches of records as lists of dictionaries, allowing
|
|
37
|
+
for memory-efficient processing of large datasets.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
extract_config: Extract configuration dictionary (source-specific)
|
|
41
|
+
params: Request/query parameters (may be source-specific)
|
|
42
|
+
headers: Request headers (may be source-specific)
|
|
43
|
+
contract_dir: Contract directory path (for variable resolution)
|
|
44
|
+
batch_size: Number of records to yield per batch
|
|
45
|
+
max_records: Maximum total records to extract (None = all)
|
|
46
|
+
config_context: Optional context dictionary for value injection
|
|
47
|
+
|
|
48
|
+
Yields:
|
|
49
|
+
Batches of extracted records (lists of dictionaries)
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
RuntimeError: If extraction fails
|
|
53
|
+
ValueError: If configuration is invalid
|
|
54
|
+
"""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
def validate_config(self, extract_config: Dict[str, Any]) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Validate extractor-specific configuration.
|
|
60
|
+
|
|
61
|
+
Override this method in subclasses to validate source-specific
|
|
62
|
+
configuration requirements.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
extract_config: Extract configuration dictionary
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
ValueError: If configuration is invalid
|
|
69
|
+
"""
|
|
70
|
+
pass
|