pycharter 0.0.22__py3-none-any.whl → 0.0.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- api/routes/v1/templates.py +43 -24
- pycharter/data/templates/etl/README.md +91 -0
- pycharter/data/templates/etl/extract_cloud_azure.yaml +23 -0
- pycharter/data/templates/etl/extract_cloud_gcs.yaml +22 -0
- pycharter/data/templates/etl/extract_cloud_s3.yaml +24 -0
- pycharter/data/templates/etl/extract_database.yaml +28 -0
- pycharter/data/templates/etl/extract_database_ssh.yaml +27 -0
- pycharter/data/templates/etl/extract_file_csv.yaml +17 -0
- pycharter/data/templates/etl/extract_file_glob.yaml +17 -0
- pycharter/data/templates/etl/extract_file_json.yaml +14 -0
- pycharter/data/templates/etl/extract_file_parquet.yaml +13 -0
- pycharter/data/templates/etl/extract_http_paginated.yaml +75 -0
- pycharter/data/templates/etl/extract_http_path_params.yaml +45 -0
- pycharter/data/templates/etl/extract_http_simple.yaml +52 -0
- pycharter/data/templates/etl/load_insert.yaml +17 -0
- pycharter/data/templates/etl/load_postgresql.yaml +17 -0
- pycharter/data/templates/etl/load_sqlite.yaml +16 -0
- pycharter/data/templates/etl/load_truncate_and_load.yaml +18 -0
- pycharter/data/templates/etl/load_upsert.yaml +28 -0
- pycharter/data/templates/etl/load_with_dlq.yaml +24 -0
- pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +28 -0
- pycharter/data/templates/etl/pipeline_http_to_db.yaml +38 -0
- pycharter/data/templates/etl/transform_combined.yaml +38 -0
- pycharter/data/templates/etl/transform_custom_function.yaml +18 -0
- pycharter/data/templates/etl/transform_jsonata.yaml +20 -0
- pycharter/data/templates/etl/transform_simple.yaml +41 -0
- pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
- pycharter/etl_generator/extraction.py +47 -262
- pycharter/etl_generator/extractors/__init__.py +26 -0
- pycharter/etl_generator/extractors/base.py +70 -0
- pycharter/etl_generator/extractors/cloud_storage.py +454 -0
- pycharter/etl_generator/extractors/database.py +151 -0
- pycharter/etl_generator/extractors/factory.py +141 -0
- pycharter/etl_generator/extractors/file.py +418 -0
- pycharter/etl_generator/extractors/http.py +816 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/METADATA +6 -1
- pycharter-0.0.23.dist-info/RECORD +498 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/WHEEL +1 -1
- ui/static/404/index.html +1 -1
- ui/static/404.html +1 -1
- ui/static/__next.__PAGE__.txt +1 -1
- ui/static/__next._full.txt +1 -1
- ui/static/__next._head.txt +1 -1
- ui/static/__next._index.txt +1 -1
- ui/static/__next._tree.txt +1 -1
- ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
- ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
- ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
- ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
- ui/static/_not-found/__next._full.txt +1 -1
- ui/static/_not-found/__next._head.txt +1 -1
- ui/static/_not-found/__next._index.txt +1 -1
- ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- ui/static/_not-found/__next._not-found.txt +1 -1
- ui/static/_not-found/__next._tree.txt +1 -1
- ui/static/_not-found/index.html +1 -1
- ui/static/_not-found/index.txt +1 -1
- ui/static/contracts/__next._full.txt +2 -2
- ui/static/contracts/__next._head.txt +1 -1
- ui/static/contracts/__next._index.txt +1 -1
- ui/static/contracts/__next._tree.txt +1 -1
- ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
- ui/static/contracts/__next.contracts.txt +1 -1
- ui/static/contracts/index.html +1 -1
- ui/static/contracts/index.txt +2 -2
- ui/static/documentation/__next._full.txt +1 -1
- ui/static/documentation/__next._head.txt +1 -1
- ui/static/documentation/__next._index.txt +1 -1
- ui/static/documentation/__next._tree.txt +1 -1
- ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
- ui/static/documentation/__next.documentation.txt +1 -1
- ui/static/documentation/index.html +2 -2
- ui/static/documentation/index.txt +1 -1
- ui/static/index.html +1 -1
- ui/static/index.txt +1 -1
- ui/static/metadata/__next._full.txt +1 -1
- ui/static/metadata/__next._head.txt +1 -1
- ui/static/metadata/__next._index.txt +1 -1
- ui/static/metadata/__next._tree.txt +1 -1
- ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
- ui/static/metadata/__next.metadata.txt +1 -1
- ui/static/metadata/index.html +1 -1
- ui/static/metadata/index.txt +1 -1
- ui/static/quality/__next._full.txt +2 -2
- ui/static/quality/__next._head.txt +1 -1
- ui/static/quality/__next._index.txt +1 -1
- ui/static/quality/__next._tree.txt +1 -1
- ui/static/quality/__next.quality.__PAGE__.txt +2 -2
- ui/static/quality/__next.quality.txt +1 -1
- ui/static/quality/index.html +2 -2
- ui/static/quality/index.txt +2 -2
- ui/static/rules/__next._full.txt +1 -1
- ui/static/rules/__next._head.txt +1 -1
- ui/static/rules/__next._index.txt +1 -1
- ui/static/rules/__next._tree.txt +1 -1
- ui/static/rules/__next.rules.__PAGE__.txt +1 -1
- ui/static/rules/__next.rules.txt +1 -1
- ui/static/rules/index.html +1 -1
- ui/static/rules/index.txt +1 -1
- ui/static/schemas/__next._full.txt +1 -1
- ui/static/schemas/__next._head.txt +1 -1
- ui/static/schemas/__next._index.txt +1 -1
- ui/static/schemas/__next._tree.txt +1 -1
- ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
- ui/static/schemas/__next.schemas.txt +1 -1
- ui/static/schemas/index.html +1 -1
- ui/static/schemas/index.txt +1 -1
- ui/static/settings/__next._full.txt +1 -1
- ui/static/settings/__next._head.txt +1 -1
- ui/static/settings/__next._index.txt +1 -1
- ui/static/settings/__next._tree.txt +1 -1
- ui/static/settings/__next.settings.__PAGE__.txt +1 -1
- ui/static/settings/__next.settings.txt +1 -1
- ui/static/settings/index.html +1 -1
- ui/static/settings/index.txt +1 -1
- ui/static/static/404/index.html +1 -1
- ui/static/static/404.html +1 -1
- ui/static/static/__next.__PAGE__.txt +1 -1
- ui/static/static/__next._full.txt +2 -2
- ui/static/static/__next._head.txt +1 -1
- ui/static/static/__next._index.txt +2 -2
- ui/static/static/__next._tree.txt +2 -2
- ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
- ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
- ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
- ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
- ui/static/static/_not-found/__next._full.txt +2 -2
- ui/static/static/_not-found/__next._head.txt +1 -1
- ui/static/static/_not-found/__next._index.txt +2 -2
- ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- ui/static/static/_not-found/__next._not-found.txt +1 -1
- ui/static/static/_not-found/__next._tree.txt +2 -2
- ui/static/static/_not-found/index.html +1 -1
- ui/static/static/_not-found/index.txt +2 -2
- ui/static/static/contracts/__next._full.txt +3 -3
- ui/static/static/contracts/__next._head.txt +1 -1
- ui/static/static/contracts/__next._index.txt +2 -2
- ui/static/static/contracts/__next._tree.txt +2 -2
- ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
- ui/static/static/contracts/__next.contracts.txt +1 -1
- ui/static/static/contracts/index.html +1 -1
- ui/static/static/contracts/index.txt +3 -3
- ui/static/static/documentation/__next._full.txt +3 -3
- ui/static/static/documentation/__next._head.txt +1 -1
- ui/static/static/documentation/__next._index.txt +2 -2
- ui/static/static/documentation/__next._tree.txt +2 -2
- ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
- ui/static/static/documentation/__next.documentation.txt +1 -1
- ui/static/static/documentation/index.html +2 -2
- ui/static/static/documentation/index.txt +3 -3
- ui/static/static/index.html +1 -1
- ui/static/static/index.txt +2 -2
- ui/static/static/metadata/__next._full.txt +2 -2
- ui/static/static/metadata/__next._head.txt +1 -1
- ui/static/static/metadata/__next._index.txt +2 -2
- ui/static/static/metadata/__next._tree.txt +2 -2
- ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
- ui/static/static/metadata/__next.metadata.txt +1 -1
- ui/static/static/metadata/index.html +1 -1
- ui/static/static/metadata/index.txt +2 -2
- ui/static/static/quality/__next._full.txt +2 -2
- ui/static/static/quality/__next._head.txt +1 -1
- ui/static/static/quality/__next._index.txt +2 -2
- ui/static/static/quality/__next._tree.txt +2 -2
- ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
- ui/static/static/quality/__next.quality.txt +1 -1
- ui/static/static/quality/index.html +2 -2
- ui/static/static/quality/index.txt +2 -2
- ui/static/static/rules/__next._full.txt +2 -2
- ui/static/static/rules/__next._head.txt +1 -1
- ui/static/static/rules/__next._index.txt +2 -2
- ui/static/static/rules/__next._tree.txt +2 -2
- ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
- ui/static/static/rules/__next.rules.txt +1 -1
- ui/static/static/rules/index.html +1 -1
- ui/static/static/rules/index.txt +2 -2
- ui/static/static/schemas/__next._full.txt +2 -2
- ui/static/static/schemas/__next._head.txt +1 -1
- ui/static/static/schemas/__next._index.txt +2 -2
- ui/static/static/schemas/__next._tree.txt +2 -2
- ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
- ui/static/static/schemas/__next.schemas.txt +1 -1
- ui/static/static/schemas/index.html +1 -1
- ui/static/static/schemas/index.txt +2 -2
- ui/static/static/settings/__next._full.txt +2 -2
- ui/static/static/settings/__next._head.txt +1 -1
- ui/static/static/settings/__next._index.txt +2 -2
- ui/static/static/settings/__next._tree.txt +2 -2
- ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
- ui/static/static/settings/__next.settings.txt +1 -1
- ui/static/static/settings/index.html +1 -1
- ui/static/static/settings/index.txt +2 -2
- ui/static/static/static/.gitkeep +0 -0
- ui/static/static/static/404/index.html +1 -0
- ui/static/static/static/404.html +1 -0
- ui/static/static/static/__next.__PAGE__.txt +10 -0
- ui/static/static/static/__next._full.txt +30 -0
- ui/static/static/static/__next._head.txt +7 -0
- ui/static/static/static/__next._index.txt +9 -0
- ui/static/static/static/__next._tree.txt +2 -0
- ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
- ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
- ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
- ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
- ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
- ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
- ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
- ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
- ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
- ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
- ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
- ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
- ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
- ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
- ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
- ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
- ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
- ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
- ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
- ui/static/static/static/_not-found/__next._full.txt +17 -0
- ui/static/static/static/_not-found/__next._head.txt +7 -0
- ui/static/static/static/_not-found/__next._index.txt +9 -0
- ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
- ui/static/static/static/_not-found/__next._not-found.txt +4 -0
- ui/static/static/static/_not-found/__next._tree.txt +2 -0
- ui/static/static/static/_not-found/index.html +1 -0
- ui/static/static/static/_not-found/index.txt +17 -0
- ui/static/static/static/contracts/__next._full.txt +21 -0
- ui/static/static/static/contracts/__next._head.txt +7 -0
- ui/static/static/static/contracts/__next._index.txt +9 -0
- ui/static/static/static/contracts/__next._tree.txt +2 -0
- ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
- ui/static/static/static/contracts/__next.contracts.txt +4 -0
- ui/static/static/static/contracts/index.html +1 -0
- ui/static/static/static/contracts/index.txt +21 -0
- ui/static/static/static/documentation/__next._full.txt +21 -0
- ui/static/static/static/documentation/__next._head.txt +7 -0
- ui/static/static/static/documentation/__next._index.txt +9 -0
- ui/static/static/static/documentation/__next._tree.txt +2 -0
- ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
- ui/static/static/static/documentation/__next.documentation.txt +4 -0
- ui/static/static/static/documentation/index.html +93 -0
- ui/static/static/static/documentation/index.txt +21 -0
- ui/static/static/static/index.html +1 -0
- ui/static/static/static/index.txt +30 -0
- ui/static/static/static/metadata/__next._full.txt +21 -0
- ui/static/static/static/metadata/__next._head.txt +7 -0
- ui/static/static/static/metadata/__next._index.txt +9 -0
- ui/static/static/static/metadata/__next._tree.txt +2 -0
- ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
- ui/static/static/static/metadata/__next.metadata.txt +4 -0
- ui/static/static/static/metadata/index.html +1 -0
- ui/static/static/static/metadata/index.txt +21 -0
- ui/static/static/static/quality/__next._full.txt +21 -0
- ui/static/static/static/quality/__next._head.txt +7 -0
- ui/static/static/static/quality/__next._index.txt +9 -0
- ui/static/static/static/quality/__next._tree.txt +2 -0
- ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
- ui/static/static/static/quality/__next.quality.txt +4 -0
- ui/static/static/static/quality/index.html +2 -0
- ui/static/static/static/quality/index.txt +21 -0
- ui/static/static/static/rules/__next._full.txt +21 -0
- ui/static/static/static/rules/__next._head.txt +7 -0
- ui/static/static/static/rules/__next._index.txt +9 -0
- ui/static/static/static/rules/__next._tree.txt +2 -0
- ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
- ui/static/static/static/rules/__next.rules.txt +4 -0
- ui/static/static/static/rules/index.html +1 -0
- ui/static/static/static/rules/index.txt +21 -0
- ui/static/static/static/schemas/__next._full.txt +21 -0
- ui/static/static/static/schemas/__next._head.txt +7 -0
- ui/static/static/static/schemas/__next._index.txt +9 -0
- ui/static/static/static/schemas/__next._tree.txt +2 -0
- ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
- ui/static/static/static/schemas/__next.schemas.txt +4 -0
- ui/static/static/static/schemas/index.html +1 -0
- ui/static/static/static/schemas/index.txt +21 -0
- ui/static/static/static/settings/__next._full.txt +21 -0
- ui/static/static/static/settings/__next._head.txt +7 -0
- ui/static/static/static/settings/__next._index.txt +9 -0
- ui/static/static/static/settings/__next._tree.txt +2 -0
- ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
- ui/static/static/static/settings/__next.settings.txt +4 -0
- ui/static/static/static/settings/index.html +1 -0
- ui/static/static/static/settings/index.txt +21 -0
- ui/static/static/static/validation/__next._full.txt +21 -0
- ui/static/static/static/validation/__next._head.txt +7 -0
- ui/static/static/static/validation/__next._index.txt +9 -0
- ui/static/static/static/validation/__next._tree.txt +2 -0
- ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
- ui/static/static/static/validation/__next.validation.txt +4 -0
- ui/static/static/static/validation/index.html +1 -0
- ui/static/static/static/validation/index.txt +21 -0
- ui/static/static/validation/__next._full.txt +2 -2
- ui/static/static/validation/__next._head.txt +1 -1
- ui/static/static/validation/__next._index.txt +2 -2
- ui/static/static/validation/__next._tree.txt +2 -2
- ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
- ui/static/static/validation/__next.validation.txt +1 -1
- ui/static/static/validation/index.html +1 -1
- ui/static/static/validation/index.txt +2 -2
- ui/static/validation/__next._full.txt +2 -2
- ui/static/validation/__next._head.txt +1 -1
- ui/static/validation/__next._index.txt +1 -1
- ui/static/validation/__next._tree.txt +1 -1
- ui/static/validation/__next.validation.__PAGE__.txt +2 -2
- ui/static/validation/__next.validation.txt +1 -1
- ui/static/validation/index.html +1 -1
- ui/static/validation/index.txt +2 -2
- pycharter/data/templates/template_transform_advanced.yaml +0 -50
- pycharter/data/templates/template_transform_simple.yaml +0 -59
- pycharter-0.0.22.dist-info/RECORD +0 -358
- /pycharter/data/templates/{template_coercion_rules.yaml → contract/template_coercion_rules.yaml} +0 -0
- /pycharter/data/templates/{template_contract.yaml → contract/template_contract.yaml} +0 -0
- /pycharter/data/templates/{template_metadata.yaml → contract/template_metadata.yaml} +0 -0
- /pycharter/data/templates/{template_schema.yaml → contract/template_schema.yaml} +0 -0
- /pycharter/data/templates/{template_validation_rules.yaml → contract/template_validation_rules.yaml} +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/entry_points.txt +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/licenses/LICENSE +0 -0
- {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/top_level.txt +0 -0
- /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
- /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
- /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
- /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
- /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
- /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
- /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
|
@@ -0,0 +1,816 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTTP/API extractor for ETL orchestrator.
|
|
3
|
+
|
|
4
|
+
Handles HTTP-based data extraction with support for:
|
|
5
|
+
- GET and POST requests
|
|
6
|
+
- Retry logic with exponential backoff
|
|
7
|
+
- Rate limiting
|
|
8
|
+
- Pagination (page, offset, cursor, next_url, link_header)
|
|
9
|
+
- Response parsing (JSON, text)
|
|
10
|
+
- Path parameter substitution
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
import time
|
|
17
|
+
from typing import Any, AsyncIterator, Dict, List, Optional
|
|
18
|
+
|
|
19
|
+
import httpx
|
|
20
|
+
|
|
21
|
+
from pycharter.etl_generator.extractors.base import BaseExtractor
|
|
22
|
+
from pycharter.utils.value_injector import resolve_values
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# Default configuration values
|
|
27
|
+
DEFAULT_RATE_LIMIT_DELAY = 0.2
|
|
28
|
+
DEFAULT_MAX_ATTEMPTS = 3
|
|
29
|
+
DEFAULT_BACKOFF_FACTOR = 2.0
|
|
30
|
+
DEFAULT_RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
31
|
+
DEFAULT_TIMEOUT_CONNECT = 10.0
|
|
32
|
+
DEFAULT_TIMEOUT_READ = 30.0
|
|
33
|
+
DEFAULT_TIMEOUT_WRITE = 10.0
|
|
34
|
+
DEFAULT_TIMEOUT_POOL = 10.0
|
|
35
|
+
|
|
36
|
+
# Common response data keys
|
|
37
|
+
RESPONSE_DATA_KEYS = ['data', 'results', 'items', 'records', 'values']
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class HTTPExtractor(BaseExtractor):
|
|
41
|
+
"""Extractor for HTTP/API data sources."""
|
|
42
|
+
|
|
43
|
+
def validate_config(self, extract_config: Dict[str, Any]) -> None:
|
|
44
|
+
"""Validate HTTP extractor configuration."""
|
|
45
|
+
if 'source_type' in extract_config and extract_config['source_type'] != 'http':
|
|
46
|
+
raise ValueError(f"HTTPExtractor requires source_type='http', got '{extract_config.get('source_type')}'")
|
|
47
|
+
|
|
48
|
+
# Check for required HTTP config fields
|
|
49
|
+
if not extract_config.get('api_endpoint') and not extract_config.get('base_url'):
|
|
50
|
+
# Allow if api_endpoint is a full URL
|
|
51
|
+
api_endpoint = extract_config.get('api_endpoint', '')
|
|
52
|
+
if not api_endpoint.startswith(('http://', 'https://')):
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"HTTP extractor requires either 'api_endpoint' (with 'base_url') "
|
|
55
|
+
"or 'api_endpoint' as full URL"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
async def extract_streaming(
|
|
59
|
+
self,
|
|
60
|
+
extract_config: Dict[str, Any],
|
|
61
|
+
params: Dict[str, Any],
|
|
62
|
+
headers: Dict[str, Any],
|
|
63
|
+
contract_dir: Optional[Any] = None,
|
|
64
|
+
batch_size: int = 1000,
|
|
65
|
+
max_records: Optional[int] = None,
|
|
66
|
+
config_context: Optional[Dict[str, Any]] = None,
|
|
67
|
+
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
68
|
+
"""
|
|
69
|
+
Extract data from HTTP/API source with pagination support.
|
|
70
|
+
|
|
71
|
+
Yields batches as they are extracted, preventing memory exhaustion for large datasets.
|
|
72
|
+
"""
|
|
73
|
+
pagination_config = extract_config.get('pagination', {})
|
|
74
|
+
|
|
75
|
+
# If pagination is not enabled, extract all and yield in batches
|
|
76
|
+
if not pagination_config.get('enabled', False):
|
|
77
|
+
logger.info("Pagination disabled, extracting all data in single request")
|
|
78
|
+
all_data = await self._extract_with_retry(
|
|
79
|
+
extract_config, params, headers, contract_dir, config_context=config_context
|
|
80
|
+
)
|
|
81
|
+
if max_records:
|
|
82
|
+
logger.info(f"Limiting to {max_records} records (extracted {len(all_data)})")
|
|
83
|
+
all_data = all_data[:max_records]
|
|
84
|
+
|
|
85
|
+
logger.info(f"Yielding {len(all_data)} records in batches of {batch_size}")
|
|
86
|
+
for i in range(0, len(all_data), batch_size):
|
|
87
|
+
batch = all_data[i:i + batch_size]
|
|
88
|
+
logger.debug(f"Yielding batch {i // batch_size + 1} with {len(batch)} records")
|
|
89
|
+
yield batch
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
# Pagination enabled - stream pages and yield in batches
|
|
93
|
+
async for batch in self._extract_with_pagination(
|
|
94
|
+
extract_config, params, headers, contract_dir, batch_size, max_records, config_context
|
|
95
|
+
):
|
|
96
|
+
yield batch
|
|
97
|
+
|
|
98
|
+
async def _extract_with_retry(
|
|
99
|
+
self,
|
|
100
|
+
extract_config: Dict[str, Any],
|
|
101
|
+
params: Dict[str, Any],
|
|
102
|
+
headers: Dict[str, Any],
|
|
103
|
+
contract_dir: Optional[Any] = None,
|
|
104
|
+
config_context: Optional[Dict[str, Any]] = None,
|
|
105
|
+
) -> List[Dict[str, Any]]:
|
|
106
|
+
"""Extract data from API with retry logic."""
|
|
107
|
+
extracted_data, _, _ = await self._extract_single_page(
|
|
108
|
+
extract_config, params, headers, contract_dir, return_full_response=False, config_context=config_context
|
|
109
|
+
)
|
|
110
|
+
return extracted_data
|
|
111
|
+
|
|
112
|
+
async def _extract_single_page(
|
|
113
|
+
self,
|
|
114
|
+
extract_config: Dict[str, Any],
|
|
115
|
+
params: Dict[str, Any],
|
|
116
|
+
headers: Dict[str, Any],
|
|
117
|
+
contract_dir: Optional[Any] = None,
|
|
118
|
+
return_full_response: bool = False,
|
|
119
|
+
config_context: Optional[Dict[str, Any]] = None,
|
|
120
|
+
) -> tuple[List[Dict[str, Any]], Optional[Any], Optional[httpx.Response]]:
|
|
121
|
+
"""Extract data from a single API request with retry logic."""
|
|
122
|
+
# Get configuration
|
|
123
|
+
base_url = extract_config.get('base_url', '')
|
|
124
|
+
api_endpoint = extract_config.get('api_endpoint', '')
|
|
125
|
+
method = extract_config.get('method', 'GET').upper()
|
|
126
|
+
timeout_config = extract_config.get('timeout', {})
|
|
127
|
+
retry_config = extract_config.get('retry', {})
|
|
128
|
+
response_path = extract_config.get('response_path')
|
|
129
|
+
response_format = extract_config.get('response_format', 'json')
|
|
130
|
+
rate_limit_delay = extract_config.get('rate_limit_delay', DEFAULT_RATE_LIMIT_DELAY)
|
|
131
|
+
body = extract_config.get('body')
|
|
132
|
+
|
|
133
|
+
# Resolve variables and convert types
|
|
134
|
+
source_file = str(contract_dir / "extract.yaml") if contract_dir else None
|
|
135
|
+
resolved_params = resolve_values(params, context=config_context, source_file=source_file)
|
|
136
|
+
resolved_headers = resolve_values(headers, context=config_context, source_file=source_file)
|
|
137
|
+
resolved_timeout_config = resolve_values(timeout_config, context=config_context, source_file=source_file)
|
|
138
|
+
resolved_rate_limit_delay = self._resolve_rate_limit_delay(rate_limit_delay, contract_dir, config_context)
|
|
139
|
+
|
|
140
|
+
if body:
|
|
141
|
+
resolved_body = resolve_values(body, context=config_context, source_file=source_file)
|
|
142
|
+
else:
|
|
143
|
+
resolved_body = None
|
|
144
|
+
|
|
145
|
+
# Extract path parameters from api_endpoint
|
|
146
|
+
path_params = {}
|
|
147
|
+
if '{' in api_endpoint:
|
|
148
|
+
path_param_names = re.findall(r'\{(\w+)\}', api_endpoint)
|
|
149
|
+
for param_name in path_param_names:
|
|
150
|
+
if param_name in resolved_params:
|
|
151
|
+
path_params[param_name] = resolved_params.pop(param_name)
|
|
152
|
+
|
|
153
|
+
# Build URL with path parameter substitution
|
|
154
|
+
url = self._build_request_url(base_url, api_endpoint, path_params)
|
|
155
|
+
|
|
156
|
+
# Configure timeout
|
|
157
|
+
timeout = self._configure_timeout(resolved_timeout_config)
|
|
158
|
+
|
|
159
|
+
# Configure retry
|
|
160
|
+
max_attempts = int(retry_config.get('max_attempts', DEFAULT_MAX_ATTEMPTS))
|
|
161
|
+
backoff_factor = float(retry_config.get('backoff_factor', DEFAULT_BACKOFF_FACTOR))
|
|
162
|
+
retry_on_status = retry_config.get('retry_on_status', DEFAULT_RETRY_STATUS_CODES)
|
|
163
|
+
|
|
164
|
+
# Make request with retry logic
|
|
165
|
+
last_exception = None
|
|
166
|
+
request_start_time = None
|
|
167
|
+
|
|
168
|
+
logger.info(
|
|
169
|
+
f"Starting HTTP extraction: {method} {url} "
|
|
170
|
+
f"(timeout: connect={timeout.connect}s, read={timeout.read}s, "
|
|
171
|
+
f"max_attempts={max_attempts})"
|
|
172
|
+
)
|
|
173
|
+
logger.debug(f"Request params: {resolved_params}")
|
|
174
|
+
logger.debug(f"Request headers: {dict(resolved_headers)}")
|
|
175
|
+
|
|
176
|
+
for attempt in range(max_attempts):
|
|
177
|
+
try:
|
|
178
|
+
request_start_time = time.time()
|
|
179
|
+
logger.debug(f"HTTP request attempt {attempt + 1}/{max_attempts} to {url}")
|
|
180
|
+
|
|
181
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
182
|
+
if attempt > 0:
|
|
183
|
+
wait_time = backoff_factor ** (attempt - 1)
|
|
184
|
+
logger.info(f"Retrying after {wait_time:.2f}s (attempt {attempt + 1}/{max_attempts})")
|
|
185
|
+
await asyncio.sleep(wait_time)
|
|
186
|
+
|
|
187
|
+
request_attempt_start = time.time()
|
|
188
|
+
try:
|
|
189
|
+
response = await self._make_http_request(
|
|
190
|
+
client, method, url, resolved_params, resolved_headers, resolved_body
|
|
191
|
+
)
|
|
192
|
+
request_duration = time.time() - request_attempt_start
|
|
193
|
+
logger.info(
|
|
194
|
+
f"HTTP request completed: {response.status_code} "
|
|
195
|
+
f"({request_duration:.2f}s, attempt {attempt + 1}/{max_attempts})"
|
|
196
|
+
)
|
|
197
|
+
except httpx.TimeoutException as timeout_error:
|
|
198
|
+
request_duration = time.time() - request_attempt_start
|
|
199
|
+
timeout_info = ""
|
|
200
|
+
if hasattr(timeout_error, 'timeout') and isinstance(timeout_error.timeout, httpx.Timeout):
|
|
201
|
+
timeout_info = (
|
|
202
|
+
f" (connect={timeout_error.timeout.connect}s, "
|
|
203
|
+
f"read={timeout_error.timeout.read}s)"
|
|
204
|
+
)
|
|
205
|
+
logger.error(
|
|
206
|
+
f"HTTP request timeout after {request_duration:.2f}s{timeout_info}: "
|
|
207
|
+
f"{type(timeout_error).__name__}: {timeout_error} "
|
|
208
|
+
f"(attempt {attempt + 1}/{max_attempts})"
|
|
209
|
+
)
|
|
210
|
+
raise
|
|
211
|
+
except httpx.RequestError as request_error:
|
|
212
|
+
request_duration = time.time() - request_attempt_start
|
|
213
|
+
logger.error(
|
|
214
|
+
f"HTTP request error after {request_duration:.2f}s: "
|
|
215
|
+
f"{type(request_error).__name__}: {request_error} "
|
|
216
|
+
f"(attempt {attempt + 1}/{max_attempts})"
|
|
217
|
+
)
|
|
218
|
+
raise
|
|
219
|
+
|
|
220
|
+
# Check if we should retry based on status code
|
|
221
|
+
if response.status_code in retry_on_status and attempt < max_attempts - 1:
|
|
222
|
+
wait_time = backoff_factor ** attempt
|
|
223
|
+
logger.warning(
|
|
224
|
+
f"HTTP {response.status_code} received, will retry after {wait_time:.2f}s "
|
|
225
|
+
f"(attempt {attempt + 1}/{max_attempts})"
|
|
226
|
+
)
|
|
227
|
+
await asyncio.sleep(wait_time)
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
# Raise for non-2xx status codes
|
|
231
|
+
response.raise_for_status()
|
|
232
|
+
|
|
233
|
+
# Parse response
|
|
234
|
+
parse_start = time.time()
|
|
235
|
+
if response_format == 'json':
|
|
236
|
+
data = response.json()
|
|
237
|
+
else:
|
|
238
|
+
data = response.text
|
|
239
|
+
parse_duration = time.time() - parse_start
|
|
240
|
+
logger.debug(f"Response parsed in {parse_duration:.3f}s")
|
|
241
|
+
|
|
242
|
+
# Extract data array
|
|
243
|
+
extract_start = time.time()
|
|
244
|
+
if response_path:
|
|
245
|
+
extracted_data = self._extract_by_path(data, response_path)
|
|
246
|
+
else:
|
|
247
|
+
extracted_data = self._extract_data_array(data)
|
|
248
|
+
extract_duration = time.time() - extract_start
|
|
249
|
+
|
|
250
|
+
total_duration = time.time() - request_start_time
|
|
251
|
+
logger.info(
|
|
252
|
+
f"Extraction successful: {len(extracted_data)} records extracted "
|
|
253
|
+
f"(total: {total_duration:.2f}s, parse: {parse_duration:.3f}s, "
|
|
254
|
+
f"extract: {extract_duration:.3f}s)"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Apply rate limiting delay
|
|
258
|
+
if resolved_rate_limit_delay > 0:
|
|
259
|
+
logger.debug(f"Applying rate limit delay: {resolved_rate_limit_delay}s")
|
|
260
|
+
await asyncio.sleep(resolved_rate_limit_delay)
|
|
261
|
+
|
|
262
|
+
if return_full_response:
|
|
263
|
+
return extracted_data, data, response
|
|
264
|
+
return extracted_data, None, None
|
|
265
|
+
|
|
266
|
+
except httpx.HTTPStatusError as e:
|
|
267
|
+
last_exception = e
|
|
268
|
+
request_duration = time.time() - request_start_time if request_start_time else 0
|
|
269
|
+
|
|
270
|
+
logger.error(
|
|
271
|
+
f"HTTP status error {e.response.status_code}",
|
|
272
|
+
extra={
|
|
273
|
+
'status_code': e.response.status_code,
|
|
274
|
+
'url': url,
|
|
275
|
+
'attempt': attempt + 1,
|
|
276
|
+
'duration': request_duration,
|
|
277
|
+
},
|
|
278
|
+
exc_info=True
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if e.response.status_code in retry_on_status and attempt < max_attempts - 1:
|
|
282
|
+
wait_time = backoff_factor ** attempt
|
|
283
|
+
await asyncio.sleep(wait_time)
|
|
284
|
+
continue
|
|
285
|
+
raise RuntimeError(
|
|
286
|
+
f"HTTP error {e.response.status_code}: {e.response.text}"
|
|
287
|
+
) from e
|
|
288
|
+
except httpx.TimeoutException as e:
|
|
289
|
+
last_exception = e
|
|
290
|
+
request_duration = time.time() - request_start_time if request_start_time else 0
|
|
291
|
+
|
|
292
|
+
logger.error(
|
|
293
|
+
"HTTP timeout",
|
|
294
|
+
extra={
|
|
295
|
+
'url': url,
|
|
296
|
+
'duration': request_duration,
|
|
297
|
+
'attempt': attempt + 1,
|
|
298
|
+
},
|
|
299
|
+
exc_info=True
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if attempt < max_attempts - 1:
|
|
303
|
+
wait_time = backoff_factor ** attempt
|
|
304
|
+
await asyncio.sleep(wait_time)
|
|
305
|
+
continue
|
|
306
|
+
raise RuntimeError(f"Request timeout after {request_duration:.2f}s: {e}") from e
|
|
307
|
+
except httpx.RequestError as e:
|
|
308
|
+
last_exception = e
|
|
309
|
+
request_duration = time.time() - request_start_time if request_start_time else 0
|
|
310
|
+
|
|
311
|
+
logger.error(
|
|
312
|
+
"HTTP request error",
|
|
313
|
+
extra={
|
|
314
|
+
'url': url,
|
|
315
|
+
'duration': request_duration,
|
|
316
|
+
'attempt': attempt + 1,
|
|
317
|
+
},
|
|
318
|
+
exc_info=True
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
if attempt < max_attempts - 1:
|
|
322
|
+
wait_time = backoff_factor ** attempt
|
|
323
|
+
await asyncio.sleep(wait_time)
|
|
324
|
+
continue
|
|
325
|
+
raise RuntimeError(f"Request failed: {e}") from e
|
|
326
|
+
except Exception as e:
|
|
327
|
+
request_duration = time.time() - request_start_time if request_start_time else 0
|
|
328
|
+
|
|
329
|
+
logger.error(
|
|
330
|
+
"Unexpected extraction error",
|
|
331
|
+
extra={
|
|
332
|
+
'url': url,
|
|
333
|
+
'duration': request_duration,
|
|
334
|
+
'attempt': attempt + 1,
|
|
335
|
+
},
|
|
336
|
+
exc_info=True
|
|
337
|
+
)
|
|
338
|
+
raise RuntimeError(f"Extraction failed: {e}") from e
|
|
339
|
+
|
|
340
|
+
# If we exhausted all retries
|
|
341
|
+
if last_exception:
|
|
342
|
+
raise RuntimeError(
|
|
343
|
+
f"Extraction failed after {max_attempts} attempts: {last_exception}"
|
|
344
|
+
) from last_exception
|
|
345
|
+
raise RuntimeError("Extraction failed: unknown error")
|
|
346
|
+
|
|
347
|
+
async def _extract_with_pagination(
|
|
348
|
+
self,
|
|
349
|
+
extract_config: Dict[str, Any],
|
|
350
|
+
params: Dict[str, Any],
|
|
351
|
+
headers: Dict[str, Any],
|
|
352
|
+
contract_dir: Optional[Any] = None,
|
|
353
|
+
batch_size: int = 1000,
|
|
354
|
+
max_records: Optional[int] = None,
|
|
355
|
+
config_context: Optional[Dict[str, Any]] = None,
|
|
356
|
+
) -> AsyncIterator[List[Dict[str, Any]]]:
|
|
357
|
+
"""Extract data with pagination support."""
|
|
358
|
+
pagination_config = extract_config.get('pagination', {})
|
|
359
|
+
strategy = pagination_config.get('strategy', 'page')
|
|
360
|
+
stop_conditions = pagination_config.get('stop_conditions', [])
|
|
361
|
+
page_delay = float(pagination_config.get('page_delay', 0.1))
|
|
362
|
+
max_pages = 1000
|
|
363
|
+
max_records_from_config = None
|
|
364
|
+
|
|
365
|
+
# Get max_pages and max_records from stop conditions
|
|
366
|
+
for condition in stop_conditions:
|
|
367
|
+
if condition.get('type') == 'max_pages':
|
|
368
|
+
max_pages = condition.get('value', 1000)
|
|
369
|
+
elif condition.get('type') == 'max_records':
|
|
370
|
+
max_records_from_config = condition.get('value')
|
|
371
|
+
|
|
372
|
+
if max_records is None:
|
|
373
|
+
max_records = max_records_from_config
|
|
374
|
+
|
|
375
|
+
current_batch = []
|
|
376
|
+
total_extracted = 0
|
|
377
|
+
page_count = 0
|
|
378
|
+
current_url = None
|
|
379
|
+
current_cursor = None
|
|
380
|
+
|
|
381
|
+
# Initialize pagination state
|
|
382
|
+
if strategy == 'page':
|
|
383
|
+
page_config = pagination_config.get('page', {})
|
|
384
|
+
current_page = page_config.get('start', 0)
|
|
385
|
+
page_increment = page_config.get('increment', 1)
|
|
386
|
+
page_param_name = page_config.get('param_name', 'page')
|
|
387
|
+
elif strategy == 'offset':
|
|
388
|
+
offset_config = pagination_config.get('offset', {})
|
|
389
|
+
current_offset = offset_config.get('start', 0)
|
|
390
|
+
offset_param_name = offset_config.get('param_name', 'offset')
|
|
391
|
+
increment_by = offset_config.get('increment_by', 'limit')
|
|
392
|
+
elif strategy == 'cursor':
|
|
393
|
+
cursor_config = pagination_config.get('cursor', {})
|
|
394
|
+
cursor_param_name = cursor_config.get('param_name', 'cursor')
|
|
395
|
+
cursor_response_path = cursor_config.get('response_path', 'next_cursor')
|
|
396
|
+
elif strategy == 'next_url':
|
|
397
|
+
next_url_config = pagination_config.get('next_url', {})
|
|
398
|
+
next_url_response_path = next_url_config.get('response_path', 'next_url')
|
|
399
|
+
elif strategy == 'link_header':
|
|
400
|
+
pass
|
|
401
|
+
else:
|
|
402
|
+
raise ValueError(f"Unsupported pagination strategy: {strategy}")
|
|
403
|
+
|
|
404
|
+
extract_config_copy = extract_config.copy()
|
|
405
|
+
original_endpoint = extract_config_copy.get('api_endpoint')
|
|
406
|
+
original_base_url = extract_config_copy.get('base_url', '')
|
|
407
|
+
|
|
408
|
+
logger.info(
|
|
409
|
+
f"Starting paginated extraction (strategy: {strategy}, "
|
|
410
|
+
f"max_pages: {max_pages}, batch_size: {batch_size}, "
|
|
411
|
+
f"page_delay: {page_delay}s)"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
while page_count < max_pages:
|
|
415
|
+
# Check max_records limit
|
|
416
|
+
if max_records and total_extracted >= max_records:
|
|
417
|
+
logger.info(
|
|
418
|
+
f"Reached max_records limit ({max_records}), stopping pagination "
|
|
419
|
+
f"(extracted {total_extracted} records from {page_count} pages)"
|
|
420
|
+
)
|
|
421
|
+
if current_batch:
|
|
422
|
+
yield current_batch
|
|
423
|
+
return
|
|
424
|
+
|
|
425
|
+
# Update params/URL based on strategy
|
|
426
|
+
if strategy == 'page':
|
|
427
|
+
params[page_param_name] = current_page
|
|
428
|
+
logger.debug(f"Fetching page {current_page} (page_count: {page_count + 1}/{max_pages})")
|
|
429
|
+
elif strategy == 'offset':
|
|
430
|
+
params[offset_param_name] = current_offset
|
|
431
|
+
elif strategy == 'cursor' and current_cursor:
|
|
432
|
+
params[cursor_param_name] = current_cursor
|
|
433
|
+
elif strategy == 'next_url' and current_url:
|
|
434
|
+
extract_config_copy['api_endpoint'] = current_url
|
|
435
|
+
extract_config_copy['base_url'] = ''
|
|
436
|
+
|
|
437
|
+
# Make request
|
|
438
|
+
need_full_response = strategy in ['cursor', 'next_url', 'link_header']
|
|
439
|
+
try:
|
|
440
|
+
logger.debug(f"Extracting page {page_count + 1} (total extracted so far: {total_extracted})")
|
|
441
|
+
page_data, full_response_data, response_obj = await self._extract_single_page(
|
|
442
|
+
extract_config_copy, params, headers, contract_dir, return_full_response=need_full_response, config_context=config_context
|
|
443
|
+
)
|
|
444
|
+
logger.info(f"Page {page_count + 1} extracted: {len(page_data)} records")
|
|
445
|
+
except Exception as e:
|
|
446
|
+
logger.error(
|
|
447
|
+
f"Error extracting page {page_count + 1}",
|
|
448
|
+
extra={
|
|
449
|
+
'page': page_count + 1,
|
|
450
|
+
'extracted': total_extracted,
|
|
451
|
+
},
|
|
452
|
+
exc_info=True
|
|
453
|
+
)
|
|
454
|
+
if current_batch:
|
|
455
|
+
yield current_batch
|
|
456
|
+
raise
|
|
457
|
+
|
|
458
|
+
# Restore original endpoint if modified
|
|
459
|
+
if strategy == 'next_url' and current_url:
|
|
460
|
+
extract_config_copy['api_endpoint'] = original_endpoint
|
|
461
|
+
extract_config_copy['base_url'] = original_base_url
|
|
462
|
+
|
|
463
|
+
# Check for empty page first
|
|
464
|
+
if not page_data:
|
|
465
|
+
logger.info(f"Empty page {page_count + 1} received, stopping pagination")
|
|
466
|
+
if current_batch:
|
|
467
|
+
yield current_batch
|
|
468
|
+
break
|
|
469
|
+
|
|
470
|
+
# Check stop conditions
|
|
471
|
+
page_count += 1
|
|
472
|
+
limit_value = params.get('limit', 100)
|
|
473
|
+
record_count = len(page_data)
|
|
474
|
+
logger.info(
|
|
475
|
+
f"Evaluating stop conditions for page {page_count}: "
|
|
476
|
+
f"{record_count} records returned, limit={limit_value}"
|
|
477
|
+
)
|
|
478
|
+
should_stop = self._check_stop_conditions(page_data, stop_conditions, params, full_response_data)
|
|
479
|
+
if should_stop:
|
|
480
|
+
logger.info(
|
|
481
|
+
f"✅ Stop condition met at page {page_count} "
|
|
482
|
+
f"(page returned {record_count} records, limit: {limit_value})"
|
|
483
|
+
)
|
|
484
|
+
for record in page_data:
|
|
485
|
+
current_batch.append(record)
|
|
486
|
+
total_extracted += 1
|
|
487
|
+
if len(current_batch) >= batch_size:
|
|
488
|
+
yield current_batch
|
|
489
|
+
current_batch = []
|
|
490
|
+
if current_batch:
|
|
491
|
+
yield current_batch
|
|
492
|
+
break
|
|
493
|
+
|
|
494
|
+
# Add page data to current batch
|
|
495
|
+
for record in page_data:
|
|
496
|
+
current_batch.append(record)
|
|
497
|
+
total_extracted += 1
|
|
498
|
+
|
|
499
|
+
if len(current_batch) >= batch_size:
|
|
500
|
+
yield current_batch
|
|
501
|
+
current_batch = []
|
|
502
|
+
|
|
503
|
+
if max_records and total_extracted >= max_records:
|
|
504
|
+
if current_batch:
|
|
505
|
+
yield current_batch
|
|
506
|
+
return
|
|
507
|
+
|
|
508
|
+
# Extract pagination token/URL for next iteration
|
|
509
|
+
if strategy == 'cursor' and full_response_data:
|
|
510
|
+
try:
|
|
511
|
+
current = full_response_data
|
|
512
|
+
for part in cursor_response_path.split('.'):
|
|
513
|
+
if isinstance(current, dict):
|
|
514
|
+
current = current.get(part)
|
|
515
|
+
elif isinstance(current, list) and part.isdigit():
|
|
516
|
+
current = current[int(part)]
|
|
517
|
+
else:
|
|
518
|
+
current = None
|
|
519
|
+
break
|
|
520
|
+
|
|
521
|
+
if current and isinstance(current, str):
|
|
522
|
+
current_cursor = current
|
|
523
|
+
elif current:
|
|
524
|
+
current_cursor = str(current)
|
|
525
|
+
else:
|
|
526
|
+
if current_batch:
|
|
527
|
+
yield current_batch
|
|
528
|
+
break
|
|
529
|
+
except (KeyError, IndexError, TypeError, ValueError):
|
|
530
|
+
if current_batch:
|
|
531
|
+
yield current_batch
|
|
532
|
+
break
|
|
533
|
+
|
|
534
|
+
elif strategy == 'next_url' and full_response_data:
|
|
535
|
+
try:
|
|
536
|
+
current = full_response_data
|
|
537
|
+
for part in next_url_response_path.split('.'):
|
|
538
|
+
if isinstance(current, dict):
|
|
539
|
+
current = current.get(part)
|
|
540
|
+
elif isinstance(current, list) and part.isdigit():
|
|
541
|
+
current = current[int(part)]
|
|
542
|
+
else:
|
|
543
|
+
current = None
|
|
544
|
+
break
|
|
545
|
+
|
|
546
|
+
if current and isinstance(current, str):
|
|
547
|
+
current_url = current
|
|
548
|
+
else:
|
|
549
|
+
current_url = None
|
|
550
|
+
|
|
551
|
+
if not current_url:
|
|
552
|
+
if current_batch:
|
|
553
|
+
yield current_batch
|
|
554
|
+
break
|
|
555
|
+
except (KeyError, IndexError, TypeError, ValueError):
|
|
556
|
+
if current_batch:
|
|
557
|
+
yield current_batch
|
|
558
|
+
break
|
|
559
|
+
|
|
560
|
+
elif strategy == 'link_header' and response_obj:
|
|
561
|
+
current_url = self._extract_link_header_url(response_obj)
|
|
562
|
+
if not current_url:
|
|
563
|
+
if current_batch:
|
|
564
|
+
yield current_batch
|
|
565
|
+
break
|
|
566
|
+
extract_config_copy['api_endpoint'] = current_url
|
|
567
|
+
extract_config_copy['base_url'] = ''
|
|
568
|
+
|
|
569
|
+
# Update pagination state
|
|
570
|
+
if strategy == 'page':
|
|
571
|
+
current_page += page_increment
|
|
572
|
+
elif strategy == 'offset':
|
|
573
|
+
limit = params.get('limit', 100)
|
|
574
|
+
if increment_by == 'limit':
|
|
575
|
+
current_offset += limit
|
|
576
|
+
else:
|
|
577
|
+
current_offset += int(increment_by)
|
|
578
|
+
|
|
579
|
+
# Delay between pages
|
|
580
|
+
if page_delay > 0:
|
|
581
|
+
await asyncio.sleep(page_delay)
|
|
582
|
+
|
|
583
|
+
# Yield remaining records
|
|
584
|
+
if current_batch:
|
|
585
|
+
yield current_batch
|
|
586
|
+
|
|
587
|
+
# Helper methods
|
|
588
|
+
def _resolve_rate_limit_delay(
|
|
589
|
+
self,
|
|
590
|
+
rate_limit_delay: Any,
|
|
591
|
+
contract_dir: Optional[Any] = None,
|
|
592
|
+
config_context: Optional[Dict[str, Any]] = None,
|
|
593
|
+
) -> float:
|
|
594
|
+
"""Resolve and convert rate_limit_delay to float."""
|
|
595
|
+
if isinstance(rate_limit_delay, str):
|
|
596
|
+
source_file = str(contract_dir / "extract.yaml") if contract_dir else None
|
|
597
|
+
resolved = resolve_values(rate_limit_delay, context=config_context, source_file=source_file)
|
|
598
|
+
return float(resolved)
|
|
599
|
+
return float(rate_limit_delay)
|
|
600
|
+
|
|
601
|
+
def _build_request_url(
|
|
602
|
+
self,
|
|
603
|
+
base_url: str,
|
|
604
|
+
api_endpoint: str,
|
|
605
|
+
path_params: Optional[Dict[str, Any]] = None,
|
|
606
|
+
) -> str:
|
|
607
|
+
"""Build full request URL from base URL and endpoint."""
|
|
608
|
+
if api_endpoint.startswith(('http://', 'https://')):
|
|
609
|
+
url = api_endpoint
|
|
610
|
+
elif base_url:
|
|
611
|
+
base_url = base_url.rstrip('/')
|
|
612
|
+
endpoint = api_endpoint.lstrip('/')
|
|
613
|
+
url = f"{base_url}/{endpoint}"
|
|
614
|
+
else:
|
|
615
|
+
raise ValueError(
|
|
616
|
+
"Either 'api_endpoint' must be a full URL (starting with http:// or https://) "
|
|
617
|
+
"or 'base_url' must be provided in extract.yaml"
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
# Substitute path parameters
|
|
621
|
+
if path_params and '{' in url:
|
|
622
|
+
try:
|
|
623
|
+
url = url.format(**path_params)
|
|
624
|
+
except KeyError as e:
|
|
625
|
+
raise ValueError(
|
|
626
|
+
f"Missing required path parameter in URL: {e}. "
|
|
627
|
+
f"URL: {url}, Available params: {list(path_params.keys())}"
|
|
628
|
+
) from e
|
|
629
|
+
|
|
630
|
+
return url
|
|
631
|
+
|
|
632
|
+
def _configure_timeout(self, timeout_config: Dict[str, Any]) -> httpx.Timeout:
|
|
633
|
+
"""Configure HTTP timeout from config dictionary."""
|
|
634
|
+
timeout = httpx.Timeout(
|
|
635
|
+
connect=float(timeout_config.get('connect', DEFAULT_TIMEOUT_CONNECT)),
|
|
636
|
+
read=float(timeout_config.get('read', DEFAULT_TIMEOUT_READ)),
|
|
637
|
+
write=float(timeout_config.get('write', DEFAULT_TIMEOUT_WRITE)),
|
|
638
|
+
pool=float(timeout_config.get('pool', DEFAULT_TIMEOUT_POOL)),
|
|
639
|
+
)
|
|
640
|
+
logger.debug(
|
|
641
|
+
f"Configured HTTP timeout: connect={timeout.connect}s, "
|
|
642
|
+
f"read={timeout.read}s, write={timeout.write}s, pool={timeout.pool}s"
|
|
643
|
+
)
|
|
644
|
+
return timeout
|
|
645
|
+
|
|
646
|
+
async def _make_http_request(
|
|
647
|
+
self,
|
|
648
|
+
client: httpx.AsyncClient,
|
|
649
|
+
method: str,
|
|
650
|
+
url: str,
|
|
651
|
+
params: Dict[str, Any],
|
|
652
|
+
headers: Dict[str, Any],
|
|
653
|
+
body: Optional[Any] = None,
|
|
654
|
+
) -> httpx.Response:
|
|
655
|
+
"""Make HTTP request with specified method."""
|
|
656
|
+
method = method.upper()
|
|
657
|
+
|
|
658
|
+
logger.debug(f"Making {method} request to {url}")
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
if method == 'GET':
|
|
662
|
+
return await client.get(url, params=params, headers=headers)
|
|
663
|
+
elif method == 'POST':
|
|
664
|
+
if body:
|
|
665
|
+
return await client.post(
|
|
666
|
+
url,
|
|
667
|
+
json=body if isinstance(body, dict) else body,
|
|
668
|
+
params=params,
|
|
669
|
+
headers=headers,
|
|
670
|
+
)
|
|
671
|
+
else:
|
|
672
|
+
return await client.post(url, params=params, headers=headers)
|
|
673
|
+
else:
|
|
674
|
+
raise ValueError(f"Unsupported HTTP method: {method}")
|
|
675
|
+
except httpx.TimeoutException as e:
|
|
676
|
+
timeout_info = ""
|
|
677
|
+
if hasattr(e, 'timeout') and isinstance(e.timeout, httpx.Timeout):
|
|
678
|
+
timeout_info = (
|
|
679
|
+
f" (connect timeout: {e.timeout.connect}s, "
|
|
680
|
+
f"read timeout: {e.timeout.read}s)"
|
|
681
|
+
)
|
|
682
|
+
logger.error(f"HTTP request timeout for {method} {url}{timeout_info}")
|
|
683
|
+
raise
|
|
684
|
+
except httpx.RequestError as e:
|
|
685
|
+
logger.error(f"HTTP request error for {method} {url}: {type(e).__name__}: {e}")
|
|
686
|
+
raise
|
|
687
|
+
|
|
688
|
+
def _extract_by_path(self, data: Any, path: str) -> List[Dict[str, Any]]:
|
|
689
|
+
"""Extract data using a simple path notation (e.g., 'data.items')."""
|
|
690
|
+
current = data
|
|
691
|
+
for part in path.split('.'):
|
|
692
|
+
if isinstance(current, dict):
|
|
693
|
+
current = current.get(part)
|
|
694
|
+
elif isinstance(current, list) and part.isdigit():
|
|
695
|
+
current = current[int(part)]
|
|
696
|
+
else:
|
|
697
|
+
return []
|
|
698
|
+
|
|
699
|
+
if current is None:
|
|
700
|
+
return []
|
|
701
|
+
|
|
702
|
+
if isinstance(current, list):
|
|
703
|
+
return current
|
|
704
|
+
elif isinstance(current, dict):
|
|
705
|
+
return [current]
|
|
706
|
+
else:
|
|
707
|
+
return []
|
|
708
|
+
|
|
709
|
+
def _extract_data_array(self, data: Any) -> List[Dict[str, Any]]:
|
|
710
|
+
"""Extract data array from response, handling common response structures."""
|
|
711
|
+
if isinstance(data, list):
|
|
712
|
+
return data
|
|
713
|
+
elif isinstance(data, dict):
|
|
714
|
+
# Try common keys for data arrays
|
|
715
|
+
for key in RESPONSE_DATA_KEYS:
|
|
716
|
+
if key in data and isinstance(data[key], list):
|
|
717
|
+
return data[key]
|
|
718
|
+
# If no array found, return as single-item list
|
|
719
|
+
return [data]
|
|
720
|
+
else:
|
|
721
|
+
return []
|
|
722
|
+
|
|
723
|
+
def _check_stop_conditions(
|
|
724
|
+
self,
|
|
725
|
+
page_data: List[Dict[str, Any]],
|
|
726
|
+
stop_conditions: List[Dict[str, Any]],
|
|
727
|
+
params: Dict[str, Any],
|
|
728
|
+
response_data: Any = None,
|
|
729
|
+
) -> bool:
|
|
730
|
+
"""Check if pagination should stop based on configured stop conditions."""
|
|
731
|
+
if not stop_conditions:
|
|
732
|
+
# Default: stop if fewer records than limit
|
|
733
|
+
limit = params.get('limit', 100)
|
|
734
|
+
return len(page_data) < limit
|
|
735
|
+
|
|
736
|
+
for condition in stop_conditions:
|
|
737
|
+
if self._check_stop_condition(condition, page_data, params, response_data):
|
|
738
|
+
return True
|
|
739
|
+
|
|
740
|
+
return False
|
|
741
|
+
|
|
742
|
+
def _check_stop_condition(
|
|
743
|
+
self,
|
|
744
|
+
condition: Dict[str, Any],
|
|
745
|
+
page_data: List[Dict[str, Any]],
|
|
746
|
+
params: Dict[str, Any],
|
|
747
|
+
response_data: Any = None,
|
|
748
|
+
) -> bool:
|
|
749
|
+
"""Check a single stop condition."""
|
|
750
|
+
condition_type = condition.get('type')
|
|
751
|
+
|
|
752
|
+
if condition_type == 'empty_response':
|
|
753
|
+
if not page_data:
|
|
754
|
+
logger.debug("Stop condition 'empty_response' triggered: page is empty")
|
|
755
|
+
return True
|
|
756
|
+
|
|
757
|
+
elif condition_type == 'fewer_records':
|
|
758
|
+
limit = params.get('limit', 100)
|
|
759
|
+
record_count = len(page_data)
|
|
760
|
+
if record_count < limit:
|
|
761
|
+
logger.debug(
|
|
762
|
+
f"Stop condition 'fewer_records' triggered: "
|
|
763
|
+
f"page returned {record_count} records < limit {limit}"
|
|
764
|
+
)
|
|
765
|
+
return True
|
|
766
|
+
|
|
767
|
+
elif condition_type == 'max_pages':
|
|
768
|
+
max_pages = condition.get('value', 1000)
|
|
769
|
+
current_page = params.get('page', 0)
|
|
770
|
+
if current_page >= max_pages:
|
|
771
|
+
logger.debug(f"Stop condition 'max_pages' triggered: page {current_page} >= {max_pages}")
|
|
772
|
+
return True
|
|
773
|
+
|
|
774
|
+
elif condition_type == 'custom':
|
|
775
|
+
return self._check_custom_stop_condition(condition, response_data)
|
|
776
|
+
|
|
777
|
+
return False
|
|
778
|
+
|
|
779
|
+
def _check_custom_stop_condition(
|
|
780
|
+
self,
|
|
781
|
+
condition: Dict[str, Any],
|
|
782
|
+
response_data: Any,
|
|
783
|
+
) -> bool:
|
|
784
|
+
"""Check custom stop condition based on response path."""
|
|
785
|
+
response_path = condition.get('response_path')
|
|
786
|
+
expected_value = condition.get('value')
|
|
787
|
+
|
|
788
|
+
if not response_path or not response_data:
|
|
789
|
+
return False
|
|
790
|
+
|
|
791
|
+
try:
|
|
792
|
+
current = response_data
|
|
793
|
+
for part in response_path.split('.'):
|
|
794
|
+
if isinstance(current, dict):
|
|
795
|
+
current = current.get(part)
|
|
796
|
+
elif isinstance(current, list) and part.isdigit():
|
|
797
|
+
current = current[int(part)]
|
|
798
|
+
else:
|
|
799
|
+
return False
|
|
800
|
+
return current == expected_value
|
|
801
|
+
except (KeyError, IndexError, TypeError):
|
|
802
|
+
return False
|
|
803
|
+
|
|
804
|
+
def _extract_link_header_url(self, response: httpx.Response) -> Optional[str]:
|
|
805
|
+
"""Extract next URL from Link header (RFC 5988)."""
|
|
806
|
+
link_header = response.headers.get('Link', '')
|
|
807
|
+
if not link_header:
|
|
808
|
+
return None
|
|
809
|
+
|
|
810
|
+
# Parse Link header: <url>; rel="next"
|
|
811
|
+
pattern = r'<([^>]+)>;\s*rel=["\']?next["\']?'
|
|
812
|
+
match = re.search(pattern, link_header, re.IGNORECASE)
|
|
813
|
+
if match:
|
|
814
|
+
return match.group(1)
|
|
815
|
+
|
|
816
|
+
return None
|