pycharter 0.0.20__py3-none-any.whl → 0.0.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- api/dependencies/__init__.py +2 -1
- api/dependencies/database.py +71 -5
- api/main.py +47 -8
- api/models/contracts.py +6 -4
- api/models/metadata.py +11 -7
- api/models/schemas.py +16 -10
- api/routes/v1/contracts.py +498 -226
- api/routes/v1/metadata.py +52 -211
- api/routes/v1/schemas.py +1 -1
- api/routes/v1/settings.py +88 -1
- api/utils.py +224 -0
- pycharter/__init__.py +149 -93
- pycharter/data/templates/template_transform_advanced.yaml +50 -0
- pycharter/data/templates/template_transform_simple.yaml +59 -0
- pycharter/db/models/base.py +1 -2
- pycharter/etl_generator/orchestrator.py +463 -487
- pycharter/metadata_store/postgres.py +16 -191
- pycharter/metadata_store/sqlite.py +12 -41
- {pycharter-0.0.20.dist-info → pycharter-0.0.22.dist-info}/METADATA +284 -62
- pycharter-0.0.22.dist-info/RECORD +358 -0
- ui/static/404/index.html +1 -1
- ui/static/404.html +1 -1
- ui/static/__next.__PAGE__.txt +1 -1
- ui/static/__next._full.txt +2 -2
- ui/static/__next._head.txt +1 -1
- ui/static/__next._index.txt +2 -2
- ui/static/__next._tree.txt +2 -2
- ui/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
- ui/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
- ui/static/_next/static/chunks/c4fa4f4114b7c352.js +1 -0
- ui/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
- ui/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
- ui/static/_not-found/__next._full.txt +2 -2
- ui/static/_not-found/__next._head.txt +1 -1
- ui/static/_not-found/__next._index.txt +2 -2
- ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- ui/static/_not-found/__next._not-found.txt +1 -1
- ui/static/_not-found/__next._tree.txt +2 -2
- ui/static/_not-found/index.html +1 -1
- ui/static/_not-found/index.txt +2 -2
- ui/static/contracts/__next._full.txt +3 -3
- ui/static/contracts/__next._head.txt +1 -1
- ui/static/contracts/__next._index.txt +2 -2
- ui/static/contracts/__next._tree.txt +2 -2
- ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
- ui/static/contracts/__next.contracts.txt +1 -1
- ui/static/contracts/index.html +1 -1
- ui/static/contracts/index.txt +3 -3
- ui/static/documentation/__next._full.txt +3 -3
- ui/static/documentation/__next._head.txt +1 -1
- ui/static/documentation/__next._index.txt +2 -2
- ui/static/documentation/__next._tree.txt +2 -2
- ui/static/documentation/__next.documentation.__PAGE__.txt +2 -2
- ui/static/documentation/__next.documentation.txt +1 -1
- ui/static/documentation/index.html +2 -2
- ui/static/documentation/index.txt +3 -3
- ui/static/index.html +1 -1
- ui/static/index.txt +2 -2
- ui/static/metadata/__next._full.txt +2 -2
- ui/static/metadata/__next._head.txt +1 -1
- ui/static/metadata/__next._index.txt +2 -2
- ui/static/metadata/__next._tree.txt +2 -2
- ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
- ui/static/metadata/__next.metadata.txt +1 -1
- ui/static/metadata/index.html +1 -1
- ui/static/metadata/index.txt +2 -2
- ui/static/quality/__next._full.txt +2 -2
- ui/static/quality/__next._head.txt +1 -1
- ui/static/quality/__next._index.txt +2 -2
- ui/static/quality/__next._tree.txt +2 -2
- ui/static/quality/__next.quality.__PAGE__.txt +1 -1
- ui/static/quality/__next.quality.txt +1 -1
- ui/static/quality/index.html +2 -2
- ui/static/quality/index.txt +2 -2
- ui/static/rules/__next._full.txt +2 -2
- ui/static/rules/__next._head.txt +1 -1
- ui/static/rules/__next._index.txt +2 -2
- ui/static/rules/__next._tree.txt +2 -2
- ui/static/rules/__next.rules.__PAGE__.txt +1 -1
- ui/static/rules/__next.rules.txt +1 -1
- ui/static/rules/index.html +1 -1
- ui/static/rules/index.txt +2 -2
- ui/static/schemas/__next._full.txt +2 -2
- ui/static/schemas/__next._head.txt +1 -1
- ui/static/schemas/__next._index.txt +2 -2
- ui/static/schemas/__next._tree.txt +2 -2
- ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
- ui/static/schemas/__next.schemas.txt +1 -1
- ui/static/schemas/index.html +1 -1
- ui/static/schemas/index.txt +2 -2
- ui/static/settings/__next._full.txt +2 -2
- ui/static/settings/__next._head.txt +1 -1
- ui/static/settings/__next._index.txt +2 -2
- ui/static/settings/__next._tree.txt +2 -2
- ui/static/settings/__next.settings.__PAGE__.txt +1 -1
- ui/static/settings/__next.settings.txt +1 -1
- ui/static/settings/index.html +1 -1
- ui/static/settings/index.txt +2 -2
- ui/static/static/.gitkeep +0 -0
- ui/static/static/404/index.html +1 -0
- ui/static/static/404.html +1 -0
- ui/static/static/__next.__PAGE__.txt +10 -0
- ui/static/static/__next._full.txt +30 -0
- ui/static/static/__next._head.txt +7 -0
- ui/static/static/__next._index.txt +9 -0
- ui/static/static/__next._tree.txt +2 -0
- ui/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
- ui/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
- ui/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
- ui/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
- ui/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
- ui/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
- ui/static/static/_next/static/chunks/5e04d10c4a7b58a3.js +1 -0
- ui/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
- ui/static/static/_next/static/chunks/75d88a058d8ffaa6.js +1 -0
- ui/static/static/_next/static/chunks/8c89634cf6bad76f.js +1 -0
- ui/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
- ui/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
- ui/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
- ui/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
- ui/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
- ui/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
- ui/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
- ui/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
- ui/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
- ui/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
- ui/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
- ui/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
- ui/static/static/_not-found/__next._full.txt +17 -0
- ui/static/static/_not-found/__next._head.txt +7 -0
- ui/static/static/_not-found/__next._index.txt +9 -0
- ui/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
- ui/static/static/_not-found/__next._not-found.txt +4 -0
- ui/static/static/_not-found/__next._tree.txt +2 -0
- ui/static/static/_not-found/index.html +1 -0
- ui/static/static/_not-found/index.txt +17 -0
- ui/static/static/contracts/__next._full.txt +21 -0
- ui/static/static/contracts/__next._head.txt +7 -0
- ui/static/static/contracts/__next._index.txt +9 -0
- ui/static/static/contracts/__next._tree.txt +2 -0
- ui/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
- ui/static/static/contracts/__next.contracts.txt +4 -0
- ui/static/static/contracts/index.html +1 -0
- ui/static/static/contracts/index.txt +21 -0
- ui/static/static/documentation/__next._full.txt +21 -0
- ui/static/static/documentation/__next._head.txt +7 -0
- ui/static/static/documentation/__next._index.txt +9 -0
- ui/static/static/documentation/__next._tree.txt +2 -0
- ui/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
- ui/static/static/documentation/__next.documentation.txt +4 -0
- ui/static/static/documentation/index.html +93 -0
- ui/static/static/documentation/index.txt +21 -0
- ui/static/static/index.html +1 -0
- ui/static/static/index.txt +30 -0
- ui/static/static/metadata/__next._full.txt +21 -0
- ui/static/static/metadata/__next._head.txt +7 -0
- ui/static/static/metadata/__next._index.txt +9 -0
- ui/static/static/metadata/__next._tree.txt +2 -0
- ui/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
- ui/static/static/metadata/__next.metadata.txt +4 -0
- ui/static/static/metadata/index.html +1 -0
- ui/static/static/metadata/index.txt +21 -0
- ui/static/static/quality/__next._full.txt +21 -0
- ui/static/static/quality/__next._head.txt +7 -0
- ui/static/static/quality/__next._index.txt +9 -0
- ui/static/static/quality/__next._tree.txt +2 -0
- ui/static/static/quality/__next.quality.__PAGE__.txt +9 -0
- ui/static/static/quality/__next.quality.txt +4 -0
- ui/static/static/quality/index.html +2 -0
- ui/static/static/quality/index.txt +21 -0
- ui/static/static/rules/__next._full.txt +21 -0
- ui/static/static/rules/__next._head.txt +7 -0
- ui/static/static/rules/__next._index.txt +9 -0
- ui/static/static/rules/__next._tree.txt +2 -0
- ui/static/static/rules/__next.rules.__PAGE__.txt +9 -0
- ui/static/static/rules/__next.rules.txt +4 -0
- ui/static/static/rules/index.html +1 -0
- ui/static/static/rules/index.txt +21 -0
- ui/static/static/schemas/__next._full.txt +21 -0
- ui/static/static/schemas/__next._head.txt +7 -0
- ui/static/static/schemas/__next._index.txt +9 -0
- ui/static/static/schemas/__next._tree.txt +2 -0
- ui/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
- ui/static/static/schemas/__next.schemas.txt +4 -0
- ui/static/static/schemas/index.html +1 -0
- ui/static/static/schemas/index.txt +21 -0
- ui/static/static/settings/__next._full.txt +21 -0
- ui/static/static/settings/__next._head.txt +7 -0
- ui/static/static/settings/__next._index.txt +9 -0
- ui/static/static/settings/__next._tree.txt +2 -0
- ui/static/static/settings/__next.settings.__PAGE__.txt +9 -0
- ui/static/static/settings/__next.settings.txt +4 -0
- ui/static/static/settings/index.html +1 -0
- ui/static/static/settings/index.txt +21 -0
- ui/static/static/validation/__next._full.txt +21 -0
- ui/static/static/validation/__next._head.txt +7 -0
- ui/static/static/validation/__next._index.txt +9 -0
- ui/static/static/validation/__next._tree.txt +2 -0
- ui/static/static/validation/__next.validation.__PAGE__.txt +9 -0
- ui/static/static/validation/__next.validation.txt +4 -0
- ui/static/static/validation/index.html +1 -0
- ui/static/static/validation/index.txt +21 -0
- ui/static/validation/__next._full.txt +2 -2
- ui/static/validation/__next._head.txt +1 -1
- ui/static/validation/__next._index.txt +2 -2
- ui/static/validation/__next._tree.txt +2 -2
- ui/static/validation/__next.validation.__PAGE__.txt +1 -1
- ui/static/validation/__next.validation.txt +1 -1
- ui/static/validation/index.html +1 -1
- ui/static/validation/index.txt +2 -2
- pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +0 -160
- pycharter-0.0.20.dist-info/RECORD +0 -247
- {pycharter-0.0.20.dist-info → pycharter-0.0.22.dist-info}/WHEEL +0 -0
- {pycharter-0.0.20.dist-info → pycharter-0.0.22.dist-info}/entry_points.txt +0 -0
- {pycharter-0.0.20.dist-info → pycharter-0.0.22.dist-info}/licenses/LICENSE +0 -0
- {pycharter-0.0.20.dist-info → pycharter-0.0.22.dist-info}/top_level.txt +0 -0
- /ui/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
- /ui/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
- /ui/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
- /ui/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
- /ui/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
- /ui/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
|
@@ -1,29 +1,32 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
ETL Orchestrator - Streaming ETL pipeline with simple operations, JSONata, and custom functions.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
Executes ETL pipelines: Extract → Transform (Simple Operations → JSONata → Custom Functions) → Load.
|
|
5
|
+
|
|
6
|
+
Transformation Pipeline:
|
|
7
|
+
1. Simple Operations: rename, convert, defaults, add, select, drop (declarative, easy to use)
|
|
8
|
+
2. JSONata: Powerful query language for complex transformations (full JSONata support)
|
|
9
|
+
3. Custom Functions: Import and run external Python modules/functions
|
|
7
10
|
"""
|
|
8
11
|
|
|
9
12
|
import asyncio
|
|
10
13
|
import gc
|
|
14
|
+
import importlib
|
|
11
15
|
import logging
|
|
16
|
+
import re
|
|
12
17
|
import uuid
|
|
13
18
|
import warnings
|
|
14
19
|
from collections import Counter, defaultdict
|
|
15
|
-
from datetime import
|
|
20
|
+
from datetime import datetime
|
|
16
21
|
from pathlib import Path
|
|
17
22
|
from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Tuple
|
|
18
23
|
|
|
24
|
+
import jsonata
|
|
19
25
|
import yaml
|
|
20
26
|
|
|
21
27
|
from pycharter.contract_parser import ContractMetadata, parse_contract_file
|
|
22
28
|
from pycharter.etl_generator.checkpoint import CheckpointManager
|
|
23
|
-
from pycharter.etl_generator.database import
|
|
24
|
-
get_database_connection,
|
|
25
|
-
load_data,
|
|
26
|
-
)
|
|
29
|
+
from pycharter.etl_generator.database import get_database_connection, load_data
|
|
27
30
|
from pycharter.etl_generator.dlq import DeadLetterQueue, DLQReason
|
|
28
31
|
from pycharter.etl_generator.extraction import extract_with_pagination_streaming
|
|
29
32
|
from pycharter.etl_generator.progress import ETLProgress, ProgressTracker
|
|
@@ -31,32 +34,14 @@ from pycharter.utils.value_injector import resolve_values
|
|
|
31
34
|
|
|
32
35
|
logger = logging.getLogger(__name__)
|
|
33
36
|
|
|
34
|
-
# Optional
|
|
37
|
+
# Optional memory monitoring
|
|
35
38
|
try:
|
|
36
39
|
import psutil
|
|
37
40
|
PSUTIL_AVAILABLE = True
|
|
38
41
|
except ImportError:
|
|
39
42
|
PSUTIL_AVAILABLE = False
|
|
40
43
|
|
|
41
|
-
|
|
42
|
-
# ============================================================================
|
|
43
|
-
# CONSTANTS
|
|
44
|
-
# ============================================================================
|
|
45
|
-
|
|
46
|
-
COMPUTED_DATETIME_NOW = "@now"
|
|
47
|
-
COMPUTED_DATETIME_UTC_NOW = "@utcnow"
|
|
48
|
-
COMPUTED_WEEK_START = "@week_start"
|
|
49
|
-
COMPUTED_WEEK_END = "@week_end"
|
|
50
44
|
DEFAULT_BATCH_SIZE = 1000
|
|
51
|
-
DEFAULT_MAX_DEPTH = 10
|
|
52
|
-
DEFAULT_SEPARATOR = "_"
|
|
53
|
-
|
|
54
|
-
# Datetime parsing formats (in order of preference)
|
|
55
|
-
DATETIME_FORMATS = [
|
|
56
|
-
'%Y-%m-%dT%H:%M:%S',
|
|
57
|
-
'%Y-%m-%d %H:%M:%S',
|
|
58
|
-
'%Y-%m-%d'
|
|
59
|
-
]
|
|
60
45
|
|
|
61
46
|
|
|
62
47
|
class ETLOrchestrator:
|
|
@@ -473,526 +458,517 @@ class ETLOrchestrator:
|
|
|
473
458
|
yield batch
|
|
474
459
|
|
|
475
460
|
# ============================================================================
|
|
476
|
-
# TRANSFORMATION
|
|
461
|
+
# TRANSFORMATION (Simple Operations → JSONata → Custom Functions)
|
|
477
462
|
# ============================================================================
|
|
478
463
|
|
|
479
464
|
def transform(self, raw_data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]:
|
|
480
465
|
"""
|
|
481
|
-
Transform
|
|
466
|
+
Transform data using simple operations, JSONata expressions, and/or custom Python functions.
|
|
482
467
|
|
|
483
|
-
|
|
484
|
-
1.
|
|
485
|
-
2.
|
|
486
|
-
3.
|
|
487
|
-
4. Apply type conversions
|
|
488
|
-
5. Apply fill_null rules
|
|
489
|
-
6. Drop specified fields
|
|
468
|
+
Pipeline order (applied sequentially):
|
|
469
|
+
1. Simple operations (rename, select, drop, convert, defaults, add)
|
|
470
|
+
2. JSONata transformation (if configured)
|
|
471
|
+
3. Custom function execution (if configured)
|
|
490
472
|
|
|
491
473
|
Args:
|
|
492
474
|
raw_data: Raw data from extraction
|
|
493
|
-
**kwargs: Additional
|
|
475
|
+
**kwargs: Additional parameters (passed to custom functions)
|
|
494
476
|
|
|
495
477
|
Returns:
|
|
496
478
|
Transformed data
|
|
479
|
+
|
|
480
|
+
Example - Simple operations:
|
|
481
|
+
transform_config:
|
|
482
|
+
rename:
|
|
483
|
+
oldName: new_name
|
|
484
|
+
camelCase: snake_case
|
|
485
|
+
select:
|
|
486
|
+
- field1
|
|
487
|
+
- field2
|
|
488
|
+
convert:
|
|
489
|
+
price: float
|
|
490
|
+
quantity: integer
|
|
491
|
+
defaults:
|
|
492
|
+
status: "pending"
|
|
493
|
+
|
|
494
|
+
Example - JSONata (advanced):
|
|
495
|
+
transform_config:
|
|
496
|
+
jsonata:
|
|
497
|
+
expression: |
|
|
498
|
+
$.{
|
|
499
|
+
"ticker": symbol,
|
|
500
|
+
"avg_price": $average(prices)
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
Example - Custom function:
|
|
504
|
+
transform_config:
|
|
505
|
+
custom_function:
|
|
506
|
+
module: "myproject.transforms"
|
|
507
|
+
function: "optimize_data"
|
|
508
|
+
mode: "batch"
|
|
497
509
|
"""
|
|
498
510
|
if not self.transform_config:
|
|
499
511
|
return raw_data
|
|
500
512
|
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
'
|
|
518
|
-
|
|
519
|
-
'
|
|
520
|
-
|
|
521
|
-
|
|
513
|
+
data = raw_data
|
|
514
|
+
|
|
515
|
+
# Step 1: Apply simple operations (in order)
|
|
516
|
+
# Support both new 'transform' key and legacy top-level keys for backward compatibility
|
|
517
|
+
simple_ops = {}
|
|
518
|
+
|
|
519
|
+
# New format: transform: { rename: {...}, select: [...] }
|
|
520
|
+
if 'transform' in self.transform_config:
|
|
521
|
+
simple_ops = self.transform_config.get('transform', {})
|
|
522
|
+
|
|
523
|
+
# Legacy format: rename: {...} at top level (for backward compatibility)
|
|
524
|
+
if 'rename' in self.transform_config and 'transform' not in self.transform_config:
|
|
525
|
+
simple_ops['rename'] = self.transform_config.get('rename')
|
|
526
|
+
if 'select' in self.transform_config and 'transform' not in self.transform_config:
|
|
527
|
+
simple_ops['select'] = self.transform_config.get('select')
|
|
528
|
+
if 'drop' in self.transform_config and 'transform' not in self.transform_config:
|
|
529
|
+
simple_ops['drop'] = self.transform_config.get('drop')
|
|
530
|
+
if 'convert' in self.transform_config and 'transform' not in self.transform_config:
|
|
531
|
+
simple_ops['convert'] = self.transform_config.get('convert')
|
|
532
|
+
if 'defaults' in self.transform_config and 'transform' not in self.transform_config:
|
|
533
|
+
simple_ops['defaults'] = self.transform_config.get('defaults')
|
|
534
|
+
if 'add' in self.transform_config and 'transform' not in self.transform_config:
|
|
535
|
+
simple_ops['add'] = self.transform_config.get('add')
|
|
536
|
+
|
|
537
|
+
if simple_ops:
|
|
538
|
+
data = self._apply_simple_operations(data, simple_ops)
|
|
539
|
+
|
|
540
|
+
# Step 2: Apply JSONata transformation (if configured)
|
|
541
|
+
jsonata_config = self.transform_config.get('jsonata')
|
|
542
|
+
if jsonata_config:
|
|
543
|
+
data = self._apply_jsonata(data, jsonata_config)
|
|
544
|
+
|
|
545
|
+
# Step 3: Apply custom function (if configured)
|
|
546
|
+
custom_func_config = self.transform_config.get('custom_function')
|
|
547
|
+
if custom_func_config:
|
|
548
|
+
data = self._apply_custom_function(data, custom_func_config, **kwargs)
|
|
549
|
+
|
|
550
|
+
return data
|
|
522
551
|
|
|
523
|
-
def
|
|
552
|
+
def _apply_simple_operations(
|
|
524
553
|
self,
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
rename_rules = transform_rules['rename']
|
|
531
|
-
flatten_rules = transform_rules['flatten']
|
|
532
|
-
type_rules = transform_rules['type']
|
|
533
|
-
fill_null_rules = transform_rules['fill_null']
|
|
534
|
-
drop_fields = transform_rules['drop']
|
|
535
|
-
|
|
536
|
-
transformed_record = {}
|
|
537
|
-
|
|
538
|
-
# Step 1: Apply rename transformations (with flattening if configured)
|
|
539
|
-
transformed_record.update(
|
|
540
|
-
self._apply_rename_transformations(record, rename_rules, flatten_rules)
|
|
541
|
-
)
|
|
554
|
+
data: List[Dict[str, Any]],
|
|
555
|
+
config: Dict[str, Any]
|
|
556
|
+
) -> List[Dict[str, Any]]:
|
|
557
|
+
"""
|
|
558
|
+
Apply simple declarative transformation operations.
|
|
542
559
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
560
|
+
Operations are applied in this order:
|
|
561
|
+
1. rename - Rename fields (old_name: new_name)
|
|
562
|
+
2. convert - Convert field types (field: type)
|
|
563
|
+
3. defaults - Set default values for missing fields
|
|
564
|
+
4. add - Add computed fields with expressions
|
|
565
|
+
5. select - Keep only specified fields
|
|
566
|
+
6. drop - Remove specified fields
|
|
547
567
|
|
|
548
|
-
|
|
549
|
-
|
|
568
|
+
Args:
|
|
569
|
+
data: Input data (list of records)
|
|
570
|
+
config: Simple operations configuration
|
|
550
571
|
|
|
551
|
-
|
|
552
|
-
|
|
572
|
+
Returns:
|
|
573
|
+
Transformed data
|
|
553
574
|
|
|
554
|
-
|
|
555
|
-
|
|
575
|
+
Example config:
|
|
576
|
+
transform:
|
|
577
|
+
rename:
|
|
578
|
+
oldName: new_name
|
|
579
|
+
camelCase: snake_case
|
|
580
|
+
convert:
|
|
581
|
+
price: float
|
|
582
|
+
quantity: integer
|
|
583
|
+
active: boolean
|
|
584
|
+
defaults:
|
|
585
|
+
status: "pending"
|
|
586
|
+
priority: 0
|
|
587
|
+
add:
|
|
588
|
+
full_name: "${first_name} ${last_name}"
|
|
589
|
+
created_at: "now()"
|
|
590
|
+
record_id: "uuid()"
|
|
591
|
+
select:
|
|
592
|
+
- field1
|
|
593
|
+
- field2
|
|
594
|
+
drop:
|
|
595
|
+
- internal_id
|
|
596
|
+
- debug_info
|
|
597
|
+
"""
|
|
598
|
+
if not data:
|
|
599
|
+
return data
|
|
600
|
+
|
|
601
|
+
result = []
|
|
602
|
+
|
|
603
|
+
# Get available fields from first record for validation
|
|
604
|
+
available_fields = set(data[0].keys()) if data else set()
|
|
605
|
+
|
|
606
|
+
# Step 1: Rename fields
|
|
607
|
+
rename_map = config.get('rename', {})
|
|
608
|
+
if rename_map:
|
|
609
|
+
# Validate rename mappings
|
|
610
|
+
missing_fields = [old for old in rename_map.keys() if old not in available_fields]
|
|
611
|
+
if missing_fields:
|
|
612
|
+
logger.warning(
|
|
613
|
+
f"Rename operation: Fields not found in data: {missing_fields}. "
|
|
614
|
+
f"Available fields: {sorted(available_fields)}"
|
|
615
|
+
)
|
|
556
616
|
|
|
557
|
-
# Step
|
|
558
|
-
|
|
617
|
+
# Step 2: Convert types
|
|
618
|
+
convert_map = config.get('convert', {})
|
|
559
619
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
def _apply_rename_transformations(
|
|
563
|
-
self,
|
|
564
|
-
record: Dict[str, Any],
|
|
565
|
-
rename_rules: Dict[str, str],
|
|
566
|
-
flatten_rules: Dict[str, Any]
|
|
567
|
-
) -> Dict[str, Any]:
|
|
568
|
-
"""Apply rename transformations, handling flattening if configured."""
|
|
569
|
-
transformed = {}
|
|
570
|
-
|
|
571
|
-
for source_field, target_field in rename_rules.items():
|
|
572
|
-
if source_field in record:
|
|
573
|
-
value = record[source_field]
|
|
574
|
-
flattened = self._maybe_flatten_field(source_field, value, flatten_rules)
|
|
575
|
-
if flattened is not None:
|
|
576
|
-
transformed.update(flattened)
|
|
577
|
-
else:
|
|
578
|
-
transformed[target_field] = value
|
|
579
|
-
elif target_field in record:
|
|
580
|
-
transformed[target_field] = record[target_field]
|
|
620
|
+
# Step 3: Defaults
|
|
621
|
+
defaults_map = config.get('defaults', {})
|
|
581
622
|
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
drop_fields
|
|
590
|
-
) -> Dict[str, Any]:
|
|
591
|
-
"""Copy remaining fields not in rename rules, handling flattening if configured."""
|
|
592
|
-
transformed = {}
|
|
593
|
-
|
|
594
|
-
for key, value in record.items():
|
|
595
|
-
if key not in rename_rules and key not in transformed:
|
|
596
|
-
if key not in drop_fields:
|
|
597
|
-
flattened = self._maybe_flatten_field(key, value, flatten_rules)
|
|
598
|
-
if flattened is not None:
|
|
599
|
-
transformed.update(flattened)
|
|
600
|
-
else:
|
|
601
|
-
transformed[key] = value
|
|
623
|
+
# Step 4: Add computed fields
|
|
624
|
+
add_map = config.get('add', {})
|
|
625
|
+
|
|
626
|
+
# Step 5: Select fields (keep only these)
|
|
627
|
+
select_fields = config.get('select')
|
|
628
|
+
|
|
629
|
+
# Step 6: Drop fields (remove these)
|
|
630
|
+
drop_fields = set(config.get('drop', []))
|
|
602
631
|
|
|
603
|
-
|
|
632
|
+
for record in data:
|
|
633
|
+
transformed = dict(record)
|
|
634
|
+
|
|
635
|
+
# 1. Rename
|
|
636
|
+
if rename_map:
|
|
637
|
+
for old_name, new_name in rename_map.items():
|
|
638
|
+
if old_name in transformed:
|
|
639
|
+
transformed[new_name] = transformed.pop(old_name)
|
|
640
|
+
|
|
641
|
+
# 2. Convert types
|
|
642
|
+
if convert_map:
|
|
643
|
+
for field_name, target_type in convert_map.items():
|
|
644
|
+
if field_name in transformed:
|
|
645
|
+
try:
|
|
646
|
+
transformed[field_name] = self._convert_type(
|
|
647
|
+
transformed[field_name], target_type
|
|
648
|
+
)
|
|
649
|
+
except (ValueError, TypeError) as e:
|
|
650
|
+
logger.warning(
|
|
651
|
+
f"Failed to convert field '{field_name}' to {target_type}: {e}. "
|
|
652
|
+
f"Keeping original value."
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
# 3. Apply defaults
|
|
656
|
+
if defaults_map:
|
|
657
|
+
for field_name, default_value in defaults_map.items():
|
|
658
|
+
if field_name not in transformed or transformed[field_name] is None:
|
|
659
|
+
transformed[field_name] = default_value
|
|
660
|
+
|
|
661
|
+
# 4. Add computed fields
|
|
662
|
+
if add_map:
|
|
663
|
+
for field_name, expression in add_map.items():
|
|
664
|
+
try:
|
|
665
|
+
transformed[field_name] = self._evaluate_expression(
|
|
666
|
+
expression, transformed
|
|
667
|
+
)
|
|
668
|
+
except Exception as e:
|
|
669
|
+
logger.warning(
|
|
670
|
+
f"Failed to compute field '{field_name}': {e}. "
|
|
671
|
+
f"Skipping this field."
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
# 5. Select (keep only specified fields)
|
|
675
|
+
if select_fields:
|
|
676
|
+
transformed = {
|
|
677
|
+
k: v for k, v in transformed.items()
|
|
678
|
+
if k in select_fields
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
# 6. Drop (remove specified fields)
|
|
682
|
+
if drop_fields:
|
|
683
|
+
transformed = {
|
|
684
|
+
k: v for k, v in transformed.items()
|
|
685
|
+
if k not in drop_fields
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
result.append(transformed)
|
|
689
|
+
|
|
690
|
+
return result
|
|
604
691
|
|
|
605
|
-
def
|
|
606
|
-
self,
|
|
607
|
-
field_name: str,
|
|
608
|
-
value: Any,
|
|
609
|
-
flatten_rules: Dict[str, Any]
|
|
610
|
-
) -> Optional[Dict[str, Any]]:
|
|
692
|
+
def _convert_type(self, value: Any, target_type: str) -> Any:
|
|
611
693
|
"""
|
|
612
|
-
|
|
694
|
+
Convert a value to the specified type.
|
|
695
|
+
|
|
696
|
+
Args:
|
|
697
|
+
value: Value to convert
|
|
698
|
+
target_type: Target type (string, integer, float, boolean, datetime, date)
|
|
613
699
|
|
|
614
700
|
Returns:
|
|
615
|
-
|
|
701
|
+
Converted value
|
|
616
702
|
"""
|
|
617
|
-
if
|
|
618
|
-
return None
|
|
619
|
-
|
|
620
|
-
flatten_config = flatten_rules[field_name]
|
|
621
|
-
if not flatten_config.get('enabled', True):
|
|
703
|
+
if value is None:
|
|
622
704
|
return None
|
|
623
705
|
|
|
624
|
-
|
|
625
|
-
return self._flatten_nested_object(value, field_name, flatten_config)
|
|
626
|
-
elif isinstance(value, list):
|
|
627
|
-
return self._flatten_array(value, field_name, flatten_config)
|
|
706
|
+
target_type_lower = target_type.lower().strip()
|
|
628
707
|
|
|
629
|
-
|
|
708
|
+
if target_type_lower in ('str', 'string'):
|
|
709
|
+
return str(value)
|
|
710
|
+
elif target_type_lower in ('int', 'integer'):
|
|
711
|
+
if isinstance(value, str):
|
|
712
|
+
# Try to parse as float first (handles "1.0" -> 1)
|
|
713
|
+
try:
|
|
714
|
+
return int(float(value))
|
|
715
|
+
except ValueError:
|
|
716
|
+
return int(value)
|
|
717
|
+
return int(value)
|
|
718
|
+
elif target_type_lower in ('float', 'number', 'numeric'):
|
|
719
|
+
if isinstance(value, str):
|
|
720
|
+
return float(value)
|
|
721
|
+
return float(value)
|
|
722
|
+
elif target_type_lower in ('bool', 'boolean'):
|
|
723
|
+
if isinstance(value, str):
|
|
724
|
+
return value.lower() in ('true', '1', 'yes', 'on')
|
|
725
|
+
return bool(value)
|
|
726
|
+
elif target_type_lower == 'datetime':
|
|
727
|
+
from datetime import datetime
|
|
728
|
+
if isinstance(value, str):
|
|
729
|
+
# Try common datetime formats
|
|
730
|
+
for fmt in [
|
|
731
|
+
'%Y-%m-%dT%H:%M:%S',
|
|
732
|
+
'%Y-%m-%dT%H:%M:%S.%f',
|
|
733
|
+
'%Y-%m-%dT%H:%M:%SZ',
|
|
734
|
+
'%Y-%m-%dT%H:%M:%S.%fZ',
|
|
735
|
+
'%Y-%m-%d %H:%M:%S',
|
|
736
|
+
'%Y-%m-%d %H:%M:%S.%f',
|
|
737
|
+
]:
|
|
738
|
+
try:
|
|
739
|
+
return datetime.strptime(value, fmt)
|
|
740
|
+
except ValueError:
|
|
741
|
+
continue
|
|
742
|
+
raise ValueError(f"Cannot parse datetime: {value}")
|
|
743
|
+
return value
|
|
744
|
+
elif target_type_lower == 'date':
|
|
745
|
+
from datetime import date, datetime
|
|
746
|
+
if isinstance(value, str):
|
|
747
|
+
# Try common date formats
|
|
748
|
+
for fmt in ['%Y-%m-%d', '%Y/%m/%d', '%m/%d/%Y']:
|
|
749
|
+
try:
|
|
750
|
+
dt = datetime.strptime(value, fmt)
|
|
751
|
+
return dt.date()
|
|
752
|
+
except ValueError:
|
|
753
|
+
continue
|
|
754
|
+
raise ValueError(f"Cannot parse date: {value}")
|
|
755
|
+
elif isinstance(value, datetime):
|
|
756
|
+
return value.date()
|
|
757
|
+
return value
|
|
758
|
+
else:
|
|
759
|
+
raise ValueError(f"Unsupported target type: {target_type}")
|
|
630
760
|
|
|
631
|
-
def
|
|
632
|
-
self,
|
|
633
|
-
transformed_record: Dict[str, Any],
|
|
634
|
-
rename_rules: Dict[str, str],
|
|
635
|
-
**kwargs
|
|
636
|
-
) -> None:
|
|
761
|
+
def _evaluate_expression(self, expression: str, record: Dict[str, Any]) -> Any:
|
|
637
762
|
"""
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
This method adds input parameters (defined in input_params) to transformed records,
|
|
641
|
-
overriding any values from the API response. It also applies rename rules to input
|
|
642
|
-
parameters, allowing them to be stored with different names (e.g., 'type' -> 'direction').
|
|
763
|
+
Evaluate a simple expression in the context of a record.
|
|
643
764
|
|
|
644
|
-
|
|
645
|
-
-
|
|
646
|
-
-
|
|
647
|
-
-
|
|
648
|
-
-
|
|
765
|
+
Supports:
|
|
766
|
+
- Field references: "${field_name}"
|
|
767
|
+
- String concatenation: "${field1} ${field2}"
|
|
768
|
+
- Simple functions: "now()", "uuid()"
|
|
769
|
+
- Literal values (if no placeholders)
|
|
649
770
|
|
|
650
771
|
Args:
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
**kwargs: Input parameters passed to the pipeline
|
|
654
|
-
"""
|
|
655
|
-
# Always override input parameters from kwargs to ensure consistency
|
|
656
|
-
# This ensures that request parameters (like symbol, period) always use
|
|
657
|
-
# the values from the input parameters, not from the API response
|
|
658
|
-
for param_name in self.input_params.keys():
|
|
659
|
-
if param_name in kwargs:
|
|
660
|
-
# Apply rename rule if one exists for this parameter
|
|
661
|
-
# This allows input parameters to be stored with different names
|
|
662
|
-
# (e.g., 'type' parameter -> 'direction' field)
|
|
663
|
-
target_name = rename_rules.get(param_name, param_name)
|
|
664
|
-
transformed_record[target_name] = kwargs[param_name]
|
|
665
|
-
|
|
666
|
-
def _apply_type_conversions(
|
|
667
|
-
self,
|
|
668
|
-
transformed_record: Dict[str, Any],
|
|
669
|
-
type_rules: Dict[str, str]
|
|
670
|
-
) -> None:
|
|
671
|
-
"""Apply type conversions to fields."""
|
|
672
|
-
for field, field_type in type_rules.items():
|
|
673
|
-
if field in transformed_record:
|
|
674
|
-
transformed_record[field] = self._convert_type(
|
|
675
|
-
transformed_record[field], field_type
|
|
676
|
-
)
|
|
677
|
-
|
|
678
|
-
def _apply_fill_null_rules(
|
|
679
|
-
self,
|
|
680
|
-
transformed_record: Dict[str, Any],
|
|
681
|
-
fill_null_rules: Dict[str, Any]
|
|
682
|
-
) -> None:
|
|
683
|
-
"""Apply fill_null rules, handling computed datetime values."""
|
|
684
|
-
for field, config in fill_null_rules.items():
|
|
685
|
-
if field not in transformed_record or transformed_record[field] is None:
|
|
686
|
-
default_value = config.get('default') if isinstance(config, dict) else config
|
|
687
|
-
computed_value = self._compute_datetime_value(default_value)
|
|
688
|
-
transformed_record[field] = computed_value if computed_value is not None else default_value
|
|
689
|
-
|
|
690
|
-
def _compute_datetime_value(self, value: Any) -> Optional[datetime]:
|
|
691
|
-
"""
|
|
692
|
-
Compute datetime value from special constants.
|
|
772
|
+
expression: Expression string
|
|
773
|
+
record: Record dictionary for context
|
|
693
774
|
|
|
694
|
-
Args:
|
|
695
|
-
value: Value to check (may be a computed datetime constant)
|
|
696
|
-
|
|
697
775
|
Returns:
|
|
698
|
-
|
|
776
|
+
Evaluated result
|
|
777
|
+
|
|
778
|
+
Examples:
|
|
779
|
+
"${first_name} ${last_name}" -> "John Doe"
|
|
780
|
+
"now()" -> "2024-01-01T12:00:00"
|
|
781
|
+
"uuid()" -> "123e4567-e89b-12d3-a456-426614174000"
|
|
782
|
+
"static_value" -> "static_value"
|
|
699
783
|
"""
|
|
700
|
-
if
|
|
701
|
-
return
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
return
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
784
|
+
if not isinstance(expression, str):
|
|
785
|
+
return expression
|
|
786
|
+
|
|
787
|
+
expression = expression.strip()
|
|
788
|
+
|
|
789
|
+
# Handle special functions
|
|
790
|
+
if expression == 'now()':
|
|
791
|
+
return datetime.now().isoformat()
|
|
792
|
+
elif expression == 'uuid()':
|
|
793
|
+
return str(uuid.uuid4())
|
|
794
|
+
|
|
795
|
+
# Handle field references and string interpolation
|
|
796
|
+
try:
|
|
797
|
+
# Simple string interpolation: "${field1} ${field2}"
|
|
798
|
+
result = expression
|
|
799
|
+
placeholders_found = False
|
|
800
|
+
|
|
801
|
+
# Find all ${...} placeholders
|
|
802
|
+
placeholder_pattern = r'\$\{([^}]+)\}'
|
|
803
|
+
matches = re.findall(placeholder_pattern, expression)
|
|
804
|
+
|
|
805
|
+
if matches:
|
|
806
|
+
placeholders_found = True
|
|
807
|
+
for field_name in matches:
|
|
808
|
+
if field_name in record:
|
|
809
|
+
value = record[field_name]
|
|
810
|
+
placeholder = f"${{{field_name}}}"
|
|
811
|
+
result = result.replace(placeholder, str(value) if value is not None else '')
|
|
812
|
+
else:
|
|
813
|
+
logger.warning(
|
|
814
|
+
f"Expression '{expression}': Field '{field_name}' not found in record. "
|
|
815
|
+
f"Available fields: {sorted(record.keys())}"
|
|
816
|
+
)
|
|
817
|
+
# Replace with empty string if field not found
|
|
818
|
+
placeholder = f"${{{field_name}}}"
|
|
819
|
+
result = result.replace(placeholder, '')
|
|
820
|
+
|
|
821
|
+
# If no placeholders were found and it's not a function, return as literal
|
|
822
|
+
if not placeholders_found and not expression.endswith('()'):
|
|
823
|
+
return expression
|
|
824
|
+
|
|
825
|
+
return result
|
|
826
|
+
except Exception as e:
|
|
827
|
+
raise ValueError(f"Failed to evaluate expression '{expression}': {e}") from e
|
|
726
828
|
|
|
727
|
-
def
|
|
728
|
-
self,
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
depth: int = 0,
|
|
733
|
-
max_depth: int = 10
|
|
734
|
-
) -> Dict[str, Any]:
|
|
829
|
+
def _apply_jsonata(
|
|
830
|
+
self,
|
|
831
|
+
data: List[Dict[str, Any]],
|
|
832
|
+
config: Dict[str, Any]
|
|
833
|
+
) -> List[Dict[str, Any]]:
|
|
735
834
|
"""
|
|
736
|
-
|
|
737
|
-
Supports simple key mapping, recursive flattening, and prefix patterns.
|
|
835
|
+
Apply JSONata expression to transform data.
|
|
738
836
|
|
|
739
837
|
Args:
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
config: Flatten configuration from transform.yaml
|
|
743
|
-
depth: Current recursion depth
|
|
744
|
-
max_depth: Maximum recursion depth to prevent infinite loops
|
|
838
|
+
data: Input data (list of records)
|
|
839
|
+
config: JSONata configuration with 'expression' and optional 'mode'
|
|
745
840
|
|
|
746
841
|
Returns:
|
|
747
|
-
|
|
842
|
+
Transformed data
|
|
843
|
+
|
|
844
|
+
Example config:
|
|
845
|
+
jsonata:
|
|
846
|
+
expression: |
|
|
847
|
+
$.{
|
|
848
|
+
"ticker": symbol,
|
|
849
|
+
"avg_price": $average(prices),
|
|
850
|
+
"total_volume": $sum(volumes)
|
|
851
|
+
}
|
|
852
|
+
mode: "batch" # or "record"
|
|
748
853
|
"""
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
separator = config.get('separator', DEFAULT_SEPARATOR)
|
|
755
|
-
max_depth_config = config.get('max_depth', max_depth)
|
|
756
|
-
key_mapping = config.get('key_mapping', {})
|
|
757
|
-
prefix = config.get('prefix', '')
|
|
758
|
-
|
|
759
|
-
if depth >= max_depth_config:
|
|
760
|
-
# Prevent infinite recursion
|
|
761
|
-
return {field_name: nested_obj}
|
|
762
|
-
|
|
763
|
-
if strategy == 'recursive':
|
|
764
|
-
# Recursively flatten all nested objects
|
|
765
|
-
for key, value in nested_obj.items():
|
|
766
|
-
if isinstance(value, dict):
|
|
767
|
-
# Recursively flatten nested dict
|
|
768
|
-
nested_flattened = self._flatten_nested_object(
|
|
769
|
-
value,
|
|
770
|
-
f"{field_name}{separator}{key}",
|
|
771
|
-
config,
|
|
772
|
-
depth + 1,
|
|
773
|
-
max_depth_config
|
|
774
|
-
)
|
|
775
|
-
flattened.update(nested_flattened)
|
|
776
|
-
elif isinstance(value, list):
|
|
777
|
-
# Handle arrays
|
|
778
|
-
array_config = config.get('array_fields', [])
|
|
779
|
-
flatten_arrays = config.get('flatten_arrays', False)
|
|
780
|
-
if key in array_config or flatten_arrays:
|
|
781
|
-
# Flatten array items
|
|
782
|
-
for idx, item in enumerate(value):
|
|
783
|
-
if isinstance(item, dict):
|
|
784
|
-
item_flattened = self._flatten_nested_object(
|
|
785
|
-
item,
|
|
786
|
-
f"{field_name}{separator}{key}{separator}{idx}",
|
|
787
|
-
config,
|
|
788
|
-
depth + 1,
|
|
789
|
-
max_depth_config
|
|
790
|
-
)
|
|
791
|
-
flattened.update(item_flattened)
|
|
792
|
-
else:
|
|
793
|
-
flattened[f"{field_name}{separator}{key}{separator}{idx}"] = item
|
|
794
|
-
else:
|
|
795
|
-
# Keep array as-is
|
|
796
|
-
flattened[f"{field_name}{separator}{key}"] = value
|
|
797
|
-
else:
|
|
798
|
-
# Simple value
|
|
799
|
-
flattened[f"{field_name}{separator}{key}"] = value
|
|
800
|
-
else: # strategy == 'simple' (default)
|
|
801
|
-
# Simple flattening with key mapping or prefix
|
|
802
|
-
for nested_key, nested_value in nested_obj.items():
|
|
803
|
-
if isinstance(nested_value, dict):
|
|
804
|
-
# Nested dict - recursively flatten if recursive is enabled
|
|
805
|
-
if config.get('recursive', False):
|
|
806
|
-
nested_flattened = self._flatten_nested_object(
|
|
807
|
-
nested_value,
|
|
808
|
-
f"{field_name}{separator}{nested_key}",
|
|
809
|
-
config,
|
|
810
|
-
depth + 1,
|
|
811
|
-
max_depth_config
|
|
812
|
-
)
|
|
813
|
-
flattened.update(nested_flattened)
|
|
814
|
-
else:
|
|
815
|
-
# Keep as nested or use prefix
|
|
816
|
-
if prefix:
|
|
817
|
-
flattened[f"{prefix}{nested_key}"] = nested_value
|
|
818
|
-
else:
|
|
819
|
-
flattened[f"{field_name}{separator}{nested_key}"] = nested_value
|
|
820
|
-
elif isinstance(nested_value, list):
|
|
821
|
-
# Array - handle if configured
|
|
822
|
-
if config.get('flatten_arrays', False):
|
|
823
|
-
for idx, item in enumerate(nested_value):
|
|
824
|
-
if isinstance(item, dict):
|
|
825
|
-
item_flattened = self._flatten_nested_object(
|
|
826
|
-
item,
|
|
827
|
-
f"{field_name}{separator}{nested_key}{separator}{idx}",
|
|
828
|
-
config,
|
|
829
|
-
depth + 1,
|
|
830
|
-
max_depth_config
|
|
831
|
-
)
|
|
832
|
-
flattened.update(item_flattened)
|
|
833
|
-
else:
|
|
834
|
-
flattened[f"{field_name}{separator}{nested_key}{separator}{idx}"] = item
|
|
835
|
-
else:
|
|
836
|
-
# Keep array as-is
|
|
837
|
-
if key_mapping and nested_key in key_mapping:
|
|
838
|
-
flattened_key = key_mapping[nested_key]
|
|
839
|
-
elif prefix:
|
|
840
|
-
flattened_key = f"{prefix}{nested_key}"
|
|
841
|
-
else:
|
|
842
|
-
flattened_key = f"{field_name}{separator}{nested_key}"
|
|
843
|
-
flattened[flattened_key] = nested_value
|
|
844
|
-
else:
|
|
845
|
-
# Simple value - use key mapping, prefix, or default
|
|
846
|
-
if key_mapping and nested_key in key_mapping:
|
|
847
|
-
# Use mapped key directly for simple strategy
|
|
848
|
-
# For array flattening, field_name will already include index (e.g., "orders_0")
|
|
849
|
-
mapped_key = key_mapping[nested_key]
|
|
850
|
-
# Only add field_name prefix if we're in array context (field_name contains separator)
|
|
851
|
-
if field_name and separator in field_name:
|
|
852
|
-
# This is from array flattening - preserve the indexed prefix
|
|
853
|
-
flattened_key = f"{field_name}{separator}{mapped_key}"
|
|
854
|
-
else:
|
|
855
|
-
# Simple strategy - use mapped key directly (no prefix)
|
|
856
|
-
flattened_key = mapped_key
|
|
857
|
-
elif prefix:
|
|
858
|
-
flattened_key = f"{prefix}{nested_key}"
|
|
859
|
-
else:
|
|
860
|
-
flattened_key = f"{field_name}{separator}{nested_key}"
|
|
861
|
-
flattened[flattened_key] = nested_value
|
|
854
|
+
expression_str = config.get('expression')
|
|
855
|
+
if not expression_str:
|
|
856
|
+
return data
|
|
857
|
+
|
|
858
|
+
mode = config.get('mode', 'batch')
|
|
862
859
|
|
|
863
|
-
|
|
860
|
+
try:
|
|
861
|
+
expr = jsonata.Jsonata(expression_str)
|
|
862
|
+
|
|
863
|
+
if mode == 'batch':
|
|
864
|
+
# Apply expression to entire dataset
|
|
865
|
+
result = expr.evaluate(data)
|
|
866
|
+
if result is None:
|
|
867
|
+
return []
|
|
868
|
+
return result if isinstance(result, list) else [result]
|
|
869
|
+
else:
|
|
870
|
+
# Apply expression to each record individually
|
|
871
|
+
return [expr.evaluate(record) for record in data if expr.evaluate(record) is not None]
|
|
872
|
+
|
|
873
|
+
except Exception as e:
|
|
874
|
+
logger.error(f"JSONata transformation failed: {e}")
|
|
875
|
+
raise ValueError(f"JSONata transformation error: {e}") from e
|
|
864
876
|
|
|
865
|
-
def
|
|
877
|
+
def _apply_custom_function(
|
|
866
878
|
self,
|
|
867
|
-
|
|
868
|
-
field_name: str,
|
|
879
|
+
data: List[Dict[str, Any]],
|
|
869
880
|
config: Dict[str, Any],
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
) -> Dict[str, Any]:
|
|
881
|
+
**kwargs
|
|
882
|
+
) -> List[Dict[str, Any]]:
|
|
873
883
|
"""
|
|
874
|
-
|
|
884
|
+
Execute a custom Python function for transformation.
|
|
875
885
|
|
|
876
886
|
Args:
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
depth: Current recursion depth
|
|
881
|
-
max_depth: Maximum recursion depth
|
|
887
|
+
data: Input data
|
|
888
|
+
config: Custom function configuration
|
|
889
|
+
**kwargs: Additional parameters passed to the function
|
|
882
890
|
|
|
883
891
|
Returns:
|
|
884
|
-
|
|
885
|
-
"""
|
|
886
|
-
if not isinstance(array_obj, list):
|
|
887
|
-
return {field_name: array_obj}
|
|
888
|
-
|
|
889
|
-
flattened = {}
|
|
890
|
-
strategy = config.get('strategy', 'array_flatten')
|
|
891
|
-
separator = config.get('separator', DEFAULT_SEPARATOR)
|
|
892
|
-
max_depth_config = config.get('max_depth', max_depth)
|
|
893
|
-
item_flatten = config.get('item_flatten', {})
|
|
894
|
-
aggregate = config.get('aggregate', False) # If True, aggregate all items into single keys
|
|
895
|
-
|
|
896
|
-
if depth >= max_depth_config:
|
|
897
|
-
return {field_name: array_obj}
|
|
898
|
-
|
|
899
|
-
if strategy == 'array_flatten' or strategy == 'simple':
|
|
900
|
-
# Flatten each item in the array
|
|
901
|
-
for idx, item in enumerate(array_obj):
|
|
902
|
-
if isinstance(item, dict):
|
|
903
|
-
# Use item_flatten config if provided, otherwise use parent config
|
|
904
|
-
item_config = item_flatten if item_flatten else config
|
|
905
|
-
|
|
906
|
-
if aggregate:
|
|
907
|
-
# Aggregate: all items contribute to same keys (last wins or merge)
|
|
908
|
-
item_flattened = self._flatten_nested_object(
|
|
909
|
-
item,
|
|
910
|
-
field_name, # Same base name for all items
|
|
911
|
-
item_config,
|
|
912
|
-
depth + 1,
|
|
913
|
-
max_depth_config
|
|
914
|
-
)
|
|
915
|
-
# Merge or overwrite (last item wins)
|
|
916
|
-
flattened.update(item_flattened)
|
|
917
|
-
else:
|
|
918
|
-
# Indexed: each item gets its own keys with index
|
|
919
|
-
# Create a new config that uses the indexed field name as base
|
|
920
|
-
indexed_config = item_config.copy()
|
|
921
|
-
indexed_config['_base_field'] = f"{field_name}{separator}{idx}"
|
|
922
|
-
|
|
923
|
-
item_flattened = self._flatten_nested_object(
|
|
924
|
-
item,
|
|
925
|
-
f"{field_name}{separator}{idx}",
|
|
926
|
-
indexed_config,
|
|
927
|
-
depth + 1,
|
|
928
|
-
max_depth_config
|
|
929
|
-
)
|
|
930
|
-
flattened.update(item_flattened)
|
|
931
|
-
else:
|
|
932
|
-
# Simple value in array
|
|
933
|
-
if aggregate:
|
|
934
|
-
# For aggregate, use field name directly (last value wins)
|
|
935
|
-
flattened[field_name] = item
|
|
936
|
-
else:
|
|
937
|
-
flattened[f"{field_name}{separator}{idx}"] = item
|
|
938
|
-
else:
|
|
939
|
-
# Keep array as-is
|
|
940
|
-
flattened[field_name] = array_obj
|
|
892
|
+
Transformed data
|
|
941
893
|
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
894
|
+
Example config:
|
|
895
|
+
custom_function:
|
|
896
|
+
module: "pyoptima"
|
|
897
|
+
function: "optimize_from_etl_inputs"
|
|
898
|
+
mode: "batch"
|
|
899
|
+
kwargs:
|
|
900
|
+
method: "min_volatility"
|
|
901
|
+
solver: "ipopt"
|
|
902
|
+
|
|
903
|
+
Alternative config (using callable path):
|
|
904
|
+
custom_function:
|
|
905
|
+
callable: "myproject.transforms.optimize_portfolio"
|
|
906
|
+
mode: "batch"
|
|
907
|
+
"""
|
|
908
|
+
# Get module and function
|
|
909
|
+
callable_path = config.get('callable')
|
|
910
|
+
module_path = config.get('module')
|
|
911
|
+
func_name = config.get('function')
|
|
912
|
+
|
|
913
|
+
if callable_path:
|
|
914
|
+
# Parse "module.submodule.function" format
|
|
915
|
+
parts = callable_path.rsplit('.', 1)
|
|
916
|
+
if len(parts) != 2:
|
|
917
|
+
raise ValueError(f"Invalid callable path: {callable_path}. Use 'module.function' format.")
|
|
918
|
+
module_path, func_name = parts
|
|
919
|
+
|
|
920
|
+
if not module_path or not func_name:
|
|
921
|
+
raise ValueError("custom_function requires either 'callable' or 'module' + 'function'")
|
|
922
|
+
|
|
923
|
+
# Dynamic import
|
|
924
|
+
try:
|
|
925
|
+
module = importlib.import_module(module_path)
|
|
926
|
+
func = getattr(module, func_name)
|
|
927
|
+
except ImportError as e:
|
|
928
|
+
raise ValueError(f"Cannot import module '{module_path}': {e}") from e
|
|
929
|
+
except AttributeError as e:
|
|
930
|
+
raise ValueError(f"Function '{func_name}' not found in module '{module_path}'") from e
|
|
931
|
+
|
|
932
|
+
# Handle class-based methods (e.g., pyoptima optimization methods)
|
|
933
|
+
if isinstance(func, type):
|
|
934
|
+
instance = func()
|
|
935
|
+
if hasattr(instance, 'optimize'):
|
|
936
|
+
func = instance.optimize
|
|
937
|
+
elif hasattr(instance, 'run'):
|
|
938
|
+
func = instance.run
|
|
939
|
+
elif hasattr(instance, '__call__'):
|
|
940
|
+
func = instance
|
|
941
|
+
else:
|
|
942
|
+
raise ValueError(f"Class '{func_name}' has no 'optimize', 'run', or '__call__' method")
|
|
948
943
|
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
'int': int,
|
|
953
|
-
'float': float,
|
|
954
|
-
'double': float,
|
|
955
|
-
'boolean': bool,
|
|
956
|
-
'bool': bool,
|
|
957
|
-
'datetime': self._parse_datetime,
|
|
958
|
-
'timestamp': self._parse_datetime,
|
|
959
|
-
'date': self._parse_date,
|
|
960
|
-
}
|
|
944
|
+
# Get mode and kwargs
|
|
945
|
+
mode = config.get('mode', 'batch')
|
|
946
|
+
func_kwargs = config.get('kwargs', {})
|
|
961
947
|
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
try:
|
|
965
|
-
# Converter is either a type (callable) or a method (also callable)
|
|
966
|
-
return converter(value)
|
|
967
|
-
except (ValueError, TypeError):
|
|
968
|
-
return value
|
|
948
|
+
# Merge with runtime kwargs
|
|
949
|
+
merged_kwargs = {**func_kwargs, **kwargs}
|
|
969
950
|
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
try:
|
|
992
|
-
return datetime.strptime(value, '%Y-%m-%d').date()
|
|
993
|
-
except ValueError:
|
|
994
|
-
pass
|
|
995
|
-
return value
|
|
951
|
+
try:
|
|
952
|
+
if mode == 'batch':
|
|
953
|
+
result = func(data, **merged_kwargs)
|
|
954
|
+
if result is None:
|
|
955
|
+
return []
|
|
956
|
+
return result if isinstance(result, list) else [result]
|
|
957
|
+
else:
|
|
958
|
+
# Record mode
|
|
959
|
+
results = []
|
|
960
|
+
for record in data:
|
|
961
|
+
record_result = func(record, **merged_kwargs)
|
|
962
|
+
if record_result is not None:
|
|
963
|
+
if isinstance(record_result, list):
|
|
964
|
+
results.extend(record_result)
|
|
965
|
+
else:
|
|
966
|
+
results.append(record_result)
|
|
967
|
+
return results
|
|
968
|
+
|
|
969
|
+
except Exception as e:
|
|
970
|
+
logger.error(f"Custom function '{func_name}' failed: {e}")
|
|
971
|
+
raise ValueError(f"Custom function error: {e}") from e
|
|
996
972
|
|
|
997
973
|
# ============================================================================
|
|
998
974
|
# LOADING
|