informatica-python 1.9.2__py3-none-any.whl → 1.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- informatica_python/__init__.py +1 -1
- informatica_python/generators/mapping_gen.py +140 -58
- informatica_python/generators/workflow_gen.py +21 -4
- informatica_python/utils/expression_converter.py +320 -4
- {informatica_python-1.9.2.dist-info → informatica_python-1.9.3.dist-info}/METADATA +175 -47
- {informatica_python-1.9.2.dist-info → informatica_python-1.9.3.dist-info}/RECORD +10 -10
- {informatica_python-1.9.2.dist-info → informatica_python-1.9.3.dist-info}/WHEEL +0 -0
- {informatica_python-1.9.2.dist-info → informatica_python-1.9.3.dist-info}/entry_points.txt +0 -0
- {informatica_python-1.9.2.dist-info → informatica_python-1.9.3.dist-info}/licenses/LICENSE +0 -0
- {informatica_python-1.9.2.dist-info → informatica_python-1.9.3.dist-info}/top_level.txt +0 -0
informatica_python/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import List, Dict
|
|
2
3
|
from informatica_python.models import (
|
|
3
4
|
MappingDef, FolderDef, SourceDef, TargetDef,
|
|
@@ -228,7 +229,6 @@ def generate_mapping_code(mapping: MappingDef, folder: FolderDef,
|
|
|
228
229
|
lines.append("import logging")
|
|
229
230
|
lines.append("import numpy as np")
|
|
230
231
|
lines.append("import pandas as pd")
|
|
231
|
-
lines.append("from datetime import datetime")
|
|
232
232
|
lines.append("from helper_functions import *")
|
|
233
233
|
lines.append("")
|
|
234
234
|
lines.append("logger = logging.getLogger(__name__)")
|
|
@@ -375,7 +375,40 @@ def generate_mapping_code(mapping: MappingDef, folder: FolderDef,
|
|
|
375
375
|
lines.append(f" run_{_safe_name(mapping.name)}(config)")
|
|
376
376
|
lines.append("")
|
|
377
377
|
|
|
378
|
-
|
|
378
|
+
code = "\n".join(lines)
|
|
379
|
+
func_sig = f"def run_{_safe_name(mapping.name)}(config):"
|
|
380
|
+
sig_idx = code.index(func_sig) + len(func_sig)
|
|
381
|
+
docstring_end = code.index('"""', code.index('"""', sig_idx) + 3) + 3
|
|
382
|
+
before_body = code[:docstring_end]
|
|
383
|
+
after_docstring = code[docstring_end:]
|
|
384
|
+
main_sentinel = "\n\nif __name__"
|
|
385
|
+
body_end_idx = after_docstring.index(main_sentinel)
|
|
386
|
+
body = after_docstring[:body_end_idx]
|
|
387
|
+
rest = after_docstring[body_end_idx:]
|
|
388
|
+
body_lines = body.split("\n")
|
|
389
|
+
while body_lines and body_lines[0].strip() == "":
|
|
390
|
+
body_lines.pop(0)
|
|
391
|
+
while body_lines and body_lines[-1].strip() == "":
|
|
392
|
+
body_lines.pop()
|
|
393
|
+
wrapped = []
|
|
394
|
+
wrapped.append("")
|
|
395
|
+
wrapped.append(" try:")
|
|
396
|
+
prev_blank = False
|
|
397
|
+
for bl in body_lines:
|
|
398
|
+
if bl.strip() == "":
|
|
399
|
+
if not prev_blank:
|
|
400
|
+
wrapped.append("")
|
|
401
|
+
prev_blank = True
|
|
402
|
+
else:
|
|
403
|
+
wrapped.append(" " + bl)
|
|
404
|
+
prev_blank = False
|
|
405
|
+
wrapped.append("")
|
|
406
|
+
wrapped.append(" except Exception as _exc:")
|
|
407
|
+
wrapped.append(f" logger.error(f'Mapping {mapping.name} failed: {{_exc}}')")
|
|
408
|
+
wrapped.append(" raise")
|
|
409
|
+
wrapped.append("")
|
|
410
|
+
|
|
411
|
+
return before_body + "\n".join(wrapped) + rest
|
|
379
412
|
|
|
380
413
|
|
|
381
414
|
def _safe_name(name):
|
|
@@ -386,6 +419,22 @@ def _safe_name(name):
|
|
|
386
419
|
return safe.lower()
|
|
387
420
|
|
|
388
421
|
|
|
422
|
+
def _emit_sql_with_params(lines, sql_var_name, sql_text, indent=" "):
|
|
423
|
+
import re
|
|
424
|
+
params = re.findall(r'\$\$(\w+)', sql_text)
|
|
425
|
+
lines.append(f"{indent}{sql_var_name} = '''")
|
|
426
|
+
for sql_line in sql_text.strip().split("\n"):
|
|
427
|
+
lines.append(f"{indent}{sql_line}")
|
|
428
|
+
lines.append(f"{indent}'''")
|
|
429
|
+
if params:
|
|
430
|
+
seen = set()
|
|
431
|
+
for p in params:
|
|
432
|
+
if p in seen:
|
|
433
|
+
continue
|
|
434
|
+
seen.add(p)
|
|
435
|
+
lines.append(f"{indent}{sql_var_name} = {sql_var_name}.replace('$${p}', str(get_param(config, '{p}')))")
|
|
436
|
+
|
|
437
|
+
|
|
389
438
|
def _flatfile_config_dict(ff):
|
|
390
439
|
cfg = {}
|
|
391
440
|
if not ff:
|
|
@@ -504,7 +553,7 @@ def _emit_flatfile_write(lines, var_name, tgt_def, indent=" ", file_path_over
|
|
|
504
553
|
def _build_source_map(mapping, folder):
|
|
505
554
|
source_map = {}
|
|
506
555
|
for inst in mapping.instances:
|
|
507
|
-
if inst.type
|
|
556
|
+
if inst.type.upper() in ("SOURCE DEFINITION", "SOURCE"):
|
|
508
557
|
tx_name = inst.transformation_name or inst.name
|
|
509
558
|
for src in folder.sources:
|
|
510
559
|
if src.name == tx_name:
|
|
@@ -518,7 +567,7 @@ def _build_source_map(mapping, folder):
|
|
|
518
567
|
def _build_target_map(mapping, folder):
|
|
519
568
|
target_map = {}
|
|
520
569
|
for inst in mapping.instances:
|
|
521
|
-
if inst.type
|
|
570
|
+
if inst.type.upper() in ("TARGET DEFINITION", "TARGET"):
|
|
522
571
|
tx_name = inst.transformation_name or inst.name
|
|
523
572
|
for tgt in folder.targets:
|
|
524
573
|
if tgt.name == tx_name:
|
|
@@ -594,7 +643,9 @@ def _generate_source_qualifier(lines, sq, source_map, source_dfs, connector_grap
|
|
|
594
643
|
if not connected_sources and source_map:
|
|
595
644
|
connected_sources.add(next(iter(source_map)))
|
|
596
645
|
|
|
646
|
+
lines.append(f" # -------------------------------------------------------------------")
|
|
597
647
|
lines.append(f" # Source Qualifier: {sq.name}")
|
|
648
|
+
lines.append(f" # -------------------------------------------------------------------")
|
|
598
649
|
|
|
599
650
|
if pre_sql:
|
|
600
651
|
lines.append(f" # Pre-SQL")
|
|
@@ -606,10 +657,7 @@ def _generate_source_qualifier(lines, sq, source_map, source_dfs, connector_grap
|
|
|
606
657
|
if not connected_sources:
|
|
607
658
|
sq_src_name = sq.name[3:] if sq.name.upper().startswith("SQ_") else sq.name
|
|
608
659
|
if sql_override:
|
|
609
|
-
lines
|
|
610
|
-
for sql_line in sql_override.strip().split("\n"):
|
|
611
|
-
lines.append(f" {sql_line}")
|
|
612
|
-
lines.append(f" '''")
|
|
660
|
+
_emit_sql_with_params(lines, f"sql_{sq_safe}", sql_override)
|
|
613
661
|
lines.append(f" df_{sq_safe} = read_from_db(config, sql_{sq_safe}, 'default')")
|
|
614
662
|
else:
|
|
615
663
|
lines.append(f" df_{sq_safe} = read_file(config.get('sources', {{}}).get('{sq_src_name}', {{}}).get('file_path', '{sq_src_name}'),")
|
|
@@ -620,10 +668,7 @@ def _generate_source_qualifier(lines, sq, source_map, source_dfs, connector_grap
|
|
|
620
668
|
sq_override = (session_overrides or {}).get(sq.name, {}) or (session_overrides or {}).get(src_name, {})
|
|
621
669
|
conn_name = sq_override.get("connection_name") or (_safe_name(src_def.db_name) if src_def.db_name else "default")
|
|
622
670
|
|
|
623
|
-
lines
|
|
624
|
-
for sql_line in sql_override.strip().split("\n"):
|
|
625
|
-
lines.append(f" {sql_line}")
|
|
626
|
-
lines.append(f" '''")
|
|
671
|
+
_emit_sql_with_params(lines, f"sql_{sq_safe}", sql_override)
|
|
627
672
|
lines.append(f" df_{sq_safe} = read_from_db(config, sql_{sq_safe}, '{conn_name}')")
|
|
628
673
|
elif len(connected_sources) == 1:
|
|
629
674
|
src_name = next(iter(connected_sources))
|
|
@@ -656,10 +701,7 @@ def _generate_source_qualifier(lines, sq, source_map, source_dfs, connector_grap
|
|
|
656
701
|
lines.append(f" df_{sq_safe} = df_{_safe_name(next(iter(connected_sources)))}")
|
|
657
702
|
|
|
658
703
|
source_dfs[sq.name] = f"df_{sq_safe}"
|
|
659
|
-
lines.append(f"
|
|
660
|
-
lines.append(f" logger.info(f'Source {sq.name}: {{len(df_{sq_safe})}} rows read')")
|
|
661
|
-
lines.append(f" except Exception:")
|
|
662
|
-
lines.append(f" logger.info('Source {sq.name}: rows read (count unavailable)')")
|
|
704
|
+
lines.append(f" logger.info(f'Source {sq.name}: {{len(df_{sq_safe})}} rows read')")
|
|
663
705
|
|
|
664
706
|
if post_sql:
|
|
665
707
|
lines.append(f" # Post-SQL")
|
|
@@ -699,10 +741,10 @@ def _generate_transformation(lines, tx, connector_graph, source_dfs, transform_m
|
|
|
699
741
|
lines.append(f" # Input fields: {', '.join(in_fields[:10])}{' ...' if len(in_fields) > 10 else ''}")
|
|
700
742
|
lines.append(f" # Output fields: {', '.join(out_fields[:10])}{' ...' if len(out_fields) > 10 else ''}")
|
|
701
743
|
lines.append(f" # -------------------------------------------------------------------")
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
744
|
+
if input_df == "df_input":
|
|
745
|
+
lines.append(f" _input_rows_{tx_safe} = -1")
|
|
746
|
+
else:
|
|
747
|
+
lines.append(f" _input_rows_{tx_safe} = len({input_df})")
|
|
706
748
|
|
|
707
749
|
if tx_type == "expression":
|
|
708
750
|
_gen_expression_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib)
|
|
@@ -742,28 +784,26 @@ def _generate_transformation(lines, tx, connector_graph, source_dfs, transform_m
|
|
|
742
784
|
lines.append(f" df_{tx_safe} = {copy_expr}")
|
|
743
785
|
source_dfs[tx.name] = f"df_{tx_safe}"
|
|
744
786
|
|
|
745
|
-
lines.append(f"
|
|
746
|
-
lines.append(f" _output_rows_{tx_safe} = len(df_{tx_safe})")
|
|
747
|
-
lines.append(f" except Exception:")
|
|
748
|
-
lines.append(f" _output_rows_{tx_safe} = -1")
|
|
787
|
+
lines.append(f" _output_rows_{tx_safe} = len(df_{tx_safe})")
|
|
749
788
|
lines.append(f" logger.info(f'{tx.name} ({tx.type}): {{_input_rows_{tx_safe}}} input rows -> {{_output_rows_{tx_safe}}} output rows')")
|
|
750
789
|
lines.append("")
|
|
751
790
|
|
|
752
791
|
|
|
753
792
|
def _gen_expression_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pandas"):
|
|
754
|
-
|
|
755
|
-
lines.append(f" df_{tx_safe} = {copy_expr}")
|
|
756
|
-
has_expressions = False
|
|
793
|
+
active_fields = []
|
|
757
794
|
for fld in tx.fields:
|
|
758
|
-
if fld.expression and fld.expression.strip() and fld.expression.strip() != fld.name:
|
|
759
|
-
|
|
795
|
+
if fld.expression and fld.expression.strip() and fld.expression.strip().lower() != fld.name.lower():
|
|
796
|
+
active_fields.append(fld)
|
|
797
|
+
|
|
798
|
+
if active_fields:
|
|
799
|
+
copy_expr = lib_copy(data_lib, input_df)
|
|
800
|
+
lines.append(f" df_{tx_safe} = {copy_expr}")
|
|
801
|
+
for fld in active_fields:
|
|
760
802
|
expr_vec = convert_expression_vectorized(fld.expression, f"df_{tx_safe}")
|
|
761
803
|
lines.append(f" # {fld.name} = {fld.expression}")
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
lines.append(f" df_{tx_safe}['{fld.name}'] = {expr_vec}")
|
|
766
|
-
if not has_expressions:
|
|
804
|
+
lines.append(f" df_{tx_safe}['{fld.name}'] = {expr_vec}")
|
|
805
|
+
else:
|
|
806
|
+
lines.append(f" df_{tx_safe} = {input_df}")
|
|
767
807
|
lines.append(f" # Pass-through expression (no transformations)")
|
|
768
808
|
source_dfs[tx.name] = f"df_{tx_safe}"
|
|
769
809
|
|
|
@@ -842,7 +882,11 @@ def _gen_sorter_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pa
|
|
|
842
882
|
sort_dirs = []
|
|
843
883
|
for fld in tx.fields:
|
|
844
884
|
sort_keys.append(fld.name)
|
|
845
|
-
|
|
885
|
+
direction = 'ASCENDING'
|
|
886
|
+
for fa in getattr(fld, 'field_attributes', []):
|
|
887
|
+
if isinstance(fa, dict) and fa.get('name', '').upper() == 'SORTDIRECTION':
|
|
888
|
+
direction = fa.get('value', 'ASCENDING') or 'ASCENDING'
|
|
889
|
+
sort_dirs.append(direction.upper() != 'DESCENDING')
|
|
846
890
|
if sort_keys:
|
|
847
891
|
sort_expr = lib_sort(data_lib, input_df, sort_keys, sort_dirs)
|
|
848
892
|
lines.append(f" df_{tx_safe} = {sort_expr}")
|
|
@@ -881,13 +925,23 @@ def _gen_joiner_transform(lines, tx, tx_safe, input_df, input_sources, source_df
|
|
|
881
925
|
master_src = None
|
|
882
926
|
detail_src = None
|
|
883
927
|
input_conns = connector_graph.get("to", {}).get(tx.name, []) if connector_graph else []
|
|
928
|
+
|
|
929
|
+
port_to_col = {}
|
|
930
|
+
master_fields_lower = {f.lower() for f in master_fields}
|
|
931
|
+
detail_fields_lower = {f.lower() for f in detail_fields}
|
|
884
932
|
for conn in input_conns:
|
|
885
933
|
to_field = conn.to_field
|
|
886
|
-
|
|
934
|
+
port_to_col[to_field] = conn.from_field
|
|
935
|
+
port_to_col[to_field.lower()] = conn.from_field
|
|
936
|
+
if to_field in master_fields or to_field.lower() in master_fields_lower:
|
|
887
937
|
master_src = conn.from_instance
|
|
888
|
-
elif to_field in detail_fields:
|
|
938
|
+
elif to_field in detail_fields or to_field.lower() in detail_fields_lower:
|
|
889
939
|
detail_src = conn.from_instance
|
|
890
940
|
|
|
941
|
+
if left_keys and right_keys and port_to_col:
|
|
942
|
+
left_keys = [port_to_col.get(k, port_to_col.get(k.lower(), k)) for k in left_keys]
|
|
943
|
+
right_keys = [port_to_col.get(k, port_to_col.get(k.lower(), k)) for k in right_keys]
|
|
944
|
+
|
|
891
945
|
src_list = list(input_sources)
|
|
892
946
|
if not master_src and not detail_src and len(src_list) >= 2:
|
|
893
947
|
master_src = src_list[0]
|
|
@@ -960,10 +1014,7 @@ def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pa
|
|
|
960
1014
|
|
|
961
1015
|
lines.append(f" # Lookup: {lookup_table or tx.name}")
|
|
962
1016
|
if lookup_sql:
|
|
963
|
-
lines
|
|
964
|
-
for sql_line in lookup_sql.strip().split("\n"):
|
|
965
|
-
lines.append(f" {sql_line}")
|
|
966
|
-
lines.append(f" '''")
|
|
1017
|
+
_emit_sql_with_params(lines, f"lkp_sql_{tx_safe}", lookup_sql)
|
|
967
1018
|
lines.append(f" df_lkp_{tx_safe} = read_from_db(config, lkp_sql_{tx_safe}, 'default')")
|
|
968
1019
|
elif lookup_table:
|
|
969
1020
|
lines.append(f" df_lkp_{tx_safe} = read_from_db(config, 'SELECT * FROM {lookup_table}', 'default')")
|
|
@@ -998,7 +1049,11 @@ def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pa
|
|
|
998
1049
|
lines.append(f" if _lkp_drop:")
|
|
999
1050
|
lines.append(f" df_{tx_safe} = df_{tx_safe}.drop(columns=_lkp_drop)")
|
|
1000
1051
|
|
|
1052
|
+
seen_output_cols = set()
|
|
1001
1053
|
for rf in all_output_fields:
|
|
1054
|
+
if rf.name in seen_output_cols:
|
|
1055
|
+
continue
|
|
1056
|
+
seen_output_cols.add(rf.name)
|
|
1002
1057
|
lines.append(f" if '{rf.name}' not in df_{tx_safe}.columns:")
|
|
1003
1058
|
lines.append(f" df_{tx_safe}['{rf.name}'] = None")
|
|
1004
1059
|
if rf.default_value:
|
|
@@ -1073,14 +1128,19 @@ def _gen_update_strategy(lines, tx, tx_safe, input_df, source_dfs):
|
|
|
1073
1128
|
for dd_const, label in dd_map.items():
|
|
1074
1129
|
expr = expr.replace(dd_const, f"'{label}'")
|
|
1075
1130
|
try:
|
|
1076
|
-
|
|
1131
|
+
expr_vec = convert_expression_vectorized(expr, f"df_{tx_safe}")
|
|
1077
1132
|
lines.append(f" # Original expression: {strategy_expr}")
|
|
1078
|
-
lines.append(f"
|
|
1079
|
-
lines.append(f" return {converted}")
|
|
1080
|
-
lines.append(f" df_{tx_safe}['_update_strategy'] = df_{tx_safe}.apply(_resolve_strategy, axis=1)")
|
|
1133
|
+
lines.append(f" df_{tx_safe}['_update_strategy'] = {expr_vec}")
|
|
1081
1134
|
except Exception:
|
|
1082
|
-
|
|
1083
|
-
|
|
1135
|
+
try:
|
|
1136
|
+
converted = convert_expression(expr)
|
|
1137
|
+
lines.append(f" # Original expression: {strategy_expr}")
|
|
1138
|
+
lines.append(f" def _resolve_strategy(row):")
|
|
1139
|
+
lines.append(f" return {converted}")
|
|
1140
|
+
lines.append(f" df_{tx_safe}['_update_strategy'] = df_{tx_safe}.apply(_resolve_strategy, axis=1)")
|
|
1141
|
+
except Exception:
|
|
1142
|
+
lines.append(f" # Could not parse strategy expression: {strategy_expr}")
|
|
1143
|
+
lines.append(f" df_{tx_safe}['_update_strategy'] = 'INSERT'")
|
|
1084
1144
|
source_dfs[tx.name] = f"df_{tx_safe}"
|
|
1085
1145
|
|
|
1086
1146
|
|
|
@@ -1343,7 +1403,7 @@ def _gen_sql_transform(lines, tx, tx_safe, input_df, source_dfs):
|
|
|
1343
1403
|
sql_query = convert_sql_expression(attr.value)
|
|
1344
1404
|
lines.append(f" # SQL Transformation: {tx.name}")
|
|
1345
1405
|
if sql_query:
|
|
1346
|
-
lines
|
|
1406
|
+
_emit_sql_with_params(lines, f"sql_{tx_safe}", sql_query)
|
|
1347
1407
|
lines.append(f" df_{tx_safe} = read_from_db(config, sql_{tx_safe}, 'default')")
|
|
1348
1408
|
else:
|
|
1349
1409
|
lines.append(f" df_{tx_safe} = {input_df}.copy()")
|
|
@@ -1371,12 +1431,21 @@ def _generate_target_write(lines, tgt_name, tgt_def, connector_graph, source_dfs
|
|
|
1371
1431
|
for c in to_conns:
|
|
1372
1432
|
col_mapping[c.to_field] = c.from_field
|
|
1373
1433
|
|
|
1434
|
+
lines.append(f" # -------------------------------------------------------------------")
|
|
1374
1435
|
lines.append(f" # Write to target: {tgt_def.name}")
|
|
1436
|
+
if tgt_def.database_type:
|
|
1437
|
+
lines.append(f" # Database type: {tgt_def.database_type}")
|
|
1438
|
+
target_field_names = [f.name for f in tgt_def.fields] if tgt_def.fields else []
|
|
1439
|
+
if target_field_names:
|
|
1440
|
+
lines.append(f" # Target fields: {', '.join(target_field_names[:10])}{' ...' if len(target_field_names) > 10 else ''}")
|
|
1441
|
+
lines.append(f" # -------------------------------------------------------------------")
|
|
1375
1442
|
if col_mapping:
|
|
1443
|
+
lines.append(f" # Column mapping: source -> target")
|
|
1376
1444
|
lines.append(f" target_columns_{tgt_safe} = {col_mapping}")
|
|
1377
1445
|
lines.append(f" df_target_{tgt_safe} = {input_df}.rename(columns={{v: k for k, v in target_columns_{tgt_safe}.items()}})")
|
|
1378
1446
|
target_cols = [f.name for f in tgt_def.fields] if tgt_def.fields else None
|
|
1379
1447
|
if target_cols:
|
|
1448
|
+
lines.append(f" # Select only target columns")
|
|
1380
1449
|
lines.append(f" available_cols = [c for c in {target_cols} if c in df_target_{tgt_safe}.columns]")
|
|
1381
1450
|
lines.append(f" if '_update_strategy' in df_target_{tgt_safe}.columns and '_update_strategy' not in available_cols:")
|
|
1382
1451
|
lines.append(f" available_cols.append('_update_strategy')")
|
|
@@ -1389,17 +1458,37 @@ def _generate_target_write(lines, tgt_name, tgt_def, connector_graph, source_dfs
|
|
|
1389
1458
|
tgt_override = (session_overrides or {}).get(tgt_name, {})
|
|
1390
1459
|
tgt_conn = tgt_override.get("connection_name")
|
|
1391
1460
|
|
|
1461
|
+
_FILE_EXTENSIONS = {".csv", ".dat", ".txt", ".xml", ".json", ".parquet", ".xlsx", ".xls", ".tsv", ".avro"}
|
|
1462
|
+
_is_file_target = bool(
|
|
1463
|
+
tgt_override.get("output_file_directory") or tgt_override.get("output_filename")
|
|
1464
|
+
or tgt_def.flatfile
|
|
1465
|
+
or (tgt_def.database_type and tgt_def.database_type == "Flat File")
|
|
1466
|
+
or os.path.splitext(tgt_def.name)[1].lower() in _FILE_EXTENSIONS
|
|
1467
|
+
)
|
|
1468
|
+
_is_db_target = bool(
|
|
1469
|
+
tgt_def.database_type and tgt_def.database_type != "Flat File"
|
|
1470
|
+
)
|
|
1471
|
+
|
|
1392
1472
|
if tgt_override.get("output_file_directory") or tgt_override.get("output_filename"):
|
|
1393
1473
|
out_dir = tgt_override.get("output_file_directory", ".")
|
|
1394
1474
|
out_file = tgt_override.get("output_filename", tgt_def.name)
|
|
1475
|
+
lines.append(f" # Write to file (session override path)")
|
|
1395
1476
|
lines.append(f" _tgt_path_{tgt_safe} = config.get('targets', {{}}).get('{tgt_def.name}', {{}}).get('file_path',")
|
|
1396
1477
|
lines.append(f" os.path.join('{out_dir}', '{out_file}'))")
|
|
1397
1478
|
if tgt_def.flatfile:
|
|
1398
1479
|
_emit_flatfile_write(lines, tgt_safe, tgt_def, file_path_override=True)
|
|
1399
1480
|
else:
|
|
1400
1481
|
lines.append(f" write_file(df_target_{tgt_safe}, _tgt_path_{tgt_safe}, config.get('targets', {{}}).get('{tgt_def.name}', {{}}))")
|
|
1401
|
-
elif tgt_def.
|
|
1482
|
+
elif tgt_def.flatfile:
|
|
1483
|
+
lines.append(f" # Write to flat file")
|
|
1484
|
+
_emit_flatfile_write(lines, tgt_safe, tgt_def)
|
|
1485
|
+
elif _is_file_target and not _is_db_target:
|
|
1486
|
+
lines.append(f" # Write to file")
|
|
1487
|
+
lines.append(f" write_file(df_target_{tgt_safe}, config.get('targets', {{}}).get('{tgt_def.name}', {{}}).get('file_path', '{tgt_def.name}'),")
|
|
1488
|
+
lines.append(f" config.get('targets', {{}}).get('{tgt_def.name}', {{}}))")
|
|
1489
|
+
else:
|
|
1402
1490
|
conn_label = tgt_conn or "target"
|
|
1491
|
+
lines.append(f" # Write to database table")
|
|
1403
1492
|
lines.append(f" if '_update_strategy' in df_target_{tgt_safe}.columns:")
|
|
1404
1493
|
key_cols = [f.name for f in tgt_def.fields if getattr(f, 'keytype', 'NOT A KEY') == 'PRIMARY KEY'] or None
|
|
1405
1494
|
if key_cols:
|
|
@@ -1408,15 +1497,8 @@ def _generate_target_write(lines, tgt_name, tgt_def, connector_graph, source_dfs
|
|
|
1408
1497
|
lines.append(f" write_with_update_strategy(config, df_target_{tgt_safe}, '{tgt_def.name}', '{conn_label}')")
|
|
1409
1498
|
lines.append(f" else:")
|
|
1410
1499
|
lines.append(f" write_to_db(config, df_target_{tgt_safe}, '{tgt_def.name}', '{conn_label}')")
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
else:
|
|
1414
|
-
lines.append(f" write_file(df_target_{tgt_safe}, config.get('targets', {{}}).get('{tgt_def.name}', {{}}).get('file_path', '{tgt_def.name}'),")
|
|
1415
|
-
lines.append(f" config.get('targets', {{}}).get('{tgt_def.name}', {{}}))")
|
|
1416
|
-
lines.append(f" try:")
|
|
1417
|
-
lines.append(f" logger.info(f'Target {tgt_def.name}: {{len(df_target_{tgt_safe})}} rows written')")
|
|
1418
|
-
lines.append(f" except Exception:")
|
|
1419
|
-
lines.append(f" logger.info('Target {tgt_def.name}: rows written (count unavailable)')")
|
|
1500
|
+
lines.append(f" logger.info(f'Target {tgt_def.name}: {{len(df_target_{tgt_safe})}} rows written')")
|
|
1501
|
+
lines.append("")
|
|
1420
1502
|
|
|
1421
1503
|
|
|
1422
1504
|
CAST_MAP = {
|
|
@@ -179,24 +179,41 @@ def _generate_workflow_function(lines, wf: WorkflowDef, folder: FolderDef, workl
|
|
|
179
179
|
def _emit_task_code(lines, task, mapping_name_map, session_to_mapping, wf, worklets):
|
|
180
180
|
task_safe = _safe_name(task.name)
|
|
181
181
|
|
|
182
|
-
if task.task_type
|
|
182
|
+
if task.task_type in ("Start Task", "Start"):
|
|
183
183
|
lines.append(f" # Start Task: {task.name}")
|
|
184
184
|
lines.append(f" logger.info('Workflow started')")
|
|
185
185
|
lines.append("")
|
|
186
186
|
return
|
|
187
187
|
|
|
188
188
|
if task.task_type == "Session":
|
|
189
|
-
|
|
189
|
+
session_key = task.task_name or task.name
|
|
190
|
+
mapping_name = session_to_mapping.get(session_key, "")
|
|
190
191
|
run_func = mapping_name_map.get(mapping_name, None)
|
|
191
192
|
|
|
193
|
+
if not run_func:
|
|
194
|
+
best_match = None
|
|
195
|
+
best_len = 0
|
|
196
|
+
session_lower = session_key.lower()
|
|
197
|
+
for mname, rfunc in mapping_name_map.items():
|
|
198
|
+
safe_mname = _safe_name(mname)
|
|
199
|
+
if session_lower.endswith(safe_mname) and len(safe_mname) > best_len:
|
|
200
|
+
best_match = rfunc
|
|
201
|
+
best_len = len(safe_mname)
|
|
202
|
+
if not best_match:
|
|
203
|
+
for mname, rfunc in mapping_name_map.items():
|
|
204
|
+
safe_mname = _safe_name(mname)
|
|
205
|
+
if safe_mname in session_lower and len(safe_mname) > best_len:
|
|
206
|
+
best_match = rfunc
|
|
207
|
+
best_len = len(safe_mname)
|
|
208
|
+
run_func = best_match
|
|
209
|
+
|
|
192
210
|
lines.append(f" # Session: {task.name}")
|
|
193
211
|
lines.append(f" try:")
|
|
194
212
|
lines.append(f" logger.info('Executing session: {task.name}')")
|
|
195
213
|
if run_func:
|
|
196
214
|
lines.append(f" {run_func}(config)")
|
|
197
215
|
else:
|
|
198
|
-
lines.append(f"
|
|
199
|
-
lines.append(f" logger.warning('Session {task.name} has no mapped function')")
|
|
216
|
+
lines.append(f" logger.warning('Session {task.name}: no mapped function found — verify mapping linkage')")
|
|
200
217
|
lines.append(f" except Exception as e:")
|
|
201
218
|
lines.append(f" logger.error(f'Session {task.name} failed: {{e}}')")
|
|
202
219
|
|
|
@@ -297,12 +297,132 @@ def _resolve_char_arg(arg, df_var):
|
|
|
297
297
|
return arg
|
|
298
298
|
|
|
299
299
|
|
|
300
|
+
def _strip_inline_comments(text):
|
|
301
|
+
result = []
|
|
302
|
+
i = 0
|
|
303
|
+
in_string = False
|
|
304
|
+
str_char = None
|
|
305
|
+
depth = 0
|
|
306
|
+
while i < len(text):
|
|
307
|
+
ch = text[i]
|
|
308
|
+
if in_string:
|
|
309
|
+
result.append(ch)
|
|
310
|
+
if ch == str_char and (i == 0 or text[i - 1] != '\\'):
|
|
311
|
+
in_string = False
|
|
312
|
+
elif ch in ("'", '"'):
|
|
313
|
+
in_string = True
|
|
314
|
+
str_char = ch
|
|
315
|
+
result.append(ch)
|
|
316
|
+
elif ch == '(':
|
|
317
|
+
depth += 1
|
|
318
|
+
result.append(ch)
|
|
319
|
+
elif ch == ')':
|
|
320
|
+
depth -= 1
|
|
321
|
+
result.append(ch)
|
|
322
|
+
elif ch == '-' and i + 1 < len(text) and text[i + 1] == '-' and depth == 0:
|
|
323
|
+
break
|
|
324
|
+
else:
|
|
325
|
+
result.append(ch)
|
|
326
|
+
i += 1
|
|
327
|
+
return ''.join(result).strip()
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _trunc_vec(va):
|
|
331
|
+
if len(va) == 1:
|
|
332
|
+
return f'np.trunc({va[0]})'
|
|
333
|
+
raw2 = va[1].strip()
|
|
334
|
+
if re.match(r'^-?\d+$', raw2):
|
|
335
|
+
n = int(raw2)
|
|
336
|
+
if n == 0:
|
|
337
|
+
return f'np.trunc({va[0]})'
|
|
338
|
+
return f'(np.trunc({va[0]} * 10**{n}) / 10**{n})'
|
|
339
|
+
fmt = raw2.strip("'\"").upper()
|
|
340
|
+
if fmt in ('DD', 'D'):
|
|
341
|
+
return f'{va[0]}.dt.floor("D")'
|
|
342
|
+
elif fmt in ('MM', 'MON', 'MONTH'):
|
|
343
|
+
return f'{va[0]}.dt.to_period("M").dt.to_timestamp()'
|
|
344
|
+
elif fmt in ('YY', 'YYYY', 'YEAR'):
|
|
345
|
+
return f'{va[0]}.dt.to_period("Y").dt.to_timestamp()'
|
|
346
|
+
elif fmt in ('HH', 'HH24'):
|
|
347
|
+
return f'{va[0]}.dt.floor("H")'
|
|
348
|
+
return f'{va[0]}.dt.floor("{fmt}")'
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
_VEC_INLINE = {
|
|
352
|
+
'TO_INTEGER': lambda va: f'pd.to_numeric({va[0]}, errors="coerce").fillna(0).astype(int)',
|
|
353
|
+
'TO_BIGINT': lambda va: f'pd.to_numeric({va[0]}, errors="coerce").astype("Int64")',
|
|
354
|
+
'TO_FLOAT': lambda va: f'pd.to_numeric({va[0]}, errors="coerce")',
|
|
355
|
+
'TO_DECIMAL': lambda va: f'pd.to_numeric({va[0]}, errors="coerce")',
|
|
356
|
+
'LENGTH': lambda va: f'{va[0]}.str.len()',
|
|
357
|
+
'ROUND': lambda va: (f'np.round({va[0]}, {va[1]})' if len(va) >= 2 else f'np.round({va[0]})'),
|
|
358
|
+
'ABS': lambda va: f'np.abs({va[0]})',
|
|
359
|
+
'CEIL': lambda va: f'np.ceil({va[0]})',
|
|
360
|
+
'CEILING': lambda va: f'np.ceil({va[0]})',
|
|
361
|
+
'FLOOR': lambda va: f'np.floor({va[0]})',
|
|
362
|
+
'MOD': lambda va: (f'({va[0]} % {va[1]})' if len(va) >= 2 else va[0]),
|
|
363
|
+
'POWER': lambda va: (f'np.power({va[0]}, {va[1]})' if len(va) >= 2 else va[0]),
|
|
364
|
+
'SQRT': lambda va: f'np.sqrt({va[0]})',
|
|
365
|
+
'LOG': lambda va: f'np.log10({va[0]})',
|
|
366
|
+
'LN': lambda va: f'np.log({va[0]})',
|
|
367
|
+
'EXP': lambda va: f'np.exp({va[0]})',
|
|
368
|
+
'SIGN': lambda va: f'np.sign({va[0]})',
|
|
369
|
+
'NVL': lambda va: (f'{va[0]}.fillna({va[1]})' if len(va) >= 2 else va[0]),
|
|
370
|
+
'ISNULL': lambda va: f'{va[0]}.isna()',
|
|
371
|
+
'IIF': lambda va: (f'np.where({va[0]}, {va[1]}, {va[2]})' if len(va) >= 3
|
|
372
|
+
else (f'np.where({va[0]}, {va[1]}, None)' if len(va) >= 2 else va[0])),
|
|
373
|
+
'IS_NUMBER': lambda va: f'pd.to_numeric({va[0]}, errors="coerce").notna()',
|
|
374
|
+
'IS_SPACES': lambda va: f'{va[0]}.str.strip().eq("")',
|
|
375
|
+
'UPPER': lambda va: f'{va[0]}.str.upper()',
|
|
376
|
+
'LOWER': lambda va: f'{va[0]}.str.lower()',
|
|
377
|
+
'TRUNC': _trunc_vec,
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
_VEC_FUNC_ORDER = sorted(
|
|
381
|
+
set(list(_VEC_INLINE.keys()) + list(INFA_FUNC_MAP.keys())),
|
|
382
|
+
key=lambda x: -len(x),
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _convert_remaining_funcs(text, df_var):
|
|
387
|
+
converted = text
|
|
388
|
+
for fn in _VEC_FUNC_ORDER:
|
|
389
|
+
safety = 10
|
|
390
|
+
offset = 0
|
|
391
|
+
while safety > 0:
|
|
392
|
+
safety -= 1
|
|
393
|
+
fr = _find_func_call(converted[offset:], fn)
|
|
394
|
+
if not fr:
|
|
395
|
+
break
|
|
396
|
+
rel_start, rel_end, raw_args = fr
|
|
397
|
+
abs_start = offset + rel_start
|
|
398
|
+
abs_end = offset + rel_end
|
|
399
|
+
if abs_start > 0 and converted[abs_start - 1] == '.':
|
|
400
|
+
offset = abs_end
|
|
401
|
+
continue
|
|
402
|
+
va = [_vec_recursive(a.strip(), df_var) for a in raw_args] if raw_args else []
|
|
403
|
+
if fn in _VEC_INLINE and va:
|
|
404
|
+
repl = _VEC_INLINE[fn](va)
|
|
405
|
+
elif fn in INFA_FUNC_MAP:
|
|
406
|
+
py_func = INFA_FUNC_MAP[fn]
|
|
407
|
+
repl = f'{py_func}({", ".join(va)})'
|
|
408
|
+
else:
|
|
409
|
+
break
|
|
410
|
+
converted = converted[:abs_start] + repl + converted[abs_end:]
|
|
411
|
+
offset = abs_start + len(repl)
|
|
412
|
+
return converted
|
|
413
|
+
|
|
414
|
+
|
|
300
415
|
def _vec_recursive(expr, df_var):
|
|
301
416
|
if not expr or not expr.strip():
|
|
302
417
|
return "None"
|
|
303
418
|
|
|
304
419
|
cleaned = expr.strip()
|
|
305
420
|
|
|
421
|
+
if '--' in cleaned:
|
|
422
|
+
cleaned = _strip_inline_comments(cleaned)
|
|
423
|
+
if not cleaned:
|
|
424
|
+
return "None"
|
|
425
|
+
|
|
306
426
|
if re.match(r'^-?\d+(\.\d+)?$', cleaned):
|
|
307
427
|
return cleaned
|
|
308
428
|
|
|
@@ -317,6 +437,12 @@ def _vec_recursive(expr, df_var):
|
|
|
317
437
|
return 'True'
|
|
318
438
|
if upper == 'FALSE':
|
|
319
439
|
return 'False'
|
|
440
|
+
if upper == 'SYSDATE' or upper == 'SYSTIMESTAMP':
|
|
441
|
+
return 'pd.Timestamp.now()'
|
|
442
|
+
if re.match(r'^SYSTIMESTAMP\s*\(\s*\)$', cleaned, re.IGNORECASE):
|
|
443
|
+
return 'pd.Timestamp.now()'
|
|
444
|
+
if re.match(r'^SYSDATE\s*\(\s*\)$', cleaned, re.IGNORECASE):
|
|
445
|
+
return 'pd.Timestamp.now()'
|
|
320
446
|
|
|
321
447
|
if re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', cleaned):
|
|
322
448
|
return f'{df_var}["{cleaned}"]'
|
|
@@ -338,6 +464,22 @@ def _vec_recursive(expr, df_var):
|
|
|
338
464
|
vec_args = ', '.join(_vec_recursive(a, df_var) for a in args)
|
|
339
465
|
return f'lookup_func("{lkp_name}", {vec_args})'
|
|
340
466
|
|
|
467
|
+
unconnected_lkp = re.match(r'^:(\w+)\.(\w+)\s*\(', cleaned, re.IGNORECASE)
|
|
468
|
+
if unconnected_lkp:
|
|
469
|
+
port_group = unconnected_lkp.group(1)
|
|
470
|
+
lkp_name = unconnected_lkp.group(2)
|
|
471
|
+
paren_start = cleaned.index('(')
|
|
472
|
+
paren_end = _find_matching_paren(cleaned, paren_start)
|
|
473
|
+
if paren_end != -1:
|
|
474
|
+
rest = cleaned[paren_end + 1:].strip()
|
|
475
|
+
inner = cleaned[paren_start + 1:paren_end]
|
|
476
|
+
args = _split_args(inner)
|
|
477
|
+
vec_args = ', '.join(_vec_recursive(a, df_var) for a in args)
|
|
478
|
+
result = f'lookup_func("{lkp_name}", {vec_args})'
|
|
479
|
+
if rest:
|
|
480
|
+
result = result + ' ' + _vec_recursive(rest, df_var)
|
|
481
|
+
return result
|
|
482
|
+
|
|
341
483
|
iif_result = _find_func_call(cleaned, 'IIF')
|
|
342
484
|
if iif_result and iif_result[0] == 0 and iif_result[1] == len(cleaned):
|
|
343
485
|
_, _, args = iif_result
|
|
@@ -346,6 +488,43 @@ def _vec_recursive(expr, df_var):
|
|
|
346
488
|
true_val = _vec_recursive(args[1], df_var)
|
|
347
489
|
false_val = _vec_recursive(args[2], df_var)
|
|
348
490
|
return f"np.where({cond}, {true_val}, {false_val})"
|
|
491
|
+
elif len(args) == 2:
|
|
492
|
+
cond = _vectorize_condition(args[0], df_var)
|
|
493
|
+
true_val = _vec_recursive(args[1], df_var)
|
|
494
|
+
return f"np.where({cond}, {true_val}, None)"
|
|
495
|
+
|
|
496
|
+
decode_result = _find_func_call(cleaned, 'DECODE')
|
|
497
|
+
if decode_result and decode_result[0] == 0 and decode_result[1] == len(cleaned):
|
|
498
|
+
_, _, args = decode_result
|
|
499
|
+
if len(args) >= 3:
|
|
500
|
+
first_arg = args[0].strip().upper()
|
|
501
|
+
if first_arg == 'TRUE':
|
|
502
|
+
pairs = args[1:]
|
|
503
|
+
if len(pairs) % 2 == 1:
|
|
504
|
+
default_val = _vec_recursive(pairs[-1], df_var)
|
|
505
|
+
pairs = pairs[:-1]
|
|
506
|
+
else:
|
|
507
|
+
default_val = 'None'
|
|
508
|
+
result = default_val
|
|
509
|
+
for i in range(len(pairs) - 2, -1, -2):
|
|
510
|
+
cond = _vectorize_condition(pairs[i], df_var)
|
|
511
|
+
val = _vec_recursive(pairs[i + 1], df_var)
|
|
512
|
+
result = f"np.where({cond}, {val}, {result})"
|
|
513
|
+
return result
|
|
514
|
+
else:
|
|
515
|
+
switch_val = _vec_recursive(args[0], df_var)
|
|
516
|
+
pairs = args[1:]
|
|
517
|
+
if len(pairs) % 2 == 1:
|
|
518
|
+
default_val = _vec_recursive(pairs[-1], df_var)
|
|
519
|
+
pairs = pairs[:-1]
|
|
520
|
+
else:
|
|
521
|
+
default_val = 'None'
|
|
522
|
+
result = default_val
|
|
523
|
+
for i in range(len(pairs) - 2, -1, -2):
|
|
524
|
+
case_val = _vec_recursive(pairs[i], df_var)
|
|
525
|
+
then_val = _vec_recursive(pairs[i + 1], df_var)
|
|
526
|
+
result = f"np.where({switch_val} == {case_val}, {then_val}, {result})"
|
|
527
|
+
return result
|
|
349
528
|
|
|
350
529
|
nvl_result = _find_func_call(cleaned, 'NVL')
|
|
351
530
|
if nvl_result and nvl_result[0] == 0 and nvl_result[1] == len(cleaned):
|
|
@@ -581,6 +760,99 @@ def _vec_recursive(expr, df_var):
|
|
|
581
760
|
parts.append(f'{v}.astype(str)')
|
|
582
761
|
return ' + '.join(parts)
|
|
583
762
|
|
|
763
|
+
reverse_result = _find_func_call(cleaned, 'REVERSE')
|
|
764
|
+
if reverse_result and reverse_result[0] == 0 and reverse_result[1] == len(cleaned):
|
|
765
|
+
_, _, args = reverse_result
|
|
766
|
+
if len(args) >= 1:
|
|
767
|
+
inner_val = _vec_recursive(args[0], df_var)
|
|
768
|
+
return f'{inner_val}.str[::-1]'
|
|
769
|
+
|
|
770
|
+
is_spaces_result = _find_func_call(cleaned, 'IS_SPACES')
|
|
771
|
+
if is_spaces_result and is_spaces_result[0] == 0 and is_spaces_result[1] == len(cleaned):
|
|
772
|
+
_, _, args = is_spaces_result
|
|
773
|
+
if len(args) >= 1:
|
|
774
|
+
inner_val = _vec_recursive(args[0], df_var)
|
|
775
|
+
return f'{inner_val}.str.strip().eq("")'
|
|
776
|
+
|
|
777
|
+
is_number_result = _find_func_call(cleaned, 'IS_NUMBER')
|
|
778
|
+
if is_number_result and is_number_result[0] == 0 and is_number_result[1] == len(cleaned):
|
|
779
|
+
_, _, args = is_number_result
|
|
780
|
+
if len(args) >= 1:
|
|
781
|
+
inner_val = _vec_recursive(args[0], df_var)
|
|
782
|
+
return f'pd.to_numeric({inner_val}, errors="coerce").notna()'
|
|
783
|
+
|
|
784
|
+
trunc_result = _find_func_call(cleaned, 'TRUNC')
|
|
785
|
+
if trunc_result and trunc_result[0] == 0 and trunc_result[1] == len(cleaned):
|
|
786
|
+
_, _, args = trunc_result
|
|
787
|
+
if len(args) >= 1:
|
|
788
|
+
field_val = _vec_recursive(args[0], df_var)
|
|
789
|
+
if len(args) >= 2:
|
|
790
|
+
raw_arg2 = args[1].strip().strip("'\"")
|
|
791
|
+
try:
|
|
792
|
+
precision = int(raw_arg2)
|
|
793
|
+
if precision == 0:
|
|
794
|
+
return f'np.trunc({field_val})'
|
|
795
|
+
return f'(np.trunc({field_val} * 10**{precision}) / 10**{precision})'
|
|
796
|
+
except ValueError:
|
|
797
|
+
pass
|
|
798
|
+
fmt = raw_arg2.upper()
|
|
799
|
+
if fmt in ('DD', 'D'):
|
|
800
|
+
return f'{field_val}.dt.floor("D")'
|
|
801
|
+
elif fmt in ('MM', 'MON', 'MONTH'):
|
|
802
|
+
return f'{field_val}.dt.to_period("M").dt.to_timestamp()'
|
|
803
|
+
elif fmt in ('YY', 'YYYY', 'YEAR'):
|
|
804
|
+
return f'{field_val}.dt.to_period("Y").dt.to_timestamp()'
|
|
805
|
+
elif fmt in ('HH', 'HH24'):
|
|
806
|
+
return f'{field_val}.dt.floor("H")'
|
|
807
|
+
return f'{field_val}.dt.floor("{fmt}")'
|
|
808
|
+
return f'np.trunc({field_val})'
|
|
809
|
+
|
|
810
|
+
add_to_date_result = _find_func_call(cleaned, 'ADD_TO_DATE')
|
|
811
|
+
if add_to_date_result and add_to_date_result[0] == 0 and add_to_date_result[1] == len(cleaned):
|
|
812
|
+
_, _, args = add_to_date_result
|
|
813
|
+
if len(args) >= 3:
|
|
814
|
+
date_val = _vec_recursive(args[0], df_var)
|
|
815
|
+
part = args[1].strip().strip("'\"").upper()
|
|
816
|
+
amount = _vec_recursive(args[2], df_var)
|
|
817
|
+
if part in ('YY', 'YYYY', 'YEAR'):
|
|
818
|
+
return f'{date_val} + pd.DateOffset(years={amount})'
|
|
819
|
+
elif part in ('MM', 'MON', 'MONTH'):
|
|
820
|
+
return f'{date_val} + pd.DateOffset(months={amount})'
|
|
821
|
+
else:
|
|
822
|
+
unit_map = {
|
|
823
|
+
'DD': 'D', 'DAY': 'D', 'D': 'D', 'DDD': 'D',
|
|
824
|
+
'HH': 'h', 'HH24': 'h', 'HOUR': 'h',
|
|
825
|
+
'MI': 'min', 'MIN': 'min', 'MINUTE': 'min',
|
|
826
|
+
'SS': 's', 'SEC': 's', 'SECOND': 's',
|
|
827
|
+
}
|
|
828
|
+
pd_unit = unit_map.get(part, 'D')
|
|
829
|
+
return f'{date_val} + pd.to_timedelta({amount}, unit="{pd_unit}")'
|
|
830
|
+
|
|
831
|
+
date_diff_result = _find_func_call(cleaned, 'DATE_DIFF')
|
|
832
|
+
if date_diff_result and date_diff_result[0] == 0 and date_diff_result[1] == len(cleaned):
|
|
833
|
+
_, _, args = date_diff_result
|
|
834
|
+
if len(args) >= 3:
|
|
835
|
+
date1 = _vec_recursive(args[0], df_var)
|
|
836
|
+
date2 = _vec_recursive(args[1], df_var)
|
|
837
|
+
part = args[2].strip().strip("'\"").upper()
|
|
838
|
+
if part in ('DD', 'DAY', 'D', 'DDD'):
|
|
839
|
+
return f'({date1} - {date2}).dt.days'
|
|
840
|
+
elif part in ('HH', 'HH24', 'HOUR'):
|
|
841
|
+
return f'({date1} - {date2}).dt.total_seconds() / 3600'
|
|
842
|
+
elif part in ('MI', 'MIN', 'MINUTE'):
|
|
843
|
+
return f'({date1} - {date2}).dt.total_seconds() / 60'
|
|
844
|
+
elif part in ('SS', 'SEC', 'SECOND'):
|
|
845
|
+
return f'({date1} - {date2}).dt.total_seconds()'
|
|
846
|
+
return f'({date1} - {date2}).dt.days'
|
|
847
|
+
|
|
848
|
+
in_result = _find_func_call(cleaned, 'IN')
|
|
849
|
+
if in_result and in_result[0] == 0 and in_result[1] == len(cleaned):
|
|
850
|
+
_, _, args = in_result
|
|
851
|
+
if len(args) >= 2:
|
|
852
|
+
field_val = _vec_recursive(args[0], df_var)
|
|
853
|
+
vals = ', '.join(_vec_recursive(a, df_var) for a in args[1:])
|
|
854
|
+
return f'{field_val}.isin([{vals}])'
|
|
855
|
+
|
|
584
856
|
if "||" in cleaned:
|
|
585
857
|
parts = _split_concat_parts(cleaned)
|
|
586
858
|
vec_parts = []
|
|
@@ -616,6 +888,8 @@ def _vec_recursive(expr, df_var):
|
|
|
616
888
|
converted = re.sub(r'\bFALSE\b', 'False', converted, flags=re.IGNORECASE)
|
|
617
889
|
converted = re.sub(r'\bNULL\b', 'None', converted, flags=re.IGNORECASE)
|
|
618
890
|
|
|
891
|
+
converted = _convert_remaining_funcs(converted, df_var)
|
|
892
|
+
|
|
619
893
|
skip_words = {
|
|
620
894
|
'True', 'False', 'None', 'and', 'or', 'not', 'np', 'pd', 'get_variable',
|
|
621
895
|
'str', 'int', 'float', 'bool', 'len', 'abs', 'round',
|
|
@@ -629,6 +903,7 @@ def _vec_recursive(expr, df_var):
|
|
|
629
903
|
converted = re.sub(r'\bNOT\b', ' ~', converted, flags=re.IGNORECASE)
|
|
630
904
|
converted = re.sub(r'<>', '!=', converted)
|
|
631
905
|
converted = re.sub(r'(?<![<>!=])=(?!=)', '==', converted)
|
|
906
|
+
converted = re.sub(r'\berrors\s*==\s*(["\'])', r'errors=\1', converted)
|
|
632
907
|
|
|
633
908
|
converted = re.sub(r'\s+', ' ', converted).strip()
|
|
634
909
|
|
|
@@ -691,6 +966,39 @@ def _vectorize_value(val, df_var="df"):
|
|
|
691
966
|
def _vectorize_simple(part, df_var):
|
|
692
967
|
c = part.strip()
|
|
693
968
|
|
|
969
|
+
lkp_match = re.search(r':(\w+)\.(\w+)\s*\(', c)
|
|
970
|
+
if lkp_match:
|
|
971
|
+
start = lkp_match.start()
|
|
972
|
+
paren_start = c.index('(', start)
|
|
973
|
+
paren_end = _find_matching_paren(c, paren_start)
|
|
974
|
+
if paren_end != -1:
|
|
975
|
+
before = c[:start].strip()
|
|
976
|
+
lkp_expr = c[start:paren_end + 1]
|
|
977
|
+
after = c[paren_end + 1:].strip()
|
|
978
|
+
vec_lkp = _vec_recursive(lkp_expr, df_var)
|
|
979
|
+
c = f'{before}{vec_lkp}{after}'.strip()
|
|
980
|
+
|
|
981
|
+
in_result = _find_func_call(c, 'IN')
|
|
982
|
+
if in_result:
|
|
983
|
+
start, end, args = in_result
|
|
984
|
+
if len(args) >= 2:
|
|
985
|
+
before = c[:start].strip()
|
|
986
|
+
after = c[end:].strip()
|
|
987
|
+
vec = _vec_recursive(c[start:end], df_var)
|
|
988
|
+
c = f'{before}{vec}{after}'.strip()
|
|
989
|
+
if not before and not after:
|
|
990
|
+
return c
|
|
991
|
+
|
|
992
|
+
is_spaces_result = _find_func_call(c, 'IS_SPACES')
|
|
993
|
+
if is_spaces_result:
|
|
994
|
+
start, end, args = is_spaces_result
|
|
995
|
+
before = c[:start].strip()
|
|
996
|
+
after = c[end:].strip()
|
|
997
|
+
vec = _vec_recursive(c[start:end], df_var)
|
|
998
|
+
c = f'{before}{vec}{after}'.strip()
|
|
999
|
+
if not before and not after:
|
|
1000
|
+
return c
|
|
1001
|
+
|
|
694
1002
|
for func_name in sorted(INFA_FUNC_MAP.keys(), key=lambda x: -len(x)):
|
|
695
1003
|
result = _find_func_call(c, func_name)
|
|
696
1004
|
if result:
|
|
@@ -701,7 +1009,7 @@ def _vectorize_simple(part, df_var):
|
|
|
701
1009
|
c = f'{before}{vec_inner}{after}'
|
|
702
1010
|
break
|
|
703
1011
|
|
|
704
|
-
for func_name in ('UPPER', 'LOWER', 'LTRIM', 'RTRIM', 'TRIM', 'SUBSTR', 'INSTR', 'LENGTH', 'INITCAP'):
|
|
1012
|
+
for func_name in ('UPPER', 'LOWER', 'LTRIM', 'RTRIM', 'TRIM', 'SUBSTR', 'INSTR', 'LENGTH', 'INITCAP', 'REVERSE', 'IS_NUMBER'):
|
|
705
1013
|
result = _find_func_call(c, func_name)
|
|
706
1014
|
if result:
|
|
707
1015
|
start, end, _ = result
|
|
@@ -710,8 +1018,16 @@ def _vectorize_simple(part, df_var):
|
|
|
710
1018
|
vec_inner = _vec_recursive(c[start:end], df_var)
|
|
711
1019
|
c = f'{before}{vec_inner}{after}'
|
|
712
1020
|
|
|
713
|
-
|
|
714
|
-
|
|
1021
|
+
isnull_result = _find_func_call(c, 'ISNULL')
|
|
1022
|
+
if isnull_result:
|
|
1023
|
+
start, end, args = isnull_result
|
|
1024
|
+
before = c[:start]
|
|
1025
|
+
after = c[end:]
|
|
1026
|
+
vec_inner = _vec_recursive(c[start:end], df_var)
|
|
1027
|
+
c = f'{before}{vec_inner}{after}'
|
|
1028
|
+
else:
|
|
1029
|
+
c = re.sub(r'\bISNULL\s*\(\s*([A-Za-z_]\w*)\s*\)',
|
|
1030
|
+
lambda m: f'{df_var}["{m.group(1)}"].isna()', c, flags=re.IGNORECASE)
|
|
715
1031
|
c = re.sub(r'\b([A-Za-z_]\w*)\s*IS\s+NOT\s+NULL\b',
|
|
716
1032
|
lambda m: f'{df_var}["{m.group(1)}"].notna()', c, flags=re.IGNORECASE)
|
|
717
1033
|
c = re.sub(r'\b([A-Za-z_]\w*)\s*IS\s+NULL\b',
|
|
@@ -727,7 +1043,7 @@ def _vectorize_simple(part, df_var):
|
|
|
727
1043
|
skip_words = {
|
|
728
1044
|
'True', 'False', 'None', 'and', 'or', 'not', 'np', 'pd',
|
|
729
1045
|
'str', 'int', 'float', 'isna', 'notna', 'fillna',
|
|
730
|
-
'get_variable', 'lookup_func',
|
|
1046
|
+
'get_variable', 'lookup_func', 'isin', 'eq',
|
|
731
1047
|
}
|
|
732
1048
|
c = _substitute_fields(c, df_var, skip_words)
|
|
733
1049
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: informatica-python
|
|
3
|
-
Version: 1.9.
|
|
3
|
+
Version: 1.9.3
|
|
4
4
|
Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
|
|
5
5
|
Author: Nick
|
|
6
6
|
License: MIT
|
|
@@ -79,25 +79,26 @@ from informatica_python import InformaticaConverter
|
|
|
79
79
|
|
|
80
80
|
converter = InformaticaConverter()
|
|
81
81
|
|
|
82
|
-
# Parse and generate files
|
|
83
|
-
converter.
|
|
82
|
+
# Parse and generate files to a directory
|
|
83
|
+
converter.convert("workflow_export.xml", output_dir="output_dir")
|
|
84
84
|
|
|
85
|
-
# Parse and generate zip
|
|
86
|
-
converter.
|
|
85
|
+
# Parse and generate zip archive
|
|
86
|
+
converter.convert("workflow_export.xml", output_zip="output.zip")
|
|
87
87
|
|
|
88
|
-
# Parse to structured dict
|
|
88
|
+
# Parse to structured dict (no code generation)
|
|
89
89
|
result = converter.parse_file("workflow_export.xml")
|
|
90
90
|
|
|
91
91
|
# Use a different data library
|
|
92
|
-
converter
|
|
92
|
+
converter = InformaticaConverter(data_lib="polars")
|
|
93
|
+
converter.convert("workflow_export.xml", output_dir="output_dir")
|
|
93
94
|
```
|
|
94
95
|
|
|
95
96
|
## Generated Output Files
|
|
96
97
|
|
|
97
98
|
| File | Description |
|
|
98
99
|
|------|-------------|
|
|
99
|
-
| `helper_functions.py` | Database/file I/O helpers, Informatica expression equivalents
|
|
100
|
-
| `mapping_{name}.py` | One per mapping, named after the real Informatica mapping name — transformation logic with row-count logging,
|
|
100
|
+
| `helper_functions.py` | Database/file I/O helpers, 90+ Informatica expression equivalents, window/analytic functions, stored procedure execution, state persistence |
|
|
101
|
+
| `mapping_{name}.py` | One per mapping, named after the real Informatica mapping name — transformation logic with vectorized expressions, row-count logging, type casting, inline documentation |
|
|
101
102
|
| `workflow.py` | Task orchestration with topological ordering, decision branching, worklet calls, and error handling |
|
|
102
103
|
| `config.yml` | Connection configs, source/target metadata, runtime parameters |
|
|
103
104
|
| `all_sql_queries.sql` | All SQL extracted from Source Qualifiers, Lookups, SQL transforms (with ANSI-translated variants) |
|
|
@@ -119,23 +120,22 @@ Select via `--data-lib` CLI flag or `data_lib` parameter:
|
|
|
119
120
|
|
|
120
121
|
The code generator produces real, runnable Python for these transformation types:
|
|
121
122
|
|
|
122
|
-
- **Source Qualifier** — SQL override, pre/post SQL, column selection, session connection overrides
|
|
123
|
-
- **Expression** — Field-level expressions converted to vectorized pandas operations (`df["COL"]` style)
|
|
123
|
+
- **Source Qualifier** — SQL override, pre/post SQL, column selection, session connection overrides, `$$PARAM` substitution in SQL
|
|
124
|
+
- **Expression** — Field-level expressions converted to vectorized pandas operations (`df["COL"]` style) with 40+ vectorized function handlers
|
|
124
125
|
- **Filter** — Row filtering with vectorized converted conditions
|
|
125
126
|
- **Joiner** — `pd.merge()` with join type and condition parsing (inner/left/right/outer)
|
|
126
|
-
- **Lookup** — `pd.merge()` lookups with connection-aware DB
|
|
127
|
+
- **Lookup** — `pd.merge()` lookups with connection-aware DB reads, multiple match policies, default values, `$$PARAM` substitution
|
|
127
128
|
- **Aggregator** — `groupby().agg()` with SUM/COUNT/AVG/MIN/MAX/FIRST/LAST, computed aggregates
|
|
128
|
-
- **Sorter** — `sort_values()` with multi-key ascending/descending
|
|
129
|
+
- **Sorter** — `sort_values()` with multi-key ascending/descending per-field direction from SORTDIRECTION attribute
|
|
129
130
|
- **Router** — Multi-group conditional routing with named groups
|
|
130
131
|
- **Union** — `pd.concat()` across multiple input groups
|
|
131
|
-
- **Update Strategy** — DD_INSERT/DD_UPDATE/DD_DELETE/DD_REJECT routing with actual target INSERT/UPDATE/DELETE operations, dialect-aware SQL placeholders, auto-detected primary keys
|
|
132
|
+
- **Update Strategy** — DD_INSERT/DD_UPDATE/DD_DELETE/DD_REJECT routing with actual target INSERT/UPDATE/DELETE operations, dialect-aware SQL placeholders, auto-detected primary keys; vectorized expression parsing with row-level fallback
|
|
132
133
|
- **Sequence Generator** — Auto-incrementing ID columns
|
|
133
134
|
- **Normalizer** — `pd.melt()` with auto-detected id/value vars
|
|
134
135
|
- **Rank** — `groupby().rank()` with Top-N filtering
|
|
135
136
|
- **Stored Procedure** — Full code generation with Oracle/MSSQL/generic support, input/output parameter mapping
|
|
136
|
-
- **Transaction Control** — Commit/rollback logic
|
|
137
137
|
- **Custom / Java** — Placeholder stubs with TODO markers
|
|
138
|
-
- **SQL Transform** — Direct SQL execution pass-through
|
|
138
|
+
- **SQL Transform** — Direct SQL execution pass-through with `$$PARAM` substitution
|
|
139
139
|
|
|
140
140
|
## Supported XML Tags (72 Tags)
|
|
141
141
|
|
|
@@ -153,6 +153,86 @@ The code generator produces real, runnable Python for these transformation types
|
|
|
153
153
|
|
|
154
154
|
## Key Features
|
|
155
155
|
|
|
156
|
+
### Generated Code Quality (v1.9.3+)
|
|
157
|
+
|
|
158
|
+
Generated code follows clean formatting and commenting standards:
|
|
159
|
+
- Consistent section headers (`# ---`) for Source Qualifiers, Transformations, and Target Writes
|
|
160
|
+
- Each section includes metadata: database type, field lists, descriptions
|
|
161
|
+
- Column mapping comments (`# Column mapping: source -> target`) and write operation type comments (`# Write to database table` / `# Write to file`)
|
|
162
|
+
- Expression inline comments showing original Informatica expression (e.g., `# FULL_NAME = UPPER(FIRST_NAME) || ' ' || UPPER(LAST_NAME)`)
|
|
163
|
+
- Clean indentation: no blank line after `try:`, no consecutive blank lines inside function body
|
|
164
|
+
- Mapping-level `try:/except` wrapper with `logger.error()` for runtime visibility
|
|
165
|
+
|
|
166
|
+
### Smart Target Write Detection (v1.9.3+)
|
|
167
|
+
|
|
168
|
+
Targets are automatically classified as database or file writes:
|
|
169
|
+
- Targets with `database_type` set (Oracle, SQL Server, etc.) generate `write_to_db()` calls
|
|
170
|
+
- Targets with flatfile metadata or file extensions (`.csv`, `.dat`, `.txt`, `.xml`, `.json`, `.parquet`, `.xlsx`, `.xls`, `.tsv`, `.avro`) generate `write_file()` calls
|
|
171
|
+
- Bare targets (no metadata) default to `write_to_db()` since Informatica targets are typically database tables
|
|
172
|
+
- Schema-qualified names (e.g., `dbo.MY_TABLE`) correctly route to database writes
|
|
173
|
+
- Session file path overrides take priority when present
|
|
174
|
+
|
|
175
|
+
### Vectorized Expression Engine (v1.9.2+)
|
|
176
|
+
|
|
177
|
+
Column-level pandas operations instead of row-level iteration. The expression converter uses a recursive parenthesis-aware parser that handles:
|
|
178
|
+
|
|
179
|
+
**Conditional / Null:**
|
|
180
|
+
- `IIF(cond, val, else_val)` → `np.where()` — supports 2-arg form (missing else defaults to `None`)
|
|
181
|
+
- `DECODE(TRUE, cond1, val1, ..., default)` → nested `np.where()` chains
|
|
182
|
+
- `DECODE(field, val1, res1, ..., default)` → value-matching `np.where()`
|
|
183
|
+
- `NVL(val, default)` → `.fillna()`
|
|
184
|
+
- `IS_SPACES(field)` → `field.str.strip().eq("")`
|
|
185
|
+
- `IS_NUMBER(field)` → `pd.to_numeric(field, errors="coerce").notna()`
|
|
186
|
+
- `IN(field, val1, val2, ...)` → `field.isin([...])`
|
|
187
|
+
|
|
188
|
+
**String:**
|
|
189
|
+
- `UPPER/LOWER` → `.str.upper()/.str.lower()`
|
|
190
|
+
- `LTRIM/RTRIM/TRIM` → `.str.lstrip()/.str.rstrip()/.str.strip()` with custom char support
|
|
191
|
+
- `SUBSTR(val, start, len)` → `.str[start:end]`
|
|
192
|
+
- `INSTR(val, search)` → `.str.find()`
|
|
193
|
+
- `LPAD/RPAD` → `.str.pad()`
|
|
194
|
+
- `REVERSE(val)` → `.str[::-1]`
|
|
195
|
+
- `INITCAP(val)` → `.str.title()`
|
|
196
|
+
- `REPLACECHR/REPLACESTR` → `.str.replace()`
|
|
197
|
+
- `REG_EXTRACT/REG_REPLACE` → `.str.extract()/.str.replace(regex=True)`
|
|
198
|
+
- `CHR(code)` → `chr(int(code))`
|
|
199
|
+
- `||` concatenation → `+` with `.astype(str)` on non-literals
|
|
200
|
+
|
|
201
|
+
**Date/Time:**
|
|
202
|
+
- `TO_DATE(val, fmt)` → `pd.to_datetime()` with Informatica→Python format conversion
|
|
203
|
+
- `TO_CHAR(val, fmt)` → `.dt.strftime()`
|
|
204
|
+
- `ADD_TO_DATE(date, part, amount)` → `date + pd.to_timedelta()` with full unit mapping (YY/MM/DD/HH/MI/SS)
|
|
205
|
+
- `DATE_DIFF(date1, date2, part)` → `(date1 - date2).dt.days` / `.dt.total_seconds() / 3600` etc.
|
|
206
|
+
- `SYSDATE/SYSTIMESTAMP` → `pd.Timestamp.now()`
|
|
207
|
+
- `TRUNC(date, 'DD')` → date truncation via `.dt.floor()/.dt.to_period()`
|
|
208
|
+
- `MAKE_DATE_TIME(y, m, d, h, mi, s)` → `pd.Timestamp()`
|
|
209
|
+
|
|
210
|
+
**Numeric:**
|
|
211
|
+
- `TO_INTEGER/TO_BIGINT/TO_FLOAT/TO_DECIMAL` → `pd.to_numeric()`
|
|
212
|
+
- `TRUNC(val)` → `np.trunc()` for numeric truncation
|
|
213
|
+
- `ROUND/ABS/CEIL/FLOOR/POWER/SQRT/MOD/LOG/SIGN` → `np.*` equivalents
|
|
214
|
+
|
|
215
|
+
**Special:**
|
|
216
|
+
- `:LKP.TABLE(args)` — Connected lookup references → `df_lkp_table` merge
|
|
217
|
+
- `:PORT.FUNC(args)` — Unconnected lookups → `lookup_func("FUNC", args)` calls
|
|
218
|
+
- Inline `--` comment stripping (respects string literals)
|
|
219
|
+
- String-literal-aware field substitution
|
|
220
|
+
|
|
221
|
+
### Expression Converter (90+ Row-Level Functions)
|
|
222
|
+
|
|
223
|
+
All Informatica expression functions are available as row-level Python equivalents in `helper_functions.py`:
|
|
224
|
+
|
|
225
|
+
- **String:** `substr`, `ltrim`, `rtrim`, `upper`, `lower`, `lpad`, `rpad`, `instr`, `length`, `concat`, `replacechr`, `replacestr`, `reg_extract`, `reg_replace`, `reg_match`, `reverse_str`, `initcap`, `chr_func`, `ascii_func`, `left_str`, `right_str`, `trim_func`, `indexof`, `metaphone_func`, `soundex_func`, `compress_func`, `decompress_func`
|
|
226
|
+
- **Date:** `add_to_date`, `date_diff`, `date_compare`, `get_date_part`, `set_date_part`, `last_day`, `make_date_time`, `to_date`, `to_char`, `to_timestamp_func`, `current_timestamp`, `session_start_time`
|
|
227
|
+
- **Numeric:** `round_val`, `trunc`, `mod_val`, `abs_val`, `ceil_val`, `floor_val`, `power_val`, `sqrt_val`, `log_val`, `ln_val`, `exp_val`, `sign_val`, `rand_val`, `greatest_val`, `least_val`
|
|
228
|
+
- **Conversion:** `to_integer`, `to_bigint`, `to_float`, `to_decimal`, `cast_func`
|
|
229
|
+
- **Null/Conditional:** `iif_expr`, `decode_expr`, `nvl`, `nvl2`, `isnull`, `is_spaces`, `is_number`, `is_date`, `in_expr`, `choose_expr`
|
|
230
|
+
- **Aggregate:** `sum_val`, `avg_val`, `count_val`, `min_val`, `max_val`, `first_val`, `last_val`, `median_val`, `stddev_val`, `variance_val`, `percentile_val`
|
|
231
|
+
- **Window/Analytic:** `moving_avg`, `moving_avg_df`, `moving_sum`, `moving_sum_df`, `cume`, `cume_df`, `percentile_df`
|
|
232
|
+
- **Lookup:** `lookup_func` — Placeholder for runtime lookup resolution
|
|
233
|
+
- **Variable:** `get_variable`, `set_variable`, `set_count_variable`
|
|
234
|
+
- **Control:** `raise_error`, `abort_func`
|
|
235
|
+
|
|
156
236
|
### Row-Count Logging (v1.8+)
|
|
157
237
|
|
|
158
238
|
Generated code automatically logs row counts at every step of the data pipeline:
|
|
@@ -165,8 +245,6 @@ AGG_TOTALS (Aggregator): 8542 input rows -> 150 output rows
|
|
|
165
245
|
Target TGT_SUMMARY: 150 rows written
|
|
166
246
|
```
|
|
167
247
|
|
|
168
|
-
All row-count operations are backend-safe (wrapped in try/except), so Dask and other lazy-evaluation backends won't fail.
|
|
169
|
-
|
|
170
248
|
### Generated Code Documentation (v1.8+)
|
|
171
249
|
|
|
172
250
|
Every generated mapping function includes a rich docstring describing:
|
|
@@ -179,14 +257,6 @@ Each transformation block is annotated with:
|
|
|
179
257
|
- Transform type and description (from Informatica XML)
|
|
180
258
|
- Input and output field lists (truncated at 10 for readability)
|
|
181
259
|
|
|
182
|
-
### Window / Analytic Functions (v1.7+)
|
|
183
|
-
|
|
184
|
-
DataFrame-level analytic functions for aggregation transforms:
|
|
185
|
-
- `moving_avg_df(df, col, window)` — rolling mean via `.rolling().mean()`
|
|
186
|
-
- `moving_sum_df(df, col, window)` — rolling sum via `.rolling().sum()`
|
|
187
|
-
- `cume_df(df, col)` — cumulative sum via `.expanding().sum()`
|
|
188
|
-
- `percentile_df(df, col, pct)` — quantile via `.quantile()`
|
|
189
|
-
|
|
190
260
|
### Update Strategy with Target Operations (v1.7+)
|
|
191
261
|
|
|
192
262
|
Update Strategy transforms now generate real INSERT/UPDATE/DELETE operations:
|
|
@@ -196,6 +266,14 @@ Update Strategy transforms now generate real INSERT/UPDATE/DELETE operations:
|
|
|
196
266
|
- Dialect-aware SQL placeholders (`?` for MSSQL, `%s` for PostgreSQL/Oracle)
|
|
197
267
|
- Primary key columns auto-detected from target field definitions
|
|
198
268
|
|
|
269
|
+
### Window / Analytic Functions (v1.7+)
|
|
270
|
+
|
|
271
|
+
DataFrame-level analytic functions for aggregation transforms:
|
|
272
|
+
- `moving_avg_df(df, col, window)` — rolling mean via `.rolling().mean()`
|
|
273
|
+
- `moving_sum_df(df, col, window)` — rolling sum via `.rolling().sum()`
|
|
274
|
+
- `cume_df(df, col)` — cumulative sum via `.expanding().sum()`
|
|
275
|
+
- `percentile_df(df, col, pct)` — quantile via `.quantile()`
|
|
276
|
+
|
|
199
277
|
### Stored Procedure Execution (v1.7+)
|
|
200
278
|
|
|
201
279
|
Full stored procedure code generation (not just stubs):
|
|
@@ -241,19 +319,13 @@ Optional `--validate-casts` flag generates null-count checks before/after type c
|
|
|
241
319
|
- Logs warnings when coercion introduces new nulls
|
|
242
320
|
- Helps identify data quality issues during test runs
|
|
243
321
|
|
|
244
|
-
### Vectorized Expression Generation (v1.5+)
|
|
245
|
-
|
|
246
|
-
Column-level pandas operations instead of row-level iteration:
|
|
247
|
-
- IIF → `np.where()`, NVL → `.fillna()`, UPPER/LOWER → `.str.upper()/.str.lower()`
|
|
248
|
-
- SUBSTR → `.str[start:end]`, TO_INTEGER → `pd.to_numeric()`, TO_DATE → `pd.to_datetime()`
|
|
249
|
-
- IS NULL/IS NOT NULL → `.isna()`/`.notna()`
|
|
250
|
-
|
|
251
322
|
### Parameter File Support (v1.5+)
|
|
252
323
|
|
|
253
324
|
Standard Informatica `.param` file parsing:
|
|
254
325
|
- `[Global]` and `[folder.WF:workflow.ST:session]` section support
|
|
255
326
|
- `get_param(config, var_name)` resolution chain: config → env vars → defaults
|
|
256
327
|
- CLI `--param-file` flag for specifying parameter files
|
|
328
|
+
- `$$PARAM` variables in SQL automatically substituted with `.replace()` calls
|
|
257
329
|
|
|
258
330
|
### Session Connection Overrides (v1.4+)
|
|
259
331
|
|
|
@@ -283,18 +355,49 @@ Expands Mapplet instances into prefixed transforms, rewires connectors, and elim
|
|
|
283
355
|
|
|
284
356
|
Converts Informatica decision conditions to Python if/else branches with proper variable substitution.
|
|
285
357
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
358
|
+
## Helper Functions Library
|
|
359
|
+
|
|
360
|
+
The generated `helper_functions.py` provides a complete runtime library:
|
|
361
|
+
|
|
362
|
+
### Configuration & Parameters
|
|
363
|
+
| Function | Description |
|
|
364
|
+
|----------|-------------|
|
|
365
|
+
| `load_config(path, param_file)` | Load YAML config with optional `.param` file merge |
|
|
366
|
+
| `parse_param_file(path)` | Parse Informatica `.param` files (`[Global]`, `[folder.WF:...]` sections) |
|
|
367
|
+
| `get_param(config, var_name, default)` | Resolve parameter: config → env vars → default |
|
|
368
|
+
| `get_variable(var_name, config)` | Get workflow/mapping variable from params, env vars, or param store |
|
|
369
|
+
| `set_variable(var_name, value)` | Set workflow/mapping variable in param store and env |
|
|
370
|
+
|
|
371
|
+
### Database Operations
|
|
372
|
+
| Function | Description |
|
|
373
|
+
|----------|-------------|
|
|
374
|
+
| `get_db_connection(config, conn_name)` | Create DB connection (pyodbc/pymssql/sqlalchemy fallback for MSSQL) |
|
|
375
|
+
| `read_from_db(config, query, conn_name)` | Execute SQL query and return DataFrame |
|
|
376
|
+
| `write_to_db(config, df, table, conn_name)` | Write DataFrame to database table via `.to_sql()` |
|
|
377
|
+
| `execute_sql(config, sql, conn_name)` | Execute DDL/DML statement (INSERT, UPDATE, DELETE) |
|
|
378
|
+
| `write_with_update_strategy(config, df, table, ...)` | Split rows by `_update_strategy` column into INSERT/UPDATE/DELETE/REJECT operations |
|
|
379
|
+
| `call_stored_procedure(config, proc, params, ...)` | Execute stored procedure with input/output parameter mapping (Oracle/MSSQL/generic) |
|
|
380
|
+
|
|
381
|
+
### File Operations
|
|
382
|
+
| Function | Description |
|
|
383
|
+
|----------|-------------|
|
|
384
|
+
| `read_file(path, file_config)` | Read CSV/DAT/TXT/XML/XLSX/JSON/Parquet with auto-detection |
|
|
385
|
+
| `write_file(df, path, file_config)` | Write DataFrame to file with format auto-detection |
|
|
386
|
+
|
|
387
|
+
### State Persistence
|
|
388
|
+
| Function | Description |
|
|
389
|
+
|----------|-------------|
|
|
390
|
+
| `load_persistent_state(file)` | Load JSON state file for persistent variables |
|
|
391
|
+
| `save_persistent_state(file)` | Save persistent variables to JSON state file |
|
|
392
|
+
| `get_persistent_variable(scope, var, default)` | Get scoped persistent variable |
|
|
393
|
+
| `set_persistent_variable(scope, var, value)` | Set scoped persistent variable |
|
|
394
|
+
|
|
395
|
+
### Logging & Monitoring
|
|
396
|
+
| Function | Description |
|
|
397
|
+
|----------|-------------|
|
|
398
|
+
| `log_mapping_start(name)` | Log mapping start with timestamp |
|
|
399
|
+
| `log_mapping_end(name, start_time, row_count)` | Log mapping completion with elapsed time |
|
|
400
|
+
| `validate_row_count(df, name, min_rows)` | Validate minimum row count threshold |
|
|
298
401
|
|
|
299
402
|
## Requirements
|
|
300
403
|
|
|
@@ -304,7 +407,32 @@ Converts Informatica expressions to Python equivalents:
|
|
|
304
407
|
|
|
305
408
|
## Changelog
|
|
306
409
|
|
|
307
|
-
### v1.9.
|
|
410
|
+
### v1.9.3 (Current)
|
|
411
|
+
- **Smart target write detection**: Bare targets default to `write_to_db()` instead of `write_file()`; file extension allowlist (`.csv`, `.dat`, `.txt`, `.xml`, `.json`, `.parquet`, `.xlsx`, `.xls`, `.tsv`, `.avro`) for file targets; schema-qualified names (`dbo.TABLE`) correctly route to database
|
|
412
|
+
- **DECODE vectorization**: `DECODE(TRUE, cond1, val1, ..., default)` → nested `np.where()` chains; value-matching DECODE; handles IN() conditions and complex boolean nesting
|
|
413
|
+
- **IS_SPACES vectorization**: `IS_SPACES(field)` → `field.str.strip().eq("")`
|
|
414
|
+
- **2-arg IIF**: `IIF(cond, val)` without else clause defaults to `None`
|
|
415
|
+
- **REVERSE vectorization**: `REVERSE(field)` → `field.str[::-1]`
|
|
416
|
+
- **IN() vectorization**: `IN(field, val1, val2, ...)` → `field.isin([...])`
|
|
417
|
+
- **IS_NUMBER vectorization**: `IS_NUMBER(field)` → `pd.to_numeric(field, errors="coerce").notna()`
|
|
418
|
+
- **SYSDATE/SYSTIMESTAMP**: Bare `SYSDATE`/`SYSTIMESTAMP` → `pd.Timestamp.now()` in vectorized mode
|
|
419
|
+
- **TRUNC vectorization**: Numeric `TRUNC(field)` → `np.trunc()`; date `TRUNC(field, 'DD')` → `.dt.floor()`
|
|
420
|
+
- **ADD_TO_DATE vectorization**: `ADD_TO_DATE(date, part, amount)` → `pd.to_timedelta()` with YY/MM/DD/HH/MI/SS units
|
|
421
|
+
- **DATE_DIFF vectorization**: `DATE_DIFF(date1, date2, part)` → arithmetic on timedelta components
|
|
422
|
+
- **Unconnected lookup support**: `:PORT.FUNC_NAME(args)` → `lookup_func("FUNC_NAME", args)`
|
|
423
|
+
- **Inline comment stripping**: `--` comments removed from expressions (respects string literals)
|
|
424
|
+
- **`$$PARAM` SQL substitution**: Source Qualifier, Lookup, and SQL Transform SQL strings auto-substitute `$$VAR` with `get_param(config, 'VAR')` calls
|
|
425
|
+
- **Sorter direction**: Reads `SORTDIRECTION` from field attributes, generates per-field `ascending=[True, False, ...]`
|
|
426
|
+
- **Pass-through optimization**: Identity expressions skip `.copy()` and use direct reference
|
|
427
|
+
- **Duplicate lookup deduplication**: `_gen_lookup_transform` uses `seen_output_cols` set to avoid duplicate column checks
|
|
428
|
+
- **Mapping-level error handling**: Generated function body wrapped in `try:/except` with `logger.error()`
|
|
429
|
+
- **Update strategy vectorized**: Tries vectorized expression first, falls back to row-level `apply()`
|
|
430
|
+
- **Generated code formatting**: Consistent `# ---` section headers for Source Qualifiers, Transforms, and Target Writes; metadata comments (database type, field lists); column mapping and write operation comments; clean blank line handling
|
|
431
|
+
- **Source/target detection**: Case-insensitive instance type matching
|
|
432
|
+
- **Session→mapping inference**: Longest-suffix-match strategy for ambiguous mapping names
|
|
433
|
+
- **646 tests** across unit, integration, expression, and formatting test suites
|
|
434
|
+
|
|
435
|
+
### v1.9.2 (Phase 8)
|
|
308
436
|
- Mapping output files now use real mapping names (e.g., `mapping_m_customer_load.py`) instead of generic numeric indices (`mapping_1.py`)
|
|
309
437
|
- Workflow imports automatically match the named mapping files
|
|
310
438
|
- **Expression converter rewrite**: Recursive parenthesis-aware parser replacing simple regex; fixes nested IIF/INSTR/LTRIM/RTRIM/REPLACECHR/REPLACESTR/SUBSTR/TO_CHAR/CHR/MAKE_DATE_TIME
|
|
@@ -367,7 +495,7 @@ Converts Informatica expressions to Python equivalents:
|
|
|
367
495
|
cd informatica_python
|
|
368
496
|
pip install -e ".[dev]"
|
|
369
497
|
|
|
370
|
-
# Run tests (
|
|
498
|
+
# Run tests (646 tests)
|
|
371
499
|
pytest tests/ -v
|
|
372
500
|
```
|
|
373
501
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
informatica_python/__init__.py,sha256=
|
|
1
|
+
informatica_python/__init__.py,sha256=o9kEVkHnEwXAD7hhY8YbN6G8RP4Mqby_q8CpjfbiknQ,337
|
|
2
2
|
informatica_python/cli.py,sha256=gFwg0O99vKM-OLO0HoHA4emd-6qrgjMNqa9T59e4e_s,2905
|
|
3
3
|
informatica_python/converter.py,sha256=xCuWrYzDji0yN72D3QqOgZCVVM2j3k2_CvlGplCWxLU,22779
|
|
4
4
|
informatica_python/models.py,sha256=G_C2WfQL-ykKjNj23m8vKFtLZYrQozp99HJzrLTKG1Y,17293
|
|
@@ -7,17 +7,17 @@ informatica_python/generators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
|
|
|
7
7
|
informatica_python/generators/config_gen.py,sha256=4tqcNKTB06kyGZIiM4yl0q97q_i3zeCHXTjuE1dNFKY,5726
|
|
8
8
|
informatica_python/generators/error_log_gen.py,sha256=2cc0rEcblydHkb9VAMXlrH7WdSQ-CNqAXcwVk3FYZeM,21319
|
|
9
9
|
informatica_python/generators/helper_gen.py,sha256=D6-UqNh09Qy2V7RimNgP-SzK_uB9YqAlsa0-cgLhf5o,72209
|
|
10
|
-
informatica_python/generators/mapping_gen.py,sha256=
|
|
10
|
+
informatica_python/generators/mapping_gen.py,sha256=gBVArcb8uODbgY3epdsldCbUywS-qo8CiKr7hcNjMnc,70654
|
|
11
11
|
informatica_python/generators/sql_gen.py,sha256=O8Y-aJz9EyFJ0DXeuISRt5yKwC3wlp2K3B0BHrmxrXw,4872
|
|
12
|
-
informatica_python/generators/workflow_gen.py,sha256=
|
|
12
|
+
informatica_python/generators/workflow_gen.py,sha256=_uSlBg31ZRMhMlCYk4hWDRBPaBROrepD8_v3QGEWJxE,18089
|
|
13
13
|
informatica_python/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
14
|
informatica_python/utils/datatype_map.py,sha256=iLOYg-iBKT4rMecGbrFkTpJj4yqs5S9HeBOTLUIWhX0,2809
|
|
15
|
-
informatica_python/utils/expression_converter.py,sha256=
|
|
15
|
+
informatica_python/utils/expression_converter.py,sha256=CqkkTESMKxcYmVsDpNfn7VcZZe771uCIMy_0YQYq6pc,45946
|
|
16
16
|
informatica_python/utils/lib_adapters.py,sha256=1ZtuMbgDg9Ukf-OF_EG1L_BeeR-6JQk8Kx3WwMfvNRU,6516
|
|
17
17
|
informatica_python/utils/sql_dialect.py,sha256=_IHJbfu8a3mT_OvHpybgSfZKqz6mwVy5ItTKDRChqnU,5461
|
|
18
|
-
informatica_python-1.9.
|
|
19
|
-
informatica_python-1.9.
|
|
20
|
-
informatica_python-1.9.
|
|
21
|
-
informatica_python-1.9.
|
|
22
|
-
informatica_python-1.9.
|
|
23
|
-
informatica_python-1.9.
|
|
18
|
+
informatica_python-1.9.3.dist-info/licenses/LICENSE,sha256=77RaRDdXgey1D90YZAjXqEQdBxWfvUQqLQX3pC1qjUE,1061
|
|
19
|
+
informatica_python-1.9.3.dist-info/METADATA,sha256=VbfZWdzKE382RnkR7F2rs7PNL397g3PfglvugN4XVTw,26097
|
|
20
|
+
informatica_python-1.9.3.dist-info/WHEEL,sha256=PovZm1ExVWmrRefZoXCfejlbKLnQI5SVIf1SWRV4QQI,97
|
|
21
|
+
informatica_python-1.9.3.dist-info/entry_points.txt,sha256=030jjTrx-1oRRQ16HZz52rdcKS8R8_llnymsTUtn_Xc,67
|
|
22
|
+
informatica_python-1.9.3.dist-info/top_level.txt,sha256=Dngg-WNteYi22XAJU2XKAQS8aZ52yM2LYC0tzxrlbVQ,19
|
|
23
|
+
informatica_python-1.9.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|