informatica-python 1.4.2__tar.gz → 1.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {informatica_python-1.4.2 → informatica_python-1.5.1}/PKG-INFO +1 -1
  2. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/__init__.py +1 -1
  3. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/cli.py +6 -0
  4. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/converter.py +6 -4
  5. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/generators/helper_gen.py +81 -7
  6. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/generators/mapping_gen.py +96 -88
  7. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/generators/workflow_gen.py +6 -1
  8. informatica_python-1.5.1/informatica_python/utils/expression_converter.py +444 -0
  9. informatica_python-1.5.1/informatica_python/utils/lib_adapters.py +164 -0
  10. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python.egg-info/PKG-INFO +1 -1
  11. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python.egg-info/SOURCES.txt +3 -1
  12. {informatica_python-1.4.2 → informatica_python-1.5.1}/pyproject.toml +1 -1
  13. informatica_python-1.5.1/tests/test_integration.py +540 -0
  14. informatica_python-1.4.2/informatica_python/utils/expression_converter.py +0 -264
  15. {informatica_python-1.4.2 → informatica_python-1.5.1}/LICENSE +0 -0
  16. {informatica_python-1.4.2 → informatica_python-1.5.1}/README.md +0 -0
  17. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/generators/__init__.py +0 -0
  18. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/generators/config_gen.py +0 -0
  19. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/generators/error_log_gen.py +0 -0
  20. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/generators/sql_gen.py +0 -0
  21. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/models.py +0 -0
  22. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/parser.py +0 -0
  23. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/utils/__init__.py +0 -0
  24. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python/utils/datatype_map.py +0 -0
  25. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python.egg-info/dependency_links.txt +0 -0
  26. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python.egg-info/entry_points.txt +0 -0
  27. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python.egg-info/requires.txt +0 -0
  28. {informatica_python-1.4.2 → informatica_python-1.5.1}/informatica_python.egg-info/top_level.txt +0 -0
  29. {informatica_python-1.4.2 → informatica_python-1.5.1}/setup.cfg +0 -0
  30. {informatica_python-1.4.2 → informatica_python-1.5.1}/tests/test_converter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: informatica-python
3
- Version: 1.4.2
3
+ Version: 1.5.1
4
4
  Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
5
5
  Author: Nick
6
6
  License: MIT
@@ -7,7 +7,7 @@ Licensed under the MIT License.
7
7
 
8
8
  from informatica_python.converter import InformaticaConverter
9
9
 
10
- __version__ = "1.4.2"
10
+ __version__ = "1.5.1"
11
11
  __author__ = "Nick"
12
12
  __license__ = "MIT"
13
13
  __all__ = ["InformaticaConverter"]
@@ -41,6 +41,11 @@ def main():
41
41
  default=None,
42
42
  help="Save parsed JSON to a file",
43
43
  )
44
+ parser.add_argument(
45
+ "--param-file",
46
+ default=None,
47
+ help="Path to Informatica .param file for variable substitution",
48
+ )
44
49
 
45
50
  args = parser.parse_args()
46
51
 
@@ -61,6 +66,7 @@ def main():
61
66
  args.input_file,
62
67
  output_dir=args.output,
63
68
  output_zip=args.zip,
69
+ param_file=args.param_file,
64
70
  )
65
71
  print(f"Conversion complete! Output: {output_path}")
66
72
  print(f"Files generated:")
@@ -33,7 +33,8 @@ class InformaticaConverter:
33
33
  return self._powermart_to_dict(self.powermart)
34
34
 
35
35
  def convert(self, file_path: str, output_dir: str = "output",
36
- output_zip: Optional[str] = None) -> str:
36
+ output_zip: Optional[str] = None,
37
+ param_file: Optional[str] = None) -> str:
37
38
  self.powermart = self.parser.parse_file(file_path)
38
39
 
39
40
  if not self.powermart.repositories:
@@ -47,7 +48,7 @@ class InformaticaConverter:
47
48
  raise ValueError("No folder found in XML file")
48
49
 
49
50
  if len(all_folders) == 1:
50
- return self._convert_folder(all_folders[0], output_dir, output_zip)
51
+ return self._convert_folder(all_folders[0], output_dir, output_zip, param_file)
51
52
 
52
53
  result_path = output_dir if not output_zip else os.path.dirname(output_zip) or "."
53
54
  for folder in all_folders:
@@ -56,7 +57,7 @@ class InformaticaConverter:
56
57
  if output_zip:
57
58
  base, ext = os.path.splitext(output_zip)
58
59
  folder_zip = f"{base}_{folder.name}{ext}"
59
- self._convert_folder(folder, folder_dir, folder_zip)
60
+ self._convert_folder(folder, folder_dir, folder_zip, param_file)
60
61
  return result_path
61
62
 
62
63
  def convert_string(self, xml_string: str, output_dir: str = "output",
@@ -87,7 +88,8 @@ class InformaticaConverter:
87
88
  return result_path
88
89
 
89
90
  def _convert_folder(self, folder: FolderDef, output_dir: str,
90
- output_zip: Optional[str] = None) -> str:
91
+ output_zip: Optional[str] = None,
92
+ param_file: Optional[str] = None) -> str:
91
93
  files = {}
92
94
 
93
95
  files["helper_functions.py"] = generate_helper_functions(folder, self.data_lib)
@@ -44,13 +44,20 @@ def generate_helper_functions(folder: FolderDef, data_lib: str = "pandas") -> st
44
44
  lines.append('logger = logging.getLogger("informatica_converter")')
45
45
  lines.append("")
46
46
  lines.append("")
47
- lines.append("def load_config(config_path='config.yml'):")
48
- lines.append(' """Load configuration from YAML file."""')
47
+ lines.append("def load_config(config_path='config.yml', param_file=None):")
48
+ lines.append(' """Load configuration from YAML file, optionally merging Informatica .param file."""')
49
49
  lines.append(" with open(config_path, 'r') as f:")
50
- lines.append(" return yaml.safe_load(f)")
50
+ lines.append(" config = yaml.safe_load(f) or {}")
51
+ lines.append(" if param_file:")
52
+ lines.append(" params = parse_param_file(param_file)")
53
+ lines.append(" config['params'] = params")
54
+ lines.append(" for key, val in params.items():")
55
+ lines.append(" os.environ[f'INFA_VAR_{key}'] = str(val)")
56
+ lines.append(" return config")
51
57
  lines.append("")
52
58
  lines.append("")
53
59
 
60
+ _add_param_file_functions(lines)
54
61
  _add_db_functions(lines, data_lib)
55
62
  _add_file_functions(lines, data_lib)
56
63
  _add_expression_helpers(lines)
@@ -59,6 +66,61 @@ def generate_helper_functions(folder: FolderDef, data_lib: str = "pandas") -> st
59
66
  return "\n".join(lines)
60
67
 
61
68
 
69
+ def _add_param_file_functions(lines):
70
+ lines.append("# ============================================================")
71
+ lines.append("# Informatica Parameter File Support")
72
+ lines.append("# ============================================================")
73
+ lines.append("")
74
+ lines.append("")
75
+ lines.append("def parse_param_file(param_path):")
76
+ lines.append(' """')
77
+ lines.append(" Parse an Informatica .param file into a flat dict of variable names to values.")
78
+ lines.append(" Supports standard Informatica parameter file format:")
79
+ lines.append(" [Global]")
80
+ lines.append(" $$VAR_NAME=value")
81
+ lines.append(" [folder_name.WF:workflow_name.ST:session_name]")
82
+ lines.append(" $$CONN_NAME=value")
83
+ lines.append(' """')
84
+ lines.append(" params = {}")
85
+ lines.append(" if not os.path.exists(param_path):")
86
+ lines.append(" logger.warning(f'Parameter file not found: {param_path}')")
87
+ lines.append(" return params")
88
+ lines.append("")
89
+ lines.append(" current_section = 'Global'")
90
+ lines.append(" with open(param_path, 'r') as f:")
91
+ lines.append(" for line_num, line in enumerate(f, 1):")
92
+ lines.append(" line = line.strip()")
93
+ lines.append(" if not line or line.startswith('#'):")
94
+ lines.append(" continue")
95
+ lines.append(" if line.startswith('[') and line.endswith(']'):")
96
+ lines.append(" current_section = line[1:-1].strip()")
97
+ lines.append(" continue")
98
+ lines.append(" if '=' in line:")
99
+ lines.append(" key, _, value = line.partition('=')")
100
+ lines.append(" key = key.strip()")
101
+ lines.append(" value = value.strip()")
102
+ lines.append(" clean_key = key.lstrip('$')")
103
+ lines.append(" params[clean_key] = value")
104
+ lines.append(" if current_section != 'Global':")
105
+ lines.append(" params[f'{current_section}.{clean_key}'] = value")
106
+ lines.append(" logger.info(f'Loaded {len(params)} parameters from {param_path}')")
107
+ lines.append(" return params")
108
+ lines.append("")
109
+ lines.append("")
110
+ lines.append("def get_param(config, var_name, default=''):")
111
+ lines.append(' """Get a parameter value from config params, then env vars, then default."""')
112
+ lines.append(" clean = var_name.lstrip('$')")
113
+ lines.append(" params = config.get('params', {})")
114
+ lines.append(" if clean in params:")
115
+ lines.append(" return params[clean]")
116
+ lines.append(" env_val = os.environ.get(f'INFA_VAR_{clean}')")
117
+ lines.append(" if env_val is not None:")
118
+ lines.append(" return env_val")
119
+ lines.append(" return default")
120
+ lines.append("")
121
+ lines.append("")
122
+
123
+
62
124
  def _add_db_functions(lines, data_lib):
63
125
  lines.append("# ============================================================")
64
126
  lines.append("# Database Operations")
@@ -1060,14 +1122,26 @@ def _add_expression_helpers(lines):
1060
1122
  lines.append(" return None")
1061
1123
  lines.append("")
1062
1124
  lines.append("")
1063
- lines.append("def get_variable(var_name):")
1064
- lines.append(' """Get workflow/mapping variable value."""')
1065
- lines.append(" return os.environ.get(f'INFA_VAR_{var_name}', '')")
1125
+ lines.append("_param_store = {}")
1126
+ lines.append("")
1127
+ lines.append("")
1128
+ lines.append("def get_variable(var_name, config=None):")
1129
+ lines.append(' """Get workflow/mapping variable value from params, env vars, or param store."""')
1130
+ lines.append(" clean = var_name.lstrip('$')")
1131
+ lines.append(" if config and 'params' in config:")
1132
+ lines.append(" val = config['params'].get(clean)")
1133
+ lines.append(" if val is not None:")
1134
+ lines.append(" return val")
1135
+ lines.append(" if clean in _param_store:")
1136
+ lines.append(" return _param_store[clean]")
1137
+ lines.append(" return os.environ.get(f'INFA_VAR_{clean}', '')")
1066
1138
  lines.append("")
1067
1139
  lines.append("")
1068
1140
  lines.append("def set_variable(var_name, value):")
1069
1141
  lines.append(' """Set workflow/mapping variable value."""')
1070
- lines.append(" os.environ[f'INFA_VAR_{var_name}'] = str(value)")
1142
+ lines.append(" clean = var_name.lstrip('$')")
1143
+ lines.append(" _param_store[clean] = value")
1144
+ lines.append(" os.environ[f'INFA_VAR_{clean}'] = str(value)")
1071
1145
  lines.append(" return value")
1072
1146
  lines.append("")
1073
1147
  lines.append("")
@@ -4,11 +4,16 @@ from informatica_python.models import (
4
4
  TransformationDef, ConnectorDef, InstanceDef, MappletDef,
5
5
  )
6
6
  from informatica_python.utils.expression_converter import (
7
- convert_expression, convert_sql_expression,
7
+ convert_expression, convert_expression_vectorized,
8
+ convert_sql_expression, convert_filter_vectorized,
8
9
  parse_join_condition, parse_lookup_condition,
9
10
  parse_aggregate_expression, PANDAS_AGG_MAP,
10
11
  )
11
12
  from informatica_python.utils.datatype_map import get_python_type
13
+ from informatica_python.utils.lib_adapters import (
14
+ lib_merge, lib_sort, lib_groupby_agg, lib_groupby_first,
15
+ lib_concat, lib_empty_df, lib_copy, lib_rank,
16
+ )
12
17
 
13
18
 
14
19
  def _inline_mapplets(mapping, folder):
@@ -184,6 +189,7 @@ def generate_mapping_code(mapping: MappingDef, folder: FolderDef,
184
189
  lines.append(f"Auto-generated by informatica-python")
185
190
  lines.append('"""')
186
191
  lines.append("")
192
+ lines.append("import numpy as np")
187
193
  lines.append("from helper_functions import *")
188
194
  lines.append("")
189
195
  lines.append("")
@@ -266,7 +272,7 @@ def generate_mapping_code(mapping: MappingDef, folder: FolderDef,
266
272
  for tx in processing_order:
267
273
  if tx.type in ("Source Qualifier", "Application Source Qualifier"):
268
274
  continue
269
- _generate_transformation(lines, tx, connector_graph, source_dfs, transform_map, instance_map)
275
+ _generate_transformation(lines, tx, connector_graph, source_dfs, transform_map, instance_map, data_lib)
270
276
 
271
277
  for tgt_name, tgt_def in target_map.items():
272
278
  _generate_target_write(lines, tgt_name, tgt_def, connector_graph, source_dfs, transform_map, instance_map, session_overrides)
@@ -277,7 +283,12 @@ def generate_mapping_code(mapping: MappingDef, folder: FolderDef,
277
283
  lines.append("")
278
284
  lines.append("")
279
285
  lines.append("if __name__ == '__main__':")
280
- lines.append(" config = load_config()")
286
+ lines.append(" import argparse as _ap")
287
+ lines.append(" _parser = _ap.ArgumentParser()")
288
+ lines.append(" _parser.add_argument('--param-file', default=None)")
289
+ lines.append(" _parser.add_argument('--config', default='config.yml')")
290
+ lines.append(" _args = _parser.parse_args()")
291
+ lines.append(" config = load_config(_args.config, param_file=_args.param_file)")
281
292
  lines.append(f" run_{_safe_name(mapping.name)}(config)")
282
293
  lines.append("")
283
294
 
@@ -560,7 +571,7 @@ def _generate_source_qualifier(lines, sq, source_map, source_dfs, connector_grap
560
571
  lines.append("")
561
572
 
562
573
 
563
- def _generate_transformation(lines, tx, connector_graph, source_dfs, transform_map, instance_map):
574
+ def _generate_transformation(lines, tx, connector_graph, source_dfs, transform_map, instance_map, data_lib="pandas"):
564
575
  tx_safe = _safe_name(tx.name)
565
576
  tx_type = tx.type.lower().strip()
566
577
 
@@ -584,21 +595,21 @@ def _generate_transformation(lines, tx, connector_graph, source_dfs, transform_m
584
595
  lines.append(f" # Transformation: {tx.name} (Type: {tx.type})")
585
596
 
586
597
  if tx_type == "expression":
587
- _gen_expression_transform(lines, tx, tx_safe, input_df, source_dfs)
598
+ _gen_expression_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib)
588
599
  elif tx_type == "filter":
589
- _gen_filter_transform(lines, tx, tx_safe, input_df, source_dfs)
600
+ _gen_filter_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib)
590
601
  elif tx_type in ("aggregator",):
591
- _gen_aggregator_transform(lines, tx, tx_safe, input_df, source_dfs)
602
+ _gen_aggregator_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib)
592
603
  elif tx_type == "sorter":
593
- _gen_sorter_transform(lines, tx, tx_safe, input_df, source_dfs)
604
+ _gen_sorter_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib)
594
605
  elif tx_type in ("joiner",):
595
- _gen_joiner_transform(lines, tx, tx_safe, input_df, input_sources, source_dfs, connector_graph)
606
+ _gen_joiner_transform(lines, tx, tx_safe, input_df, input_sources, source_dfs, connector_graph, data_lib)
596
607
  elif tx_type in ("lookup procedure", "lookup"):
597
- _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs)
608
+ _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib)
598
609
  elif tx_type == "router":
599
610
  _gen_router_transform(lines, tx, tx_safe, input_df, source_dfs)
600
611
  elif tx_type in ("union",):
601
- _gen_union_transform(lines, tx, tx_safe, input_sources, source_dfs)
612
+ _gen_union_transform(lines, tx, tx_safe, input_sources, source_dfs, data_lib)
602
613
  elif tx_type in ("update strategy",):
603
614
  _gen_update_strategy(lines, tx, tx_safe, input_df, source_dfs)
604
615
  elif tx_type == "sequence generator":
@@ -606,9 +617,9 @@ def _generate_transformation(lines, tx, connector_graph, source_dfs, transform_m
606
617
  elif tx_type in ("normalizer",):
607
618
  _gen_normalizer_transform(lines, tx, tx_safe, input_df, source_dfs)
608
619
  elif tx_type in ("rank",):
609
- _gen_rank_transform(lines, tx, tx_safe, input_df, source_dfs)
620
+ _gen_rank_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib)
610
621
  elif tx_type in ("custom transformation",):
611
- _gen_custom_transform(lines, tx, tx_safe, input_df, input_sources, source_dfs)
622
+ _gen_custom_transform(lines, tx, tx_safe, input_df, input_sources, source_dfs, data_lib)
612
623
  elif tx_type in ("stored procedure",):
613
624
  _gen_stored_proc(lines, tx, tx_safe, input_df, source_dfs)
614
625
  elif tx_type in ("java",):
@@ -617,44 +628,48 @@ def _generate_transformation(lines, tx, connector_graph, source_dfs, transform_m
617
628
  _gen_sql_transform(lines, tx, tx_safe, input_df, source_dfs)
618
629
  else:
619
630
  lines.append(f" # TODO: Unsupported transformation type '{tx.type}' - passing through")
620
- lines.append(f" df_{tx_safe} = {input_df}.copy() if hasattr({input_df}, 'copy') else {input_df}")
631
+ copy_expr = lib_copy(data_lib, input_df)
632
+ lines.append(f" df_{tx_safe} = {copy_expr}")
621
633
  source_dfs[tx.name] = f"df_{tx_safe}"
622
634
 
623
635
  lines.append("")
624
636
 
625
637
 
626
- def _gen_expression_transform(lines, tx, tx_safe, input_df, source_dfs):
627
- lines.append(f" df_{tx_safe} = {input_df}.copy()")
638
+ def _gen_expression_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pandas"):
639
+ copy_expr = lib_copy(data_lib, input_df)
640
+ lines.append(f" df_{tx_safe} = {copy_expr}")
628
641
  has_expressions = False
629
642
  for fld in tx.fields:
630
643
  if fld.expression and fld.expression.strip() and fld.expression.strip() != fld.name:
631
644
  has_expressions = True
632
- expr_py = convert_expression(fld.expression)
645
+ expr_vec = convert_expression_vectorized(fld.expression, f"df_{tx_safe}")
633
646
  lines.append(f" # {fld.name} = {fld.expression}")
634
647
  if fld.porttype and "OUTPUT" in fld.porttype.upper() and "INPUT" not in fld.porttype.upper():
635
- lines.append(f" df_{tx_safe}['{fld.name}'] = {expr_py} # output-only port")
648
+ lines.append(f" df_{tx_safe}['{fld.name}'] = {expr_vec}")
636
649
  else:
637
- lines.append(f" df_{tx_safe}['{fld.name}'] = {expr_py}")
650
+ lines.append(f" df_{tx_safe}['{fld.name}'] = {expr_vec}")
638
651
  if not has_expressions:
639
652
  lines.append(f" # Pass-through expression (no transformations)")
640
653
  source_dfs[tx.name] = f"df_{tx_safe}"
641
654
 
642
655
 
643
- def _gen_filter_transform(lines, tx, tx_safe, input_df, source_dfs):
656
+ def _gen_filter_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pandas"):
644
657
  filter_condition = ""
645
658
  for attr in tx.attributes:
646
659
  if attr.name == "Filter Condition":
647
660
  filter_condition = attr.value
648
661
  if filter_condition:
649
- expr_py = convert_expression(filter_condition)
662
+ expr_vec = convert_filter_vectorized(filter_condition, input_df)
650
663
  lines.append(f" # Filter: {filter_condition}")
651
- lines.append(f" df_{tx_safe} = {input_df}[{expr_py}].copy()")
664
+ copy_expr = lib_copy(data_lib, f"{input_df}[{expr_vec}]")
665
+ lines.append(f" df_{tx_safe} = {copy_expr}")
652
666
  else:
653
- lines.append(f" df_{tx_safe} = {input_df}.copy()")
667
+ copy_expr = lib_copy(data_lib, input_df)
668
+ lines.append(f" df_{tx_safe} = {copy_expr}")
654
669
  source_dfs[tx.name] = f"df_{tx_safe}"
655
670
 
656
671
 
657
- def _gen_aggregator_transform(lines, tx, tx_safe, input_df, source_dfs):
672
+ def _gen_aggregator_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pandas"):
658
673
  group_by_ports = []
659
674
  agg_ports = []
660
675
  for fld in tx.fields:
@@ -686,22 +701,18 @@ def _gen_aggregator_transform(lines, tx, tx_safe, input_df, source_dfs):
686
701
 
687
702
  if group_by_ports and agg_dict:
688
703
  lines.append(f" # Aggregator: group by {group_by_ports}")
689
- agg_spec = {}
690
- for out_name, (col, func) in agg_dict.items():
691
- agg_spec[out_name] = f"pd.NamedAgg(column='{col}', aggfunc='{func}')"
692
-
693
- lines.append(f" df_{tx_safe} = {input_df}.groupby({group_by_ports}, as_index=False).agg(")
694
- for out_name, spec in agg_spec.items():
695
- lines.append(f" {out_name}={spec},")
696
- lines.append(f" )")
704
+ agg_expr = lib_groupby_agg(data_lib, input_df, group_by_ports, agg_dict)
705
+ lines.append(f" df_{tx_safe} = {agg_expr}")
697
706
 
698
707
  if rename_map:
699
708
  lines.append(f" df_{tx_safe} = df_{tx_safe}.rename(columns={rename_map})")
700
709
  elif group_by_ports:
701
710
  lines.append(f" # Aggregator: group by {group_by_ports}")
702
- lines.append(f" df_{tx_safe} = {input_df}.groupby({group_by_ports}, as_index=False).agg('first')")
711
+ first_expr = lib_groupby_first(data_lib, input_df, group_by_ports)
712
+ lines.append(f" df_{tx_safe} = {first_expr}")
703
713
  else:
704
- lines.append(f" df_{tx_safe} = {input_df}.copy()")
714
+ copy_expr = lib_copy(data_lib, input_df)
715
+ lines.append(f" df_{tx_safe} = {copy_expr}")
705
716
 
706
717
  for col_name, expr_text in computed_aggs:
707
718
  expr_py = convert_expression(expr_text)
@@ -711,20 +722,22 @@ def _gen_aggregator_transform(lines, tx, tx_safe, input_df, source_dfs):
711
722
  source_dfs[tx.name] = f"df_{tx_safe}"
712
723
 
713
724
 
714
- def _gen_sorter_transform(lines, tx, tx_safe, input_df, source_dfs):
725
+ def _gen_sorter_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pandas"):
715
726
  sort_keys = []
716
727
  sort_dirs = []
717
728
  for fld in tx.fields:
718
729
  sort_keys.append(fld.name)
719
730
  sort_dirs.append(True)
720
731
  if sort_keys:
721
- lines.append(f" df_{tx_safe} = {input_df}.sort_values(by={sort_keys}, ascending={sort_dirs}).reset_index(drop=True)")
732
+ sort_expr = lib_sort(data_lib, input_df, sort_keys, sort_dirs)
733
+ lines.append(f" df_{tx_safe} = {sort_expr}")
722
734
  else:
723
- lines.append(f" df_{tx_safe} = {input_df}.copy()")
735
+ copy_expr = lib_copy(data_lib, input_df)
736
+ lines.append(f" df_{tx_safe} = {copy_expr}")
724
737
  source_dfs[tx.name] = f"df_{tx_safe}"
725
738
 
726
739
 
727
- def _gen_joiner_transform(lines, tx, tx_safe, input_df, input_sources, source_dfs, connector_graph=None):
740
+ def _gen_joiner_transform(lines, tx, tx_safe, input_df, input_sources, source_dfs, connector_graph=None, data_lib="pandas"):
728
741
  join_type = "inner"
729
742
  join_condition = ""
730
743
  for attr in tx.attributes:
@@ -778,33 +791,29 @@ def _gen_joiner_transform(lines, tx, tx_safe, input_df, input_sources, source_df
778
791
 
779
792
  lines.append(f" # Join ({join_type}): {join_condition or 'auto'}")
780
793
  if left_keys and right_keys:
781
- lines.append(f" df_{tx_safe} = {df_detail}.merge(")
782
- lines.append(f" {df_master},")
783
- lines.append(f" left_on={left_keys},")
784
- lines.append(f" right_on={right_keys},")
785
- lines.append(f" how='{join_type}',")
786
- lines.append(f" suffixes=('', '_master')")
787
- lines.append(f" )")
794
+ merge_expr = lib_merge(data_lib, df_detail, df_master,
795
+ left_on=left_keys, right_on=right_keys, how=join_type)
796
+ lines.append(f" df_{tx_safe} = {merge_expr}")
788
797
  else:
789
798
  common_cols = [f for f in detail_fields if f in master_fields]
790
799
  if common_cols:
791
- lines.append(f" df_{tx_safe} = {df_detail}.merge(")
792
- lines.append(f" {df_master},")
793
- lines.append(f" on={common_cols},")
794
- lines.append(f" how='{join_type}',")
795
- lines.append(f" suffixes=('', '_master')")
796
- lines.append(f" )")
800
+ merge_expr = lib_merge(data_lib, df_detail, df_master,
801
+ on=common_cols, how=join_type)
802
+ lines.append(f" df_{tx_safe} = {merge_expr}")
797
803
  else:
798
- lines.append(f" df_{tx_safe} = {df_detail}.merge({df_master}, how='{join_type}', suffixes=('', '_master'))")
804
+ merge_expr = lib_merge(data_lib, df_detail, df_master, how=join_type)
805
+ lines.append(f" df_{tx_safe} = {merge_expr}")
799
806
  elif len(src_list) == 1:
800
807
  df1 = source_dfs.get(src_list[0], f"df_{_safe_name(src_list[0])}")
801
- lines.append(f" df_{tx_safe} = {df1}.copy()")
808
+ copy_expr = lib_copy(data_lib, df1)
809
+ lines.append(f" df_{tx_safe} = {copy_expr}")
802
810
  else:
803
- lines.append(f" df_{tx_safe} = {input_df}.copy()")
811
+ copy_expr = lib_copy(data_lib, input_df)
812
+ lines.append(f" df_{tx_safe} = {copy_expr}")
804
813
  source_dfs[tx.name] = f"df_{tx_safe}"
805
814
 
806
815
 
807
- def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs):
816
+ def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pandas"):
808
817
  lookup_table = ""
809
818
  lookup_sql = ""
810
819
  lookup_condition = ""
@@ -844,7 +853,8 @@ def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs):
844
853
  elif lookup_table:
845
854
  lines.append(f" df_lkp_{tx_safe} = read_from_db(config, 'SELECT * FROM {lookup_table}', 'default')")
846
855
  else:
847
- lines.append(f" df_lkp_{tx_safe} = pd.DataFrame()")
856
+ empty_expr = lib_empty_df(data_lib)
857
+ lines.append(f" df_lkp_{tx_safe} = {empty_expr}")
848
858
 
849
859
  input_keys, lookup_keys = parse_lookup_condition(lookup_condition)
850
860
 
@@ -862,13 +872,10 @@ def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs):
862
872
  else:
863
873
  lines.append(f" df_lkp_{tx_safe} = df_lkp_{tx_safe}[lkp_select_cols_{tx_safe}].drop_duplicates(subset={lookup_keys}, keep='first')")
864
874
 
865
- lines.append(f" df_{tx_safe} = {input_df}.merge(")
866
- lines.append(f" df_lkp_{tx_safe},")
867
- lines.append(f" left_on={input_keys},")
868
- lines.append(f" right_on={lookup_keys},")
869
- lines.append(f" how='left',")
870
- lines.append(f" suffixes=('', '_lkp')")
871
- lines.append(f" )")
875
+ merge_expr = lib_merge(data_lib, input_df, f"df_lkp_{tx_safe}",
876
+ left_on=input_keys, right_on=lookup_keys,
877
+ how="left", suffixes=("", "_lkp"))
878
+ lines.append(f" df_{tx_safe} = {merge_expr}")
872
879
 
873
880
  drop_cols = [k for k in lookup_keys if k not in input_keys]
874
881
  if drop_cols:
@@ -910,7 +917,7 @@ def _gen_router_transform(lines, tx, tx_safe, input_df, source_dfs):
910
917
  source_dfs[tx.name] = f"df_{tx_safe}"
911
918
 
912
919
 
913
- def _gen_union_transform(lines, tx, tx_safe, input_sources, source_dfs):
920
+ def _gen_union_transform(lines, tx, tx_safe, input_sources, source_dfs, data_lib="pandas"):
914
921
  dfs_to_union = []
915
922
  for src in input_sources:
916
923
  df_name = source_dfs.get(src, f"df_{_safe_name(src)}")
@@ -918,11 +925,14 @@ def _gen_union_transform(lines, tx, tx_safe, input_sources, source_dfs):
918
925
 
919
926
  if len(dfs_to_union) > 1:
920
927
  df_list = ", ".join(dfs_to_union)
921
- lines.append(f" df_{tx_safe} = pd.concat([{df_list}], ignore_index=True)")
928
+ concat_expr = lib_concat(data_lib, df_list)
929
+ lines.append(f" df_{tx_safe} = {concat_expr}")
922
930
  elif dfs_to_union:
923
- lines.append(f" df_{tx_safe} = {dfs_to_union[0]}.copy()")
931
+ copy_expr = lib_copy(data_lib, dfs_to_union[0])
932
+ lines.append(f" df_{tx_safe} = {copy_expr}")
924
933
  else:
925
- lines.append(f" df_{tx_safe} = pd.DataFrame()")
934
+ empty_expr = lib_empty_df(data_lib)
935
+ lines.append(f" df_{tx_safe} = {empty_expr}")
926
936
  source_dfs[tx.name] = f"df_{tx_safe}"
927
937
 
928
938
 
@@ -1037,7 +1047,7 @@ def _gen_normalizer_transform(lines, tx, tx_safe, input_df, source_dfs):
1037
1047
  source_dfs[tx.name] = f"df_{tx_safe}"
1038
1048
 
1039
1049
 
1040
- def _gen_rank_transform(lines, tx, tx_safe, input_df, source_dfs):
1050
+ def _gen_rank_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pandas"):
1041
1051
  rank_port = None
1042
1052
  group_by_ports = []
1043
1053
  top_bottom = "TOP"
@@ -1080,19 +1090,15 @@ def _gen_rank_transform(lines, tx, tx_safe, input_df, source_dfs):
1080
1090
  rank_out_field = fld.name
1081
1091
  break
1082
1092
 
1083
- lines.append(f" df_{tx_safe} = {input_df}.copy()")
1084
- if rank_port and group_by_ports:
1085
- lines.append(f" # Rank by '{rank_port}' within groups {group_by_ports}")
1086
- lines.append(f" _rank_vals = df_{tx_safe}.groupby({group_by_ports})['{rank_port}'].rank(")
1087
- lines.append(f" method='min', ascending={ascending}")
1088
- lines.append(f" )")
1089
- lines.append(f" df_{tx_safe}['{rank_out_field}'] = _rank_vals.fillna(0).astype(int)")
1090
- if top_n:
1091
- lines.append(f" df_{tx_safe} = df_{tx_safe}[df_{tx_safe}['{rank_out_field}'] <= {top_n}].reset_index(drop=True)")
1092
- elif rank_port:
1093
- lines.append(f" # Rank by '{rank_port}' (no group-by)")
1094
- lines.append(f" _rank_vals = df_{tx_safe}['{rank_port}'].rank(method='min', ascending={ascending})")
1095
- lines.append(f" df_{tx_safe}['{rank_out_field}'] = _rank_vals.fillna(0).astype(int)")
1093
+ copy_expr = lib_copy(data_lib, input_df)
1094
+ lines.append(f" df_{tx_safe} = {copy_expr}")
1095
+ if rank_port:
1096
+ rank_code = lib_rank(data_lib, f"df_{tx_safe}", group_by_ports, rank_port, ascending, rank_out_field)
1097
+ if group_by_ports:
1098
+ lines.append(f" # Rank by '{rank_port}' within groups {group_by_ports}")
1099
+ else:
1100
+ lines.append(f" # Rank by '{rank_port}' (no group-by)")
1101
+ lines.append(f" {rank_code}")
1096
1102
  if top_n:
1097
1103
  lines.append(f" df_{tx_safe} = df_{tx_safe}[df_{tx_safe}['{rank_out_field}'] <= {top_n}].reset_index(drop=True)")
1098
1104
  else:
@@ -1100,7 +1106,7 @@ def _gen_rank_transform(lines, tx, tx_safe, input_df, source_dfs):
1100
1106
  source_dfs[tx.name] = f"df_{tx_safe}"
1101
1107
 
1102
1108
 
1103
- def _gen_custom_transform(lines, tx, tx_safe, input_df, input_sources, source_dfs):
1109
+ def _gen_custom_transform(lines, tx, tx_safe, input_df, input_sources, source_dfs, data_lib="pandas"):
1104
1110
  is_union = False
1105
1111
  output_fields = []
1106
1112
  input_groups = {}
@@ -1108,11 +1114,9 @@ def _gen_custom_transform(lines, tx, tx_safe, input_df, input_sources, source_df
1108
1114
  for fld in tx.fields:
1109
1115
  if "OUTPUT" in (fld.porttype or "").upper():
1110
1116
  output_fields.append(fld)
1111
- group_suffix_match = None
1112
1117
  import re
1113
1118
  m = re.match(r'^(.+?)(\d+)$', fld.name)
1114
1119
  if m and "INPUT" in (fld.porttype or "").upper():
1115
- base_name = m.group(1)
1116
1120
  group_idx = m.group(2)
1117
1121
  if group_idx not in input_groups:
1118
1122
  input_groups[group_idx] = []
@@ -1128,14 +1132,18 @@ def _gen_custom_transform(lines, tx, tx_safe, input_df, input_sources, source_df
1128
1132
  dfs_to_union.append(df_name)
1129
1133
  if len(dfs_to_union) > 1:
1130
1134
  df_list = ", ".join(dfs_to_union)
1131
- lines.append(f" df_{tx_safe} = pd.concat([{df_list}], ignore_index=True)")
1135
+ concat_expr = lib_concat(data_lib, df_list)
1136
+ lines.append(f" df_{tx_safe} = {concat_expr}")
1132
1137
  elif dfs_to_union:
1133
- lines.append(f" df_{tx_safe} = {dfs_to_union[0]}.copy()")
1138
+ copy_expr = lib_copy(data_lib, dfs_to_union[0])
1139
+ lines.append(f" df_{tx_safe} = {copy_expr}")
1134
1140
  else:
1135
- lines.append(f" df_{tx_safe} = pd.DataFrame()")
1141
+ empty_expr = lib_empty_df(data_lib)
1142
+ lines.append(f" df_{tx_safe} = {empty_expr}")
1136
1143
  else:
1137
1144
  lines.append(f" # Custom transformation: {tx.name}")
1138
- lines.append(f" df_{tx_safe} = {input_df}.copy()")
1145
+ copy_expr = lib_copy(data_lib, input_df)
1146
+ lines.append(f" df_{tx_safe} = {copy_expr}")
1139
1147
 
1140
1148
  source_dfs[tx.name] = f"df_{tx_safe}"
1141
1149
 
@@ -51,7 +51,12 @@ def generate_workflow_code(folder: FolderDef) -> str:
51
51
 
52
52
  lines.append("")
53
53
  lines.append("if __name__ == '__main__':")
54
- lines.append(" config = load_config()")
54
+ lines.append(" import argparse as _ap")
55
+ lines.append(" _parser = _ap.ArgumentParser()")
56
+ lines.append(" _parser.add_argument('--param-file', default=None)")
57
+ lines.append(" _parser.add_argument('--config', default='config.yml')")
58
+ lines.append(" _args = _parser.parse_args()")
59
+ lines.append(" config = load_config(_args.config, param_file=_args.param_file)")
55
60
  lines.append(" success = run_workflow(config)")
56
61
  lines.append(" sys.exit(0 if success else 1)")
57
62
  lines.append("")