informatica-python 1.9.3__tar.gz → 1.9.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {informatica_python-1.9.3 → informatica_python-1.9.5}/PKG-INFO +3 -3
  2. {informatica_python-1.9.3 → informatica_python-1.9.5}/README.md +2 -2
  3. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/__init__.py +1 -1
  4. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/generators/helper_gen.py +218 -49
  5. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/generators/mapping_gen.py +35 -8
  6. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/utils/expression_converter.py +19 -5
  7. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python.egg-info/PKG-INFO +3 -3
  8. {informatica_python-1.9.3 → informatica_python-1.9.5}/pyproject.toml +1 -1
  9. {informatica_python-1.9.3 → informatica_python-1.9.5}/tests/test_integration.py +462 -0
  10. {informatica_python-1.9.3 → informatica_python-1.9.5}/LICENSE +0 -0
  11. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/cli.py +0 -0
  12. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/converter.py +0 -0
  13. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/generators/__init__.py +0 -0
  14. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/generators/config_gen.py +0 -0
  15. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/generators/error_log_gen.py +0 -0
  16. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/generators/sql_gen.py +0 -0
  17. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/generators/workflow_gen.py +0 -0
  18. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/models.py +0 -0
  19. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/parser.py +0 -0
  20. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/utils/__init__.py +0 -0
  21. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/utils/datatype_map.py +0 -0
  22. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/utils/lib_adapters.py +0 -0
  23. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python/utils/sql_dialect.py +0 -0
  24. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python.egg-info/SOURCES.txt +0 -0
  25. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python.egg-info/dependency_links.txt +0 -0
  26. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python.egg-info/entry_points.txt +0 -0
  27. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python.egg-info/requires.txt +0 -0
  28. {informatica_python-1.9.3 → informatica_python-1.9.5}/informatica_python.egg-info/top_level.txt +0 -0
  29. {informatica_python-1.9.3 → informatica_python-1.9.5}/setup.cfg +0 -0
  30. {informatica_python-1.9.3 → informatica_python-1.9.5}/tests/test_converter.py +0 -0
  31. {informatica_python-1.9.3 → informatica_python-1.9.5}/tests/test_expressions.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: informatica-python
3
- Version: 1.9.3
3
+ Version: 1.9.5
4
4
  Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
5
5
  Author: Nick
6
6
  License: MIT
@@ -430,7 +430,7 @@ The generated `helper_functions.py` provides a complete runtime library:
430
430
  - **Generated code formatting**: Consistent `# ---` section headers for Source Qualifiers, Transforms, and Target Writes; metadata comments (database type, field lists); column mapping and write operation comments; clean blank line handling
431
431
  - **Source/target detection**: Case-insensitive instance type matching
432
432
  - **Session→mapping inference**: Longest-suffix-match strategy for ambiguous mapping names
433
- - **646 tests** across unit, integration, expression, and formatting test suites
433
+ - **663 tests** across unit, integration, expression, and formatting test suites
434
434
 
435
435
  ### v1.9.2 (Phase 8)
436
436
  - Mapping output files now use real mapping names (e.g., `mapping_m_customer_load.py`) instead of generic numeric indices (`mapping_1.py`)
@@ -495,7 +495,7 @@ The generated `helper_functions.py` provides a complete runtime library:
495
495
  cd informatica_python
496
496
  pip install -e ".[dev]"
497
497
 
498
- # Run tests (646 tests)
498
+ # Run tests (663 tests)
499
499
  pytest tests/ -v
500
500
  ```
501
501
 
@@ -403,7 +403,7 @@ The generated `helper_functions.py` provides a complete runtime library:
403
403
  - **Generated code formatting**: Consistent `# ---` section headers for Source Qualifiers, Transforms, and Target Writes; metadata comments (database type, field lists); column mapping and write operation comments; clean blank line handling
404
404
  - **Source/target detection**: Case-insensitive instance type matching
405
405
  - **Session→mapping inference**: Longest-suffix-match strategy for ambiguous mapping names
406
- - **646 tests** across unit, integration, expression, and formatting test suites
406
+ - **663 tests** across unit, integration, expression, and formatting test suites
407
407
 
408
408
  ### v1.9.2 (Phase 8)
409
409
  - Mapping output files now use real mapping names (e.g., `mapping_m_customer_load.py`) instead of generic numeric indices (`mapping_1.py`)
@@ -468,7 +468,7 @@ The generated `helper_functions.py` provides a complete runtime library:
468
468
  cd informatica_python
469
469
  pip install -e ".[dev]"
470
470
 
471
- # Run tests (646 tests)
471
+ # Run tests (663 tests)
472
472
  pytest tests/ -v
473
473
  ```
474
474
 
@@ -7,7 +7,7 @@ Licensed under the MIT License.
7
7
 
8
8
  from informatica_python.converter import InformaticaConverter
9
9
 
10
- __version__ = "1.9.3"
10
+ __version__ = "1.9.5"
11
11
  __author__ = "Nick"
12
12
  __license__ = "MIT"
13
13
  __all__ = ["InformaticaConverter"]
@@ -11,6 +11,7 @@ def generate_helper_functions(folder: FolderDef, data_lib: str = "pandas") -> st
11
11
  lines.append("")
12
12
 
13
13
  lines.append("import os")
14
+ lines.append("import re")
14
15
  lines.append("import sys")
15
16
  lines.append("import logging")
16
17
  lines.append("import yaml")
@@ -58,6 +59,7 @@ def generate_helper_functions(folder: FolderDef, data_lib: str = "pandas") -> st
58
59
  lines.append("")
59
60
 
60
61
  _add_param_file_functions(lines)
62
+ _add_env_resolution(lines)
61
63
  _add_db_functions(lines, data_lib)
62
64
  _add_file_functions(lines, data_lib)
63
65
  _add_expression_helpers(lines)
@@ -121,23 +123,143 @@ def _add_param_file_functions(lines):
121
123
  lines.append("")
122
124
 
123
125
 
126
+ def _add_env_resolution(lines):
127
+ lines.append("# ============================================================")
128
+ lines.append("# Environment Variable Resolution")
129
+ lines.append("# ============================================================")
130
+ lines.append("")
131
+ lines.append("")
132
+ lines.append("def resolve_env(value, config=None):")
133
+ lines.append(' """')
134
+ lines.append(" Resolve ${VAR} placeholders in a string.")
135
+ lines.append(" Lookup order: OS environment variable -> config connections/variables -> literal.")
136
+ lines.append(' """')
137
+ lines.append(" if not isinstance(value, str):")
138
+ lines.append(" return value")
139
+ lines.append(" def _replace(m):")
140
+ lines.append(" var = m.group(1)")
141
+ lines.append(" env_val = os.environ.get(var)")
142
+ lines.append(" if env_val is not None:")
143
+ lines.append(" return env_val")
144
+ lines.append(" if config:")
145
+ lines.append(" for section in ('variables', 'connections', 'params'):")
146
+ lines.append(" sect = config.get(section, {})")
147
+ lines.append(" if isinstance(sect, dict) and var in sect:")
148
+ lines.append(" v = sect[var]")
149
+ lines.append(" return str(v) if not isinstance(v, dict) else str(v.get('default_value', ''))")
150
+ lines.append(" return m.group(0)")
151
+ lines.append(r" return re.sub(r'\$\{(\w+)\}', _replace, value)")
152
+ lines.append("")
153
+ lines.append("")
154
+ lines.append("def rename_with_duplicates(df, col_mapping):")
155
+ lines.append(' """')
156
+ lines.append(" Rename DataFrame columns supporting one-source-to-many-target mapping.")
157
+ lines.append(" col_mapping is {target_col: source_col}.")
158
+ lines.append(" When multiple target cols map to the same source col, we duplicate the column.")
159
+ lines.append(' """')
160
+ lines.append(" result = df.copy()")
161
+ lines.append(" from collections import Counter")
162
+ lines.append(" src_counts = Counter(col_mapping.values())")
163
+ lines.append(" simple_rename = {}")
164
+ lines.append(" for tgt, src in col_mapping.items():")
165
+ lines.append(" if src_counts[src] == 1 and src in result.columns:")
166
+ lines.append(" simple_rename[src] = tgt")
167
+ lines.append(" elif src in result.columns:")
168
+ lines.append(" result[tgt] = result[src].copy()")
169
+ lines.append(" if simple_rename:")
170
+ lines.append(" result = result.rename(columns=simple_rename)")
171
+ lines.append(" return result")
172
+ lines.append("")
173
+ lines.append("")
174
+
175
+ lines.append("def resolve_builtin_variable(var_name, mapping_name='', session_name='', folder_name=''):")
176
+ lines.append(' """Resolve Informatica built-in variables like $PMMappingName, $PMSessionName."""')
177
+ lines.append(" builtins = {")
178
+ lines.append(" 'PMMappingName': mapping_name,")
179
+ lines.append(" 'PMSessionName': session_name,")
180
+ lines.append(" 'PMFolderName': folder_name,")
181
+ lines.append(" 'PMWorkflowName': os.environ.get('INFA_VAR_PMWorkflowName', ''),")
182
+ lines.append(" 'PMWorkflowRunId': os.environ.get('INFA_VAR_PMWorkflowRunId', '0'),")
183
+ lines.append(" 'PMSessionRunId': os.environ.get('INFA_VAR_PMSessionRunId', '0'),")
184
+ lines.append(" 'PMIntegrationServiceName': os.environ.get('INFA_VAR_PMIntegrationServiceName', ''),")
185
+ lines.append(" 'PMRepositoryServiceName': os.environ.get('INFA_VAR_PMRepositoryServiceName', ''),")
186
+ lines.append(" 'PMSourceDBConnection': os.environ.get('INFA_VAR_PMSourceDBConnection', ''),")
187
+ lines.append(" 'PMTargetDBConnection': os.environ.get('INFA_VAR_PMTargetDBConnection', ''),")
188
+ lines.append(" }")
189
+ lines.append(" clean = var_name.lstrip('$').lstrip('PM')")
190
+ lines.append(" for key, val in builtins.items():")
191
+ lines.append(" if key.lower() == ('PM' + clean).lower() or key.lower() == var_name.lstrip('$').lower():")
192
+ lines.append(" return val")
193
+ lines.append(" return os.environ.get(f'INFA_VAR_{var_name.lstrip(\"$\")}', '')")
194
+ lines.append("")
195
+ lines.append("")
196
+
197
+
124
198
  def _add_db_functions(lines, data_lib):
125
199
  lines.append("# ============================================================")
126
200
  lines.append("# Database Operations")
127
201
  lines.append("# ============================================================")
128
202
  lines.append("")
129
203
  lines.append("")
204
+ lines.append("_engine_cache = {}")
205
+ lines.append("")
206
+ lines.append("")
130
207
  lines.append("def get_db_connection(config, connection_name='default'):")
131
- lines.append(' """Create database connection from config."""')
208
+ lines.append(' """')
209
+ lines.append(" Create database connection from config.")
210
+ lines.append(" Prefers SQLAlchemy engine (with connection pooling) over raw drivers.")
211
+ lines.append(" Config values support ${VAR} env-var placeholders via resolve_env().")
212
+ lines.append(' """')
132
213
  lines.append(" conn_config = config.get('connections', {}).get(connection_name, {})")
133
- lines.append(" db_type = conn_config.get('type', 'mssql')")
134
- lines.append(" host = conn_config.get('host', 'localhost')")
135
- lines.append(" port = conn_config.get('port', 1433)")
136
- lines.append(" database = conn_config.get('database', '')")
137
- lines.append(" username = conn_config.get('username', '')")
138
- lines.append(" password = conn_config.get('password', '')")
139
- lines.append(" schema = conn_config.get('schema', 'dbo')")
214
+ lines.append(" db_type = resolve_env(conn_config.get('type', 'mssql'), config)")
215
+ lines.append(" host = resolve_env(conn_config.get('host', 'localhost'), config)")
216
+ lines.append(" port = resolve_env(conn_config.get('port', 1433), config)")
217
+ lines.append(" database = resolve_env(conn_config.get('database', ''), config)")
218
+ lines.append(" username = resolve_env(conn_config.get('username', ''), config)")
219
+ lines.append(" password = resolve_env(conn_config.get('password', ''), config)")
220
+ lines.append(" schema = resolve_env(conn_config.get('schema', 'dbo'), config)")
221
+ lines.append("")
222
+ lines.append(" cache_key = f'{db_type}://{username}@{host}:{port}/{database}'")
223
+ lines.append(" if cache_key in _engine_cache:")
224
+ lines.append(" return _engine_cache[cache_key].connect()")
225
+ lines.append("")
226
+ lines.append(" try:")
227
+ lines.append(" from sqlalchemy import create_engine")
228
+ lines.append(" url = _build_sqlalchemy_url(db_type, host, port, database, username, password)")
229
+ lines.append(" if url:")
230
+ lines.append(" engine = create_engine(url, pool_pre_ping=True, pool_size=5)")
231
+ lines.append(" _engine_cache[cache_key] = engine")
232
+ lines.append(" return engine.connect()")
233
+ lines.append(" except ImportError:")
234
+ lines.append(" logger.info('SQLAlchemy not available, falling back to raw drivers')")
235
+ lines.append(" except Exception as e:")
236
+ lines.append(" logger.warning(f'SQLAlchemy connection failed: {e}, falling back to raw drivers')")
237
+ lines.append("")
238
+ lines.append(" return _get_raw_connection(db_type, host, port, database, username, password, conn_config)")
140
239
  lines.append("")
240
+ lines.append("")
241
+ lines.append("def _build_sqlalchemy_url(db_type, host, port, database, username, password):")
242
+ lines.append(" from urllib.parse import quote_plus")
243
+ lines.append(" pw = quote_plus(str(password)) if password else ''")
244
+ lines.append(" if db_type == 'mssql':")
245
+ lines.append(" try:")
246
+ lines.append(" import pyodbc")
247
+ lines.append(" conn_str = quote_plus(")
248
+ lines.append(" f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={host},{port};DATABASE={database};UID={username};PWD={password}'")
249
+ lines.append(" )")
250
+ lines.append(" return f'mssql+pyodbc:///?odbc_connect={conn_str}'")
251
+ lines.append(" except ImportError:")
252
+ lines.append(" return f'mssql+pymssql://{username}:{pw}@{host}:{port}/{database}'")
253
+ lines.append(" elif db_type == 'postgresql':")
254
+ lines.append(" return f'postgresql://{username}:{pw}@{host}:{port}/{database}'")
255
+ lines.append(" elif db_type == 'oracle':")
256
+ lines.append(" return f'oracle+cx_oracle://{username}:{pw}@{host}:{port}/{database}'")
257
+ lines.append(" elif db_type == 'mysql':")
258
+ lines.append(" return f'mysql+pymysql://{username}:{pw}@{host}:{port}/{database}'")
259
+ lines.append(" return None")
260
+ lines.append("")
261
+ lines.append("")
262
+ lines.append("def _get_raw_connection(db_type, host, port, database, username, password, conn_config):")
141
263
  lines.append(" if db_type == 'mssql':")
142
264
  lines.append(" try:")
143
265
  lines.append(" import pyodbc")
@@ -156,39 +278,19 @@ def _add_db_functions(lines, data_lib):
156
278
  lines.append(" return pymssql.connect(server=host, port=int(port), database=database, user=username, password=password)")
157
279
  lines.append(" except ImportError:")
158
280
  lines.append(" pass")
159
- lines.append(" try:")
160
- lines.append(" from sqlalchemy import create_engine")
161
- lines.append(" engine = create_engine(f'mssql+pymssql://{username}:{password}@{host}:{port}/{database}')")
162
- lines.append(" return engine.connect()")
163
- lines.append(" except ImportError:")
164
- lines.append(" pass")
165
- lines.append("")
166
- lines.append(" if db_type == 'postgresql':")
167
- lines.append(" try:")
168
- lines.append(" import psycopg2")
169
- lines.append(" return psycopg2.connect(")
170
- lines.append(" host=host, port=port, dbname=database,")
171
- lines.append(" user=username, password=password")
172
- lines.append(" )")
173
- lines.append(" except ImportError:")
174
- lines.append(" pass")
175
- lines.append("")
176
- lines.append(" if db_type == 'oracle':")
177
- lines.append(" try:")
178
- lines.append(" import cx_Oracle")
179
- lines.append(" dsn = cx_Oracle.makedsn(host, port, service_name=database)")
180
- lines.append(" return cx_Oracle.connect(username, password, dsn)")
181
- lines.append(" except ImportError:")
182
- lines.append(" pass")
281
+ lines.append(" elif db_type == 'postgresql':")
282
+ lines.append(" import psycopg2")
283
+ lines.append(" return psycopg2.connect(host=host, port=port, dbname=database, user=username, password=password)")
284
+ lines.append(" elif db_type == 'oracle':")
285
+ lines.append(" import cx_Oracle")
286
+ lines.append(" dsn = cx_Oracle.makedsn(host, port, service_name=database)")
287
+ lines.append(" return cx_Oracle.connect(username, password, dsn)")
183
288
  lines.append("")
184
289
  lines.append(" jdbc_url = conn_config.get('jdbc_url', '')")
185
290
  lines.append(" if jdbc_url:")
186
- lines.append(" try:")
187
- lines.append(" import jaydebeapi")
188
- lines.append(" driver = conn_config.get('jdbc_driver', '')")
189
- lines.append(" return jaydebeapi.connect(driver, jdbc_url, [username, password])")
190
- lines.append(" except ImportError:")
191
- lines.append(" pass")
291
+ lines.append(" import jaydebeapi")
292
+ lines.append(" driver = conn_config.get('jdbc_driver', '')")
293
+ lines.append(" return jaydebeapi.connect(driver, jdbc_url, [username, password])")
192
294
  lines.append("")
193
295
  lines.append(" raise ConnectionError(f'Cannot create connection for type: {db_type}')")
194
296
  lines.append("")
@@ -203,6 +305,15 @@ def _add_db_functions(lines, data_lib):
203
305
  else:
204
306
  read_func = "pd.read_sql"
205
307
 
308
+ lines.append("def _safe_close(conn):")
309
+ lines.append(' """Close connection safely — handles both SQLAlchemy and raw connections."""')
310
+ lines.append(" try:")
311
+ lines.append(" if hasattr(conn, 'close'):")
312
+ lines.append(" conn.close()")
313
+ lines.append(" except Exception:")
314
+ lines.append(" pass")
315
+ lines.append("")
316
+ lines.append("")
206
317
  lines.append("def read_from_db(config, query, connection_name='default'):")
207
318
  lines.append(' """Read data from database using SQL query."""')
208
319
  lines.append(" conn = get_db_connection(config, connection_name)")
@@ -220,7 +331,7 @@ def _add_db_functions(lines, data_lib):
220
331
  lines.append(" logger.error(f'DB read error on {{connection_name}}: {{e}}')")
221
332
  lines.append(" raise")
222
333
  lines.append(" finally:")
223
- lines.append(" conn.close()")
334
+ lines.append(" _safe_close(conn)")
224
335
  lines.append("")
225
336
  lines.append("")
226
337
  lines.append("def write_to_db(config, df, table_name, connection_name='default', if_exists='append', schema=None):")
@@ -242,23 +353,31 @@ def _add_db_functions(lines, data_lib):
242
353
  lines.append(" logger.error(f'DB write error to {{schema}}.{{table_name}}: {{e}}')")
243
354
  lines.append(" raise")
244
355
  lines.append(" finally:")
245
- lines.append(" conn.close()")
356
+ lines.append(" _safe_close(conn)")
246
357
  lines.append("")
247
358
  lines.append("")
248
359
  lines.append("def execute_sql(config, sql, connection_name='default'):")
249
360
  lines.append(' """Execute a SQL statement (INSERT, UPDATE, DELETE, DDL)."""')
250
361
  lines.append(" conn = get_db_connection(config, connection_name)")
251
362
  lines.append(" try:")
252
- lines.append(" cursor = conn.cursor()")
253
- lines.append(" cursor.execute(sql)")
254
- lines.append(" conn.commit()")
363
+ lines.append(" if hasattr(conn, 'execute'):")
364
+ lines.append(" from sqlalchemy import text")
365
+ lines.append(" conn.execute(text(sql))")
366
+ lines.append(" conn.commit()")
367
+ lines.append(" else:")
368
+ lines.append(" cursor = conn.cursor()")
369
+ lines.append(" cursor.execute(sql)")
370
+ lines.append(" conn.commit()")
255
371
  lines.append(" logger.info(f'Executed SQL on {{connection_name}}')")
256
372
  lines.append(" except Exception as e:")
257
373
  lines.append(" logger.error(f'SQL execution error: {{e}}')")
258
- lines.append(" conn.rollback()")
374
+ lines.append(" try:")
375
+ lines.append(" conn.rollback()")
376
+ lines.append(" except Exception:")
377
+ lines.append(" pass")
259
378
  lines.append(" raise")
260
379
  lines.append(" finally:")
261
- lines.append(" conn.close()")
380
+ lines.append(" _safe_close(conn)")
262
381
  lines.append("")
263
382
  lines.append("")
264
383
 
@@ -1150,10 +1269,60 @@ def _add_expression_helpers(lines):
1150
1269
  lines.append(" raise SystemExit(message)")
1151
1270
  lines.append("")
1152
1271
  lines.append("")
1153
- lines.append("def lookup_func(table, condition, *fields):")
1154
- lines.append(' """Placeholder for Informatica LOOKUP function."""')
1155
- lines.append(" logger.warning(f'LOOKUP called for table {table} - implement in mapping-specific code')")
1156
- lines.append(" return None")
1272
+ lines.append("_lookup_cache = {}")
1273
+ lines.append("")
1274
+ lines.append("")
1275
+ lines.append("def lookup_func(table, condition, *fields, config=None, connection_name='default'):")
1276
+ lines.append(' """')
1277
+ lines.append(" Informatica unconnected LOOKUP function.")
1278
+ lines.append(" Loads and caches the lookup table, then filters by condition.")
1279
+ lines.append(" Returns the first matching value of the first return field, or None.")
1280
+ lines.append(' """')
1281
+ lines.append(" global _lookup_cache")
1282
+ lines.append(" if table not in _lookup_cache:")
1283
+ lines.append(" if config is not None:")
1284
+ lines.append(" try:")
1285
+ lines.append(" lkp_conn = connection_name")
1286
+ lines.append(" conns = config.get('connections', {})")
1287
+ lines.append(" for cname, cval in conns.items():")
1288
+ lines.append(" if isinstance(cval, dict) and cval.get('connection_name', '') == table:")
1289
+ lines.append(" lkp_conn = cname")
1290
+ lines.append(" break")
1291
+ lines.append(" df_lkp = read_from_db(config, f'SELECT * FROM {table}', lkp_conn)")
1292
+ lines.append(" _lookup_cache[table] = df_lkp")
1293
+ lines.append(" logger.info(f'Cached lookup table {table}: {len(df_lkp)} rows')")
1294
+ lines.append(" except Exception as e:")
1295
+ lines.append(" logger.warning(f'Could not load lookup table {table}: {e}')")
1296
+ lines.append(" _lookup_cache[table] = None")
1297
+ lines.append(" else:")
1298
+ lines.append(" logger.warning(f'LOOKUP called for {table} without config - returning None')")
1299
+ lines.append(" return None")
1300
+ lines.append(" df_lkp = _lookup_cache.get(table)")
1301
+ lines.append(" if df_lkp is None or df_lkp.empty:")
1302
+ lines.append(" return None")
1303
+ lines.append(" try:")
1304
+ lines.append(" if callable(condition):")
1305
+ lines.append(" matches = df_lkp[condition(df_lkp)]")
1306
+ lines.append(" elif isinstance(condition, str) and '=' in condition:")
1307
+ lines.append(" col, _, val = condition.partition('=')")
1308
+ lines.append(" col = col.strip()")
1309
+ lines.append(" val = val.strip().strip(\"'\")")
1310
+ lines.append(" if col in df_lkp.columns:")
1311
+ lines.append(" matches = df_lkp[df_lkp[col].astype(str) == str(val)]")
1312
+ lines.append(" else:")
1313
+ lines.append(" return None")
1314
+ lines.append(" else:")
1315
+ lines.append(" return None")
1316
+ lines.append(" if matches.empty:")
1317
+ lines.append(" return None")
1318
+ lines.append(" if fields:")
1319
+ lines.append(" field = str(fields[0]).strip()")
1320
+ lines.append(" if field in matches.columns:")
1321
+ lines.append(" return matches.iloc[0][field]")
1322
+ lines.append(" return matches.iloc[0].to_dict()")
1323
+ lines.append(" except Exception as e:")
1324
+ lines.append(" logger.warning(f'LOOKUP error on {table}: {e}')")
1325
+ lines.append(" return None")
1157
1326
  lines.append("")
1158
1327
  lines.append("")
1159
1328
  lines.append("_param_store = {}")
@@ -419,9 +419,10 @@ def _safe_name(name):
419
419
  return safe.lower()
420
420
 
421
421
 
422
- def _emit_sql_with_params(lines, sql_var_name, sql_text, indent=" "):
422
+ def _emit_sql_with_params(lines, sql_var_name, sql_text, indent=" ", mapping_name="", session_name="", folder_name=""):
423
423
  import re
424
424
  params = re.findall(r'\$\$(\w+)', sql_text)
425
+ pm_vars = re.findall(r'\$(PM\w+)', sql_text)
425
426
  lines.append(f"{indent}{sql_var_name} = '''")
426
427
  for sql_line in sql_text.strip().split("\n"):
427
428
  lines.append(f"{indent}{sql_line}")
@@ -433,6 +434,13 @@ def _emit_sql_with_params(lines, sql_var_name, sql_text, indent=" "):
433
434
  continue
434
435
  seen.add(p)
435
436
  lines.append(f"{indent}{sql_var_name} = {sql_var_name}.replace('$${p}', str(get_param(config, '{p}')))")
437
+ if pm_vars:
438
+ seen_pm = set()
439
+ for pm in pm_vars:
440
+ if pm in seen_pm:
441
+ continue
442
+ seen_pm.add(pm)
443
+ lines.append(f"{indent}{sql_var_name} = {sql_var_name}.replace('${pm}', str(resolve_builtin_variable('{pm}', mapping_name='{mapping_name}', session_name='{session_name}', folder_name='{folder_name}')))")
436
444
 
437
445
 
438
446
  def _flatfile_config_dict(ff):
@@ -757,7 +765,7 @@ def _generate_transformation(lines, tx, connector_graph, source_dfs, transform_m
757
765
  elif tx_type in ("joiner",):
758
766
  _gen_joiner_transform(lines, tx, tx_safe, input_df, input_sources, source_dfs, connector_graph, data_lib)
759
767
  elif tx_type in ("lookup procedure", "lookup"):
760
- _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib)
768
+ _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, connector_graph, data_lib)
761
769
  elif tx_type == "router":
762
770
  _gen_router_transform(lines, tx, tx_safe, input_df, source_dfs)
763
771
  elif tx_type in ("union",):
@@ -982,7 +990,7 @@ def _gen_joiner_transform(lines, tx, tx_safe, input_df, input_sources, source_df
982
990
  source_dfs[tx.name] = f"df_{tx_safe}"
983
991
 
984
992
 
985
- def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pandas"):
993
+ def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, connector_graph=None, data_lib="pandas"):
986
994
  lookup_table = ""
987
995
  lookup_sql = ""
988
996
  lookup_condition = ""
@@ -1012,6 +1020,11 @@ def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pa
1012
1020
 
1013
1021
  all_output_fields = return_fields + lookup_output_fields
1014
1022
 
1023
+ port_to_col = {}
1024
+ if connector_graph and tx.name in connector_graph.get("to", {}):
1025
+ for conn in connector_graph["to"][tx.name]:
1026
+ port_to_col[conn.to_field.lower()] = conn.from_field
1027
+
1015
1028
  lines.append(f" # Lookup: {lookup_table or tx.name}")
1016
1029
  if lookup_sql:
1017
1030
  _emit_sql_with_params(lines, f"lkp_sql_{tx_safe}", lookup_sql)
@@ -1020,10 +1033,13 @@ def _gen_lookup_transform(lines, tx, tx_safe, input_df, source_dfs, data_lib="pa
1020
1033
  lines.append(f" df_lkp_{tx_safe} = read_from_db(config, 'SELECT * FROM {lookup_table}', 'default')")
1021
1034
  else:
1022
1035
  empty_expr = lib_empty_df(data_lib)
1023
- lines.append(f" df_lkp_{tx_safe} = {empty_expr}")
1036
+ lines.append(f" df_lkp_{tx_safe} = {empty_expr} # WARNING: no lookup table/SQL override found")
1024
1037
 
1025
1038
  input_keys, lookup_keys = parse_lookup_condition(lookup_condition)
1026
1039
 
1040
+ if input_keys and port_to_col:
1041
+ input_keys = [port_to_col.get(k.lower(), k) for k in input_keys]
1042
+
1027
1043
  if input_keys and lookup_keys:
1028
1044
  lines.append(f" # Lookup condition: {lookup_condition}")
1029
1045
 
@@ -1078,12 +1094,23 @@ def _gen_router_transform(lines, tx, tx_safe, input_df, source_dfs):
1078
1094
  if "Group Filter Condition" in attr.name:
1079
1095
  group_conditions[attr.name] = attr.value
1080
1096
 
1097
+ remaining_mask_parts = []
1081
1098
  if group_conditions:
1082
1099
  for i, (gname, cond) in enumerate(group_conditions.items()):
1083
- expr_py = convert_expression(cond) if cond else "True"
1084
- lines.append(f" df_{tx_safe}_group{i} = {input_df}[{expr_py}].copy() # {gname}")
1100
+ if cond and cond.strip():
1101
+ expr_py = convert_filter_vectorized(cond, input_df)
1102
+ else:
1103
+ expr_py = f"pd.Series(True, index={input_df}.index)"
1104
+ mask_var = f"_router_mask_{tx_safe}_{i}"
1105
+ lines.append(f" {mask_var} = {expr_py} # {gname}")
1106
+ lines.append(f" df_{tx_safe}_group{i} = {input_df}[{mask_var}].copy()")
1085
1107
  source_dfs[f"{tx.name}_group{i}"] = f"df_{tx_safe}_group{i}"
1086
- lines.append(f" df_{tx_safe} = {input_df}.copy() # Default group")
1108
+ remaining_mask_parts.append(f"~{mask_var}")
1109
+ if remaining_mask_parts:
1110
+ lines.append(f" _router_default_mask = {' & '.join(remaining_mask_parts)}")
1111
+ lines.append(f" df_{tx_safe} = {input_df}[_router_default_mask].copy() # Default group")
1112
+ else:
1113
+ lines.append(f" df_{tx_safe} = {input_df}.copy() # Default group")
1087
1114
  source_dfs[tx.name] = f"df_{tx_safe}"
1088
1115
 
1089
1116
 
@@ -1442,7 +1469,7 @@ def _generate_target_write(lines, tgt_name, tgt_def, connector_graph, source_dfs
1442
1469
  if col_mapping:
1443
1470
  lines.append(f" # Column mapping: source -> target")
1444
1471
  lines.append(f" target_columns_{tgt_safe} = {col_mapping}")
1445
- lines.append(f" df_target_{tgt_safe} = {input_df}.rename(columns={{v: k for k, v in target_columns_{tgt_safe}.items()}})")
1472
+ lines.append(f" df_target_{tgt_safe} = rename_with_duplicates({input_df}, target_columns_{tgt_safe})")
1446
1473
  target_cols = [f.name for f in tgt_def.fields] if tgt_def.fields else None
1447
1474
  if target_cols:
1448
1475
  lines.append(f" # Select only target columns")
@@ -248,6 +248,7 @@ def _convert_infa_date_format(fmt_str):
248
248
  fmt = fmt.replace("Mon", "%b").replace("MON", "%b")
249
249
  fmt = fmt.replace("HH24", "%H").replace("HH12", "%I").replace("HH", "%H")
250
250
  fmt = fmt.replace("MI", "%M").replace("SS", "%S")
251
+ fmt = fmt.replace("US", "%f").replace("NS", "%f").replace("MS", "%f")
251
252
  return fmt
252
253
 
253
254
 
@@ -548,7 +549,7 @@ def _vec_recursive(expr, df_var):
548
549
  'RTRIM': f'.str.rstrip("{char_arg}")',
549
550
  'TRIM': f'.str.strip("{char_arg}")',
550
551
  }
551
- return f'{inner_val}{method_map[func_name.upper()]}'
552
+ return f'{inner_val}.astype(str){method_map[func_name.upper()]}'
552
553
 
553
554
  upper_result = _find_func_call(cleaned, 'UPPER')
554
555
  if upper_result and upper_result[0] == 0 and upper_result[1] == len(cleaned):
@@ -584,7 +585,7 @@ def _vec_recursive(expr, df_var):
584
585
  if len(args) >= 2:
585
586
  field_val = _vec_recursive(args[0], df_var)
586
587
  try:
587
- start = int(args[1].strip()) - 1
588
+ start = max(int(args[1].strip()) - 1, 0)
588
589
  except ValueError:
589
590
  start_val = _vec_recursive(args[1], df_var)
590
591
  if len(args) >= 3:
@@ -722,7 +723,11 @@ def _vec_recursive(expr, df_var):
722
723
  field_val = _vec_recursive(args[0], df_var)
723
724
  pattern_val = args[1].strip().strip("'\"")
724
725
  if func_name == 'REG_EXTRACT':
725
- return f'{field_val}.str.extract(r"({pattern_val})", expand=False)'
726
+ if re.search(r'(?<!\\)\((?!\?)', pattern_val):
727
+ extract_pat = pattern_val
728
+ else:
729
+ extract_pat = f'({pattern_val})'
730
+ return f'{field_val}.str.extract(r"{extract_pat}", expand=False)'
726
731
  elif func_name == 'REG_REPLACE':
727
732
  replace_val = args[2].strip().strip("'\"") if len(args) >= 3 else ''
728
733
  return f'{field_val}.str.replace(r"{pattern_val}", "{replace_val}", regex=True)'
@@ -862,7 +867,7 @@ def _vec_recursive(expr, df_var):
862
867
  if v.startswith("'") and v.endswith("'"):
863
868
  vec_parts.append(v)
864
869
  else:
865
- vec_parts.append(f'{v}.astype(str)')
870
+ vec_parts.append(f'{v}.fillna(\'\').astype(str)')
866
871
  return " + ".join(vec_parts)
867
872
 
868
873
  for func_name in sorted(INFA_FUNC_MAP.keys(), key=lambda x: -len(x)):
@@ -894,7 +899,8 @@ def _vec_recursive(expr, df_var):
894
899
  'True', 'False', 'None', 'and', 'or', 'not', 'np', 'pd', 'get_variable',
895
900
  'str', 'int', 'float', 'bool', 'len', 'abs', 'round',
896
901
  'fillna', 'astype', 'isna', 'notna', 'where', 'errors', 'coerce',
897
- 'lookup_func',
902
+ 'lookup_func', 'expand', 'extract', 'regex', 'contains', 'replace',
903
+ 'upper', 'lower', 'strip', 'lstrip', 'rstrip', 'dt', 'copy',
898
904
  }
899
905
  converted = _substitute_fields(converted, df_var, skip_words)
900
906
 
@@ -904,6 +910,8 @@ def _vec_recursive(expr, df_var):
904
910
  converted = re.sub(r'<>', '!=', converted)
905
911
  converted = re.sub(r'(?<![<>!=])=(?!=)', '==', converted)
906
912
  converted = re.sub(r'\berrors\s*==\s*(["\'])', r'errors=\1', converted)
913
+ converted = re.sub(r'\bexpand\s*==\s*', 'expand=', converted)
914
+ converted = re.sub(r'\bregex\s*==\s*', 'regex=', converted)
907
915
 
908
916
  converted = re.sub(r'\s+', ' ', converted).strip()
909
917
 
@@ -1044,8 +1052,14 @@ def _vectorize_simple(part, df_var):
1044
1052
  'True', 'False', 'None', 'and', 'or', 'not', 'np', 'pd',
1045
1053
  'str', 'int', 'float', 'isna', 'notna', 'fillna',
1046
1054
  'get_variable', 'lookup_func', 'isin', 'eq',
1055
+ 'expand', 'extract', 'astype', 'errors', 'coerce', 'regex',
1056
+ 'contains', 'replace', 'upper', 'lower', 'strip', 'lstrip', 'rstrip',
1057
+ 'dt', 'len', 'copy', 'abs', 'round', 'where', 'bool',
1047
1058
  }
1048
1059
  c = _substitute_fields(c, df_var, skip_words)
1060
+ c = re.sub(r'\bexpand\s*==\s*', 'expand=', c)
1061
+ c = re.sub(r'\berrors\s*==\s*', 'errors=', c)
1062
+ c = re.sub(r'\bregex\s*==\s*', 'regex=', c)
1049
1063
 
1050
1064
  return c
1051
1065
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: informatica-python
3
- Version: 1.9.3
3
+ Version: 1.9.5
4
4
  Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
5
5
  Author: Nick
6
6
  License: MIT
@@ -430,7 +430,7 @@ The generated `helper_functions.py` provides a complete runtime library:
430
430
  - **Generated code formatting**: Consistent `# ---` section headers for Source Qualifiers, Transforms, and Target Writes; metadata comments (database type, field lists); column mapping and write operation comments; clean blank line handling
431
431
  - **Source/target detection**: Case-insensitive instance type matching
432
432
  - **Session→mapping inference**: Longest-suffix-match strategy for ambiguous mapping names
433
- - **646 tests** across unit, integration, expression, and formatting test suites
433
+ - **663 tests** across unit, integration, expression, and formatting test suites
434
434
 
435
435
  ### v1.9.2 (Phase 8)
436
436
  - Mapping output files now use real mapping names (e.g., `mapping_m_customer_load.py`) instead of generic numeric indices (`mapping_1.py`)
@@ -495,7 +495,7 @@ The generated `helper_functions.py` provides a complete runtime library:
495
495
  cd informatica_python
496
496
  pip install -e ".[dev]"
497
497
 
498
- # Run tests (646 tests)
498
+ # Run tests (663 tests)
499
499
  pytest tests/ -v
500
500
  ```
501
501
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "informatica-python"
7
- version = "1.9.3"
7
+ version = "1.9.5"
8
8
  description = "Convert Informatica PowerCenter workflow XML to Python/PySpark code"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -2246,3 +2246,465 @@ class TestJoinerFieldRemapping(unittest.TestCase):
2246
2246
  if "left_on" in line and "right_on" in line:
2247
2247
  assert "Table_Name" in line, \
2248
2248
  "Merge should use source column name Table_Name"
2249
+
2250
+
2251
+ class TestRegExtractConversion(unittest.TestCase):
2252
+ """Tests for REG_EXTRACT capture group and expand parameter handling."""
2253
+
2254
+ def test_no_double_capture_group(self):
2255
+ r = convert_expression_vectorized(r"REG_EXTRACT(col,'(\s+)')", "df")
2256
+ assert r.count("(") - r.count("str.extract") <= 2
2257
+ assert '((\\s+))' not in r
2258
+
2259
+ def test_adds_capture_group_when_missing(self):
2260
+ r = convert_expression_vectorized(r"REG_EXTRACT(col,'\\d+')", "df")
2261
+ assert 'expand=False' in r
2262
+ assert '.str.extract' in r
2263
+
2264
+ def test_expand_is_boolean_not_series(self):
2265
+ r = convert_expression_vectorized(r"REG_EXTRACT(col,'(\s+)')", "df")
2266
+ assert 'expand=False' in r
2267
+ assert 'expand==False' not in r
2268
+ assert 'df["expand"]' not in r
2269
+
2270
+ def test_isnull_reg_extract_nested(self):
2271
+ r = convert_expression_vectorized(
2272
+ "IIF(ISNULL(REG_EXTRACT(PART_BIRTH_DTE,'(\\s+)')),PART_BIRTH_DTE,NULL)", "df_exp"
2273
+ )
2274
+ assert "np.where" in r
2275
+ assert ".isna()" in r
2276
+ assert "expand=False" in r
2277
+ assert 'expand==False' not in r
2278
+ assert 'df_exp["expand"]' not in r
2279
+
2280
+
2281
+ class TestDatetimeFormatMask(unittest.TestCase):
2282
+ """Tests for datetime format mask conversion (US/microseconds)."""
2283
+
2284
+ def test_us_to_percent_f(self):
2285
+ from informatica_python.utils.expression_converter import _convert_infa_date_format
2286
+ fmt = _convert_infa_date_format("YYYY-MM-DD HH24.MI.SS.US")
2287
+ assert "%f" in fmt
2288
+ assert "US" not in fmt
2289
+
2290
+ def test_full_format_mask(self):
2291
+ from informatica_python.utils.expression_converter import _convert_infa_date_format
2292
+ fmt = _convert_infa_date_format("YYYY-MM-DD HH24:MI:SS")
2293
+ assert fmt == "%Y-%m-%d %H:%M:%S"
2294
+
2295
+ def test_to_date_with_us_format(self):
2296
+ r = convert_expression_vectorized(
2297
+ "TO_DATE(x, 'YYYY-MM-DD HH24.MI.SS.US')", "df"
2298
+ )
2299
+ assert "%f" in r
2300
+ assert "US" not in r
2301
+
2302
+
2303
+ class TestSubstrZeroIndex(unittest.TestCase):
2304
+ """Tests for SUBSTR with 0-based start position."""
2305
+
2306
+ def test_substr_start_0(self):
2307
+ r = convert_expression_vectorized("SUBSTR(x, 0, 11)", "df")
2308
+ assert "str[0:" in r
2309
+ assert "str[-1:" not in r
2310
+
2311
+ def test_substr_start_1(self):
2312
+ r = convert_expression_vectorized("SUBSTR(x, 1, 5)", "df")
2313
+ assert "str[0:" in r
2314
+
2315
+ def test_substr_start_5(self):
2316
+ r = convert_expression_vectorized("SUBSTR(x, 5, 3)", "df")
2317
+ assert "str[4:7]" in r
2318
+
2319
+
2320
+ class TestStringOpSafety(unittest.TestCase):
2321
+ """Tests for string operations adding .astype(str) for safety."""
2322
+
2323
+ def test_ltrim_has_astype_str(self):
2324
+ r = convert_expression_vectorized("LTRIM(name)", "df")
2325
+ assert ".astype(str)" in r
2326
+ assert ".str.lstrip()" in r
2327
+
2328
+ def test_rtrim_has_astype_str(self):
2329
+ r = convert_expression_vectorized("RTRIM(name)", "df")
2330
+ assert ".astype(str)" in r
2331
+ assert ".str.rstrip()" in r
2332
+
2333
+ def test_trim_has_astype_str(self):
2334
+ r = convert_expression_vectorized("TRIM(name)", "df")
2335
+ assert ".astype(str)" in r
2336
+ assert ".str.strip()" in r
2337
+
2338
+ def test_ltrim_with_char(self):
2339
+ r = convert_expression_vectorized("LTRIM(name, '0')", "df")
2340
+ assert ".astype(str)" in r
2341
+ assert '.str.lstrip("0")' in r
2342
+
2343
+
2344
+ class TestRouterVectorized(unittest.TestCase):
2345
+ """Tests for Router transformation generating vectorized conditions."""
2346
+
2347
+ ROUTER_XML = '''<?xml version="1.0" encoding="UTF-8"?>
2348
+ <!DOCTYPE POWERMART SYSTEM "powrmart.dtd">
2349
+ <POWERMART CREATION_DATE="01/01/2025" REPOSITORY_VERSION="1">
2350
+ <REPOSITORY NAME="repo" VERSION="1" CODEPAGE="UTF-8" DATABASETYPE="Oracle">
2351
+ <FOLDER NAME="TEST" OWNER="admin">
2352
+ <SOURCE NAME="SRC" DATABASETYPE="Flat File" DBDNAME="SRC">
2353
+ <FLATFILE DELIMITEDBY="COMMA" HEADERROWPRESENT="YES" PADBYTES="NO" ROWDELIMITER="\\n"/>
2354
+ <SOURCEFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" NULLABLE="NOTNULL" KEYTYPE="PRIMARY KEY" FIELDNUMBER="1"/>
2355
+ <SOURCEFIELD NAME="STATUS" DATATYPE="string" PRECISION="20" SCALE="0" NULLABLE="NULL" KEYTYPE="NOT A KEY" FIELDNUMBER="2"/>
2356
+ </SOURCE>
2357
+ <TARGET NAME="TGT" DATABASETYPE="Flat File">
2358
+ <TARGETFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" NULLABLE="NULL" KEYTYPE="NOT A KEY" FIELDNUMBER="1"/>
2359
+ </TARGET>
2360
+ <MAPPING NAME="m_router_test" ISVALID="YES">
2361
+ <TRANSFORMATION NAME="SQ_SRC" TYPE="Source Qualifier" REUSABLE="NO">
2362
+ <TRANSFORMFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" PORTTYPE="OUTPUT"/>
2363
+ <TRANSFORMFIELD NAME="STATUS" DATATYPE="string" PRECISION="20" SCALE="0" PORTTYPE="OUTPUT"/>
2364
+ </TRANSFORMATION>
2365
+ <TRANSFORMATION NAME="RTR_STATUS" TYPE="Router" REUSABLE="NO">
2366
+ <TRANSFORMFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" PORTTYPE="INPUT/OUTPUT"/>
2367
+ <TRANSFORMFIELD NAME="STATUS" DATATYPE="string" PRECISION="20" SCALE="0" PORTTYPE="INPUT/OUTPUT"/>
2368
+ <TABLEATTRIBUTE NAME="Group Filter Condition_ACTIVE" VALUE="STATUS = 'ACTIVE'"/>
2369
+ <TABLEATTRIBUTE NAME="Group Filter Condition_INACTIVE" VALUE="STATUS = 'INACTIVE'"/>
2370
+ </TRANSFORMATION>
2371
+ <INSTANCE NAME="SRC" TYPE="Source Definition" TRANSFORMATION_NAME="SRC"/>
2372
+ <INSTANCE NAME="SQ_SRC" TYPE="Source Qualifier" TRANSFORMATION_NAME="SQ_SRC"/>
2373
+ <INSTANCE NAME="RTR_STATUS" TYPE="Router" TRANSFORMATION_NAME="RTR_STATUS"/>
2374
+ <INSTANCE NAME="TGT" TYPE="Target Definition" TRANSFORMATION_NAME="TGT"/>
2375
+ <CONNECTOR FROMINSTANCE="SRC" FROMFIELD="ID" TOINSTANCE="SQ_SRC" TOFIELD="ID"/>
2376
+ <CONNECTOR FROMINSTANCE="SRC" FROMFIELD="STATUS" TOINSTANCE="SQ_SRC" TOFIELD="STATUS"/>
2377
+ <CONNECTOR FROMINSTANCE="SQ_SRC" FROMFIELD="ID" TOINSTANCE="RTR_STATUS" TOFIELD="ID"/>
2378
+ <CONNECTOR FROMINSTANCE="SQ_SRC" FROMFIELD="STATUS" TOINSTANCE="RTR_STATUS" TOFIELD="STATUS"/>
2379
+ <CONNECTOR FROMINSTANCE="RTR_STATUS" FROMFIELD="ID" TOINSTANCE="TGT" TOFIELD="ID"/>
2380
+ </MAPPING>
2381
+ <CONFIG NAME="default_session_config"/>
2382
+ <WORKFLOW NAME="wf_router_test" ISVALID="YES">
2383
+ <TASK NAME="Start" REUSABLE="NO" TYPE="Start"/>
2384
+ <SESSION NAME="s_m_router_test" ISVALID="YES" REUSABLE="NO" MAPPINGNAME="m_router_test">
2385
+ <CONFIGREFERENCE REFOBJECTNAME="default_session_config" TYPE="Session config"/>
2386
+ </SESSION>
2387
+ <TASKINSTANCE NAME="Start" TASKNAME="Start" TASKTYPE="Start"/>
2388
+ <TASKINSTANCE NAME="s_m_router_test" TASKNAME="s_m_router_test" TASKTYPE="Session"/>
2389
+ <WORKFLOWLINK FROMTASK="Start" TOTASK="s_m_router_test"/>
2390
+ </WORKFLOW>
2391
+ </FOLDER>
2392
+ </REPOSITORY>
2393
+ </POWERMART>'''
2394
+
2395
+ def test_router_generates_group_filters(self):
2396
+ converter = InformaticaConverter()
2397
+ tmpdir = tempfile.mkdtemp()
2398
+ try:
2399
+ converter.convert_string(self.ROUTER_XML, output_dir=tmpdir)
2400
+ for fn in os.listdir(tmpdir):
2401
+ if fn.startswith("mapping_") and fn.endswith(".py"):
2402
+ with open(os.path.join(tmpdir, fn)) as f:
2403
+ code = f.read()
2404
+ assert "_router_mask_" in code or "group0" in code, \
2405
+ "Router should generate group filter masks"
2406
+ assert "Default group" in code
2407
+ break
2408
+ finally:
2409
+ shutil.rmtree(tmpdir)
2410
+
2411
+ def test_router_default_excludes_matched_rows(self):
2412
+ converter = InformaticaConverter()
2413
+ tmpdir = tempfile.mkdtemp()
2414
+ try:
2415
+ converter.convert_string(self.ROUTER_XML, output_dir=tmpdir)
2416
+ for fn in os.listdir(tmpdir):
2417
+ if fn.startswith("mapping_") and fn.endswith(".py"):
2418
+ with open(os.path.join(tmpdir, fn)) as f:
2419
+ code = f.read()
2420
+ assert "_router_default_mask" in code or "~" in code, \
2421
+ "Default group should exclude rows matching other groups"
2422
+ break
2423
+ finally:
2424
+ shutil.rmtree(tmpdir)
2425
+
2426
+
2427
+ class TestLookupWarning(unittest.TestCase):
2428
+ """Tests for lookup empty DataFrame warning."""
2429
+
2430
+ LOOKUP_XML = '''<?xml version="1.0" encoding="UTF-8"?>
2431
+ <!DOCTYPE POWERMART SYSTEM "powrmart.dtd">
2432
+ <POWERMART CREATION_DATE="01/01/2025" REPOSITORY_VERSION="1">
2433
+ <REPOSITORY NAME="repo" VERSION="1" CODEPAGE="UTF-8" DATABASETYPE="Oracle">
2434
+ <FOLDER NAME="TEST" OWNER="admin">
2435
+ <SOURCE NAME="SRC" DATABASETYPE="Flat File" DBDNAME="SRC">
2436
+ <FLATFILE DELIMITEDBY="COMMA" HEADERROWPRESENT="YES" PADBYTES="NO" ROWDELIMITER="\\n"/>
2437
+ <SOURCEFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" NULLABLE="NOTNULL" KEYTYPE="PRIMARY KEY" FIELDNUMBER="1"/>
2438
+ </SOURCE>
2439
+ <TARGET NAME="TGT" DATABASETYPE="Flat File">
2440
+ <TARGETFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" NULLABLE="NULL" KEYTYPE="NOT A KEY" FIELDNUMBER="1"/>
2441
+ </TARGET>
2442
+ <MAPPING NAME="m_lkp_test" ISVALID="YES">
2443
+ <TRANSFORMATION NAME="SQ_SRC" TYPE="Source Qualifier" REUSABLE="NO">
2444
+ <TRANSFORMFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" PORTTYPE="OUTPUT"/>
2445
+ </TRANSFORMATION>
2446
+ <TRANSFORMATION NAME="LKP_TEST" TYPE="Lookup Procedure" REUSABLE="NO">
2447
+ <TRANSFORMFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" PORTTYPE="INPUT"/>
2448
+ <TRANSFORMFIELD NAME="RESULT" DATATYPE="string" PRECISION="100" SCALE="0" PORTTYPE="OUTPUT/RETURN"/>
2449
+ <TABLEATTRIBUTE NAME="Lookup table name" VALUE="DIM_TABLE"/>
2450
+ <TABLEATTRIBUTE NAME="Lookup condition" VALUE="ID = ID"/>
2451
+ </TRANSFORMATION>
2452
+ <INSTANCE NAME="SRC" TYPE="Source Definition" TRANSFORMATION_NAME="SRC"/>
2453
+ <INSTANCE NAME="SQ_SRC" TYPE="Source Qualifier" TRANSFORMATION_NAME="SQ_SRC"/>
2454
+ <INSTANCE NAME="LKP_TEST" TYPE="Lookup Procedure" TRANSFORMATION_NAME="LKP_TEST"/>
2455
+ <INSTANCE NAME="TGT" TYPE="Target Definition" TRANSFORMATION_NAME="TGT"/>
2456
+ <CONNECTOR FROMINSTANCE="SRC" FROMFIELD="ID" TOINSTANCE="SQ_SRC" TOFIELD="ID"/>
2457
+ <CONNECTOR FROMINSTANCE="SQ_SRC" FROMFIELD="ID" TOINSTANCE="LKP_TEST" TOFIELD="ID"/>
2458
+ <CONNECTOR FROMINSTANCE="LKP_TEST" FROMFIELD="RESULT" TOINSTANCE="TGT" TOFIELD="ID"/>
2459
+ </MAPPING>
2460
+ <CONFIG NAME="default_session_config"/>
2461
+ <WORKFLOW NAME="wf_lkp_test" ISVALID="YES">
2462
+ <TASK NAME="Start" REUSABLE="NO" TYPE="Start"/>
2463
+ <SESSION NAME="s_m_lkp_test" ISVALID="YES" REUSABLE="NO" MAPPINGNAME="m_lkp_test">
2464
+ <CONFIGREFERENCE REFOBJECTNAME="default_session_config" TYPE="Session config"/>
2465
+ </SESSION>
2466
+ <TASKINSTANCE NAME="Start" TASKNAME="Start" TASKTYPE="Start"/>
2467
+ <TASKINSTANCE NAME="s_m_lkp_test" TASKNAME="s_m_lkp_test" TASKTYPE="Session"/>
2468
+ <WORKFLOWLINK FROMTASK="Start" TOTASK="s_m_lkp_test"/>
2469
+ </WORKFLOW>
2470
+ </FOLDER>
2471
+ </REPOSITORY>
2472
+ </POWERMART>'''
2473
+
2474
+ def test_lookup_with_table_reads_from_db(self):
2475
+ converter = InformaticaConverter()
2476
+ tmpdir = tempfile.mkdtemp()
2477
+ try:
2478
+ converter.convert_string(self.LOOKUP_XML, output_dir=tmpdir)
2479
+ for fn in os.listdir(tmpdir):
2480
+ if fn.startswith("mapping_") and fn.endswith(".py"):
2481
+ with open(os.path.join(tmpdir, fn)) as f:
2482
+ code = f.read()
2483
+ assert "read_from_db" in code, "Lookup with table should use read_from_db"
2484
+ assert "DIM_TABLE" in code
2485
+ break
2486
+ finally:
2487
+ shutil.rmtree(tmpdir)
2488
+
2489
+
2490
+ class TestRenameWithDuplicates(unittest.TestCase):
2491
+
2492
+ def test_helper_contains_rename_with_duplicates(self):
2493
+ converter = InformaticaConverter()
2494
+ tmpdir = tempfile.mkdtemp()
2495
+ try:
2496
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2497
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2498
+ code = f.read()
2499
+ assert "def rename_with_duplicates(" in code
2500
+ finally:
2501
+ shutil.rmtree(tmpdir)
2502
+
2503
+ def test_target_uses_rename_with_duplicates(self):
2504
+ converter = InformaticaConverter()
2505
+ tmpdir = tempfile.mkdtemp()
2506
+ try:
2507
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2508
+ for fn in os.listdir(tmpdir):
2509
+ if fn.startswith("mapping_") and fn.endswith(".py"):
2510
+ with open(os.path.join(tmpdir, fn)) as f:
2511
+ code = f.read()
2512
+ if "target_columns_" in code:
2513
+ assert "rename_with_duplicates(" in code, \
2514
+ "Target rename should use rename_with_duplicates"
2515
+ finally:
2516
+ shutil.rmtree(tmpdir)
2517
+
2518
+
2519
+ class TestResolveEnv(unittest.TestCase):
2520
+
2521
+ def test_helper_contains_resolve_env(self):
2522
+ converter = InformaticaConverter()
2523
+ tmpdir = tempfile.mkdtemp()
2524
+ try:
2525
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2526
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2527
+ code = f.read()
2528
+ assert "def resolve_env(" in code
2529
+ finally:
2530
+ shutil.rmtree(tmpdir)
2531
+
2532
+ def test_helper_contains_resolve_builtin_variable(self):
2533
+ converter = InformaticaConverter()
2534
+ tmpdir = tempfile.mkdtemp()
2535
+ try:
2536
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2537
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2538
+ code = f.read()
2539
+ assert "def resolve_builtin_variable(" in code
2540
+ assert "PMMappingName" in code
2541
+ finally:
2542
+ shutil.rmtree(tmpdir)
2543
+
2544
+
2545
+ class TestGetDbConnectionSQLAlchemy(unittest.TestCase):
2546
+
2547
+ def test_helper_sqlalchemy_primary(self):
2548
+ converter = InformaticaConverter()
2549
+ tmpdir = tempfile.mkdtemp()
2550
+ try:
2551
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2552
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2553
+ code = f.read()
2554
+ sa_pos = code.index("create_engine")
2555
+ pyodbc_pos = code.index("pyodbc")
2556
+ assert sa_pos < pyodbc_pos, "SQLAlchemy should be tried before raw pyodbc"
2557
+ finally:
2558
+ shutil.rmtree(tmpdir)
2559
+
2560
+ def test_helper_engine_cache(self):
2561
+ converter = InformaticaConverter()
2562
+ tmpdir = tempfile.mkdtemp()
2563
+ try:
2564
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2565
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2566
+ code = f.read()
2567
+ assert "_engine_cache" in code
2568
+ assert "pool_pre_ping=True" in code
2569
+ finally:
2570
+ shutil.rmtree(tmpdir)
2571
+
2572
+ def test_helper_safe_close(self):
2573
+ converter = InformaticaConverter()
2574
+ tmpdir = tempfile.mkdtemp()
2575
+ try:
2576
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2577
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2578
+ code = f.read()
2579
+ assert "def _safe_close(" in code
2580
+ assert "_safe_close(conn)" in code
2581
+ finally:
2582
+ shutil.rmtree(tmpdir)
2583
+
2584
+ def test_helper_resolve_env_in_db(self):
2585
+ converter = InformaticaConverter()
2586
+ tmpdir = tempfile.mkdtemp()
2587
+ try:
2588
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2589
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2590
+ code = f.read()
2591
+ assert "resolve_env(" in code
2592
+ finally:
2593
+ shutil.rmtree(tmpdir)
2594
+
2595
+
2596
+ class TestLookupFuncImpl(unittest.TestCase):
2597
+
2598
+ def test_helper_lookup_func_full_impl(self):
2599
+ converter = InformaticaConverter()
2600
+ tmpdir = tempfile.mkdtemp()
2601
+ try:
2602
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2603
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2604
+ code = f.read()
2605
+ assert "_lookup_cache" in code
2606
+ assert "def lookup_func(" in code
2607
+ assert "config=None" in code
2608
+ assert "read_from_db" in code.split("def lookup_func")[1]
2609
+ finally:
2610
+ shutil.rmtree(tmpdir)
2611
+
2612
+
2613
+ class TestNullSafeConcat(unittest.TestCase):
2614
+
2615
+ def test_concat_fillna(self):
2616
+ result = convert_expression_vectorized("A || B", "df")
2617
+ assert ".fillna('')" in result, f"Concat should use fillna, got: {result}"
2618
+ assert ".astype(str)" in result
2619
+
2620
+ def test_concat_literal_no_fillna(self):
2621
+ result = convert_expression_vectorized("A || '-' || B", "df")
2622
+ assert "'-'" in result
2623
+ parts = result.split(" + ")
2624
+ for part in parts:
2625
+ if part.strip().startswith("'") and part.strip().endswith("'"):
2626
+ assert ".fillna" not in part
2627
+ else:
2628
+ assert ".fillna('')" in part
2629
+
2630
+ def test_concat_three_fields_all_fillna(self):
2631
+ result = convert_expression_vectorized("X || Y || Z", "df")
2632
+ assert result.count(".fillna('')") == 3
2633
+
2634
+
2635
+ class TestPMVariableHandling(unittest.TestCase):
2636
+
2637
+ PM_VAR_XML = '''<?xml version="1.0" encoding="UTF-8"?>
2638
+ <!DOCTYPE POWERMART SYSTEM "powrmart.dtd">
2639
+ <POWERMART CREATION_DATE="01/01/2025" REPOSITORY_VERSION="1">
2640
+ <REPOSITORY NAME="repo" VERSION="1" CODEPAGE="UTF-8" DATABASETYPE="Oracle">
2641
+ <FOLDER NAME="TEST_FOLDER" OWNER="admin">
2642
+ <SOURCE NAME="SRC_PM" DATABASETYPE="Microsoft SQL Server" DBDNAME="TestDB" OWNERNAME="dbo">
2643
+ <SOURCEFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" NULLABLE="NOTNULL" KEYTYPE="PRIMARY KEY" FIELDNUMBER="1"/>
2644
+ </SOURCE>
2645
+ <TARGET NAME="TGT_PM" DATABASETYPE="Microsoft SQL Server">
2646
+ <TARGETFIELD NAME="ID" DATATYPE="integer" PRECISION="10" SCALE="0" NULLABLE="NOTNULL" KEYTYPE="PRIMARY KEY" FIELDNUMBER="1"/>
2647
+ </TARGET>
2648
+ <MAPPING NAME="m_pm_vars" ISVALID="YES">
2649
+ <TRANSFORMATION NAME="SQ_SRC_PM" TYPE="Source Qualifier" REUSABLE="NO">
2650
+ <TRANSFORMFIELD NAME="ID" DATATYPE="integer" PORTTYPE="INPUT/OUTPUT" PRECISION="10" SCALE="0"/>
2651
+ <TABLEATTRIBUTE NAME="Sql Query" VALUE="SELECT ID FROM dbo.SRC_PM WHERE mapping_name = &apos;$PMMappingName&apos;"/>
2652
+ </TRANSFORMATION>
2653
+ <INSTANCE NAME="SQ_SRC_PM" TRANSFORMATION_NAME="SQ_SRC_PM" TYPE="Source Qualifier"/>
2654
+ <INSTANCE NAME="SRC_PM" TRANSFORMATION_NAME="SRC_PM" TYPE="Source Definition"/>
2655
+ <INSTANCE NAME="TGT_PM" TRANSFORMATION_NAME="TGT_PM" TYPE="Target Definition"/>
2656
+ <CONNECTOR FROMINSTANCE="SRC_PM" FROMFIELD="ID" TOINSTANCE="SQ_SRC_PM" TOFIELD="ID" FROMINSTANCETYPE="Source Definition" TOINSTANCETYPE="Source Qualifier"/>
2657
+ <CONNECTOR FROMINSTANCE="SQ_SRC_PM" FROMFIELD="ID" TOINSTANCE="TGT_PM" TOFIELD="ID" FROMINSTANCETYPE="Source Qualifier" TOINSTANCETYPE="Target Definition"/>
2658
+ </MAPPING>
2659
+ <CONFIG NAME="default_session_config"/>
2660
+ <WORKFLOW NAME="wf_pm_vars" ISVALID="YES">
2661
+ <SESSION NAME="s_pm_vars" ISVALID="YES" MAPPINGNAME="m_pm_vars"/>
2662
+ </WORKFLOW>
2663
+ </FOLDER>
2664
+ </REPOSITORY>
2665
+ </POWERMART>'''
2666
+
2667
+ def test_pm_variable_resolved_in_sql(self):
2668
+ converter = InformaticaConverter()
2669
+ tmpdir = tempfile.mkdtemp()
2670
+ try:
2671
+ converter.convert_string(self.PM_VAR_XML, output_dir=tmpdir)
2672
+ for fn in os.listdir(tmpdir):
2673
+ if fn.startswith("mapping_") and fn.endswith(".py"):
2674
+ with open(os.path.join(tmpdir, fn)) as f:
2675
+ code = f.read()
2676
+ if "$PMMappingName" in code:
2677
+ assert "resolve_builtin_variable" in code, \
2678
+ "SQL with $PMMappingName should call resolve_builtin_variable"
2679
+ break
2680
+ finally:
2681
+ shutil.rmtree(tmpdir)
2682
+
2683
+
2684
+ class TestExecuteSqlAlchemy(unittest.TestCase):
2685
+
2686
+ def test_execute_sql_handles_sqlalchemy(self):
2687
+ converter = InformaticaConverter()
2688
+ tmpdir = tempfile.mkdtemp()
2689
+ try:
2690
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2691
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2692
+ code = f.read()
2693
+ exec_block = code.split("def execute_sql(")[1]
2694
+ assert "sqlalchemy" in exec_block or "text(sql)" in exec_block
2695
+ finally:
2696
+ shutil.rmtree(tmpdir)
2697
+
2698
+
2699
+ class TestImportRe(unittest.TestCase):
2700
+
2701
+ def test_helper_imports_re(self):
2702
+ converter = InformaticaConverter()
2703
+ tmpdir = tempfile.mkdtemp()
2704
+ try:
2705
+ converter.convert_string(MINIMAL_XML, output_dir=tmpdir)
2706
+ with open(os.path.join(tmpdir, "helper_functions.py")) as f:
2707
+ code = f.read()
2708
+ assert "import re" in code
2709
+ finally:
2710
+ shutil.rmtree(tmpdir)