informatica-python 1.4.1__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. informatica_python-1.5.0/LICENSE +21 -0
  2. informatica_python-1.5.0/PKG-INFO +228 -0
  3. informatica_python-1.5.0/README.md +201 -0
  4. informatica_python-1.5.0/informatica_python/__init__.py +13 -0
  5. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/cli.py +5 -0
  6. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/generators/helper_gen.py +81 -7
  7. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/generators/mapping_gen.py +79 -80
  8. informatica_python-1.5.0/informatica_python/utils/expression_converter.py +437 -0
  9. informatica_python-1.5.0/informatica_python/utils/lib_adapters.py +164 -0
  10. informatica_python-1.5.0/informatica_python.egg-info/PKG-INFO +228 -0
  11. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python.egg-info/SOURCES.txt +4 -1
  12. informatica_python-1.5.0/pyproject.toml +41 -0
  13. informatica_python-1.5.0/tests/test_integration.py +518 -0
  14. informatica_python-1.4.1/PKG-INFO +0 -118
  15. informatica_python-1.4.1/README.md +0 -106
  16. informatica_python-1.4.1/informatica_python/__init__.py +0 -4
  17. informatica_python-1.4.1/informatica_python/utils/expression_converter.py +0 -264
  18. informatica_python-1.4.1/informatica_python.egg-info/PKG-INFO +0 -118
  19. informatica_python-1.4.1/pyproject.toml +0 -24
  20. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/converter.py +0 -0
  21. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/generators/__init__.py +0 -0
  22. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/generators/config_gen.py +0 -0
  23. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/generators/error_log_gen.py +0 -0
  24. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/generators/sql_gen.py +0 -0
  25. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/generators/workflow_gen.py +0 -0
  26. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/models.py +0 -0
  27. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/parser.py +0 -0
  28. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/utils/__init__.py +0 -0
  29. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python/utils/datatype_map.py +0 -0
  30. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python.egg-info/dependency_links.txt +0 -0
  31. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python.egg-info/entry_points.txt +0 -0
  32. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python.egg-info/requires.txt +0 -0
  33. {informatica_python-1.4.1 → informatica_python-1.5.0}/informatica_python.egg-info/top_level.txt +0 -0
  34. {informatica_python-1.4.1 → informatica_python-1.5.0}/setup.cfg +0 -0
  35. {informatica_python-1.4.1 → informatica_python-1.5.0}/tests/test_converter.py +0 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Nick
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: informatica-python
3
+ Version: 1.5.0
4
+ Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
5
+ Author: Nick
6
+ License: MIT
7
+ Keywords: informatica,powercenter,etl,code-generator,pandas,pyspark,data-engineering
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Software Development :: Code Generators
18
+ Classifier: Topic :: Database :: Database Engines/Servers
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: lxml>=4.9.0
23
+ Requires-Dist: pyyaml>=6.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=7.0; extra == "dev"
26
+ Dynamic: license-file
27
+
28
+ # informatica-python
29
+
30
+ Convert Informatica PowerCenter workflow XML exports into clean, runnable Python/PySpark code.
31
+
32
+ **Author:** Nick
33
+ **License:** MIT
34
+ **PyPI:** [informatica-python](https://pypi.org/project/informatica-python/)
35
+
36
+ ---
37
+
38
+ ## Overview
39
+
40
+ `informatica-python` parses Informatica PowerCenter XML export files and generates equivalent Python code using your choice of data library. It handles all 72 DTD tags from the PowerCenter XML schema and produces a complete, ready-to-run Python project.
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install informatica-python
46
+ ```
47
+
48
+ ## Quick Start
49
+
50
+ ### Command Line
51
+
52
+ ```bash
53
+ # Generate Python files to a directory
54
+ informatica-python workflow_export.xml -o output_dir
55
+
56
+ # Generate as a zip archive
57
+ informatica-python workflow_export.xml -z output.zip
58
+
59
+ # Use a different data library
60
+ informatica-python workflow_export.xml -o output_dir --data-lib polars
61
+
62
+ # Parse to JSON only (no code generation)
63
+ informatica-python workflow_export.xml --json
64
+
65
+ # Save parsed JSON to file
66
+ informatica-python workflow_export.xml --json-file parsed.json
67
+ ```
68
+
69
+ ### Python API
70
+
71
+ ```python
72
+ from informatica_python import InformaticaConverter
73
+
74
+ converter = InformaticaConverter()
75
+
76
+ # Parse and generate files
77
+ converter.convert_to_files("workflow_export.xml", "output_dir")
78
+
79
+ # Parse and generate zip
80
+ converter.convert_to_zip("workflow_export.xml", "output.zip")
81
+
82
+ # Parse to structured dict
83
+ result = converter.parse_file("workflow_export.xml")
84
+
85
+ # Use a different data library
86
+ converter.convert_to_files("workflow_export.xml", "output_dir", data_lib="polars")
87
+ ```
88
+
89
+ ## Generated Output Files
90
+
91
+ | File | Description |
92
+ |------|-------------|
93
+ | `helper_functions.py` | Database/file I/O helpers, Informatica expression equivalents (80+ functions) |
94
+ | `mapping_N.py` | One per mapping — transformation logic, source reads, target writes |
95
+ | `workflow.py` | Task orchestration with topological ordering and error handling |
96
+ | `config.yml` | Connection configs, source/target metadata, runtime parameters |
97
+ | `all_sql_queries.sql` | All SQL extracted from Source Qualifiers, Lookups, SQL transforms |
98
+ | `error_log.txt` | Conversion summary, warnings, and unsupported feature notes |
99
+
100
+ ## Supported Data Libraries
101
+
102
+ Select via `--data-lib` CLI flag or `data_lib` parameter:
103
+
104
+ | Library | Flag | Best For |
105
+ |---------|------|----------|
106
+ | **pandas** | `pandas` (default) | General-purpose, most compatible |
107
+ | **dask** | `dask` | Large datasets, parallel processing |
108
+ | **polars** | `polars` | High performance, Rust-backed |
109
+ | **vaex** | `vaex` | Out-of-core, billion-row datasets |
110
+ | **modin** | `modin` | Drop-in pandas replacement, multi-core |
111
+
112
+ ## Supported Transformations
113
+
114
+ The code generator produces real, runnable Python for these transformation types:
115
+
116
+ - **Source Qualifier** — SQL override, pre/post SQL, column selection
117
+ - **Expression** — Field-level expressions converted to pandas operations
118
+ - **Filter** — Row filtering with converted conditions
119
+ - **Joiner** — `pd.merge()` with join type and condition parsing
120
+ - **Lookup** — `pd.merge()` lookups with connection-aware DB/file reads
121
+ - **Aggregator** — `groupby().agg()` with SUM/COUNT/AVG/MIN/MAX/FIRST/LAST
122
+ - **Sorter** — `sort_values()` with multi-key ascending/descending
123
+ - **Router** — Multi-group conditional routing with if/elif/else
124
+ - **Union** — `pd.concat()` across multiple input groups
125
+ - **Update Strategy** — Insert/Update/Delete/Reject flag generation
126
+ - **Sequence Generator** — Auto-incrementing ID columns
127
+ - **Normalizer** — `pd.melt()` with auto-detected id/value vars
128
+ - **Rank** — `groupby().rank()` with Top-N filtering
129
+ - **Stored Procedure** — Stub generation with SP name and parameters
130
+ - **Transaction Control** — Commit/rollback logic stubs
131
+ - **Custom / Java** — Placeholder stubs with TODO markers
132
+ - **SQL Transform** — Direct SQL execution pass-through
133
+
134
+ ## Supported XML Tags (72 Tags)
135
+
136
+ **Top-level:** POWERMART, REPOSITORY, FOLDER, FOLDERVERSION
137
+
138
+ **Source/Target:** SOURCE, SOURCEFIELD, TARGET, TARGETFIELD, TARGETINDEX, TARGETINDEXFIELD, FLATFILE, XMLINFO, XMLTEXT, GROUP, TABLEATTRIBUTE, FIELDATTRIBUTE, METADATAEXTENSION, KEYWORD, ERPSRCINFO
139
+
140
+ **Mapping/Mapplet:** MAPPING, MAPPLET, TRANSFORMATION, TRANSFORMFIELD, TRANSFORMFIELDATTR, TRANSFORMFIELDATTRDEF, INSTANCE, ASSOCIATED_SOURCE_INSTANCE, CONNECTOR, MAPDEPENDENCY, TARGETLOADORDER, MAPPINGVARIABLE, FIELDDEPENDENCY, INITPROP, ERPINFO
141
+
142
+ **Task/Session/Workflow:** TASK, TIMER, VALUEPAIR, SCHEDULER, SCHEDULEINFO, STARTOPTIONS, ENDOPTIONS, SCHEDULEOPTIONS, RECURRING, CUSTOM, DAILYFREQUENCY, REPEAT, FILTER, SESSION, CONFIGREFERENCE, SESSTRANSFORMATIONINST, SESSTRANSFORMATIONGROUP, PARTITION, HASHKEY, KEYRANGE, CONFIG, SESSIONCOMPONENT, CONNECTIONREFERENCE, TASKINSTANCE, WORKFLOWLINK, WORKFLOWVARIABLE, WORKFLOWEVENT, WORKLET, WORKFLOW, ATTRIBUTE
143
+
144
+ **Shortcut:** SHORTCUT
145
+
146
+ **SAP:** SAPFUNCTION, SAPSTRUCTURE, SAPPROGRAM, SAPOUTPUTPORT, SAPVARIABLE, SAPPROGRAMFLOWOBJECT, SAPTABLEPARAM
147
+
148
+ ## Key Features
149
+
150
+ ### Session Connection Overrides (v1.4+)
151
+ When sessions define per-transform connection overrides (different database, file directory, or filename), the generated code uses those overrides instead of source/target defaults.
152
+
153
+ ### Worklet Support (v1.4+)
154
+ Worklet workflows are detected and generate separate `run_worklet_NAME(config)` functions. The main workflow calls these automatically for Worklet task types.
155
+
156
+ ### Type Casting at Target Writes (v1.4+)
157
+ Target field datatypes are mapped to pandas types and generate proper casting code:
158
+ - Integers: nullable `Int64`/`Int32` or `fillna(0).astype(int)` for NOT NULL
159
+ - Dates: `pd.to_datetime(errors='coerce')`
160
+ - Decimals/Floats: `pd.to_numeric(errors='coerce')`
161
+ - Booleans: `.astype('boolean')`
162
+
163
+ ### Flat File Handling (v1.3+)
164
+ Parses FLATFILE metadata for delimiter, fixed-width, header lines, skip rows, quote/escape chars. Generates `pd.read_fwf()` for fixed-width or enriched `read_file()` for delimited.
165
+
166
+ ### Mapplet Inlining (v1.3+)
167
+ Expands Mapplet instances into prefixed transforms, rewires connectors, and eliminates duplication.
168
+
169
+ ### Decision Tasks (v1.3+)
170
+ Converts Informatica decision conditions to Python if/else branches with proper variable substitution.
171
+
172
+ ### Expression Converter (80+ Functions)
173
+
174
+ Converts Informatica expressions to Python equivalents:
175
+
176
+ - **String:** SUBSTR, LTRIM, RTRIM, UPPER, LOWER, LPAD, RPAD, INSTR, LENGTH, CONCAT, REPLACE, REG_EXTRACT, REG_REPLACE, REVERSE, INITCAP, CHR, ASCII
177
+ - **Date:** ADD_TO_DATE, DATE_DIFF, GET_DATE_PART, SYSDATE, SYSTIMESTAMP, TO_DATE, TO_CHAR, TRUNC (date)
178
+ - **Numeric:** ROUND, TRUNC, MOD, ABS, CEIL, FLOOR, POWER, SQRT, LOG, EXP, SIGN
179
+ - **Conversion:** TO_INTEGER, TO_BIGINT, TO_FLOAT, TO_DECIMAL, TO_CHAR, TO_DATE
180
+ - **Null handling:** IIF, DECODE, NVL, NVL2, ISNULL, IS_SPACES, IS_NUMBER
181
+ - **Aggregate:** SUM, AVG, COUNT, MIN, MAX, FIRST, LAST, MEDIAN, STDDEV, VARIANCE
182
+ - **Lookup:** :LKP expressions with dynamic lookup references
183
+ - **Variable:** SETVARIABLE / mapping variable assignment
184
+
185
+ ## Requirements
186
+
187
+ - Python >= 3.8
188
+ - lxml >= 4.9.0
189
+ - PyYAML >= 6.0
190
+
191
+ ## Changelog
192
+
193
+ ### v1.4.x (Phase 3)
194
+ - Session connection overrides for sources and targets
195
+ - Worklet function generation with safe invocation
196
+ - Type casting at target writes based on TARGETFIELD datatypes
197
+ - Flat-file session path overrides properly wired
198
+
199
+ ### v1.3.x (Phase 2)
200
+ - FLATFILE metadata in source reads and target writes
201
+ - Normalizer with `pd.melt()`
202
+ - Rank with group-by and Top-N filtering
203
+ - Decision tasks with real if/else branches
204
+ - Mapplet instance inlining
205
+
206
+ ### v1.2.x (Phase 1)
207
+ - Core parser for all 72 XML tags
208
+ - Expression converter with 80+ functions
209
+ - Aggregator, Joiner, Lookup code generation
210
+ - Workflow orchestration with topological task ordering
211
+ - Multi-library support (pandas, dask, polars, vaex, modin)
212
+
213
+ ## Development
214
+
215
+ ```bash
216
+ # Clone and install in development mode
217
+ cd informatica_python
218
+ pip install -e ".[dev]"
219
+
220
+ # Run tests (25 tests)
221
+ pytest tests/test_converter.py -v
222
+ ```
223
+
224
+ ## License
225
+
226
+ MIT License - Copyright (c) 2025 Nick
227
+
228
+ See [LICENSE](LICENSE) for details.
@@ -0,0 +1,201 @@
1
+ # informatica-python
2
+
3
+ Convert Informatica PowerCenter workflow XML exports into clean, runnable Python/PySpark code.
4
+
5
+ **Author:** Nick
6
+ **License:** MIT
7
+ **PyPI:** [informatica-python](https://pypi.org/project/informatica-python/)
8
+
9
+ ---
10
+
11
+ ## Overview
12
+
13
+ `informatica-python` parses Informatica PowerCenter XML export files and generates equivalent Python code using your choice of data library. It handles all 72 DTD tags from the PowerCenter XML schema and produces a complete, ready-to-run Python project.
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ pip install informatica-python
19
+ ```
20
+
21
+ ## Quick Start
22
+
23
+ ### Command Line
24
+
25
+ ```bash
26
+ # Generate Python files to a directory
27
+ informatica-python workflow_export.xml -o output_dir
28
+
29
+ # Generate as a zip archive
30
+ informatica-python workflow_export.xml -z output.zip
31
+
32
+ # Use a different data library
33
+ informatica-python workflow_export.xml -o output_dir --data-lib polars
34
+
35
+ # Parse to JSON only (no code generation)
36
+ informatica-python workflow_export.xml --json
37
+
38
+ # Save parsed JSON to file
39
+ informatica-python workflow_export.xml --json-file parsed.json
40
+ ```
41
+
42
+ ### Python API
43
+
44
+ ```python
45
+ from informatica_python import InformaticaConverter
46
+
47
+ converter = InformaticaConverter()
48
+
49
+ # Parse and generate files
50
+ converter.convert_to_files("workflow_export.xml", "output_dir")
51
+
52
+ # Parse and generate zip
53
+ converter.convert_to_zip("workflow_export.xml", "output.zip")
54
+
55
+ # Parse to structured dict
56
+ result = converter.parse_file("workflow_export.xml")
57
+
58
+ # Use a different data library
59
+ converter.convert_to_files("workflow_export.xml", "output_dir", data_lib="polars")
60
+ ```
61
+
62
+ ## Generated Output Files
63
+
64
+ | File | Description |
65
+ |------|-------------|
66
+ | `helper_functions.py` | Database/file I/O helpers, Informatica expression equivalents (80+ functions) |
67
+ | `mapping_N.py` | One per mapping — transformation logic, source reads, target writes |
68
+ | `workflow.py` | Task orchestration with topological ordering and error handling |
69
+ | `config.yml` | Connection configs, source/target metadata, runtime parameters |
70
+ | `all_sql_queries.sql` | All SQL extracted from Source Qualifiers, Lookups, SQL transforms |
71
+ | `error_log.txt` | Conversion summary, warnings, and unsupported feature notes |
72
+
73
+ ## Supported Data Libraries
74
+
75
+ Select via `--data-lib` CLI flag or `data_lib` parameter:
76
+
77
+ | Library | Flag | Best For |
78
+ |---------|------|----------|
79
+ | **pandas** | `pandas` (default) | General-purpose, most compatible |
80
+ | **dask** | `dask` | Large datasets, parallel processing |
81
+ | **polars** | `polars` | High performance, Rust-backed |
82
+ | **vaex** | `vaex` | Out-of-core, billion-row datasets |
83
+ | **modin** | `modin` | Drop-in pandas replacement, multi-core |
84
+
85
+ ## Supported Transformations
86
+
87
+ The code generator produces real, runnable Python for these transformation types:
88
+
89
+ - **Source Qualifier** — SQL override, pre/post SQL, column selection
90
+ - **Expression** — Field-level expressions converted to pandas operations
91
+ - **Filter** — Row filtering with converted conditions
92
+ - **Joiner** — `pd.merge()` with join type and condition parsing
93
+ - **Lookup** — `pd.merge()` lookups with connection-aware DB/file reads
94
+ - **Aggregator** — `groupby().agg()` with SUM/COUNT/AVG/MIN/MAX/FIRST/LAST
95
+ - **Sorter** — `sort_values()` with multi-key ascending/descending
96
+ - **Router** — Multi-group conditional routing with if/elif/else
97
+ - **Union** — `pd.concat()` across multiple input groups
98
+ - **Update Strategy** — Insert/Update/Delete/Reject flag generation
99
+ - **Sequence Generator** — Auto-incrementing ID columns
100
+ - **Normalizer** — `pd.melt()` with auto-detected id/value vars
101
+ - **Rank** — `groupby().rank()` with Top-N filtering
102
+ - **Stored Procedure** — Stub generation with SP name and parameters
103
+ - **Transaction Control** — Commit/rollback logic stubs
104
+ - **Custom / Java** — Placeholder stubs with TODO markers
105
+ - **SQL Transform** — Direct SQL execution pass-through
106
+
107
+ ## Supported XML Tags (72 Tags)
108
+
109
+ **Top-level:** POWERMART, REPOSITORY, FOLDER, FOLDERVERSION
110
+
111
+ **Source/Target:** SOURCE, SOURCEFIELD, TARGET, TARGETFIELD, TARGETINDEX, TARGETINDEXFIELD, FLATFILE, XMLINFO, XMLTEXT, GROUP, TABLEATTRIBUTE, FIELDATTRIBUTE, METADATAEXTENSION, KEYWORD, ERPSRCINFO
112
+
113
+ **Mapping/Mapplet:** MAPPING, MAPPLET, TRANSFORMATION, TRANSFORMFIELD, TRANSFORMFIELDATTR, TRANSFORMFIELDATTRDEF, INSTANCE, ASSOCIATED_SOURCE_INSTANCE, CONNECTOR, MAPDEPENDENCY, TARGETLOADORDER, MAPPINGVARIABLE, FIELDDEPENDENCY, INITPROP, ERPINFO
114
+
115
+ **Task/Session/Workflow:** TASK, TIMER, VALUEPAIR, SCHEDULER, SCHEDULEINFO, STARTOPTIONS, ENDOPTIONS, SCHEDULEOPTIONS, RECURRING, CUSTOM, DAILYFREQUENCY, REPEAT, FILTER, SESSION, CONFIGREFERENCE, SESSTRANSFORMATIONINST, SESSTRANSFORMATIONGROUP, PARTITION, HASHKEY, KEYRANGE, CONFIG, SESSIONCOMPONENT, CONNECTIONREFERENCE, TASKINSTANCE, WORKFLOWLINK, WORKFLOWVARIABLE, WORKFLOWEVENT, WORKLET, WORKFLOW, ATTRIBUTE
116
+
117
+ **Shortcut:** SHORTCUT
118
+
119
+ **SAP:** SAPFUNCTION, SAPSTRUCTURE, SAPPROGRAM, SAPOUTPUTPORT, SAPVARIABLE, SAPPROGRAMFLOWOBJECT, SAPTABLEPARAM
120
+
121
+ ## Key Features
122
+
123
+ ### Session Connection Overrides (v1.4+)
124
+ When sessions define per-transform connection overrides (different database, file directory, or filename), the generated code uses those overrides instead of source/target defaults.
125
+
126
+ ### Worklet Support (v1.4+)
127
+ Worklet workflows are detected and generate separate `run_worklet_NAME(config)` functions. The main workflow calls these automatically for Worklet task types.
128
+
129
+ ### Type Casting at Target Writes (v1.4+)
130
+ Target field datatypes are mapped to pandas types and generate proper casting code:
131
+ - Integers: nullable `Int64`/`Int32` or `fillna(0).astype(int)` for NOT NULL
132
+ - Dates: `pd.to_datetime(errors='coerce')`
133
+ - Decimals/Floats: `pd.to_numeric(errors='coerce')`
134
+ - Booleans: `.astype('boolean')`
135
+
136
+ ### Flat File Handling (v1.3+)
137
+ Parses FLATFILE metadata for delimiter, fixed-width, header lines, skip rows, quote/escape chars. Generates `pd.read_fwf()` for fixed-width or enriched `read_file()` for delimited.
138
+
139
+ ### Mapplet Inlining (v1.3+)
140
+ Expands Mapplet instances into prefixed transforms, rewires connectors, and eliminates duplication.
141
+
142
+ ### Decision Tasks (v1.3+)
143
+ Converts Informatica decision conditions to Python if/else branches with proper variable substitution.
144
+
145
+ ### Expression Converter (80+ Functions)
146
+
147
+ Converts Informatica expressions to Python equivalents:
148
+
149
+ - **String:** SUBSTR, LTRIM, RTRIM, UPPER, LOWER, LPAD, RPAD, INSTR, LENGTH, CONCAT, REPLACE, REG_EXTRACT, REG_REPLACE, REVERSE, INITCAP, CHR, ASCII
150
+ - **Date:** ADD_TO_DATE, DATE_DIFF, GET_DATE_PART, SYSDATE, SYSTIMESTAMP, TO_DATE, TO_CHAR, TRUNC (date)
151
+ - **Numeric:** ROUND, TRUNC, MOD, ABS, CEIL, FLOOR, POWER, SQRT, LOG, EXP, SIGN
152
+ - **Conversion:** TO_INTEGER, TO_BIGINT, TO_FLOAT, TO_DECIMAL, TO_CHAR, TO_DATE
153
+ - **Null handling:** IIF, DECODE, NVL, NVL2, ISNULL, IS_SPACES, IS_NUMBER
154
+ - **Aggregate:** SUM, AVG, COUNT, MIN, MAX, FIRST, LAST, MEDIAN, STDDEV, VARIANCE
155
+ - **Lookup:** :LKP expressions with dynamic lookup references
156
+ - **Variable:** SETVARIABLE / mapping variable assignment
157
+
158
+ ## Requirements
159
+
160
+ - Python >= 3.8
161
+ - lxml >= 4.9.0
162
+ - PyYAML >= 6.0
163
+
164
+ ## Changelog
165
+
166
+ ### v1.4.x (Phase 3)
167
+ - Session connection overrides for sources and targets
168
+ - Worklet function generation with safe invocation
169
+ - Type casting at target writes based on TARGETFIELD datatypes
170
+ - Flat-file session path overrides properly wired
171
+
172
+ ### v1.3.x (Phase 2)
173
+ - FLATFILE metadata in source reads and target writes
174
+ - Normalizer with `pd.melt()`
175
+ - Rank with group-by and Top-N filtering
176
+ - Decision tasks with real if/else branches
177
+ - Mapplet instance inlining
178
+
179
+ ### v1.2.x (Phase 1)
180
+ - Core parser for all 72 XML tags
181
+ - Expression converter with 80+ functions
182
+ - Aggregator, Joiner, Lookup code generation
183
+ - Workflow orchestration with topological task ordering
184
+ - Multi-library support (pandas, dask, polars, vaex, modin)
185
+
186
+ ## Development
187
+
188
+ ```bash
189
+ # Clone and install in development mode
190
+ cd informatica_python
191
+ pip install -e ".[dev]"
192
+
193
+ # Run tests (25 tests)
194
+ pytest tests/test_converter.py -v
195
+ ```
196
+
197
+ ## License
198
+
199
+ MIT License - Copyright (c) 2025 Nick
200
+
201
+ See [LICENSE](LICENSE) for details.
@@ -0,0 +1,13 @@
1
+ """
2
+ informatica-python: Convert Informatica PowerCenter workflow XML to Python/PySpark code.
3
+
4
+ Copyright (c) 2025 Nick. All rights reserved.
5
+ Licensed under the MIT License.
6
+ """
7
+
8
+ from informatica_python.converter import InformaticaConverter
9
+
10
+ __version__ = "1.5.0"
11
+ __author__ = "Nick"
12
+ __license__ = "MIT"
13
+ __all__ = ["InformaticaConverter"]
@@ -41,6 +41,11 @@ def main():
41
41
  default=None,
42
42
  help="Save parsed JSON to a file",
43
43
  )
44
+ parser.add_argument(
45
+ "--param-file",
46
+ default=None,
47
+ help="Path to Informatica .param file for variable substitution",
48
+ )
44
49
 
45
50
  args = parser.parse_args()
46
51
 
@@ -44,13 +44,20 @@ def generate_helper_functions(folder: FolderDef, data_lib: str = "pandas") -> st
44
44
  lines.append('logger = logging.getLogger("informatica_converter")')
45
45
  lines.append("")
46
46
  lines.append("")
47
- lines.append("def load_config(config_path='config.yml'):")
48
- lines.append(' """Load configuration from YAML file."""')
47
+ lines.append("def load_config(config_path='config.yml', param_file=None):")
48
+ lines.append(' """Load configuration from YAML file, optionally merging Informatica .param file."""')
49
49
  lines.append(" with open(config_path, 'r') as f:")
50
- lines.append(" return yaml.safe_load(f)")
50
+ lines.append(" config = yaml.safe_load(f) or {}")
51
+ lines.append(" if param_file:")
52
+ lines.append(" params = parse_param_file(param_file)")
53
+ lines.append(" config['params'] = params")
54
+ lines.append(" for key, val in params.items():")
55
+ lines.append(" os.environ[f'INFA_VAR_{key}'] = str(val)")
56
+ lines.append(" return config")
51
57
  lines.append("")
52
58
  lines.append("")
53
59
 
60
+ _add_param_file_functions(lines)
54
61
  _add_db_functions(lines, data_lib)
55
62
  _add_file_functions(lines, data_lib)
56
63
  _add_expression_helpers(lines)
@@ -59,6 +66,61 @@ def generate_helper_functions(folder: FolderDef, data_lib: str = "pandas") -> st
59
66
  return "\n".join(lines)
60
67
 
61
68
 
69
+ def _add_param_file_functions(lines):
70
+ lines.append("# ============================================================")
71
+ lines.append("# Informatica Parameter File Support")
72
+ lines.append("# ============================================================")
73
+ lines.append("")
74
+ lines.append("")
75
+ lines.append("def parse_param_file(param_path):")
76
+ lines.append(' """')
77
+ lines.append(" Parse an Informatica .param file into a flat dict of variable names to values.")
78
+ lines.append(" Supports standard Informatica parameter file format:")
79
+ lines.append(" [Global]")
80
+ lines.append(" $$VAR_NAME=value")
81
+ lines.append(" [folder_name.WF:workflow_name.ST:session_name]")
82
+ lines.append(" $$CONN_NAME=value")
83
+ lines.append(' """')
84
+ lines.append(" params = {}")
85
+ lines.append(" if not os.path.exists(param_path):")
86
+ lines.append(" logger.warning(f'Parameter file not found: {param_path}')")
87
+ lines.append(" return params")
88
+ lines.append("")
89
+ lines.append(" current_section = 'Global'")
90
+ lines.append(" with open(param_path, 'r') as f:")
91
+ lines.append(" for line_num, line in enumerate(f, 1):")
92
+ lines.append(" line = line.strip()")
93
+ lines.append(" if not line or line.startswith('#'):")
94
+ lines.append(" continue")
95
+ lines.append(" if line.startswith('[') and line.endswith(']'):")
96
+ lines.append(" current_section = line[1:-1].strip()")
97
+ lines.append(" continue")
98
+ lines.append(" if '=' in line:")
99
+ lines.append(" key, _, value = line.partition('=')")
100
+ lines.append(" key = key.strip()")
101
+ lines.append(" value = value.strip()")
102
+ lines.append(" clean_key = key.lstrip('$')")
103
+ lines.append(" params[clean_key] = value")
104
+ lines.append(" if current_section != 'Global':")
105
+ lines.append(" params[f'{current_section}.{clean_key}'] = value")
106
+ lines.append(" logger.info(f'Loaded {len(params)} parameters from {param_path}')")
107
+ lines.append(" return params")
108
+ lines.append("")
109
+ lines.append("")
110
+ lines.append("def get_param(config, var_name, default=''):")
111
+ lines.append(' """Get a parameter value from config params, then env vars, then default."""')
112
+ lines.append(" clean = var_name.lstrip('$')")
113
+ lines.append(" params = config.get('params', {})")
114
+ lines.append(" if clean in params:")
115
+ lines.append(" return params[clean]")
116
+ lines.append(" env_val = os.environ.get(f'INFA_VAR_{clean}')")
117
+ lines.append(" if env_val is not None:")
118
+ lines.append(" return env_val")
119
+ lines.append(" return default")
120
+ lines.append("")
121
+ lines.append("")
122
+
123
+
62
124
  def _add_db_functions(lines, data_lib):
63
125
  lines.append("# ============================================================")
64
126
  lines.append("# Database Operations")
@@ -1060,14 +1122,26 @@ def _add_expression_helpers(lines):
1060
1122
  lines.append(" return None")
1061
1123
  lines.append("")
1062
1124
  lines.append("")
1063
- lines.append("def get_variable(var_name):")
1064
- lines.append(' """Get workflow/mapping variable value."""')
1065
- lines.append(" return os.environ.get(f'INFA_VAR_{var_name}', '')")
1125
+ lines.append("_param_store = {}")
1126
+ lines.append("")
1127
+ lines.append("")
1128
+ lines.append("def get_variable(var_name, config=None):")
1129
+ lines.append(' """Get workflow/mapping variable value from params, env vars, or param store."""')
1130
+ lines.append(" clean = var_name.lstrip('$')")
1131
+ lines.append(" if config and 'params' in config:")
1132
+ lines.append(" val = config['params'].get(clean)")
1133
+ lines.append(" if val is not None:")
1134
+ lines.append(" return val")
1135
+ lines.append(" if clean in _param_store:")
1136
+ lines.append(" return _param_store[clean]")
1137
+ lines.append(" return os.environ.get(f'INFA_VAR_{clean}', '')")
1066
1138
  lines.append("")
1067
1139
  lines.append("")
1068
1140
  lines.append("def set_variable(var_name, value):")
1069
1141
  lines.append(' """Set workflow/mapping variable value."""')
1070
- lines.append(" os.environ[f'INFA_VAR_{var_name}'] = str(value)")
1142
+ lines.append(" clean = var_name.lstrip('$')")
1143
+ lines.append(" _param_store[clean] = value")
1144
+ lines.append(" os.environ[f'INFA_VAR_{clean}'] = str(value)")
1071
1145
  lines.append(" return value")
1072
1146
  lines.append("")
1073
1147
  lines.append("")