informatica-python 1.4.0__tar.gz → 1.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. informatica_python-1.4.2/LICENSE +21 -0
  2. informatica_python-1.4.2/PKG-INFO +228 -0
  3. informatica_python-1.4.2/README.md +201 -0
  4. informatica_python-1.4.2/informatica_python/__init__.py +13 -0
  5. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/generators/mapping_gen.py +25 -11
  6. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/generators/workflow_gen.py +4 -3
  7. informatica_python-1.4.2/informatica_python.egg-info/PKG-INFO +228 -0
  8. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python.egg-info/SOURCES.txt +1 -0
  9. informatica_python-1.4.2/pyproject.toml +41 -0
  10. informatica_python-1.4.0/PKG-INFO +0 -118
  11. informatica_python-1.4.0/README.md +0 -106
  12. informatica_python-1.4.0/informatica_python/__init__.py +0 -4
  13. informatica_python-1.4.0/informatica_python.egg-info/PKG-INFO +0 -118
  14. informatica_python-1.4.0/pyproject.toml +0 -24
  15. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/cli.py +0 -0
  16. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/converter.py +0 -0
  17. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/generators/__init__.py +0 -0
  18. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/generators/config_gen.py +0 -0
  19. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/generators/error_log_gen.py +0 -0
  20. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/generators/helper_gen.py +0 -0
  21. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/generators/sql_gen.py +0 -0
  22. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/models.py +0 -0
  23. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/parser.py +0 -0
  24. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/utils/__init__.py +0 -0
  25. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/utils/datatype_map.py +0 -0
  26. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python/utils/expression_converter.py +0 -0
  27. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python.egg-info/dependency_links.txt +0 -0
  28. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python.egg-info/entry_points.txt +0 -0
  29. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python.egg-info/requires.txt +0 -0
  30. {informatica_python-1.4.0 → informatica_python-1.4.2}/informatica_python.egg-info/top_level.txt +0 -0
  31. {informatica_python-1.4.0 → informatica_python-1.4.2}/setup.cfg +0 -0
  32. {informatica_python-1.4.0 → informatica_python-1.4.2}/tests/test_converter.py +0 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Nick
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: informatica-python
3
+ Version: 1.4.2
4
+ Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
5
+ Author: Nick
6
+ License: MIT
7
+ Keywords: informatica,powercenter,etl,code-generator,pandas,pyspark,data-engineering
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Software Development :: Code Generators
18
+ Classifier: Topic :: Database :: Database Engines/Servers
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: lxml>=4.9.0
23
+ Requires-Dist: pyyaml>=6.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=7.0; extra == "dev"
26
+ Dynamic: license-file
27
+
28
+ # informatica-python
29
+
30
+ Convert Informatica PowerCenter workflow XML exports into clean, runnable Python/PySpark code.
31
+
32
+ **Author:** Nick
33
+ **License:** MIT
34
+ **PyPI:** [informatica-python](https://pypi.org/project/informatica-python/)
35
+
36
+ ---
37
+
38
+ ## Overview
39
+
40
+ `informatica-python` parses Informatica PowerCenter XML export files and generates equivalent Python code using your choice of data library. It handles all 72 DTD tags from the PowerCenter XML schema and produces a complete, ready-to-run Python project.
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install informatica-python
46
+ ```
47
+
48
+ ## Quick Start
49
+
50
+ ### Command Line
51
+
52
+ ```bash
53
+ # Generate Python files to a directory
54
+ informatica-python workflow_export.xml -o output_dir
55
+
56
+ # Generate as a zip archive
57
+ informatica-python workflow_export.xml -z output.zip
58
+
59
+ # Use a different data library
60
+ informatica-python workflow_export.xml -o output_dir --data-lib polars
61
+
62
+ # Parse to JSON only (no code generation)
63
+ informatica-python workflow_export.xml --json
64
+
65
+ # Save parsed JSON to file
66
+ informatica-python workflow_export.xml --json-file parsed.json
67
+ ```
68
+
69
+ ### Python API
70
+
71
+ ```python
72
+ from informatica_python import InformaticaConverter
73
+
74
+ converter = InformaticaConverter()
75
+
76
+ # Parse and generate files
77
+ converter.convert_to_files("workflow_export.xml", "output_dir")
78
+
79
+ # Parse and generate zip
80
+ converter.convert_to_zip("workflow_export.xml", "output.zip")
81
+
82
+ # Parse to structured dict
83
+ result = converter.parse_file("workflow_export.xml")
84
+
85
+ # Use a different data library
86
+ converter.convert_to_files("workflow_export.xml", "output_dir", data_lib="polars")
87
+ ```
88
+
89
+ ## Generated Output Files
90
+
91
+ | File | Description |
92
+ |------|-------------|
93
+ | `helper_functions.py` | Database/file I/O helpers, Informatica expression equivalents (80+ functions) |
94
+ | `mapping_N.py` | One per mapping — transformation logic, source reads, target writes |
95
+ | `workflow.py` | Task orchestration with topological ordering and error handling |
96
+ | `config.yml` | Connection configs, source/target metadata, runtime parameters |
97
+ | `all_sql_queries.sql` | All SQL extracted from Source Qualifiers, Lookups, SQL transforms |
98
+ | `error_log.txt` | Conversion summary, warnings, and unsupported feature notes |
99
+
100
+ ## Supported Data Libraries
101
+
102
+ Select via `--data-lib` CLI flag or `data_lib` parameter:
103
+
104
+ | Library | Flag | Best For |
105
+ |---------|------|----------|
106
+ | **pandas** | `pandas` (default) | General-purpose, most compatible |
107
+ | **dask** | `dask` | Large datasets, parallel processing |
108
+ | **polars** | `polars` | High performance, Rust-backed |
109
+ | **vaex** | `vaex` | Out-of-core, billion-row datasets |
110
+ | **modin** | `modin` | Drop-in pandas replacement, multi-core |
111
+
112
+ ## Supported Transformations
113
+
114
+ The code generator produces real, runnable Python for these transformation types:
115
+
116
+ - **Source Qualifier** — SQL override, pre/post SQL, column selection
117
+ - **Expression** — Field-level expressions converted to pandas operations
118
+ - **Filter** — Row filtering with converted conditions
119
+ - **Joiner** — `pd.merge()` with join type and condition parsing
120
+ - **Lookup** — `pd.merge()` lookups with connection-aware DB/file reads
121
+ - **Aggregator** — `groupby().agg()` with SUM/COUNT/AVG/MIN/MAX/FIRST/LAST
122
+ - **Sorter** — `sort_values()` with multi-key ascending/descending
123
+ - **Router** — Multi-group conditional routing with if/elif/else
124
+ - **Union** — `pd.concat()` across multiple input groups
125
+ - **Update Strategy** — Insert/Update/Delete/Reject flag generation
126
+ - **Sequence Generator** — Auto-incrementing ID columns
127
+ - **Normalizer** — `pd.melt()` with auto-detected id/value vars
128
+ - **Rank** — `groupby().rank()` with Top-N filtering
129
+ - **Stored Procedure** — Stub generation with SP name and parameters
130
+ - **Transaction Control** — Commit/rollback logic stubs
131
+ - **Custom / Java** — Placeholder stubs with TODO markers
132
+ - **SQL Transform** — Direct SQL execution pass-through
133
+
134
+ ## Supported XML Tags (72 Tags)
135
+
136
+ **Top-level:** POWERMART, REPOSITORY, FOLDER, FOLDERVERSION
137
+
138
+ **Source/Target:** SOURCE, SOURCEFIELD, TARGET, TARGETFIELD, TARGETINDEX, TARGETINDEXFIELD, FLATFILE, XMLINFO, XMLTEXT, GROUP, TABLEATTRIBUTE, FIELDATTRIBUTE, METADATAEXTENSION, KEYWORD, ERPSRCINFO
139
+
140
+ **Mapping/Mapplet:** MAPPING, MAPPLET, TRANSFORMATION, TRANSFORMFIELD, TRANSFORMFIELDATTR, TRANSFORMFIELDATTRDEF, INSTANCE, ASSOCIATED_SOURCE_INSTANCE, CONNECTOR, MAPDEPENDENCY, TARGETLOADORDER, MAPPINGVARIABLE, FIELDDEPENDENCY, INITPROP, ERPINFO
141
+
142
+ **Task/Session/Workflow:** TASK, TIMER, VALUEPAIR, SCHEDULER, SCHEDULEINFO, STARTOPTIONS, ENDOPTIONS, SCHEDULEOPTIONS, RECURRING, CUSTOM, DAILYFREQUENCY, REPEAT, FILTER, SESSION, CONFIGREFERENCE, SESSTRANSFORMATIONINST, SESSTRANSFORMATIONGROUP, PARTITION, HASHKEY, KEYRANGE, CONFIG, SESSIONCOMPONENT, CONNECTIONREFERENCE, TASKINSTANCE, WORKFLOWLINK, WORKFLOWVARIABLE, WORKFLOWEVENT, WORKLET, WORKFLOW, ATTRIBUTE
143
+
144
+ **Shortcut:** SHORTCUT
145
+
146
+ **SAP:** SAPFUNCTION, SAPSTRUCTURE, SAPPROGRAM, SAPOUTPUTPORT, SAPVARIABLE, SAPPROGRAMFLOWOBJECT, SAPTABLEPARAM
147
+
148
+ ## Key Features
149
+
150
+ ### Session Connection Overrides (v1.4+)
151
+ When sessions define per-transform connection overrides (different database, file directory, or filename), the generated code uses those overrides instead of source/target defaults.
152
+
153
+ ### Worklet Support (v1.4+)
154
+ Worklet workflows are detected and generate separate `run_worklet_NAME(config)` functions. The main workflow calls these automatically for Worklet task types.
155
+
156
+ ### Type Casting at Target Writes (v1.4+)
157
+ Target field datatypes are mapped to pandas types and generate proper casting code:
158
+ - Integers: nullable `Int64`/`Int32` or `fillna(0).astype(int)` for NOT NULL
159
+ - Dates: `pd.to_datetime(errors='coerce')`
160
+ - Decimals/Floats: `pd.to_numeric(errors='coerce')`
161
+ - Booleans: `.astype('boolean')`
162
+
163
+ ### Flat File Handling (v1.3+)
164
+ Parses FLATFILE metadata for delimiter, fixed-width, header lines, skip rows, quote/escape chars. Generates `pd.read_fwf()` for fixed-width or enriched `read_file()` for delimited.
165
+
166
+ ### Mapplet Inlining (v1.3+)
167
+ Expands Mapplet instances into prefixed transforms, rewires connectors, and eliminates duplication.
168
+
169
+ ### Decision Tasks (v1.3+)
170
+ Converts Informatica decision conditions to Python if/else branches with proper variable substitution.
171
+
172
+ ### Expression Converter (80+ Functions)
173
+
174
+ Converts Informatica expressions to Python equivalents:
175
+
176
+ - **String:** SUBSTR, LTRIM, RTRIM, UPPER, LOWER, LPAD, RPAD, INSTR, LENGTH, CONCAT, REPLACE, REG_EXTRACT, REG_REPLACE, REVERSE, INITCAP, CHR, ASCII
177
+ - **Date:** ADD_TO_DATE, DATE_DIFF, GET_DATE_PART, SYSDATE, SYSTIMESTAMP, TO_DATE, TO_CHAR, TRUNC (date)
178
+ - **Numeric:** ROUND, TRUNC, MOD, ABS, CEIL, FLOOR, POWER, SQRT, LOG, EXP, SIGN
179
+ - **Conversion:** TO_INTEGER, TO_BIGINT, TO_FLOAT, TO_DECIMAL, TO_CHAR, TO_DATE
180
+ - **Null handling:** IIF, DECODE, NVL, NVL2, ISNULL, IS_SPACES, IS_NUMBER
181
+ - **Aggregate:** SUM, AVG, COUNT, MIN, MAX, FIRST, LAST, MEDIAN, STDDEV, VARIANCE
182
+ - **Lookup:** :LKP expressions with dynamic lookup references
183
+ - **Variable:** SETVARIABLE / mapping variable assignment
184
+
185
+ ## Requirements
186
+
187
+ - Python >= 3.8
188
+ - lxml >= 4.9.0
189
+ - PyYAML >= 6.0
190
+
191
+ ## Changelog
192
+
193
+ ### v1.4.x (Phase 3)
194
+ - Session connection overrides for sources and targets
195
+ - Worklet function generation with safe invocation
196
+ - Type casting at target writes based on TARGETFIELD datatypes
197
+ - Flat-file session path overrides properly wired
198
+
199
+ ### v1.3.x (Phase 2)
200
+ - FLATFILE metadata in source reads and target writes
201
+ - Normalizer with `pd.melt()`
202
+ - Rank with group-by and Top-N filtering
203
+ - Decision tasks with real if/else branches
204
+ - Mapplet instance inlining
205
+
206
+ ### v1.2.x (Phase 1)
207
+ - Core parser for all 72 XML tags
208
+ - Expression converter with 80+ functions
209
+ - Aggregator, Joiner, Lookup code generation
210
+ - Workflow orchestration with topological task ordering
211
+ - Multi-library support (pandas, dask, polars, vaex, modin)
212
+
213
+ ## Development
214
+
215
+ ```bash
216
+ # Clone and install in development mode
217
+ cd informatica_python
218
+ pip install -e ".[dev]"
219
+
220
+ # Run tests (25 tests)
221
+ pytest tests/test_converter.py -v
222
+ ```
223
+
224
+ ## License
225
+
226
+ MIT License - Copyright (c) 2025 Nick
227
+
228
+ See [LICENSE](LICENSE) for details.
@@ -0,0 +1,201 @@
1
+ # informatica-python
2
+
3
+ Convert Informatica PowerCenter workflow XML exports into clean, runnable Python/PySpark code.
4
+
5
+ **Author:** Nick
6
+ **License:** MIT
7
+ **PyPI:** [informatica-python](https://pypi.org/project/informatica-python/)
8
+
9
+ ---
10
+
11
+ ## Overview
12
+
13
+ `informatica-python` parses Informatica PowerCenter XML export files and generates equivalent Python code using your choice of data library. It handles all 72 DTD tags from the PowerCenter XML schema and produces a complete, ready-to-run Python project.
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ pip install informatica-python
19
+ ```
20
+
21
+ ## Quick Start
22
+
23
+ ### Command Line
24
+
25
+ ```bash
26
+ # Generate Python files to a directory
27
+ informatica-python workflow_export.xml -o output_dir
28
+
29
+ # Generate as a zip archive
30
+ informatica-python workflow_export.xml -z output.zip
31
+
32
+ # Use a different data library
33
+ informatica-python workflow_export.xml -o output_dir --data-lib polars
34
+
35
+ # Parse to JSON only (no code generation)
36
+ informatica-python workflow_export.xml --json
37
+
38
+ # Save parsed JSON to file
39
+ informatica-python workflow_export.xml --json-file parsed.json
40
+ ```
41
+
42
+ ### Python API
43
+
44
+ ```python
45
+ from informatica_python import InformaticaConverter
46
+
47
+ converter = InformaticaConverter()
48
+
49
+ # Parse and generate files
50
+ converter.convert_to_files("workflow_export.xml", "output_dir")
51
+
52
+ # Parse and generate zip
53
+ converter.convert_to_zip("workflow_export.xml", "output.zip")
54
+
55
+ # Parse to structured dict
56
+ result = converter.parse_file("workflow_export.xml")
57
+
58
+ # Use a different data library
59
+ converter.convert_to_files("workflow_export.xml", "output_dir", data_lib="polars")
60
+ ```
61
+
62
+ ## Generated Output Files
63
+
64
+ | File | Description |
65
+ |------|-------------|
66
+ | `helper_functions.py` | Database/file I/O helpers, Informatica expression equivalents (80+ functions) |
67
+ | `mapping_N.py` | One per mapping — transformation logic, source reads, target writes |
68
+ | `workflow.py` | Task orchestration with topological ordering and error handling |
69
+ | `config.yml` | Connection configs, source/target metadata, runtime parameters |
70
+ | `all_sql_queries.sql` | All SQL extracted from Source Qualifiers, Lookups, SQL transforms |
71
+ | `error_log.txt` | Conversion summary, warnings, and unsupported feature notes |
72
+
73
+ ## Supported Data Libraries
74
+
75
+ Select via `--data-lib` CLI flag or `data_lib` parameter:
76
+
77
+ | Library | Flag | Best For |
78
+ |---------|------|----------|
79
+ | **pandas** | `pandas` (default) | General-purpose, most compatible |
80
+ | **dask** | `dask` | Large datasets, parallel processing |
81
+ | **polars** | `polars` | High performance, Rust-backed |
82
+ | **vaex** | `vaex` | Out-of-core, billion-row datasets |
83
+ | **modin** | `modin` | Drop-in pandas replacement, multi-core |
84
+
85
+ ## Supported Transformations
86
+
87
+ The code generator produces real, runnable Python for these transformation types:
88
+
89
+ - **Source Qualifier** — SQL override, pre/post SQL, column selection
90
+ - **Expression** — Field-level expressions converted to pandas operations
91
+ - **Filter** — Row filtering with converted conditions
92
+ - **Joiner** — `pd.merge()` with join type and condition parsing
93
+ - **Lookup** — `pd.merge()` lookups with connection-aware DB/file reads
94
+ - **Aggregator** — `groupby().agg()` with SUM/COUNT/AVG/MIN/MAX/FIRST/LAST
95
+ - **Sorter** — `sort_values()` with multi-key ascending/descending
96
+ - **Router** — Multi-group conditional routing with if/elif/else
97
+ - **Union** — `pd.concat()` across multiple input groups
98
+ - **Update Strategy** — Insert/Update/Delete/Reject flag generation
99
+ - **Sequence Generator** — Auto-incrementing ID columns
100
+ - **Normalizer** — `pd.melt()` with auto-detected id/value vars
101
+ - **Rank** — `groupby().rank()` with Top-N filtering
102
+ - **Stored Procedure** — Stub generation with SP name and parameters
103
+ - **Transaction Control** — Commit/rollback logic stubs
104
+ - **Custom / Java** — Placeholder stubs with TODO markers
105
+ - **SQL Transform** — Direct SQL execution pass-through
106
+
107
+ ## Supported XML Tags (72 Tags)
108
+
109
+ **Top-level:** POWERMART, REPOSITORY, FOLDER, FOLDERVERSION
110
+
111
+ **Source/Target:** SOURCE, SOURCEFIELD, TARGET, TARGETFIELD, TARGETINDEX, TARGETINDEXFIELD, FLATFILE, XMLINFO, XMLTEXT, GROUP, TABLEATTRIBUTE, FIELDATTRIBUTE, METADATAEXTENSION, KEYWORD, ERPSRCINFO
112
+
113
+ **Mapping/Mapplet:** MAPPING, MAPPLET, TRANSFORMATION, TRANSFORMFIELD, TRANSFORMFIELDATTR, TRANSFORMFIELDATTRDEF, INSTANCE, ASSOCIATED_SOURCE_INSTANCE, CONNECTOR, MAPDEPENDENCY, TARGETLOADORDER, MAPPINGVARIABLE, FIELDDEPENDENCY, INITPROP, ERPINFO
114
+
115
+ **Task/Session/Workflow:** TASK, TIMER, VALUEPAIR, SCHEDULER, SCHEDULEINFO, STARTOPTIONS, ENDOPTIONS, SCHEDULEOPTIONS, RECURRING, CUSTOM, DAILYFREQUENCY, REPEAT, FILTER, SESSION, CONFIGREFERENCE, SESSTRANSFORMATIONINST, SESSTRANSFORMATIONGROUP, PARTITION, HASHKEY, KEYRANGE, CONFIG, SESSIONCOMPONENT, CONNECTIONREFERENCE, TASKINSTANCE, WORKFLOWLINK, WORKFLOWVARIABLE, WORKFLOWEVENT, WORKLET, WORKFLOW, ATTRIBUTE
116
+
117
+ **Shortcut:** SHORTCUT
118
+
119
+ **SAP:** SAPFUNCTION, SAPSTRUCTURE, SAPPROGRAM, SAPOUTPUTPORT, SAPVARIABLE, SAPPROGRAMFLOWOBJECT, SAPTABLEPARAM
120
+
121
+ ## Key Features
122
+
123
+ ### Session Connection Overrides (v1.4+)
124
+ When sessions define per-transform connection overrides (different database, file directory, or filename), the generated code uses those overrides instead of source/target defaults.
125
+
126
+ ### Worklet Support (v1.4+)
127
+ Worklet workflows are detected and generate separate `run_worklet_NAME(config)` functions. The main workflow calls these automatically for Worklet task types.
128
+
129
+ ### Type Casting at Target Writes (v1.4+)
130
+ Target field datatypes are mapped to pandas types and generate proper casting code:
131
+ - Integers: nullable `Int64`/`Int32` or `fillna(0).astype(int)` for NOT NULL
132
+ - Dates: `pd.to_datetime(errors='coerce')`
133
+ - Decimals/Floats: `pd.to_numeric(errors='coerce')`
134
+ - Booleans: `.astype('boolean')`
135
+
136
+ ### Flat File Handling (v1.3+)
137
+ Parses FLATFILE metadata for delimiter, fixed-width, header lines, skip rows, quote/escape chars. Generates `pd.read_fwf()` for fixed-width or enriched `read_file()` for delimited.
138
+
139
+ ### Mapplet Inlining (v1.3+)
140
+ Expands Mapplet instances into prefixed transforms, rewires connectors, and eliminates duplication.
141
+
142
+ ### Decision Tasks (v1.3+)
143
+ Converts Informatica decision conditions to Python if/else branches with proper variable substitution.
144
+
145
+ ### Expression Converter (80+ Functions)
146
+
147
+ Converts Informatica expressions to Python equivalents:
148
+
149
+ - **String:** SUBSTR, LTRIM, RTRIM, UPPER, LOWER, LPAD, RPAD, INSTR, LENGTH, CONCAT, REPLACE, REG_EXTRACT, REG_REPLACE, REVERSE, INITCAP, CHR, ASCII
150
+ - **Date:** ADD_TO_DATE, DATE_DIFF, GET_DATE_PART, SYSDATE, SYSTIMESTAMP, TO_DATE, TO_CHAR, TRUNC (date)
151
+ - **Numeric:** ROUND, TRUNC, MOD, ABS, CEIL, FLOOR, POWER, SQRT, LOG, EXP, SIGN
152
+ - **Conversion:** TO_INTEGER, TO_BIGINT, TO_FLOAT, TO_DECIMAL, TO_CHAR, TO_DATE
153
+ - **Null handling:** IIF, DECODE, NVL, NVL2, ISNULL, IS_SPACES, IS_NUMBER
154
+ - **Aggregate:** SUM, AVG, COUNT, MIN, MAX, FIRST, LAST, MEDIAN, STDDEV, VARIANCE
155
+ - **Lookup:** :LKP expressions with dynamic lookup references
156
+ - **Variable:** SETVARIABLE / mapping variable assignment
157
+
158
+ ## Requirements
159
+
160
+ - Python >= 3.8
161
+ - lxml >= 4.9.0
162
+ - PyYAML >= 6.0
163
+
164
+ ## Changelog
165
+
166
+ ### v1.4.x (Phase 3)
167
+ - Session connection overrides for sources and targets
168
+ - Worklet function generation with safe invocation
169
+ - Type casting at target writes based on TARGETFIELD datatypes
170
+ - Flat-file session path overrides properly wired
171
+
172
+ ### v1.3.x (Phase 2)
173
+ - FLATFILE metadata in source reads and target writes
174
+ - Normalizer with `pd.melt()`
175
+ - Rank with group-by and Top-N filtering
176
+ - Decision tasks with real if/else branches
177
+ - Mapplet instance inlining
178
+
179
+ ### v1.2.x (Phase 1)
180
+ - Core parser for all 72 XML tags
181
+ - Expression converter with 80+ functions
182
+ - Aggregator, Joiner, Lookup code generation
183
+ - Workflow orchestration with topological task ordering
184
+ - Multi-library support (pandas, dask, polars, vaex, modin)
185
+
186
+ ## Development
187
+
188
+ ```bash
189
+ # Clone and install in development mode
190
+ cd informatica_python
191
+ pip install -e ".[dev]"
192
+
193
+ # Run tests (25 tests)
194
+ pytest tests/test_converter.py -v
195
+ ```
196
+
197
+ ## License
198
+
199
+ MIT License - Copyright (c) 2025 Nick
200
+
201
+ See [LICENSE](LICENSE) for details.
@@ -0,0 +1,13 @@
1
+ """
2
+ informatica-python: Convert Informatica PowerCenter workflow XML to Python/PySpark code.
3
+
4
+ Copyright (c) 2025 Nick. All rights reserved.
5
+ Licensed under the MIT License.
6
+ """
7
+
8
+ from informatica_python.converter import InformaticaConverter
9
+
10
+ __version__ = "1.4.2"
11
+ __author__ = "Nick"
12
+ __license__ = "MIT"
13
+ __all__ = ["InformaticaConverter"]
@@ -247,7 +247,7 @@ def generate_mapping_code(mapping: MappingDef, folder: FolderDef,
247
247
  lines.append(f" _src_path_{safe} = config.get('sources', {{}}).get('{src_def.name}', {{}}).get('file_path',")
248
248
  lines.append(f" os.path.join('{src_dir}', '{src_file}'))")
249
249
  if src_def.flatfile:
250
- _emit_flatfile_read(lines, safe, src_def)
250
+ _emit_flatfile_read(lines, safe, src_def, file_path_override=True)
251
251
  else:
252
252
  lines.append(f" df_{safe} = read_file(_src_path_{safe}, config.get('sources', {{}}).get('{src_def.name}', {{}}))")
253
253
  elif src_def.database_type and src_def.database_type != "Flat File":
@@ -323,15 +323,16 @@ def _flatfile_config_dict(ff):
323
323
  return cfg
324
324
 
325
325
 
326
- def _emit_flatfile_read(lines, var_name, src_def, indent=" "):
326
+ def _emit_flatfile_read(lines, var_name, src_def, indent=" ", file_path_override=None):
327
327
  ff = src_def.flatfile
328
328
  fc = _flatfile_config_dict(ff)
329
+ default_path = f"_src_path_{var_name}" if file_path_override else f"config.get('sources', {{}}).get('{src_def.name}', {{}}).get('file_path', '{src_def.name}')"
329
330
  if fc.get("fixed_width"):
330
331
  widths = []
331
332
  for fld in src_def.fields:
332
333
  widths.append(fld.precision if fld.precision else 10)
333
334
  lines.append(f"{indent}df_{var_name} = pd.read_fwf(")
334
- lines.append(f"{indent} config.get('sources', {{}}).get('{src_def.name}', {{}}).get('file_path', '{src_def.name}'),")
335
+ lines.append(f"{indent} {default_path},")
335
336
  lines.append(f"{indent} widths={widths},")
336
337
  hdr = fc.get("header_lines", 0)
337
338
  if hdr:
@@ -367,15 +368,22 @@ def _emit_flatfile_read(lines, var_name, src_def, indent=" "):
367
368
  if file_cfg:
368
369
  lines.append(f"{indent}ff_cfg_{var_name} = {repr(file_cfg)}")
369
370
  lines.append(f"{indent}ff_cfg_{var_name}.update(config.get('sources', {{}}).get('{src_def.name}', {{}}))")
370
- lines.append(f"{indent}df_{var_name} = read_file(ff_cfg_{var_name}.get('file_path', '{src_def.name}'), ff_cfg_{var_name})")
371
+ if file_path_override:
372
+ lines.append(f"{indent}df_{var_name} = read_file({default_path}, ff_cfg_{var_name})")
373
+ else:
374
+ lines.append(f"{indent}df_{var_name} = read_file(ff_cfg_{var_name}.get('file_path', '{src_def.name}'), ff_cfg_{var_name})")
371
375
  else:
372
- lines.append(f"{indent}df_{var_name} = read_file(config.get('sources', {{}}).get('{src_def.name}', {{}}).get('file_path', '{src_def.name}'),")
373
- lines.append(f"{indent} config.get('sources', {{}}).get('{src_def.name}', {{}}))")
376
+ if file_path_override:
377
+ lines.append(f"{indent}df_{var_name} = read_file({default_path}, config.get('sources', {{}}).get('{src_def.name}', {{}}))")
378
+ else:
379
+ lines.append(f"{indent}df_{var_name} = read_file(config.get('sources', {{}}).get('{src_def.name}', {{}}).get('file_path', '{src_def.name}'),")
380
+ lines.append(f"{indent} config.get('sources', {{}}).get('{src_def.name}', {{}}))")
374
381
 
375
382
 
376
- def _emit_flatfile_write(lines, var_name, tgt_def, indent=" "):
383
+ def _emit_flatfile_write(lines, var_name, tgt_def, indent=" ", file_path_override=None):
377
384
  ff = tgt_def.flatfile
378
385
  fc = _flatfile_config_dict(ff)
386
+ default_path = f"_tgt_path_{var_name}" if file_path_override else f"config.get('targets', {{}}).get('{tgt_def.name}', {{}}).get('file_path', '{tgt_def.name}')"
379
387
  file_cfg = {}
380
388
  if "delimiter" in fc:
381
389
  file_cfg["delimiter"] = fc["delimiter"]
@@ -387,10 +395,16 @@ def _emit_flatfile_write(lines, var_name, tgt_def, indent=" "):
387
395
  if file_cfg:
388
396
  lines.append(f"{indent}ff_cfg_{var_name} = {repr(file_cfg)}")
389
397
  lines.append(f"{indent}ff_cfg_{var_name}.update(config.get('targets', {{}}).get('{tgt_def.name}', {{}}))")
390
- lines.append(f"{indent}write_file(df_target_{var_name}, ff_cfg_{var_name}.get('file_path', '{tgt_def.name}'), ff_cfg_{var_name})")
398
+ if file_path_override:
399
+ lines.append(f"{indent}write_file(df_target_{var_name}, {default_path}, ff_cfg_{var_name})")
400
+ else:
401
+ lines.append(f"{indent}write_file(df_target_{var_name}, ff_cfg_{var_name}.get('file_path', '{tgt_def.name}'), ff_cfg_{var_name})")
391
402
  else:
392
- lines.append(f"{indent}write_file(df_target_{var_name}, config.get('targets', {{}}).get('{tgt_def.name}', {{}}).get('file_path', '{tgt_def.name}'),")
393
- lines.append(f"{indent} config.get('targets', {{}}).get('{tgt_def.name}', {{}}))")
403
+ if file_path_override:
404
+ lines.append(f"{indent}write_file(df_target_{var_name}, {default_path}, config.get('targets', {{}}).get('{tgt_def.name}', {{}}))")
405
+ else:
406
+ lines.append(f"{indent}write_file(df_target_{var_name}, config.get('targets', {{}}).get('{tgt_def.name}', {{}}).get('file_path', '{tgt_def.name}'),")
407
+ lines.append(f"{indent} config.get('targets', {{}}).get('{tgt_def.name}', {{}}))")
394
408
 
395
409
 
396
410
  def _build_source_map(mapping, folder):
@@ -1202,7 +1216,7 @@ def _generate_target_write(lines, tgt_name, tgt_def, connector_graph, source_dfs
1202
1216
  lines.append(f" _tgt_path_{tgt_safe} = config.get('targets', {{}}).get('{tgt_def.name}', {{}}).get('file_path',")
1203
1217
  lines.append(f" os.path.join('{out_dir}', '{out_file}'))")
1204
1218
  if tgt_def.flatfile:
1205
- _emit_flatfile_write(lines, tgt_safe, tgt_def)
1219
+ _emit_flatfile_write(lines, tgt_safe, tgt_def, file_path_override=True)
1206
1220
  else:
1207
1221
  lines.append(f" write_file(df_target_{tgt_safe}, _tgt_path_{tgt_safe}, config.get('targets', {{}}).get('{tgt_def.name}', {{}}))")
1208
1222
  elif tgt_def.database_type and tgt_def.database_type != "Flat File":
@@ -195,10 +195,11 @@ def _emit_task_code(lines, task, mapping_name_map, session_to_mapping, wf, workl
195
195
  lines.append(f" logger.info('Executing worklet: {task.name}')")
196
196
  if matched_worklet:
197
197
  lines.append(f" worklet_result_{task_safe} = run_worklet_{worklet_safe}(config)")
198
+ lines.append(f" if not worklet_result_{task_safe}:")
199
+ lines.append(f" raise RuntimeError('Worklet {worklet_name} returned failure')")
198
200
  else:
199
- lines.append(f" worklet_result_{task_safe} = run_worklet_{worklet_safe}(config)")
200
- lines.append(f" if not worklet_result_{task_safe}:")
201
- lines.append(f" raise RuntimeError('Worklet {worklet_name} returned failure')")
201
+ lines.append(f" # WARNING: Worklet '{worklet_name}' definition not found in folder")
202
+ lines.append(f" logger.warning('Worklet {worklet_name} not found — skipping')")
202
203
  lines.append(f" except Exception as e:")
203
204
  lines.append(f" logger.error(f'Worklet {task.name} failed: {{e}}')")
204
205
  if task.fail_parent_if_instance_fails == "YES":
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: informatica-python
3
+ Version: 1.4.2
4
+ Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
5
+ Author: Nick
6
+ License: MIT
7
+ Keywords: informatica,powercenter,etl,code-generator,pandas,pyspark,data-engineering
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Software Development :: Code Generators
18
+ Classifier: Topic :: Database :: Database Engines/Servers
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: lxml>=4.9.0
23
+ Requires-Dist: pyyaml>=6.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=7.0; extra == "dev"
26
+ Dynamic: license-file
27
+
28
+ # informatica-python
29
+
30
+ Convert Informatica PowerCenter workflow XML exports into clean, runnable Python/PySpark code.
31
+
32
+ **Author:** Nick
33
+ **License:** MIT
34
+ **PyPI:** [informatica-python](https://pypi.org/project/informatica-python/)
35
+
36
+ ---
37
+
38
+ ## Overview
39
+
40
+ `informatica-python` parses Informatica PowerCenter XML export files and generates equivalent Python code using your choice of data library. It handles all 72 DTD tags from the PowerCenter XML schema and produces a complete, ready-to-run Python project.
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install informatica-python
46
+ ```
47
+
48
+ ## Quick Start
49
+
50
+ ### Command Line
51
+
52
+ ```bash
53
+ # Generate Python files to a directory
54
+ informatica-python workflow_export.xml -o output_dir
55
+
56
+ # Generate as a zip archive
57
+ informatica-python workflow_export.xml -z output.zip
58
+
59
+ # Use a different data library
60
+ informatica-python workflow_export.xml -o output_dir --data-lib polars
61
+
62
+ # Parse to JSON only (no code generation)
63
+ informatica-python workflow_export.xml --json
64
+
65
+ # Save parsed JSON to file
66
+ informatica-python workflow_export.xml --json-file parsed.json
67
+ ```
68
+
69
+ ### Python API
70
+
71
+ ```python
72
+ from informatica_python import InformaticaConverter
73
+
74
+ converter = InformaticaConverter()
75
+
76
+ # Parse and generate files
77
+ converter.convert_to_files("workflow_export.xml", "output_dir")
78
+
79
+ # Parse and generate zip
80
+ converter.convert_to_zip("workflow_export.xml", "output.zip")
81
+
82
+ # Parse to structured dict
83
+ result = converter.parse_file("workflow_export.xml")
84
+
85
+ # Use a different data library
86
+ converter.convert_to_files("workflow_export.xml", "output_dir", data_lib="polars")
87
+ ```
88
+
89
+ ## Generated Output Files
90
+
91
+ | File | Description |
92
+ |------|-------------|
93
+ | `helper_functions.py` | Database/file I/O helpers, Informatica expression equivalents (80+ functions) |
94
+ | `mapping_N.py` | One per mapping — transformation logic, source reads, target writes |
95
+ | `workflow.py` | Task orchestration with topological ordering and error handling |
96
+ | `config.yml` | Connection configs, source/target metadata, runtime parameters |
97
+ | `all_sql_queries.sql` | All SQL extracted from Source Qualifiers, Lookups, SQL transforms |
98
+ | `error_log.txt` | Conversion summary, warnings, and unsupported feature notes |
99
+
100
+ ## Supported Data Libraries
101
+
102
+ Select via `--data-lib` CLI flag or `data_lib` parameter:
103
+
104
+ | Library | Flag | Best For |
105
+ |---------|------|----------|
106
+ | **pandas** | `pandas` (default) | General-purpose, most compatible |
107
+ | **dask** | `dask` | Large datasets, parallel processing |
108
+ | **polars** | `polars` | High performance, Rust-backed |
109
+ | **vaex** | `vaex` | Out-of-core, billion-row datasets |
110
+ | **modin** | `modin` | Drop-in pandas replacement, multi-core |
111
+
112
+ ## Supported Transformations
113
+
114
+ The code generator produces real, runnable Python for these transformation types:
115
+
116
+ - **Source Qualifier** — SQL override, pre/post SQL, column selection
117
+ - **Expression** — Field-level expressions converted to pandas operations
118
+ - **Filter** — Row filtering with converted conditions
119
+ - **Joiner** — `pd.merge()` with join type and condition parsing
120
+ - **Lookup** — `pd.merge()` lookups with connection-aware DB/file reads
121
+ - **Aggregator** — `groupby().agg()` with SUM/COUNT/AVG/MIN/MAX/FIRST/LAST
122
+ - **Sorter** — `sort_values()` with multi-key ascending/descending
123
+ - **Router** — Multi-group conditional routing with if/elif/else
124
+ - **Union** — `pd.concat()` across multiple input groups
125
+ - **Update Strategy** — Insert/Update/Delete/Reject flag generation
126
+ - **Sequence Generator** — Auto-incrementing ID columns
127
+ - **Normalizer** — `pd.melt()` with auto-detected id/value vars
128
+ - **Rank** — `groupby().rank()` with Top-N filtering
129
+ - **Stored Procedure** — Stub generation with SP name and parameters
130
+ - **Transaction Control** — Commit/rollback logic stubs
131
+ - **Custom / Java** — Placeholder stubs with TODO markers
132
+ - **SQL Transform** — Direct SQL execution pass-through
133
+
134
+ ## Supported XML Tags (72 Tags)
135
+
136
+ **Top-level:** POWERMART, REPOSITORY, FOLDER, FOLDERVERSION
137
+
138
+ **Source/Target:** SOURCE, SOURCEFIELD, TARGET, TARGETFIELD, TARGETINDEX, TARGETINDEXFIELD, FLATFILE, XMLINFO, XMLTEXT, GROUP, TABLEATTRIBUTE, FIELDATTRIBUTE, METADATAEXTENSION, KEYWORD, ERPSRCINFO
139
+
140
+ **Mapping/Mapplet:** MAPPING, MAPPLET, TRANSFORMATION, TRANSFORMFIELD, TRANSFORMFIELDATTR, TRANSFORMFIELDATTRDEF, INSTANCE, ASSOCIATED_SOURCE_INSTANCE, CONNECTOR, MAPDEPENDENCY, TARGETLOADORDER, MAPPINGVARIABLE, FIELDDEPENDENCY, INITPROP, ERPINFO
141
+
142
+ **Task/Session/Workflow:** TASK, TIMER, VALUEPAIR, SCHEDULER, SCHEDULEINFO, STARTOPTIONS, ENDOPTIONS, SCHEDULEOPTIONS, RECURRING, CUSTOM, DAILYFREQUENCY, REPEAT, FILTER, SESSION, CONFIGREFERENCE, SESSTRANSFORMATIONINST, SESSTRANSFORMATIONGROUP, PARTITION, HASHKEY, KEYRANGE, CONFIG, SESSIONCOMPONENT, CONNECTIONREFERENCE, TASKINSTANCE, WORKFLOWLINK, WORKFLOWVARIABLE, WORKFLOWEVENT, WORKLET, WORKFLOW, ATTRIBUTE
143
+
144
+ **Shortcut:** SHORTCUT
145
+
146
+ **SAP:** SAPFUNCTION, SAPSTRUCTURE, SAPPROGRAM, SAPOUTPUTPORT, SAPVARIABLE, SAPPROGRAMFLOWOBJECT, SAPTABLEPARAM
147
+
148
+ ## Key Features
149
+
150
+ ### Session Connection Overrides (v1.4+)
151
+ When sessions define per-transform connection overrides (different database, file directory, or filename), the generated code uses those overrides instead of source/target defaults.
152
+
153
+ ### Worklet Support (v1.4+)
154
+ Worklet workflows are detected and generate separate `run_worklet_NAME(config)` functions. The main workflow calls these automatically for Worklet task types.
155
+
156
+ ### Type Casting at Target Writes (v1.4+)
157
+ Target field datatypes are mapped to pandas types and generate proper casting code:
158
+ - Integers: nullable `Int64`/`Int32` or `fillna(0).astype(int)` for NOT NULL
159
+ - Dates: `pd.to_datetime(errors='coerce')`
160
+ - Decimals/Floats: `pd.to_numeric(errors='coerce')`
161
+ - Booleans: `.astype('boolean')`
162
+
163
+ ### Flat File Handling (v1.3+)
164
+ Parses FLATFILE metadata for delimiter, fixed-width, header lines, skip rows, quote/escape chars. Generates `pd.read_fwf()` for fixed-width or enriched `read_file()` for delimited.
165
+
166
+ ### Mapplet Inlining (v1.3+)
167
+ Expands Mapplet instances into prefixed transforms, rewires connectors, and eliminates duplication.
168
+
169
+ ### Decision Tasks (v1.3+)
170
+ Converts Informatica decision conditions to Python if/else branches with proper variable substitution.
171
+
172
+ ### Expression Converter (80+ Functions)
173
+
174
+ Converts Informatica expressions to Python equivalents:
175
+
176
+ - **String:** SUBSTR, LTRIM, RTRIM, UPPER, LOWER, LPAD, RPAD, INSTR, LENGTH, CONCAT, REPLACE, REG_EXTRACT, REG_REPLACE, REVERSE, INITCAP, CHR, ASCII
177
+ - **Date:** ADD_TO_DATE, DATE_DIFF, GET_DATE_PART, SYSDATE, SYSTIMESTAMP, TO_DATE, TO_CHAR, TRUNC (date)
178
+ - **Numeric:** ROUND, TRUNC, MOD, ABS, CEIL, FLOOR, POWER, SQRT, LOG, EXP, SIGN
179
+ - **Conversion:** TO_INTEGER, TO_BIGINT, TO_FLOAT, TO_DECIMAL, TO_CHAR, TO_DATE
180
+ - **Null handling:** IIF, DECODE, NVL, NVL2, ISNULL, IS_SPACES, IS_NUMBER
181
+ - **Aggregate:** SUM, AVG, COUNT, MIN, MAX, FIRST, LAST, MEDIAN, STDDEV, VARIANCE
182
+ - **Lookup:** :LKP expressions with dynamic lookup references
183
+ - **Variable:** SETVARIABLE / mapping variable assignment
184
+
185
+ ## Requirements
186
+
187
+ - Python >= 3.8
188
+ - lxml >= 4.9.0
189
+ - PyYAML >= 6.0
190
+
191
+ ## Changelog
192
+
193
+ ### v1.4.x (Phase 3)
194
+ - Session connection overrides for sources and targets
195
+ - Worklet function generation with safe invocation
196
+ - Type casting at target writes based on TARGETFIELD datatypes
197
+ - Flat-file session path overrides properly wired
198
+
199
+ ### v1.3.x (Phase 2)
200
+ - FLATFILE metadata in source reads and target writes
201
+ - Normalizer with `pd.melt()`
202
+ - Rank with group-by and Top-N filtering
203
+ - Decision tasks with real if/else branches
204
+ - Mapplet instance inlining
205
+
206
+ ### v1.2.x (Phase 1)
207
+ - Core parser for all 72 XML tags
208
+ - Expression converter with 80+ functions
209
+ - Aggregator, Joiner, Lookup code generation
210
+ - Workflow orchestration with topological task ordering
211
+ - Multi-library support (pandas, dask, polars, vaex, modin)
212
+
213
+ ## Development
214
+
215
+ ```bash
216
+ # Clone and install in development mode
217
+ cd informatica_python
218
+ pip install -e ".[dev]"
219
+
220
+ # Run tests (25 tests)
221
+ pytest tests/test_converter.py -v
222
+ ```
223
+
224
+ ## License
225
+
226
+ MIT License - Copyright (c) 2025 Nick
227
+
228
+ See [LICENSE](LICENSE) for details.
@@ -1,3 +1,4 @@
1
+ LICENSE
1
2
  README.md
2
3
  pyproject.toml
3
4
  informatica_python/__init__.py
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "informatica-python"
7
+ version = "1.4.2"
8
+ description = "Convert Informatica PowerCenter workflow XML to Python/PySpark code"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.8"
12
+ authors = [
13
+ { name = "Nick" },
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.8",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Topic :: Software Development :: Code Generators",
26
+ "Topic :: Database :: Database Engines/Servers",
27
+ ]
28
+ keywords = ["informatica", "powercenter", "etl", "code-generator", "pandas", "pyspark", "data-engineering"]
29
+ dependencies = [
30
+ "lxml>=4.9.0",
31
+ "pyyaml>=6.0",
32
+ ]
33
+
34
+ [project.scripts]
35
+ informatica-python = "informatica_python.cli:main"
36
+
37
+ [project.optional-dependencies]
38
+ dev = ["pytest>=7.0"]
39
+
40
+ [tool.setuptools.packages.find]
41
+ include = ["informatica_python*"]
@@ -1,118 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: informatica-python
3
- Version: 1.4.0
4
- Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
5
- License-Expression: MIT
6
- Requires-Python: >=3.8
7
- Description-Content-Type: text/markdown
8
- Requires-Dist: lxml>=4.9.0
9
- Requires-Dist: pyyaml>=6.0
10
- Provides-Extra: dev
11
- Requires-Dist: pytest>=7.0; extra == "dev"
12
-
13
- # informatica-python
14
-
15
- Convert Informatica PowerCenter workflow XML files to Python/PySpark code.
16
-
17
- ## Installation
18
-
19
- ```bash
20
- pip install informatica-python
21
- ```
22
-
23
- ## Quick Start
24
-
25
- ### Command Line
26
-
27
- ```bash
28
- # Convert XML to Python files in a directory
29
- informatica-python workflow.xml -o output_dir
30
-
31
- # Convert XML to a zip file
32
- informatica-python workflow.xml -z output.zip
33
-
34
- # Use a different data library (pandas, dask, polars, vaex, modin)
35
- informatica-python workflow.xml -o output_dir --data-lib polars
36
-
37
- # Parse XML to JSON (no code generation)
38
- informatica-python workflow.xml --json
39
-
40
- # Save parsed JSON to file
41
- informatica-python workflow.xml --json-file parsed.json
42
- ```
43
-
44
- ### Python API
45
-
46
- ```python
47
- from informatica_python import InformaticaConverter
48
-
49
- # Convert XML to Python files
50
- converter = InformaticaConverter(data_lib="pandas")
51
- converter.convert("workflow.xml", output_dir="output")
52
-
53
- # Convert to zip
54
- converter.convert("workflow.xml", output_zip="output.zip")
55
-
56
- # Parse XML to JSON dict
57
- result = converter.parse_file("workflow.xml")
58
-
59
- # Parse XML string
60
- result = converter.parse_string(xml_string)
61
- ```
62
-
63
- ## Generated Output Files
64
-
65
- | File | Description |
66
- |------|-------------|
67
- | `helper_functions.py` | Database/file I/O functions plus Python equivalents for 50+ Informatica expression functions |
68
- | `mapping_N.py` | One file per mapping with full transformation logic |
69
- | `workflow.py` | Task orchestration with topological ordering |
70
- | `config.yml` | Connection configs, source/target metadata, variables |
71
- | `all_sql_queries.sql` | All extracted SQL queries (source qualifiers, lookups, pre/post SQL) |
72
- | `error_log.txt` | Conversion summary, warnings, and coverage statistics |
73
-
74
- ## Supported Transformation Types
75
-
76
- - Source Qualifier / Application Source Qualifier
77
- - Expression
78
- - Filter
79
- - Aggregator
80
- - Sorter
81
- - Joiner
82
- - Lookup Procedure
83
- - Router
84
- - Union
85
- - Update Strategy
86
- - Sequence Generator
87
- - Normalizer
88
- - Rank
89
- - Stored Procedure (placeholder)
90
- - Custom Transformation (placeholder)
91
- - Java Transformation (placeholder)
92
- - SQL Transformation
93
-
94
- ## Supported Data Libraries
95
-
96
- Choose your preferred data manipulation library with `--data-lib`:
97
-
98
- - **pandas** (default) — Standard Python data analysis
99
- - **dask** — Parallel computing with pandas-like API
100
- - **polars** — Fast DataFrame library written in Rust
101
- - **vaex** — Out-of-core DataFrames for large datasets
102
- - **modin** — Drop-in pandas replacement with parallel execution
103
-
104
- ## Informatica Expression Functions
105
-
106
- The generated `helper_functions.py` includes Python equivalents for:
107
-
108
- `IIF`, `DECODE`, `NVL`, `NVL2`, `ISNULL`, `LTRIM`, `RTRIM`, `UPPER`, `LOWER`, `SUBSTR`, `LPAD`, `RPAD`, `TO_CHAR`, `TO_DATE`, `TO_INTEGER`, `TO_BIGINT`, `TO_FLOAT`, `TO_DECIMAL`, `REPLACECHR`, `REPLACESTR`, `INSTR`, `LENGTH`, `CONCAT`, `REG_EXTRACT`, `REG_MATCH`, `REG_REPLACE`, `GET_DATE_PART`, `ADD_TO_DATE`, `IS_DATE`, `IS_NUMBER`, `IS_SPACES`, `SYSDATE`, `ERROR`, `ABORT`, and more.
109
-
110
- ## Requirements
111
-
112
- - Python >= 3.8
113
- - lxml >= 4.9.0
114
- - PyYAML >= 6.0
115
-
116
- ## License
117
-
118
- MIT
@@ -1,106 +0,0 @@
1
- # informatica-python
2
-
3
- Convert Informatica PowerCenter workflow XML files to Python/PySpark code.
4
-
5
- ## Installation
6
-
7
- ```bash
8
- pip install informatica-python
9
- ```
10
-
11
- ## Quick Start
12
-
13
- ### Command Line
14
-
15
- ```bash
16
- # Convert XML to Python files in a directory
17
- informatica-python workflow.xml -o output_dir
18
-
19
- # Convert XML to a zip file
20
- informatica-python workflow.xml -z output.zip
21
-
22
- # Use a different data library (pandas, dask, polars, vaex, modin)
23
- informatica-python workflow.xml -o output_dir --data-lib polars
24
-
25
- # Parse XML to JSON (no code generation)
26
- informatica-python workflow.xml --json
27
-
28
- # Save parsed JSON to file
29
- informatica-python workflow.xml --json-file parsed.json
30
- ```
31
-
32
- ### Python API
33
-
34
- ```python
35
- from informatica_python import InformaticaConverter
36
-
37
- # Convert XML to Python files
38
- converter = InformaticaConverter(data_lib="pandas")
39
- converter.convert("workflow.xml", output_dir="output")
40
-
41
- # Convert to zip
42
- converter.convert("workflow.xml", output_zip="output.zip")
43
-
44
- # Parse XML to JSON dict
45
- result = converter.parse_file("workflow.xml")
46
-
47
- # Parse XML string
48
- result = converter.parse_string(xml_string)
49
- ```
50
-
51
- ## Generated Output Files
52
-
53
- | File | Description |
54
- |------|-------------|
55
- | `helper_functions.py` | Database/file I/O functions plus Python equivalents for 50+ Informatica expression functions |
56
- | `mapping_N.py` | One file per mapping with full transformation logic |
57
- | `workflow.py` | Task orchestration with topological ordering |
58
- | `config.yml` | Connection configs, source/target metadata, variables |
59
- | `all_sql_queries.sql` | All extracted SQL queries (source qualifiers, lookups, pre/post SQL) |
60
- | `error_log.txt` | Conversion summary, warnings, and coverage statistics |
61
-
62
- ## Supported Transformation Types
63
-
64
- - Source Qualifier / Application Source Qualifier
65
- - Expression
66
- - Filter
67
- - Aggregator
68
- - Sorter
69
- - Joiner
70
- - Lookup Procedure
71
- - Router
72
- - Union
73
- - Update Strategy
74
- - Sequence Generator
75
- - Normalizer
76
- - Rank
77
- - Stored Procedure (placeholder)
78
- - Custom Transformation (placeholder)
79
- - Java Transformation (placeholder)
80
- - SQL Transformation
81
-
82
- ## Supported Data Libraries
83
-
84
- Choose your preferred data manipulation library with `--data-lib`:
85
-
86
- - **pandas** (default) — Standard Python data analysis
87
- - **dask** — Parallel computing with pandas-like API
88
- - **polars** — Fast DataFrame library written in Rust
89
- - **vaex** — Out-of-core DataFrames for large datasets
90
- - **modin** — Drop-in pandas replacement with parallel execution
91
-
92
- ## Informatica Expression Functions
93
-
94
- The generated `helper_functions.py` includes Python equivalents for:
95
-
96
- `IIF`, `DECODE`, `NVL`, `NVL2`, `ISNULL`, `LTRIM`, `RTRIM`, `UPPER`, `LOWER`, `SUBSTR`, `LPAD`, `RPAD`, `TO_CHAR`, `TO_DATE`, `TO_INTEGER`, `TO_BIGINT`, `TO_FLOAT`, `TO_DECIMAL`, `REPLACECHR`, `REPLACESTR`, `INSTR`, `LENGTH`, `CONCAT`, `REG_EXTRACT`, `REG_MATCH`, `REG_REPLACE`, `GET_DATE_PART`, `ADD_TO_DATE`, `IS_DATE`, `IS_NUMBER`, `IS_SPACES`, `SYSDATE`, `ERROR`, `ABORT`, and more.
97
-
98
- ## Requirements
99
-
100
- - Python >= 3.8
101
- - lxml >= 4.9.0
102
- - PyYAML >= 6.0
103
-
104
- ## License
105
-
106
- MIT
@@ -1,4 +0,0 @@
1
- from informatica_python.converter import InformaticaConverter
2
-
3
- __version__ = "1.0.0"
4
- __all__ = ["InformaticaConverter"]
@@ -1,118 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: informatica-python
3
- Version: 1.4.0
4
- Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
5
- License-Expression: MIT
6
- Requires-Python: >=3.8
7
- Description-Content-Type: text/markdown
8
- Requires-Dist: lxml>=4.9.0
9
- Requires-Dist: pyyaml>=6.0
10
- Provides-Extra: dev
11
- Requires-Dist: pytest>=7.0; extra == "dev"
12
-
13
- # informatica-python
14
-
15
- Convert Informatica PowerCenter workflow XML files to Python/PySpark code.
16
-
17
- ## Installation
18
-
19
- ```bash
20
- pip install informatica-python
21
- ```
22
-
23
- ## Quick Start
24
-
25
- ### Command Line
26
-
27
- ```bash
28
- # Convert XML to Python files in a directory
29
- informatica-python workflow.xml -o output_dir
30
-
31
- # Convert XML to a zip file
32
- informatica-python workflow.xml -z output.zip
33
-
34
- # Use a different data library (pandas, dask, polars, vaex, modin)
35
- informatica-python workflow.xml -o output_dir --data-lib polars
36
-
37
- # Parse XML to JSON (no code generation)
38
- informatica-python workflow.xml --json
39
-
40
- # Save parsed JSON to file
41
- informatica-python workflow.xml --json-file parsed.json
42
- ```
43
-
44
- ### Python API
45
-
46
- ```python
47
- from informatica_python import InformaticaConverter
48
-
49
- # Convert XML to Python files
50
- converter = InformaticaConverter(data_lib="pandas")
51
- converter.convert("workflow.xml", output_dir="output")
52
-
53
- # Convert to zip
54
- converter.convert("workflow.xml", output_zip="output.zip")
55
-
56
- # Parse XML to JSON dict
57
- result = converter.parse_file("workflow.xml")
58
-
59
- # Parse XML string
60
- result = converter.parse_string(xml_string)
61
- ```
62
-
63
- ## Generated Output Files
64
-
65
- | File | Description |
66
- |------|-------------|
67
- | `helper_functions.py` | Database/file I/O functions plus Python equivalents for 50+ Informatica expression functions |
68
- | `mapping_N.py` | One file per mapping with full transformation logic |
69
- | `workflow.py` | Task orchestration with topological ordering |
70
- | `config.yml` | Connection configs, source/target metadata, variables |
71
- | `all_sql_queries.sql` | All extracted SQL queries (source qualifiers, lookups, pre/post SQL) |
72
- | `error_log.txt` | Conversion summary, warnings, and coverage statistics |
73
-
74
- ## Supported Transformation Types
75
-
76
- - Source Qualifier / Application Source Qualifier
77
- - Expression
78
- - Filter
79
- - Aggregator
80
- - Sorter
81
- - Joiner
82
- - Lookup Procedure
83
- - Router
84
- - Union
85
- - Update Strategy
86
- - Sequence Generator
87
- - Normalizer
88
- - Rank
89
- - Stored Procedure (placeholder)
90
- - Custom Transformation (placeholder)
91
- - Java Transformation (placeholder)
92
- - SQL Transformation
93
-
94
- ## Supported Data Libraries
95
-
96
- Choose your preferred data manipulation library with `--data-lib`:
97
-
98
- - **pandas** (default) — Standard Python data analysis
99
- - **dask** — Parallel computing with pandas-like API
100
- - **polars** — Fast DataFrame library written in Rust
101
- - **vaex** — Out-of-core DataFrames for large datasets
102
- - **modin** — Drop-in pandas replacement with parallel execution
103
-
104
- ## Informatica Expression Functions
105
-
106
- The generated `helper_functions.py` includes Python equivalents for:
107
-
108
- `IIF`, `DECODE`, `NVL`, `NVL2`, `ISNULL`, `LTRIM`, `RTRIM`, `UPPER`, `LOWER`, `SUBSTR`, `LPAD`, `RPAD`, `TO_CHAR`, `TO_DATE`, `TO_INTEGER`, `TO_BIGINT`, `TO_FLOAT`, `TO_DECIMAL`, `REPLACECHR`, `REPLACESTR`, `INSTR`, `LENGTH`, `CONCAT`, `REG_EXTRACT`, `REG_MATCH`, `REG_REPLACE`, `GET_DATE_PART`, `ADD_TO_DATE`, `IS_DATE`, `IS_NUMBER`, `IS_SPACES`, `SYSDATE`, `ERROR`, `ABORT`, and more.
109
-
110
- ## Requirements
111
-
112
- - Python >= 3.8
113
- - lxml >= 4.9.0
114
- - PyYAML >= 6.0
115
-
116
- ## License
117
-
118
- MIT
@@ -1,24 +0,0 @@
1
- [build-system]
2
- requires = ["setuptools>=68.0", "wheel"]
3
- build-backend = "setuptools.build_meta"
4
-
5
- [project]
6
- name = "informatica-python"
7
- version = "1.4.0"
8
- description = "Convert Informatica PowerCenter workflow XML to Python/PySpark code"
9
- readme = "README.md"
10
- license = "MIT"
11
- requires-python = ">=3.8"
12
- dependencies = [
13
- "lxml>=4.9.0",
14
- "pyyaml>=6.0",
15
- ]
16
-
17
- [project.scripts]
18
- informatica-python = "informatica_python.cli:main"
19
-
20
- [project.optional-dependencies]
21
- dev = ["pytest>=7.0"]
22
-
23
- [tool.setuptools.packages.find]
24
- include = ["informatica_python*"]