informatica-python 1.0.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {informatica_python-1.0.0 → informatica_python-1.1.0}/PKG-INFO +2 -2
  2. informatica_python-1.1.0/informatica_python/converter.py +425 -0
  3. informatica_python-1.1.0/informatica_python/generators/error_log_gen.py +315 -0
  4. informatica_python-1.1.0/informatica_python/models.py +653 -0
  5. informatica_python-1.1.0/informatica_python/parser.py +1031 -0
  6. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python.egg-info/PKG-INFO +2 -2
  7. {informatica_python-1.0.0 → informatica_python-1.1.0}/pyproject.toml +2 -2
  8. informatica_python-1.0.0/informatica_python/converter.py +0 -285
  9. informatica_python-1.0.0/informatica_python/generators/error_log_gen.py +0 -140
  10. informatica_python-1.0.0/informatica_python/models.py +0 -281
  11. informatica_python-1.0.0/informatica_python/parser.py +0 -468
  12. {informatica_python-1.0.0 → informatica_python-1.1.0}/README.md +0 -0
  13. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/__init__.py +0 -0
  14. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/cli.py +0 -0
  15. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/generators/__init__.py +0 -0
  16. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/generators/config_gen.py +0 -0
  17. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/generators/helper_gen.py +0 -0
  18. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/generators/mapping_gen.py +0 -0
  19. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/generators/sql_gen.py +0 -0
  20. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/generators/workflow_gen.py +0 -0
  21. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/utils/__init__.py +0 -0
  22. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/utils/datatype_map.py +0 -0
  23. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python/utils/expression_converter.py +0 -0
  24. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python.egg-info/SOURCES.txt +0 -0
  25. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python.egg-info/dependency_links.txt +0 -0
  26. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python.egg-info/entry_points.txt +0 -0
  27. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python.egg-info/requires.txt +0 -0
  28. {informatica_python-1.0.0 → informatica_python-1.1.0}/informatica_python.egg-info/top_level.txt +0 -0
  29. {informatica_python-1.0.0 → informatica_python-1.1.0}/setup.cfg +0 -0
  30. {informatica_python-1.0.0 → informatica_python-1.1.0}/tests/test_converter.py +0 -0
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: informatica-python
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: Convert Informatica PowerCenter workflow XML to Python/PySpark code
5
- License: MIT
5
+ License-Expression: MIT
6
6
  Requires-Python: >=3.8
7
7
  Description-Content-Type: text/markdown
8
8
  Requires-Dist: lxml>=4.9.0
@@ -0,0 +1,425 @@
1
+ import os
2
+ import json
3
+ import zipfile
4
+ import tempfile
5
+ from typing import Optional
6
+ from informatica_python.parser import InformaticaParser
7
+ from informatica_python.models import PowermartDef, FolderDef
8
+ from informatica_python.generators.helper_gen import generate_helper_functions
9
+ from informatica_python.generators.mapping_gen import generate_mapping_code
10
+ from informatica_python.generators.workflow_gen import generate_workflow_code
11
+ from informatica_python.generators.config_gen import generate_config
12
+ from informatica_python.generators.sql_gen import generate_sql_file
13
+ from informatica_python.generators.error_log_gen import generate_error_log
14
+
15
+
16
+ class InformaticaConverter:
17
+ def __init__(self, data_lib: str = "pandas"):
18
+ self.data_lib = data_lib
19
+ self.parser = InformaticaParser()
20
+ self.powermart = None
21
+
22
+ def parse_file(self, file_path: str) -> dict:
23
+ self.powermart = self.parser.parse_file(file_path)
24
+ return self.to_json()
25
+
26
+ def parse_string(self, xml_string: str) -> dict:
27
+ self.powermart = self.parser.parse_string(xml_string)
28
+ return self.to_json()
29
+
30
+ def to_json(self) -> dict:
31
+ if not self.powermart:
32
+ return {}
33
+ return self._powermart_to_dict(self.powermart)
34
+
35
+ def convert(self, file_path: str, output_dir: str = "output",
36
+ output_zip: Optional[str] = None) -> str:
37
+ self.powermart = self.parser.parse_file(file_path)
38
+
39
+ if not self.powermart.repositories:
40
+ raise ValueError("No repository found in XML file")
41
+
42
+ all_folders = []
43
+ for repo in self.powermart.repositories:
44
+ all_folders.extend(repo.folders)
45
+
46
+ if not all_folders:
47
+ raise ValueError("No folder found in XML file")
48
+
49
+ if len(all_folders) == 1:
50
+ return self._convert_folder(all_folders[0], output_dir, output_zip)
51
+
52
+ result_path = output_dir if not output_zip else os.path.dirname(output_zip) or "."
53
+ for folder in all_folders:
54
+ folder_dir = os.path.join(output_dir, folder.name)
55
+ folder_zip = None
56
+ if output_zip:
57
+ base, ext = os.path.splitext(output_zip)
58
+ folder_zip = f"{base}_{folder.name}{ext}"
59
+ self._convert_folder(folder, folder_dir, folder_zip)
60
+ return result_path
61
+
62
+ def convert_string(self, xml_string: str, output_dir: str = "output",
63
+ output_zip: Optional[str] = None) -> str:
64
+ self.powermart = self.parser.parse_string(xml_string)
65
+
66
+ if not self.powermart.repositories:
67
+ raise ValueError("No repository found in XML")
68
+
69
+ all_folders = []
70
+ for repo in self.powermart.repositories:
71
+ all_folders.extend(repo.folders)
72
+
73
+ if not all_folders:
74
+ raise ValueError("No folder found in XML")
75
+
76
+ if len(all_folders) == 1:
77
+ return self._convert_folder(all_folders[0], output_dir, output_zip)
78
+
79
+ result_path = output_dir if not output_zip else os.path.dirname(output_zip) or "."
80
+ for folder in all_folders:
81
+ folder_dir = os.path.join(output_dir, folder.name)
82
+ folder_zip = None
83
+ if output_zip:
84
+ base, ext = os.path.splitext(output_zip)
85
+ folder_zip = f"{base}_{folder.name}{ext}"
86
+ self._convert_folder(folder, folder_dir, folder_zip)
87
+ return result_path
88
+
89
+ def _convert_folder(self, folder: FolderDef, output_dir: str,
90
+ output_zip: Optional[str] = None) -> str:
91
+ files = {}
92
+
93
+ files["helper_functions.py"] = generate_helper_functions(folder, self.data_lib)
94
+
95
+ for i, mapping in enumerate(folder.mappings, 1):
96
+ code = generate_mapping_code(mapping, folder, self.data_lib, i)
97
+ files[f"mapping_{i}.py"] = code
98
+
99
+ files["workflow.py"] = generate_workflow_code(folder)
100
+
101
+ files["config.yml"] = generate_config(folder, self.data_lib)
102
+
103
+ files["all_sql_queries.sql"] = generate_sql_file(folder)
104
+
105
+ files["error_log.txt"] = generate_error_log(
106
+ folder,
107
+ parser_errors=self.parser.errors,
108
+ parser_warnings=self.parser.warnings,
109
+ )
110
+
111
+ if output_zip:
112
+ return self._write_zip(files, output_zip)
113
+ else:
114
+ return self._write_files(files, output_dir)
115
+
116
+ def _write_files(self, files: dict, output_dir: str) -> str:
117
+ os.makedirs(output_dir, exist_ok=True)
118
+ for filename, content in files.items():
119
+ filepath = os.path.join(output_dir, filename)
120
+ with open(filepath, "w", encoding="utf-8") as f:
121
+ f.write(content)
122
+ return output_dir
123
+
124
+ def _write_zip(self, files: dict, zip_path: str) -> str:
125
+ os.makedirs(os.path.dirname(zip_path) or ".", exist_ok=True)
126
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
127
+ for filename, content in files.items():
128
+ zf.writestr(filename, content)
129
+ return zip_path
130
+
131
+ def _powermart_to_dict(self, pm: PowermartDef) -> dict:
132
+ result = {
133
+ "creation_date": pm.creation_date,
134
+ "repository_version": pm.repository_version,
135
+ "repositories": [],
136
+ }
137
+ for repo in pm.repositories:
138
+ repo_dict = {
139
+ "name": repo.name,
140
+ "version": repo.version,
141
+ "codepage": repo.codepage,
142
+ "database_type": repo.database_type,
143
+ "folders": [],
144
+ }
145
+ for folder in repo.folders:
146
+ folder_dict = self._folder_to_dict(folder)
147
+ repo_dict["folders"].append(folder_dict)
148
+ result["repositories"].append(repo_dict)
149
+ return result
150
+
151
+ def _folder_to_dict(self, folder) -> dict:
152
+ return {
153
+ "name": folder.name,
154
+ "owner": folder.owner,
155
+ "description": folder.description,
156
+ "folder_versions": [
157
+ {"folder_name": fv.folder_name, "version_number": fv.version_number, **fv.attributes}
158
+ for fv in folder.folder_versions
159
+ ],
160
+ "sources": [self._source_to_dict(s) for s in folder.sources],
161
+ "targets": [self._target_to_dict(t) for t in folder.targets],
162
+ "mappings": [self._mapping_to_dict(m) for m in folder.mappings],
163
+ "mapplets": [self._mapplet_to_dict(m) for m in folder.mapplets],
164
+ "sessions": [self._session_to_dict(s) for s in folder.sessions],
165
+ "workflows": [self._workflow_to_dict(w) for w in folder.workflows],
166
+ "tasks": [self._task_to_dict(t) for t in folder.tasks],
167
+ "configs": [self._config_to_dict(c) for c in folder.configs],
168
+ "schedulers": [self._scheduler_to_dict(s) for s in folder.schedulers],
169
+ "shortcuts": [self._shortcut_to_dict(s) for s in folder.shortcuts],
170
+ "transformations": [self._transformation_to_dict(tx) for tx in folder.transformations],
171
+ "metadata_extensions": [self._meta_ext_to_dict(me) for me in folder.metadata_extensions],
172
+ }
173
+
174
+ def _field_to_dict(self, f):
175
+ d = {
176
+ "name": f.name,
177
+ "datatype": f.datatype,
178
+ "precision": f.precision,
179
+ "scale": f.scale,
180
+ "nullable": f.nullable,
181
+ "keytype": f.keytype,
182
+ }
183
+ if f.expression:
184
+ d["expression"] = f.expression
185
+ if f.porttype:
186
+ d["porttype"] = f.porttype
187
+ if f.default_value:
188
+ d["default_value"] = f.default_value
189
+ if f.field_attributes:
190
+ d["field_attributes"] = f.field_attributes
191
+ return d
192
+
193
+ def _meta_ext_to_dict(self, me):
194
+ return {"name": me.name, "value": me.value, "datatype": me.datatype}
195
+
196
+ def _source_to_dict(self, src):
197
+ d = {
198
+ "name": src.name,
199
+ "database_type": src.database_type,
200
+ "db_name": src.db_name,
201
+ "owner_name": src.owner_name,
202
+ "fields": [self._field_to_dict(f) for f in src.fields],
203
+ "attributes": [{"name": a.name, "value": a.value} for a in src.attributes],
204
+ "metadata_extensions": [self._meta_ext_to_dict(me) for me in src.metadata_extensions],
205
+ }
206
+ if src.flatfile:
207
+ d["flatfile"] = {"delimiter": src.flatfile.delimiter, "header_lines": src.flatfile.header_lines,
208
+ "is_fixed_width": src.flatfile.is_fixed_width, "code_page": src.flatfile.code_page}
209
+ if src.xmlinfo:
210
+ d["xmlinfo"] = {"xml_type": src.xmlinfo.xml_type, "root_element": src.xmlinfo.root_element,
211
+ "xml_texts": src.xmlinfo.xml_texts}
212
+ if src.groups:
213
+ d["groups"] = [{"name": g.name, "type": g.type, "fields": [self._field_to_dict(f) for f in g.fields]} for g in src.groups]
214
+ if src.keywords:
215
+ d["keywords"] = [{"name": k.name, "value": k.value} for k in src.keywords]
216
+ if src.erp_src_info:
217
+ d["erp_src_info"] = {"name": src.erp_src_info.name, "source_type": src.erp_src_info.source_type}
218
+ return d
219
+
220
+ def _target_to_dict(self, tgt):
221
+ d = {
222
+ "name": tgt.name,
223
+ "database_type": tgt.database_type,
224
+ "fields": [self._field_to_dict(f) for f in tgt.fields],
225
+ "attributes": [{"name": a.name, "value": a.value} for a in tgt.attributes],
226
+ "indexes": [
227
+ {"name": idx.name, "index_type": idx.index_type, "unique": idx.unique,
228
+ "fields": [{"name": idf.name, "expression": idf.expression, "sort_direction": idf.sort_direction} for idf in idx.fields]}
229
+ for idx in tgt.indexes
230
+ ],
231
+ "metadata_extensions": [self._meta_ext_to_dict(me) for me in tgt.metadata_extensions],
232
+ }
233
+ if tgt.flatfile:
234
+ d["flatfile"] = {"delimiter": tgt.flatfile.delimiter, "header_lines": tgt.flatfile.header_lines}
235
+ if tgt.xmlinfo:
236
+ d["xmlinfo"] = {"xml_type": tgt.xmlinfo.xml_type, "root_element": tgt.xmlinfo.root_element}
237
+ if tgt.groups:
238
+ d["groups"] = [{"name": g.name, "type": g.type} for g in tgt.groups]
239
+ return d
240
+
241
+ def _transformation_to_dict(self, tx):
242
+ d = {
243
+ "name": tx.name,
244
+ "type": tx.type,
245
+ "description": tx.description,
246
+ "reusable": tx.reusable,
247
+ "fields": [self._field_to_dict(f) for f in tx.fields],
248
+ "attributes": [{"name": a.name, "value": a.value} for a in tx.attributes],
249
+ "metadata": tx.metadata,
250
+ }
251
+ if tx.field_attrs:
252
+ d["field_attrs"] = [{"name": fa.name, "value": fa.value, "field_name": fa.field_name} for fa in tx.field_attrs]
253
+ if tx.field_attr_defs:
254
+ d["field_attr_defs"] = [{"name": fad.name, "datatype": fad.datatype, "default_value": fad.default_value} for fad in tx.field_attr_defs]
255
+ if tx.init_props:
256
+ d["init_props"] = [{"name": ip.name, "value": ip.value} for ip in tx.init_props]
257
+ if tx.erp_info:
258
+ d["erp_info"] = {"name": tx.erp_info.name, "erp_type": tx.erp_info.erp_type}
259
+ if tx.groups:
260
+ d["groups"] = [{"name": g.name, "type": g.type} for g in tx.groups]
261
+ if tx.sap_functions:
262
+ d["sap_functions"] = [self._sap_function_to_dict(sf) for sf in tx.sap_functions]
263
+ return d
264
+
265
+ def _sap_function_to_dict(self, sf):
266
+ return {
267
+ "name": sf.name, "function_type": sf.function_type,
268
+ "structures": [{"name": s.name, "type": s.structure_type} for s in sf.structures],
269
+ "output_ports": [{"name": p.name, "datatype": p.datatype} for p in sf.output_ports],
270
+ "variables": [{"name": v.name, "datatype": v.datatype, "default_value": v.default_value} for v in sf.variables],
271
+ "table_params": [{"name": t.name, "table_name": t.table_name, "direction": t.direction} for t in sf.table_params],
272
+ "programs": [{"name": p.name, "program_type": p.program_type,
273
+ "flow_objects": [{"name": fo.name, "object_type": fo.object_type} for fo in p.flow_objects]} for p in sf.programs],
274
+ }
275
+
276
+ def _mapping_to_dict(self, mapping):
277
+ return {
278
+ "name": mapping.name,
279
+ "description": mapping.description,
280
+ "is_valid": mapping.is_valid,
281
+ "transformations": [self._transformation_to_dict(tx) for tx in mapping.transformations],
282
+ "connectors": [
283
+ {"from_field": c.from_field, "from_instance": c.from_instance,
284
+ "from_instance_type": c.from_instance_type,
285
+ "to_field": c.to_field, "to_instance": c.to_instance,
286
+ "to_instance_type": c.to_instance_type}
287
+ for c in mapping.connectors
288
+ ],
289
+ "instances": [
290
+ {"name": i.name, "type": i.type, "transformation_name": i.transformation_name,
291
+ "transformation_type": i.transformation_type,
292
+ "associated_source_instances": [
293
+ {"name": a.name, "source_instance": a.source_instance}
294
+ for a in i.associated_source_instances
295
+ ]}
296
+ for i in mapping.instances
297
+ ],
298
+ "target_load_orders": [
299
+ {"order": tlo.order, "target_instance": tlo.target_instance}
300
+ for tlo in mapping.target_load_orders
301
+ ],
302
+ "variables": [
303
+ {"name": v.name, "datatype": v.datatype, "default_value": v.default_value,
304
+ "is_persistent": v.is_persistent, "usage_type": v.usage_type}
305
+ for v in mapping.variables
306
+ ],
307
+ "metadata_extensions": [self._meta_ext_to_dict(me) for me in mapping.metadata_extensions],
308
+ "map_dependencies": [
309
+ {"name": md.name, "from_mapping": md.from_mapping, "to_mapping": md.to_mapping}
310
+ for md in mapping.map_dependencies
311
+ ],
312
+ "field_dependencies": [
313
+ {"name": fd.name, "from_field": fd.from_field, "from_instance": fd.from_instance,
314
+ "to_field": fd.to_field, "to_instance": fd.to_instance, "expression": fd.expression}
315
+ for fd in mapping.field_dependencies
316
+ ],
317
+ }
318
+
319
+ def _mapplet_to_dict(self, mapplet):
320
+ return {
321
+ "name": mapplet.name, "description": mapplet.description, "is_valid": mapplet.is_valid,
322
+ "transformations": [self._transformation_to_dict(tx) for tx in mapplet.transformations],
323
+ "connectors": [{"from_field": c.from_field, "from_instance": c.from_instance,
324
+ "to_field": c.to_field, "to_instance": c.to_instance} for c in mapplet.connectors],
325
+ "instances": [{"name": i.name, "type": i.type, "transformation_name": i.transformation_name} for i in mapplet.instances],
326
+ }
327
+
328
+ def _session_to_dict(self, session):
329
+ return {
330
+ "name": session.name, "mapping_name": session.mapping_name,
331
+ "description": session.description, "is_valid": session.is_valid, "reusable": session.reusable,
332
+ "transform_instances": [
333
+ {"instance_name": sti.instance_name, "pipeline": sti.pipeline, "stage": sti.stage,
334
+ "transformation_name": sti.transformation_name, "transformation_type": sti.transformation_type,
335
+ "is_partitionable": sti.is_partitionable,
336
+ "attributes": [{"name": a.name, "value": a.value} for a in sti.attributes],
337
+ "connections": [{"connection_name": c.connection_name, "connection_type": c.connection_type} for c in sti.connections],
338
+ "partitions": [{"name": p.name, "partition_type": p.partition_type,
339
+ "hash_keys": [{"name": hk.name, "expression": hk.expression} for hk in p.hash_keys],
340
+ "key_ranges": [{"name": kr.name, "low": kr.low_value, "high": kr.high_value} for kr in p.key_ranges]}
341
+ for p in sti.partitions]}
342
+ for sti in session.transform_instances
343
+ ],
344
+ "transform_groups": [
345
+ {"name": stg.name, "transform_instances": [
346
+ {"instance_name": sti.instance_name, "transformation_name": sti.transformation_name}
347
+ for sti in stg.transform_instances
348
+ ]}
349
+ for stg in session.transform_groups
350
+ ],
351
+ "config_references": session.config_references,
352
+ "components": session.components,
353
+ }
354
+
355
+ def _task_to_dict(self, task):
356
+ d = {
357
+ "name": task.name, "type": task.type, "description": task.description, "reusable": task.reusable,
358
+ "attributes": [{"name": a.name, "value": a.value} for a in task.attributes],
359
+ "value_pairs": [{"name": vp.name, "value": vp.value, "type": vp.type} for vp in task.value_pairs],
360
+ }
361
+ if task.timer:
362
+ d["timer"] = {"name": task.timer.name, "start_type": task.timer.start_type,
363
+ "start_date": task.timer.start_date, "start_time": task.timer.start_time}
364
+ return d
365
+
366
+ def _config_to_dict(self, cfg):
367
+ return {
368
+ "name": cfg.name, "description": cfg.description, "is_valid": cfg.is_valid,
369
+ "attributes": [{"name": a.name, "value": a.value} for a in cfg.attributes],
370
+ }
371
+
372
+ def _scheduler_to_dict(self, sched):
373
+ d = {
374
+ "name": sched.name, "description": sched.description, "reusable": sched.reusable,
375
+ "attributes": [{"name": a.name, "value": a.value} for a in sched.attributes],
376
+ }
377
+ if sched.schedule_info:
378
+ d["schedule_info"] = {"schedule_type": sched.schedule_info.schedule_type, **sched.schedule_info.attributes}
379
+ if sched.start_options:
380
+ d["start_options"] = sched.start_options.attributes
381
+ if sched.end_options:
382
+ d["end_options"] = sched.end_options.attributes
383
+ if sched.recurring:
384
+ d["recurring"] = sched.recurring.attributes
385
+ if sched.daily_frequency:
386
+ d["daily_frequency"] = sched.daily_frequency.attributes
387
+ return d
388
+
389
+ def _shortcut_to_dict(self, sc):
390
+ return {
391
+ "name": sc.name, "shortcut_type": sc.shortcut_type, "reference_name": sc.reference_name,
392
+ "folder_name": sc.folder_name, "repository_name": sc.repository_name,
393
+ "object_type": sc.object_type, "object_subtype": sc.object_subtype, "dbdname": sc.dbdname,
394
+ }
395
+
396
+ def _workflow_to_dict(self, wf):
397
+ return {
398
+ "name": wf.name,
399
+ "description": wf.description,
400
+ "is_valid": wf.is_valid,
401
+ "scheduler_name": wf.scheduler_name,
402
+ "is_worklet": wf.metadata.get("is_worklet", "NO"),
403
+ "task_instances": [
404
+ {"name": t.name, "task_name": t.task_name, "task_type": t.task_type,
405
+ "fail_parent_if_instance_fails": t.fail_parent_if_instance_fails,
406
+ "treat_input_link_as_and": t.treat_input_link_as_and}
407
+ for t in wf.task_instances
408
+ ],
409
+ "links": [
410
+ {"from": l.from_instance, "to": l.to_instance,
411
+ "condition": l.condition, "link_type": l.link_type}
412
+ for l in wf.links
413
+ ],
414
+ "variables": [
415
+ {"name": v.name, "datatype": v.datatype, "default_value": v.default_value,
416
+ "is_persistent": v.is_persistent, "usage_type": v.usage_type}
417
+ for v in wf.variables
418
+ ],
419
+ "events": [
420
+ {"name": e.name, "event_type": e.event_type, "description": e.description}
421
+ for e in wf.events
422
+ ],
423
+ "attributes": [{"name": a.name, "value": a.value} for a in wf.attributes],
424
+ "metadata": wf.metadata,
425
+ }