informatica-python 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,468 @@
1
+ import xml.etree.ElementTree as ET
2
+ from typing import Optional
3
+ from informatica_python.models import (
4
+ PowermartDef, RepositoryDef, FolderDef, SourceDef, TargetDef,
5
+ MappingDef, TransformationDef, ConnectorDef, InstanceDef,
6
+ FieldDef, TableAttribute, SessionDef, WorkflowDef,
7
+ TaskInstanceDef, WorkflowLink, WorkflowVariable,
8
+ MappingVariable, TargetLoadOrder, SessionTransformInst,
9
+ ConnectionRef, ConfigDef, SchedulerDef, MappletDef,
10
+ ShortcutDef, TaskDef,
11
+ )
12
+
13
+
14
+ class InformaticaParser:
15
+ def __init__(self):
16
+ self.errors = []
17
+ self.warnings = []
18
+
19
+ def parse_file(self, file_path: str) -> PowermartDef:
20
+ try:
21
+ tree = ET.parse(file_path)
22
+ root = tree.getroot()
23
+ return self._parse_powermart(root)
24
+ except ET.ParseError as e:
25
+ self.errors.append(f"XML parse error: {e}")
26
+ try:
27
+ with open(file_path, "r", encoding="utf-8") as f:
28
+ content = f.read()
29
+ content = self._strip_dtd(content)
30
+ root = ET.fromstring(content)
31
+ return self._parse_powermart(root)
32
+ except Exception as e2:
33
+ self.errors.append(f"Fallback parse also failed: {e2}")
34
+ return PowermartDef()
35
+
36
+ def parse_string(self, xml_string: str) -> PowermartDef:
37
+ try:
38
+ cleaned = self._strip_dtd(xml_string)
39
+ root = ET.fromstring(cleaned)
40
+ return self._parse_powermart(root)
41
+ except ET.ParseError as e:
42
+ self.errors.append(f"XML parse error: {e}")
43
+ return PowermartDef()
44
+
45
+ def _strip_dtd(self, content: str) -> str:
46
+ import re
47
+ content = re.sub(r'<!DOCTYPE[^>]*>', '', content)
48
+ return content
49
+
50
+ def _attr(self, elem, name, default=""):
51
+ val = elem.get(name, default)
52
+ return val.strip() if val else default
53
+
54
+ def _parse_powermart(self, elem) -> PowermartDef:
55
+ pm = PowermartDef(
56
+ creation_date=self._attr(elem, "CREATION_DATE"),
57
+ repository_version=self._attr(elem, "REPOSITORY_VERSION"),
58
+ )
59
+ for repo_elem in elem.findall("REPOSITORY"):
60
+ pm.repositories.append(self._parse_repository(repo_elem))
61
+ return pm
62
+
63
+ def _parse_repository(self, elem) -> RepositoryDef:
64
+ repo = RepositoryDef(
65
+ name=self._attr(elem, "NAME"),
66
+ version=self._attr(elem, "VERSION"),
67
+ codepage=self._attr(elem, "CODEPAGE"),
68
+ database_type=self._attr(elem, "DATABASETYPE"),
69
+ )
70
+ for folder_elem in elem.findall("FOLDER"):
71
+ repo.folders.append(self._parse_folder(folder_elem))
72
+ return repo
73
+
74
+ def _parse_folder(self, elem) -> FolderDef:
75
+ folder = FolderDef(
76
+ name=self._attr(elem, "NAME"),
77
+ owner=self._attr(elem, "OWNER"),
78
+ description=self._attr(elem, "DESCRIPTION"),
79
+ group=self._attr(elem, "GROUP"),
80
+ shared=self._attr(elem, "SHARED", "NOTSHARED"),
81
+ permissions=self._attr(elem, "PERMISSIONS"),
82
+ )
83
+
84
+ for src in elem.findall("SOURCE"):
85
+ folder.sources.append(self._parse_source(src))
86
+
87
+ for tgt in elem.findall("TARGET"):
88
+ folder.targets.append(self._parse_target(tgt))
89
+
90
+ for mapping in elem.findall("MAPPING"):
91
+ folder.mappings.append(self._parse_mapping(mapping))
92
+
93
+ for mapplet in elem.findall("MAPPLET"):
94
+ folder.mapplets.append(self._parse_mapplet(mapplet))
95
+
96
+ for session in elem.findall("SESSION"):
97
+ folder.sessions.append(self._parse_session(session))
98
+
99
+ for wf in elem.findall("WORKFLOW"):
100
+ folder.workflows.append(self._parse_workflow(wf))
101
+
102
+ for task in elem.findall("TASK"):
103
+ folder.tasks.append(self._parse_task(task))
104
+
105
+ for cfg in elem.findall("CONFIG"):
106
+ folder.configs.append(self._parse_config(cfg))
107
+
108
+ for sched in elem.findall("SCHEDULER"):
109
+ folder.schedulers.append(self._parse_scheduler(sched))
110
+
111
+ for sc in elem.findall("SHORTCUT"):
112
+ folder.shortcuts.append(self._parse_shortcut(sc))
113
+
114
+ for tx in elem.findall("TRANSFORMATION"):
115
+ folder.transformations.append(self._parse_transformation(tx))
116
+
117
+ for worklet in elem.findall("WORKLET"):
118
+ wf_def = self._parse_workflow(worklet)
119
+ wf_def.metadata["is_worklet"] = "YES"
120
+ folder.workflows.append(wf_def)
121
+
122
+ return folder
123
+
124
+ def _parse_source(self, elem) -> SourceDef:
125
+ src = SourceDef(
126
+ name=self._attr(elem, "NAME"),
127
+ database_type=self._attr(elem, "DATABASETYPE"),
128
+ db_name=self._attr(elem, "DBDNAME"),
129
+ owner_name=self._attr(elem, "OWNERNAME"),
130
+ description=self._attr(elem, "DESCRIPTION"),
131
+ business_name=self._attr(elem, "BUSINESSNAME"),
132
+ )
133
+ for sf in elem.findall("SOURCEFIELD"):
134
+ src.fields.append(self._parse_source_field(sf))
135
+ for ta in elem.findall("TABLEATTRIBUTE"):
136
+ src.attributes.append(self._parse_table_attribute(ta))
137
+ for fa in elem.findall("FIELDATTRIBUTE"):
138
+ src.attributes.append(self._parse_table_attribute(fa))
139
+ for me in elem.findall("METADATAEXTENSION"):
140
+ src.attributes.append(self._parse_table_attribute(me))
141
+ return src
142
+
143
+ def _parse_source_field(self, elem) -> FieldDef:
144
+ return FieldDef(
145
+ name=self._attr(elem, "NAME"),
146
+ datatype=self._attr(elem, "DATATYPE"),
147
+ precision=int(self._attr(elem, "PRECISION", "0") or "0"),
148
+ scale=int(self._attr(elem, "SCALE", "0") or "0"),
149
+ nullable=self._attr(elem, "NULLABLE", "NULL"),
150
+ keytype=self._attr(elem, "KEYTYPE", "NOT A KEY"),
151
+ field_number=int(self._attr(elem, "FIELDNUMBER", "0") or "0"),
152
+ hidden=self._attr(elem, "HIDDEN", "NO"),
153
+ business_name=self._attr(elem, "BUSINESSNAME"),
154
+ description=self._attr(elem, "DESCRIPTION"),
155
+ )
156
+
157
+ def _parse_target(self, elem) -> TargetDef:
158
+ tgt = TargetDef(
159
+ name=self._attr(elem, "NAME"),
160
+ database_type=self._attr(elem, "DATABASETYPE"),
161
+ description=self._attr(elem, "DESCRIPTION"),
162
+ business_name=self._attr(elem, "BUSINESSNAME"),
163
+ constraint=self._attr(elem, "CONSTRAINT"),
164
+ table_options=self._attr(elem, "TABLEOPTIONS"),
165
+ )
166
+ for tf in elem.findall("TARGETFIELD"):
167
+ tgt.fields.append(self._parse_target_field(tf))
168
+ for ta in elem.findall("TABLEATTRIBUTE"):
169
+ tgt.attributes.append(self._parse_table_attribute(ta))
170
+ for ti in elem.findall("TARGETINDEX"):
171
+ tgt.attributes.append(TableAttribute(
172
+ name=f"INDEX_{self._attr(ti, 'NAME')}",
173
+ value=self._attr(ti, 'DESCRIPTION'),
174
+ ))
175
+ return tgt
176
+
177
+ def _parse_target_field(self, elem) -> FieldDef:
178
+ return FieldDef(
179
+ name=self._attr(elem, "NAME"),
180
+ datatype=self._attr(elem, "DATATYPE"),
181
+ precision=int(self._attr(elem, "PRECISION", "0") or "0"),
182
+ scale=int(self._attr(elem, "SCALE", "0") or "0"),
183
+ nullable=self._attr(elem, "NULLABLE", "NULL"),
184
+ keytype=self._attr(elem, "KEYTYPE", "NOT A KEY"),
185
+ field_number=int(self._attr(elem, "FIELDNUMBER", "0") or "0"),
186
+ description=self._attr(elem, "DESCRIPTION"),
187
+ business_name=self._attr(elem, "BUSINESSNAME"),
188
+ )
189
+
190
+ def _parse_transformation(self, elem) -> TransformationDef:
191
+ tx = TransformationDef(
192
+ name=self._attr(elem, "NAME"),
193
+ type=self._attr(elem, "TYPE"),
194
+ description=self._attr(elem, "DESCRIPTION"),
195
+ reusable=self._attr(elem, "REUSABLE", "NO"),
196
+ )
197
+ for tf in elem.findall("TRANSFORMFIELD"):
198
+ tx.fields.append(self._parse_transform_field(tf))
199
+ for ta in elem.findall("TABLEATTRIBUTE"):
200
+ tx.attributes.append(self._parse_table_attribute(ta))
201
+
202
+ for child_tag in ["TRANSFORMFIELDATTR", "TRANSFORMFIELDATTRDEF", "INITPROP", "ERPINFO"]:
203
+ for child in elem.findall(child_tag):
204
+ tx.metadata[f"{child_tag}_{self._attr(child, 'NAME', child_tag)}"] = self._attr(child, "VALUE", str(child.attrib))
205
+
206
+ return tx
207
+
208
+ def _parse_transform_field(self, elem) -> FieldDef:
209
+ return FieldDef(
210
+ name=self._attr(elem, "NAME"),
211
+ datatype=self._attr(elem, "DATATYPE"),
212
+ precision=int(self._attr(elem, "PRECISION", "0") or "0"),
213
+ scale=int(self._attr(elem, "SCALE", "0") or "0"),
214
+ default_value=self._attr(elem, "DEFAULTVALUE"),
215
+ expression=self._attr(elem, "EXPRESSION"),
216
+ expression_type=self._attr(elem, "EXPRESSIONTYPE"),
217
+ porttype=self._attr(elem, "PORTTYPE"),
218
+ description=self._attr(elem, "DESCRIPTION"),
219
+ )
220
+
221
+ def _parse_table_attribute(self, elem) -> TableAttribute:
222
+ return TableAttribute(
223
+ name=self._attr(elem, "NAME"),
224
+ value=self._attr(elem, "VALUE"),
225
+ )
226
+
227
+ def _parse_connector(self, elem) -> ConnectorDef:
228
+ return ConnectorDef(
229
+ from_field=self._attr(elem, "FROMFIELD"),
230
+ from_instance=self._attr(elem, "FROMINSTANCE"),
231
+ from_instance_type=self._attr(elem, "FROMINSTANCETYPE"),
232
+ to_field=self._attr(elem, "TOFIELD"),
233
+ to_instance=self._attr(elem, "TOINSTANCE"),
234
+ to_instance_type=self._attr(elem, "TOINSTANCETYPE"),
235
+ )
236
+
237
+ def _parse_instance(self, elem) -> InstanceDef:
238
+ return InstanceDef(
239
+ name=self._attr(elem, "NAME"),
240
+ type=self._attr(elem, "TYPE"),
241
+ transformation_name=self._attr(elem, "TRANSFORMATION_NAME"),
242
+ transformation_type=self._attr(elem, "TRANSFORMATION_TYPE"),
243
+ description=self._attr(elem, "DESCRIPTION"),
244
+ reusable=self._attr(elem, "REUSABLE", "NO"),
245
+ )
246
+
247
+ def _parse_mapping(self, elem) -> MappingDef:
248
+ mapping = MappingDef(
249
+ name=self._attr(elem, "NAME"),
250
+ description=self._attr(elem, "DESCRIPTION"),
251
+ is_valid=self._attr(elem, "ISVALID", "YES"),
252
+ )
253
+
254
+ for tx in elem.findall("TRANSFORMATION"):
255
+ mapping.transformations.append(self._parse_transformation(tx))
256
+
257
+ for conn in elem.findall("CONNECTOR"):
258
+ mapping.connectors.append(self._parse_connector(conn))
259
+
260
+ for inst in elem.findall("INSTANCE"):
261
+ mapping.instances.append(self._parse_instance(inst))
262
+
263
+ for tlo in elem.findall("TARGETLOADORDER"):
264
+ order = TargetLoadOrder(
265
+ order=int(self._attr(tlo, "ORDER", "1") or "1"),
266
+ target_instance=self._attr(tlo, "TARGETINSTANCE"),
267
+ )
268
+ mapping.target_load_orders.append(order)
269
+
270
+ for mv in elem.findall("MAPPINGVARIABLE"):
271
+ mapping.variables.append(MappingVariable(
272
+ name=self._attr(mv, "NAME"),
273
+ datatype=self._attr(mv, "DATATYPE", "string"),
274
+ default_value=self._attr(mv, "DEFAULTVALUE"),
275
+ description=self._attr(mv, "DESCRIPTION"),
276
+ is_expression_variable=self._attr(mv, "ISEXPRESSIONVARIABLE", "NO"),
277
+ is_persistent=self._attr(mv, "ISPERSISTENT", "NO"),
278
+ precision=int(self._attr(mv, "PRECISION", "0") or "0"),
279
+ scale=int(self._attr(mv, "SCALE", "0") or "0"),
280
+ usage_type=self._attr(mv, "USAGETYPE"),
281
+ ))
282
+
283
+ for me in elem.findall("METADATAEXTENSION"):
284
+ mapping.metadata[self._attr(me, "NAME")] = self._attr(me, "VALUE")
285
+
286
+ return mapping
287
+
288
+ def _parse_mapplet(self, elem) -> MappletDef:
289
+ mapplet = MappletDef(
290
+ name=self._attr(elem, "NAME"),
291
+ description=self._attr(elem, "DESCRIPTION"),
292
+ is_valid=self._attr(elem, "ISVALID", "YES"),
293
+ )
294
+ for tx in elem.findall("TRANSFORMATION"):
295
+ mapplet.transformations.append(self._parse_transformation(tx))
296
+ for conn in elem.findall("CONNECTOR"):
297
+ mapplet.connectors.append(self._parse_connector(conn))
298
+ for inst in elem.findall("INSTANCE"):
299
+ mapplet.instances.append(self._parse_instance(inst))
300
+ return mapplet
301
+
302
+ def _parse_session(self, elem) -> SessionDef:
303
+ session = SessionDef(
304
+ name=self._attr(elem, "NAME"),
305
+ mapping_name=self._attr(elem, "MAPPINGNAME"),
306
+ description=self._attr(elem, "DESCRIPTION"),
307
+ is_valid=self._attr(elem, "ISVALID", "YES"),
308
+ reusable=self._attr(elem, "REUSABLE", "NO"),
309
+ )
310
+
311
+ for ta in elem.findall(".//ATTRIBUTE"):
312
+ session.attributes.append(self._parse_table_attribute(ta))
313
+
314
+ for cr in elem.findall(".//CONFIGREFERENCE"):
315
+ session.config_references.append({
316
+ "name": self._attr(cr, "REFOBJECTNAME"),
317
+ "type": self._attr(cr, "TYPE"),
318
+ })
319
+
320
+ for sti in elem.findall(".//SESSTRANSFORMATIONINST"):
321
+ st = SessionTransformInst(
322
+ instance_name=self._attr(sti, "SINSTANCENAME"),
323
+ pipeline=self._attr(sti, "PIPELINE"),
324
+ stage=self._attr(sti, "STAGE"),
325
+ transformation_name=self._attr(sti, "TRANSFORMATIONNAME"),
326
+ transformation_type=self._attr(sti, "TRANSFORMATIONTYPE"),
327
+ is_partitionable=self._attr(sti, "ISREPARTITIONPOINT", "NO"),
328
+ )
329
+ for ta in sti.findall("ATTRIBUTE"):
330
+ st.attributes.append(self._parse_table_attribute(ta))
331
+ for cr in sti.findall("CONNECTIONREFERENCE"):
332
+ st.connections.append(ConnectionRef(
333
+ connection_name=self._attr(cr, "CONNECTIONNAME"),
334
+ connection_type=self._attr(cr, "CONNECTIONTYPE"),
335
+ connection_subtype=self._attr(cr, "CONNECTIONSUBTYPE"),
336
+ variable=self._attr(cr, "VARIABLE"),
337
+ ))
338
+ session.transform_instances.append(st)
339
+
340
+ for comp in elem.findall(".//SESSIONCOMPONENT"):
341
+ comp_data = {
342
+ "name": self._attr(comp, "REFOBJECTNAME"),
343
+ "type": self._attr(comp, "TYPE"),
344
+ "attributes": [],
345
+ }
346
+ for ta in comp.findall("ATTRIBUTE"):
347
+ comp_data["attributes"].append({
348
+ "name": self._attr(ta, "NAME"),
349
+ "value": self._attr(ta, "VALUE"),
350
+ })
351
+ session.components.append(comp_data)
352
+
353
+ return session
354
+
355
+ def _parse_workflow(self, elem) -> WorkflowDef:
356
+ wf = WorkflowDef(
357
+ name=self._attr(elem, "NAME"),
358
+ description=self._attr(elem, "DESCRIPTION"),
359
+ is_valid=self._attr(elem, "ISVALID", "YES"),
360
+ reusable=self._attr(elem, "REUSABLE", "NO"),
361
+ scheduler_name=self._attr(elem, "SCHEDULERNAME"),
362
+ )
363
+
364
+ for ti in elem.findall("TASKINSTANCE"):
365
+ wf.task_instances.append(self._parse_task_instance(ti))
366
+
367
+ for link in elem.findall("WORKFLOWLINK"):
368
+ wf.links.append(WorkflowLink(
369
+ from_instance=self._attr(link, "FROMTASK"),
370
+ to_instance=self._attr(link, "TOTASK"),
371
+ condition=self._attr(link, "CONDITION"),
372
+ link_type=self._attr(link, "LINKTYPE"),
373
+ ))
374
+
375
+ for wv in elem.findall("WORKFLOWVARIABLE"):
376
+ wf.variables.append(WorkflowVariable(
377
+ name=self._attr(wv, "NAME"),
378
+ datatype=self._attr(wv, "DATATYPE", "string"),
379
+ default_value=self._attr(wv, "DEFAULTVALUE"),
380
+ description=self._attr(wv, "DESCRIPTION"),
381
+ is_null=self._attr(wv, "ISNULL", "NO"),
382
+ is_persistent=self._attr(wv, "ISPERSISTENT", "NO"),
383
+ is_user_defined=self._attr(wv, "ISUSERDEFINED", "YES"),
384
+ precision=int(self._attr(wv, "PRECISION", "0") or "0"),
385
+ scale=int(self._attr(wv, "SCALE", "0") or "0"),
386
+ usage_type=self._attr(wv, "USAGETYPE"),
387
+ ))
388
+
389
+ for ta in elem.findall("ATTRIBUTE"):
390
+ wf.attributes.append(self._parse_table_attribute(ta))
391
+
392
+ for me in elem.findall("METADATAEXTENSION"):
393
+ wf.metadata[self._attr(me, "NAME")] = self._attr(me, "VALUE")
394
+
395
+ for session in elem.findall("SESSION"):
396
+ pass
397
+
398
+ return wf
399
+
400
+ def _parse_task_instance(self, elem) -> TaskInstanceDef:
401
+ ti = TaskInstanceDef(
402
+ name=self._attr(elem, "NAME"),
403
+ task_name=self._attr(elem, "TASKNAME"),
404
+ task_type=self._attr(elem, "TASKTYPE"),
405
+ description=self._attr(elem, "DESCRIPTION"),
406
+ is_valid=self._attr(elem, "ISVALID", "YES"),
407
+ reusable=self._attr(elem, "REUSABLE", "NO"),
408
+ fail_parent_if_instance_fails=self._attr(elem, "FAIL_PARENT_IF_INSTANCE_FAILS", "YES"),
409
+ fail_parent_if_instance_did_not_run=self._attr(elem, "FAIL_PARENT_IF_INSTANCE_DID_NOT_RUN", "NO"),
410
+ treat_input_link_as_and=self._attr(elem, "TREAT_INPUTLINKS_AS_AND", "YES"),
411
+ )
412
+ for ta in elem.findall("ATTRIBUTE"):
413
+ ti.attributes.append(self._parse_table_attribute(ta))
414
+ return ti
415
+
416
+ def _parse_config(self, elem) -> ConfigDef:
417
+ cfg = ConfigDef(
418
+ name=self._attr(elem, "NAME"),
419
+ description=self._attr(elem, "DESCRIPTION"),
420
+ is_valid=self._attr(elem, "ISVALID", "YES"),
421
+ )
422
+ for ta in elem.findall("ATTRIBUTE"):
423
+ cfg.attributes.append(self._parse_table_attribute(ta))
424
+ return cfg
425
+
426
+ def _parse_scheduler(self, elem) -> SchedulerDef:
427
+ sched = SchedulerDef(
428
+ name=self._attr(elem, "NAME"),
429
+ description=self._attr(elem, "DESCRIPTION"),
430
+ reusable=self._attr(elem, "REUSABLE", "NO"),
431
+ )
432
+ for ta in elem.findall("ATTRIBUTE"):
433
+ sched.attributes.append(self._parse_table_attribute(ta))
434
+ for child_tag in ["SCHEDULEINFO", "STARTOPTIONS", "ENDOPTIONS",
435
+ "SCHEDULEOPTIONS", "RECURRING", "CUSTOM",
436
+ "DAILYFREQUENCY", "REPEAT", "FILTER"]:
437
+ for child in elem.findall(f".//{child_tag}"):
438
+ for k, v in child.attrib.items():
439
+ sched.attributes.append(TableAttribute(
440
+ name=f"{child_tag}_{k}",
441
+ value=v,
442
+ ))
443
+ return sched
444
+
445
+ def _parse_shortcut(self, elem) -> ShortcutDef:
446
+ return ShortcutDef(
447
+ name=self._attr(elem, "NAME"),
448
+ shortcut_type=self._attr(elem, "OBJECTSUBTYPE"),
449
+ reference_name=self._attr(elem, "REFOBJECTNAME"),
450
+ folder_name=self._attr(elem, "FOLDERNAME"),
451
+ repository_name=self._attr(elem, "REPOSITORYNAME"),
452
+ )
453
+
454
+ def _parse_task(self, elem) -> TaskDef:
455
+ task = TaskDef(
456
+ name=self._attr(elem, "NAME"),
457
+ type=self._attr(elem, "TYPE"),
458
+ description=self._attr(elem, "DESCRIPTION"),
459
+ reusable=self._attr(elem, "REUSABLE", "NO"),
460
+ )
461
+ for ta in elem.findall("ATTRIBUTE"):
462
+ task.attributes.append(self._parse_table_attribute(ta))
463
+ for vp in elem.findall("VALUEPAIR"):
464
+ task.attributes.append(TableAttribute(
465
+ name=self._attr(vp, "NAME"),
466
+ value=self._attr(vp, "VALUE"),
467
+ ))
468
+ return task
File without changes
@@ -0,0 +1,105 @@
1
+ INFORMATICA_TO_PYTHON = {
2
+ "bigint": "int",
3
+ "integer": "int",
4
+ "int": "int",
5
+ "small integer": "int",
6
+ "smallint": "int",
7
+ "tinyint": "int",
8
+ "numeric": "float",
9
+ "decimal": "float",
10
+ "float": "float",
11
+ "double": "float",
12
+ "real": "float",
13
+ "money": "float",
14
+ "smallmoney": "float",
15
+ "string": "str",
16
+ "nstring": "str",
17
+ "text": "str",
18
+ "ntext": "str",
19
+ "varchar": "str",
20
+ "nvarchar": "str",
21
+ "char": "str",
22
+ "nchar": "str",
23
+ "binary": "bytes",
24
+ "varbinary": "bytes",
25
+ "image": "bytes",
26
+ "date/time": "str",
27
+ "datetime": "str",
28
+ "datetime2": "str",
29
+ "date": "str",
30
+ "time": "str",
31
+ "timestamp": "str",
32
+ "bit": "bool",
33
+ "boolean": "bool",
34
+ "uniqueidentifier": "str",
35
+ "xml": "str",
36
+ "sql_variant": "str",
37
+ }
38
+
39
+ INFORMATICA_TO_SPARK = {
40
+ "bigint": "LongType()",
41
+ "integer": "IntegerType()",
42
+ "int": "IntegerType()",
43
+ "small integer": "ShortType()",
44
+ "smallint": "ShortType()",
45
+ "tinyint": "ByteType()",
46
+ "numeric": "DecimalType({precision}, {scale})",
47
+ "decimal": "DecimalType({precision}, {scale})",
48
+ "float": "FloatType()",
49
+ "double": "DoubleType()",
50
+ "real": "FloatType()",
51
+ "money": "DecimalType(19, 4)",
52
+ "smallmoney": "DecimalType(10, 4)",
53
+ "string": "StringType()",
54
+ "nstring": "StringType()",
55
+ "text": "StringType()",
56
+ "ntext": "StringType()",
57
+ "varchar": "StringType()",
58
+ "nvarchar": "StringType()",
59
+ "char": "StringType()",
60
+ "nchar": "StringType()",
61
+ "binary": "BinaryType()",
62
+ "varbinary": "BinaryType()",
63
+ "image": "BinaryType()",
64
+ "date/time": "TimestampType()",
65
+ "datetime": "TimestampType()",
66
+ "datetime2": "TimestampType()",
67
+ "date": "DateType()",
68
+ "time": "StringType()",
69
+ "timestamp": "TimestampType()",
70
+ "bit": "BooleanType()",
71
+ "boolean": "BooleanType()",
72
+ "uniqueidentifier": "StringType()",
73
+ "xml": "StringType()",
74
+ "sql_variant": "StringType()",
75
+ }
76
+
77
+ DB_TYPE_MAP = {
78
+ "Microsoft SQL Server": "mssql",
79
+ "Oracle": "oracle",
80
+ "Sybase": "sybase",
81
+ "Informix": "informix",
82
+ "DB2": "db2",
83
+ "Teradata": "teradata",
84
+ "ODBC": "odbc",
85
+ "Flat File": "flatfile",
86
+ "XML": "xml",
87
+ "SAP": "sap",
88
+ "": "unknown",
89
+ }
90
+
91
+
92
+ def get_python_type(informatica_type):
93
+ return INFORMATICA_TO_PYTHON.get(informatica_type.lower().strip(), "str")
94
+
95
+
96
+ def get_spark_type(informatica_type, precision=10, scale=0):
97
+ key = informatica_type.lower().strip()
98
+ spark_type = INFORMATICA_TO_SPARK.get(key, "StringType()")
99
+ if "{precision}" in spark_type:
100
+ spark_type = spark_type.format(precision=precision, scale=scale)
101
+ return spark_type
102
+
103
+
104
+ def get_db_type(database_type):
105
+ return DB_TYPE_MAP.get(database_type, database_type.lower() if database_type else "unknown")