acryl-datahub 1.0.0rc10__py3-none-any.whl → 1.0.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (28) hide show
  1. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/METADATA +2416 -2416
  2. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/RECORD +28 -27
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +4 -3
  5. datahub/ingestion/source/iceberg/iceberg_common.py +40 -1
  6. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  7. datahub/ingestion/source/redshift/config.py +4 -0
  8. datahub/ingestion/source/redshift/datashares.py +236 -0
  9. datahub/ingestion/source/redshift/lineage.py +6 -2
  10. datahub/ingestion/source/redshift/lineage_v2.py +7 -4
  11. datahub/ingestion/source/redshift/profile.py +1 -1
  12. datahub/ingestion/source/redshift/query.py +125 -33
  13. datahub/ingestion/source/redshift/redshift.py +41 -72
  14. datahub/ingestion/source/redshift/redshift_schema.py +166 -6
  15. datahub/ingestion/source/redshift/report.py +3 -0
  16. datahub/ingestion/source/sql/oracle.py +93 -63
  17. datahub/metadata/_schema_classes.py +5 -5
  18. datahub/metadata/schema.avsc +2 -1
  19. datahub/metadata/schemas/DomainKey.avsc +2 -1
  20. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  21. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  22. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  23. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  24. datahub/sql_parsing/sql_parsing_common.py +7 -0
  25. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/LICENSE +0 -0
  26. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/WHEEL +0 -0
  27. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/entry_points.txt +0 -0
  28. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/top_level.txt +0 -0
@@ -152,6 +152,7 @@ class OracleInspectorObjectWrapper:
152
152
  self.exclude_tablespaces: Tuple[str, str] = ("SYSTEM", "SYSAUX")
153
153
 
154
154
  def get_db_name(self) -> str:
155
+ db_name = None
155
156
  try:
156
157
  # Try to retrieve current DB name by executing query
157
158
  db_name = self._inspector_instance.bind.execute(
@@ -159,7 +160,12 @@ class OracleInspectorObjectWrapper:
159
160
  ).scalar()
160
161
  return str(db_name)
161
162
  except sqlalchemy.exc.DatabaseError as e:
162
- logger.error("Error fetching DB name: " + str(e))
163
+ self.report.failure(
164
+ title="Error fetching database name using sys_context.",
165
+ message="database_fetch_error",
166
+ context=db_name,
167
+ exc=e,
168
+ )
163
169
  return ""
164
170
 
165
171
  def get_schema_names(self) -> List[str]:
@@ -326,8 +332,8 @@ class OracleInspectorObjectWrapper:
326
332
  try:
327
333
  coltype = ischema_names[coltype]()
328
334
  except KeyError:
329
- logger.warning(
330
- f"Did not recognize type {coltype} of column {colname}"
335
+ logger.info(
336
+ f"Unrecognized column datatype {coltype} of column {colname}"
331
337
  )
332
338
  coltype = sqltypes.NULLTYPE
333
339
 
@@ -379,8 +385,8 @@ class OracleInspectorObjectWrapper:
379
385
  COMMENT_SQL = """
380
386
  SELECT comments
381
387
  FROM dba_tab_comments
382
- WHERE table_name = CAST(:table_name AS VARCHAR(128))
383
- AND owner = CAST(:schema_name AS VARCHAR(128))
388
+ WHERE table_name = :table_name
389
+ AND owner = :schema_name
384
390
  """
385
391
 
386
392
  c = self._inspector_instance.bind.execute(
@@ -397,79 +403,93 @@ class OracleInspectorObjectWrapper:
397
403
 
398
404
  text = (
399
405
  "SELECT"
400
- "\nac.constraint_name," # 0
401
- "\nac.constraint_type," # 1
402
- "\nloc.column_name AS local_column," # 2
403
- "\nrem.table_name AS remote_table," # 3
404
- "\nrem.column_name AS remote_column," # 4
405
- "\nrem.owner AS remote_owner," # 5
406
- "\nloc.position as loc_pos," # 6
407
- "\nrem.position as rem_pos," # 7
408
- "\nac.search_condition," # 8
409
- "\nac.delete_rule" # 9
410
- "\nFROM dba_constraints%(dblink)s ac,"
411
- "\ndba_cons_columns%(dblink)s loc,"
412
- "\ndba_cons_columns%(dblink)s rem"
413
- "\nWHERE ac.table_name = CAST(:table_name AS VARCHAR2(128))"
414
- "\nAND ac.constraint_type IN ('R','P', 'U', 'C')"
406
+ "\nac.constraint_name,"
407
+ "\nac.constraint_type,"
408
+ "\nacc.column_name AS local_column,"
409
+ "\nNULL AS remote_table,"
410
+ "\nNULL AS remote_column,"
411
+ "\nNULL AS remote_owner,"
412
+ "\nacc.position AS loc_pos,"
413
+ "\nNULL AS rem_pos,"
414
+ "\nac.search_condition,"
415
+ "\nac.delete_rule"
416
+ "\nFROM dba_constraints ac"
417
+ "\nJOIN dba_cons_columns acc"
418
+ "\nON ac.owner = acc.owner"
419
+ "\nAND ac.constraint_name = acc.constraint_name"
420
+ "\nAND ac.table_name = acc.table_name"
421
+ "\nWHERE ac.table_name = :table_name"
422
+ "\nAND ac.constraint_type IN ('P', 'U', 'C')"
415
423
  )
416
424
 
417
425
  if schema is not None:
418
426
  params["owner"] = schema
419
- text += "\nAND ac.owner = CAST(:owner AS VARCHAR2(128))"
427
+ text += "\nAND ac.owner = :owner"
420
428
 
429
+ # Splitting into queries with UNION ALL for execution efficiency
421
430
  text += (
422
- "\nAND ac.owner = loc.owner"
423
- "\nAND ac.constraint_name = loc.constraint_name"
424
- "\nAND ac.r_owner = rem.owner(+)"
425
- "\nAND ac.r_constraint_name = rem.constraint_name(+)"
426
- "\nAND (rem.position IS NULL or loc.position=rem.position)"
427
- "\nORDER BY ac.constraint_name, loc.position"
431
+ "\nUNION ALL"
432
+ "\nSELECT"
433
+ "\nac.constraint_name,"
434
+ "\nac.constraint_type,"
435
+ "\nacc.column_name AS local_column,"
436
+ "\nac.r_table_name AS remote_table,"
437
+ "\nrcc.column_name AS remote_column,"
438
+ "\nac.r_owner AS remote_owner,"
439
+ "\nacc.position AS loc_pos,"
440
+ "\nrcc.position AS rem_pos,"
441
+ "\nac.search_condition,"
442
+ "\nac.delete_rule"
443
+ "\nFROM dba_constraints ac"
444
+ "\nJOIN dba_cons_columns acc"
445
+ "\nON ac.owner = acc.owner"
446
+ "\nAND ac.constraint_name = acc.constraint_name"
447
+ "\nAND ac.table_name = acc.table_name"
448
+ "\nLEFT JOIN dba_cons_columns rcc"
449
+ "\nON ac.r_owner = rcc.owner"
450
+ "\nAND ac.r_constraint_name = rcc.constraint_name"
451
+ "\nAND acc.position = rcc.position"
452
+ "\nWHERE ac.table_name = :table_name"
453
+ "\nAND ac.constraint_type = 'R'"
428
454
  )
429
455
 
430
- text = text % {"dblink": dblink}
456
+ if schema is not None:
457
+ text += "\nAND ac.owner = :owner"
458
+
459
+ text += "\nORDER BY constraint_name, loc_pos"
460
+
431
461
  rp = self._inspector_instance.bind.execute(sql.text(text), params)
432
- constraint_data = rp.fetchall()
433
- return constraint_data
462
+ return rp.fetchall()
434
463
 
435
464
  def get_pk_constraint(
436
465
  self, table_name: str, schema: Optional[str] = None, dblink: str = ""
437
466
  ) -> Dict:
438
- denormalized_table_name = self._inspector_instance.dialect.denormalize_name(
439
- table_name
440
- )
441
- assert denormalized_table_name
442
-
443
- schema = self._inspector_instance.dialect.denormalize_name(
444
- schema or self.default_schema_name
445
- )
446
-
447
- if schema is None:
448
- schema = self._inspector_instance.dialect.default_schema_name
449
-
450
467
  pkeys = []
451
468
  constraint_name = None
452
- constraint_data = self._get_constraint_data(
453
- denormalized_table_name, schema, dblink
454
- )
455
469
 
456
- for row in constraint_data:
457
- (
458
- cons_name,
459
- cons_type,
460
- local_column,
461
- remote_table,
462
- remote_column,
463
- remote_owner,
464
- ) = row[0:2] + tuple(
465
- [self._inspector_instance.dialect.normalize_name(x) for x in row[2:6]]
470
+ try:
471
+ for row in self._get_constraint_data(table_name, schema, dblink):
472
+ if row[1] == "P": # constraint_type is 'P' for primary key
473
+ if constraint_name is None:
474
+ constraint_name = (
475
+ self._inspector_instance.dialect.normalize_name(row[0])
476
+ )
477
+ col_name = self._inspector_instance.dialect.normalize_name(
478
+ row[2]
479
+ ) # local_column
480
+ pkeys.append(col_name)
481
+ except Exception as e:
482
+ self.report.warning(
483
+ title="Failed to Process Primary Keys",
484
+ message=(
485
+ f"Unable to process primary key constraints for {schema}.{table_name}. "
486
+ "Ensure SELECT access on DBA_CONSTRAINTS and DBA_CONS_COLUMNS.",
487
+ ),
488
+ context=f"{schema}.{table_name}",
489
+ exc=e,
466
490
  )
467
- if cons_type == "P":
468
- if constraint_name is None:
469
- constraint_name = self._inspector_instance.dialect.normalize_name(
470
- cons_name
471
- )
472
- pkeys.append(local_column)
491
+ # Return empty constraint if we can't process it
492
+ return {"constrained_columns": [], "name": None}
473
493
 
474
494
  return {"constrained_columns": pkeys, "name": constraint_name}
475
495
 
@@ -527,6 +547,16 @@ class OracleInspectorObjectWrapper:
527
547
  f"dba_cons_columns{dblink} - does the user have "
528
548
  "proper rights to the table?"
529
549
  )
550
+ self.report.warning(
551
+ title="Missing Table Permissions",
552
+ message=(
553
+ f"Unable to query table_name from dba_cons_columns{dblink}. "
554
+ "This usually indicates insufficient permissions on the target table. "
555
+ f"Foreign key relationships will not be detected for {schema}.{table_name}. "
556
+ "Please ensure the user has SELECT privileges on dba_cons_columns."
557
+ ),
558
+ context=f"{schema}.{table_name}",
559
+ )
530
560
 
531
561
  rec = fkeys[cons_name]
532
562
  rec["name"] = cons_name
@@ -573,8 +603,8 @@ class OracleInspectorObjectWrapper:
573
603
  text = "SELECT text FROM dba_views WHERE view_name=:view_name"
574
604
 
575
605
  if schema is not None:
576
- text += " AND owner = :schema"
577
- params["schema"] = schema
606
+ params["owner"] = schema
607
+ text += "\nAND owner = :owner"
578
608
 
579
609
  rp = self._inspector_instance.bind.execute(sql.text(text), params).scalar()
580
610
 
@@ -15486,7 +15486,7 @@ class DomainKeyClass(_Aspect):
15486
15486
 
15487
15487
 
15488
15488
  ASPECT_NAME = 'domainKey'
15489
- ASPECT_INFO = {'keyForEntity': 'domain', 'entityCategory': 'core', 'entityAspects': ['domainProperties', 'institutionalMemory', 'ownership', 'structuredProperties', 'forms', 'testResults'], 'entityDoc': 'A data domain within an organization.'}
15489
+ ASPECT_INFO = {'keyForEntity': 'domain', 'entityCategory': 'core', 'entityAspects': ['domainProperties', 'institutionalMemory', 'ownership', 'structuredProperties', 'forms', 'testResults', 'displayProperties'], 'entityDoc': 'A data domain within an organization.'}
15490
15490
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DomainKey")
15491
15491
 
15492
15492
  def __init__(self,
@@ -15631,7 +15631,7 @@ class GlossaryNodeKeyClass(_Aspect):
15631
15631
 
15632
15632
 
15633
15633
  ASPECT_NAME = 'glossaryNodeKey'
15634
- ASPECT_INFO = {'keyForEntity': 'glossaryNode', 'entityCategory': 'core', 'entityAspects': ['glossaryNodeInfo', 'institutionalMemory', 'ownership', 'status', 'structuredProperties', 'forms', 'testResults', 'subTypes']}
15634
+ ASPECT_INFO = {'keyForEntity': 'glossaryNode', 'entityCategory': 'core', 'entityAspects': ['glossaryNodeInfo', 'institutionalMemory', 'ownership', 'status', 'structuredProperties', 'forms', 'testResults', 'subTypes', 'displayProperties']}
15635
15635
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.GlossaryNodeKey")
15636
15636
 
15637
15637
  def __init__(self,
@@ -15831,7 +15831,7 @@ class MLModelDeploymentKeyClass(_Aspect):
15831
15831
 
15832
15832
 
15833
15833
  ASPECT_NAME = 'mlModelDeploymentKey'
15834
- ASPECT_INFO = {'keyForEntity': 'mlModelDeployment', 'entityCategory': 'core', 'entityAspects': ['mlModelDeploymentProperties', 'ownership', 'status', 'deprecation', 'globalTags', 'dataPlatformInstance', 'testResults']}
15834
+ ASPECT_INFO = {'keyForEntity': 'mlModelDeployment', 'entityCategory': 'core', 'entityAspects': ['mlModelDeploymentProperties', 'ownership', 'status', 'deprecation', 'globalTags', 'dataPlatformInstance', 'testResults', 'container']}
15835
15835
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.MLModelDeploymentKey")
15836
15836
 
15837
15837
  def __init__(self,
@@ -15886,7 +15886,7 @@ class MLModelGroupKeyClass(_Aspect):
15886
15886
 
15887
15887
 
15888
15888
  ASPECT_NAME = 'mlModelGroupKey'
15889
- ASPECT_INFO = {'keyForEntity': 'mlModelGroup', 'entityCategory': 'core', 'entityAspects': ['glossaryTerms', 'editableMlModelGroupProperties', 'domains', 'mlModelGroupProperties', 'ownership', 'status', 'deprecation', 'browsePaths', 'globalTags', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'testResults', 'subTypes']}
15889
+ ASPECT_INFO = {'keyForEntity': 'mlModelGroup', 'entityCategory': 'core', 'entityAspects': ['glossaryTerms', 'editableMlModelGroupProperties', 'domains', 'mlModelGroupProperties', 'ownership', 'status', 'deprecation', 'browsePaths', 'globalTags', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'testResults', 'subTypes', 'container']}
15890
15890
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.MLModelGroupKey")
15891
15891
 
15892
15892
  def __init__(self,
@@ -15941,7 +15941,7 @@ class MLModelKeyClass(_Aspect):
15941
15941
 
15942
15942
 
15943
15943
  ASPECT_NAME = 'mlModelKey'
15944
- ASPECT_INFO = {'keyForEntity': 'mlModel', 'entityCategory': 'core', 'entityAspects': ['glossaryTerms', 'editableMlModelProperties', 'domains', 'ownership', 'mlModelProperties', 'intendedUse', 'mlModelFactorPrompts', 'mlModelMetrics', 'mlModelEvaluationData', 'mlModelTrainingData', 'mlModelQuantitativeAnalyses', 'mlModelEthicalConsiderations', 'mlModelCaveatsAndRecommendations', 'institutionalMemory', 'sourceCode', 'status', 'cost', 'deprecation', 'browsePaths', 'globalTags', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'testResults', 'versionProperties', 'subTypes']}
15944
+ ASPECT_INFO = {'keyForEntity': 'mlModel', 'entityCategory': 'core', 'entityAspects': ['glossaryTerms', 'editableMlModelProperties', 'domains', 'ownership', 'mlModelProperties', 'intendedUse', 'mlModelFactorPrompts', 'mlModelMetrics', 'mlModelEvaluationData', 'mlModelTrainingData', 'mlModelQuantitativeAnalyses', 'mlModelEthicalConsiderations', 'mlModelCaveatsAndRecommendations', 'institutionalMemory', 'sourceCode', 'status', 'cost', 'deprecation', 'browsePaths', 'globalTags', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'testResults', 'versionProperties', 'subTypes', 'container']}
15945
15945
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.MLModelKey")
15946
15946
 
15947
15947
  def __init__(self,
@@ -15636,7 +15636,8 @@
15636
15636
  "ownership",
15637
15637
  "structuredProperties",
15638
15638
  "forms",
15639
- "testResults"
15639
+ "testResults",
15640
+ "displayProperties"
15640
15641
  ],
15641
15642
  "entityDoc": "A data domain within an organization."
15642
15643
  },
@@ -10,7 +10,8 @@
10
10
  "ownership",
11
11
  "structuredProperties",
12
12
  "forms",
13
- "testResults"
13
+ "testResults",
14
+ "displayProperties"
14
15
  ],
15
16
  "entityDoc": "A data domain within an organization."
16
17
  },
@@ -12,7 +12,8 @@
12
12
  "structuredProperties",
13
13
  "forms",
14
14
  "testResults",
15
- "subTypes"
15
+ "subTypes",
16
+ "displayProperties"
16
17
  ]
17
18
  },
18
19
  "name": "GlossaryNodeKey",
@@ -11,7 +11,8 @@
11
11
  "deprecation",
12
12
  "globalTags",
13
13
  "dataPlatformInstance",
14
- "testResults"
14
+ "testResults",
15
+ "container"
15
16
  ]
16
17
  },
17
18
  "name": "MLModelDeploymentKey",
@@ -19,7 +19,8 @@
19
19
  "structuredProperties",
20
20
  "forms",
21
21
  "testResults",
22
- "subTypes"
22
+ "subTypes",
23
+ "container"
23
24
  ]
24
25
  },
25
26
  "name": "MLModelGroupKey",
@@ -31,7 +31,8 @@
31
31
  "forms",
32
32
  "testResults",
33
33
  "versionProperties",
34
- "subTypes"
34
+ "subTypes",
35
+ "container"
35
36
  ]
36
37
  },
37
38
  "name": "MLModelKey",
@@ -24,12 +24,19 @@ DIALECTS_WITH_CASE_INSENSITIVE_COLS = {
24
24
  # For SQL server, the default collation rules mean that all identifiers (schema, table, column names)
25
25
  # are case preserving but case insensitive.
26
26
  "mssql",
27
+ # Oracle automatically converts unquoted identifiers to uppercase.
28
+ # https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Database-Object-Names-and-Qualifiers.html#GUID-3C59E44A-5140-4BCA-B9E1-3039C8050C49
29
+ # In our Oracle connector, we then normalize column names to lowercase. This behavior
30
+ # actually comes from the underlying Oracle sqlalchemy dialect.
31
+ # https://github.com/sqlalchemy/sqlalchemy/blob/d9b4d8ff3aae504402d324f3ebf0b8faff78f5dc/lib/sqlalchemy/dialects/oracle/base.py#L2579
32
+ "oracle",
27
33
  }
28
34
  DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
29
35
  # In some dialects, column identifiers are effectively case insensitive
30
36
  # because they are automatically converted to uppercase. Most other systems
31
37
  # automatically lowercase unquoted identifiers.
32
38
  "snowflake",
39
+ "oracle",
33
40
  }
34
41
  assert DIALECTS_WITH_DEFAULT_UPPERCASE_COLS.issubset(
35
42
  DIALECTS_WITH_CASE_INSENSITIVE_COLS