PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show

{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +31 -7
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +8 -5
datahub/ingestion/source/dbt/dbt_core.py +11 -9
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +6 -5
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +16 -18
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +19 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +258 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +30 -6
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +220 -126
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/sql/oracle.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import datetime
 import logging
+import platform
 import re
 # This import verifies that the dependencies are available.
@@ -85,6 +86,16 @@ class OracleConfig(BasicSQLAlchemyConfig):
         description="The data dictionary views mode, to extract information about schema objects "
         "('ALL' and 'DBA' views are supported). (https://docs.oracle.com/cd/E11882_01/nav/catalog_views.htm)",
     )
+    # oracledb settings to enable thick mode and client library location
+    enable_thick_mode: Optional[bool] = Field(
+        default=False,
+        description="Connection defaults to thin mode. Set to True to enable thick mode.",
+    )
+    thick_mode_lib_dir: Optional[str] = Field(
+        default=None,
+        description="If using thick mode on Windows or Mac, set thick_mode_lib_dir to the oracle client libraries path. "
+        "On Linux, this value is ignored, as ldconfig or LD_LIBRARY_PATH will define the location.",
+    )
     @pydantic.validator("service_name")
     def check_service_name(cls, v, values):
@@ -100,6 +111,18 @@ class OracleConfig(BasicSQLAlchemyConfig):
             raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
         return values
+    @pydantic.validator("thick_mode_lib_dir", always=True)
+    def check_thick_mode_lib_dir(cls, v, values):
+        if (
+            v is None
+            and values.get("enable_thick_mode")
+            and (platform.system() == "Darwin" or platform.system() == "Windows")
+        ):
+            raise ValueError(
+                "Specify 'thick_mode_lib_dir' on Mac/Windows when enable_thick_mode is true"
+            )
+        return v
     def get_sql_alchemy_url(self):
         url = super().get_sql_alchemy_url()
         if self.service_name:
@@ -129,6 +152,7 @@ class OracleInspectorObjectWrapper:
         self.exclude_tablespaces: Tuple[str, str] = ("SYSTEM", "SYSAUX")
     def get_db_name(self) -> str:
+        db_name = None
         try:
             # Try to retrieve current DB name by executing query
             db_name = self._inspector_instance.bind.execute(
@@ -136,7 +160,12 @@ class OracleInspectorObjectWrapper:
             ).scalar()
             return str(db_name)
         except sqlalchemy.exc.DatabaseError as e:
-            logger.error("Error fetching DB name: " + str(e))
+            self.report.failure(
+                title="Error fetching database name using sys_context.",
+                message="database_fetch_error",
+                context=db_name,
+                exc=e,
+            )
             return ""
     def get_schema_names(self) -> List[str]:
@@ -303,8 +332,8 @@ class OracleInspectorObjectWrapper:
                 try:
                     coltype = ischema_names[coltype]()
                 except KeyError:
-                    logger.warning(
-                        f"Did not recognize type {coltype} of column {colname}"
+                    logger.info(
+                        f"Unrecognized column datatype {coltype} of column {colname}"
                     )
                     coltype = sqltypes.NULLTYPE
@@ -356,8 +385,8 @@ class OracleInspectorObjectWrapper:
         COMMENT_SQL = """
             SELECT comments
             FROM dba_tab_comments
-            WHERE table_name = CAST(:table_name AS VARCHAR(128))
-            AND owner = CAST(:schema_name AS VARCHAR(128))
+            WHERE table_name = :table_name
+            AND owner = :schema_name
         """
         c = self._inspector_instance.bind.execute(
@@ -374,79 +403,93 @@ class OracleInspectorObjectWrapper:
         text = (
             "SELECT"
-            "\nac.constraint_name,"  # 0
-            "\nac.constraint_type,"  # 1
-            "\nloc.column_name AS local_column,"  # 2
-            "\nrem.table_name AS remote_table,"  # 3
-            "\nrem.column_name AS remote_column,"  # 4
-            "\nrem.owner AS remote_owner,"  # 5
-            "\nloc.position as loc_pos,"  # 6
-            "\nrem.position as rem_pos,"  # 7
-            "\nac.search_condition,"  # 8
-            "\nac.delete_rule"  # 9
-            "\nFROM dba_constraints%(dblink)s ac,"
-            "\ndba_cons_columns%(dblink)s loc,"
-            "\ndba_cons_columns%(dblink)s rem"
-            "\nWHERE ac.table_name = CAST(:table_name AS VARCHAR2(128))"
-            "\nAND ac.constraint_type IN ('R','P', 'U', 'C')"
+            "\nac.constraint_name,"
+            "\nac.constraint_type,"
+            "\nacc.column_name AS local_column,"
+            "\nNULL AS remote_table,"
+            "\nNULL AS remote_column,"
+            "\nNULL AS remote_owner,"
+            "\nacc.position AS loc_pos,"
+            "\nNULL AS rem_pos,"
+            "\nac.search_condition,"
+            "\nac.delete_rule"
+            "\nFROM dba_constraints ac"
+            "\nJOIN dba_cons_columns acc"
+            "\nON ac.owner = acc.owner"
+            "\nAND ac.constraint_name = acc.constraint_name"
+            "\nAND ac.table_name = acc.table_name"
+            "\nWHERE ac.table_name = :table_name"
+            "\nAND ac.constraint_type IN ('P', 'U', 'C')"
         )
         if schema is not None:
             params["owner"] = schema
-            text += "\nAND ac.owner = CAST(:owner AS VARCHAR2(128))"
+            text += "\nAND ac.owner = :owner"
+        # Splitting into queries with UNION ALL for execution efficiency
         text += (
-            "\nAND ac.owner = loc.owner"
-            "\nAND ac.constraint_name = loc.constraint_name"
-            "\nAND ac.r_owner = rem.owner(+)"
-            "\nAND ac.r_constraint_name = rem.constraint_name(+)"
-            "\nAND (rem.position IS NULL or loc.position=rem.position)"
-            "\nORDER BY ac.constraint_name, loc.position"
+            "\nUNION ALL"
+            "\nSELECT"
+            "\nac.constraint_name,"
+            "\nac.constraint_type,"
+            "\nacc.column_name AS local_column,"
+            "\nac.r_table_name AS remote_table,"
+            "\nrcc.column_name AS remote_column,"
+            "\nac.r_owner AS remote_owner,"
+            "\nacc.position AS loc_pos,"
+            "\nrcc.position AS rem_pos,"
+            "\nac.search_condition,"
+            "\nac.delete_rule"
+            "\nFROM dba_constraints ac"
+            "\nJOIN dba_cons_columns acc"
+            "\nON ac.owner = acc.owner"
+            "\nAND ac.constraint_name = acc.constraint_name"
+            "\nAND ac.table_name = acc.table_name"
+            "\nLEFT JOIN dba_cons_columns rcc"
+            "\nON ac.r_owner = rcc.owner"
+            "\nAND ac.r_constraint_name = rcc.constraint_name"
+            "\nAND acc.position = rcc.position"
+            "\nWHERE ac.table_name = :table_name"
+            "\nAND ac.constraint_type = 'R'"
         )
-        text = text % {"dblink": dblink}
+        if schema is not None:
+            text += "\nAND ac.owner = :owner"
+        text += "\nORDER BY constraint_name, loc_pos"
         rp = self._inspector_instance.bind.execute(sql.text(text), params)
-        constraint_data = rp.fetchall()
-        return constraint_data
+        return rp.fetchall()
     def get_pk_constraint(
         self, table_name: str, schema: Optional[str] = None, dblink: str = ""
     ) -> Dict:
-        denormalized_table_name = self._inspector_instance.dialect.denormalize_name(
-            table_name
-        )
-        assert denormalized_table_name
-        schema = self._inspector_instance.dialect.denormalize_name(
-            schema or self.default_schema_name
-        )
-        if schema is None:
-            schema = self._inspector_instance.dialect.default_schema_name
         pkeys = []
         constraint_name = None
-        constraint_data = self._get_constraint_data(
-            denormalized_table_name, schema, dblink
-        )
-        for row in constraint_data:
-            (
-                cons_name,
-                cons_type,
-                local_column,
-                remote_table,
-                remote_column,
-                remote_owner,
-            ) = row[0:2] + tuple(
-                [self._inspector_instance.dialect.normalize_name(x) for x in row[2:6]]
+        try:
+            for row in self._get_constraint_data(table_name, schema, dblink):
+                if row[1] == "P":  # constraint_type is 'P' for primary key
+                    if constraint_name is None:
+                        constraint_name = (
+                            self._inspector_instance.dialect.normalize_name(row[0])
+                        )
+                    col_name = self._inspector_instance.dialect.normalize_name(
+                        row[2]
+                    )  # local_column
+                    pkeys.append(col_name)
+        except Exception as e:
+            self.report.warning(
+                title="Failed to Process Primary Keys",
+                message=(
+                    f"Unable to process primary key constraints for {schema}.{table_name}. "
+                    "Ensure SELECT access on DBA_CONSTRAINTS and DBA_CONS_COLUMNS.",
+                ),
+                context=f"{schema}.{table_name}",
+                exc=e,
             )
-            if cons_type == "P":
-                if constraint_name is None:
-                    constraint_name = self._inspector_instance.dialect.normalize_name(
-                        cons_name
-                    )
-                pkeys.append(local_column)
+            # Return empty constraint if we can't process it
+            return {"constrained_columns": [], "name": None}
         return {"constrained_columns": pkeys, "name": constraint_name}
@@ -504,6 +547,16 @@ class OracleInspectorObjectWrapper:
                         f"dba_cons_columns{dblink} - does the user have "
                         "proper rights to the table?"
                     )
+                    self.report.warning(
+                        title="Missing Table Permissions",
+                        message=(
+                            f"Unable to query table_name from dba_cons_columns{dblink}. "
+                            "This usually indicates insufficient permissions on the target table. "
+                            f"Foreign key relationships will not be detected for {schema}.{table_name}. "
+                            "Please ensure the user has SELECT privileges on dba_cons_columns."
+                        ),
+                        context=f"{schema}.{table_name}",
+                    )
                 rec = fkeys[cons_name]
                 rec["name"] = cons_name
@@ -550,8 +603,8 @@ class OracleInspectorObjectWrapper:
         text = "SELECT text FROM dba_views WHERE view_name=:view_name"
         if schema is not None:
-            text += " AND owner = :schema"
-            params["schema"] = schema
+            params["owner"] = schema
+            text += "\nAND owner = :owner"
         rp = self._inspector_instance.bind.execute(sql.text(text), params).scalar()
@@ -586,6 +639,17 @@ class OracleSource(SQLAlchemySource):
     def __init__(self, config, ctx):
         super().__init__(config, ctx, "oracle")
+        # if connecting to oracle with enable_thick_mode, it must be initialized before calling
+        # create_engine, which is called in get_inspectors()
+        # https://python-oracledb.readthedocs.io/en/latest/user_guide/initialization.html#enabling-python-oracledb-thick-mode
+        if self.config.enable_thick_mode:
+            if platform.system() == "Darwin" or platform.system() == "Windows":
+                # windows and mac os require lib_dir to be set explicitly
+                oracledb.init_oracle_client(lib_dir=self.config.thick_mode_lib_dir)
+            else:
+                # linux requires configurating the library path with ldconfig or LD_LIBRARY_PATH
+                oracledb.init_oracle_client()
     @classmethod
     def create(cls, config_dict, ctx):
         config = OracleConfig.parse_obj(config_dict)

datahub/ingestion/source/sql/sql_common.py CHANGED Viewed

@@ -204,7 +204,7 @@ def get_column_type(
     """
     TypeClass: Optional[Type] = None
-    for sql_type in _field_type_mapping.keys():
+    for sql_type in _field_type_mapping:
         if isinstance(column_type, sql_type):
             TypeClass = _field_type_mapping[sql_type]
             break
@@ -352,6 +352,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
         )
         self.report.sql_aggregator = self.aggregator.report
+    def _add_default_options(self, sql_config: SQLCommonConfig) -> None:
+        """Add default SQLAlchemy options. Can be overridden by subclasses to add additional defaults."""
+        # Extra default SQLAlchemy option for better connection pooling and threading.
+        # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
+        if sql_config.is_profiling_enabled():
+            sql_config.options.setdefault(
+                "max_overflow", sql_config.profiling.max_workers
+            )
     @classmethod
     def test_connection(cls, config_dict: dict) -> TestConnectionReport:
         test_report = TestConnectionReport()
@@ -519,12 +528,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             # Known issue with sqlalchemy https://stackoverflow.com/questions/60804288/pycharm-duplicated-log-for-sqlalchemy-echo-true
             sqlalchemy_log._add_default_handler = lambda x: None  # type: ignore
-        # Extra default SQLAlchemy option for better connection pooling and threading.
-        # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
-        if sql_config.is_profiling_enabled():
-            sql_config.options.setdefault(
-                "max_overflow", sql_config.profiling.max_workers
-            )
+        self._add_default_options(sql_config)
         for inspector in self.get_inspectors():
             profiler = None
@@ -631,7 +635,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
         return None
-    def loop_tables(  # noqa: C901
+    def loop_tables(
         self,
         inspector: Inspector,
         schema: str,
@@ -969,7 +973,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
                     inspector=inspector,
                 )
             ),
-            description=column.get("comment", None),
+            description=column.get("comment"),
             nullable=column["nullable"],
             recursive=False,
             globalTags=gtc,
@@ -1027,16 +1031,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
     def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
         try:
             view_definition = inspector.get_view_definition(view, schema)
-            if view_definition is None:
-                view_definition = ""
-            else:
-                # Some dialects return a TextClause instead of a raw string,
-                # so we need to convert them to a string.
-                view_definition = str(view_definition)
+            # Some dialects return a TextClause instead of a raw string, so we need to convert them to a string.
+            return str(view_definition) if view_definition else ""
         except NotImplementedError:
-            view_definition = ""
-        return view_definition
+            return ""
     def _process_view(
         self,

datahub/ingestion/source/sql/sql_types.py CHANGED Viewed

@@ -317,10 +317,10 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
     match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
     if match:
         modified_type_base = match.group(1)  # Extract the base type
-        return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
+        return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
     # Fallback for types without precision/scale
-    return SNOWFLAKE_TYPES_MAP.get(type_string, None)
+    return SNOWFLAKE_TYPES_MAP.get(type_string)
 # see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32

datahub/ingestion/source/sql/teradata.py CHANGED Viewed

@@ -22,6 +22,7 @@ from sqlalchemy import create_engine, inspect
 from sqlalchemy.engine import Engine
 from sqlalchemy.engine.base import Connection
 from sqlalchemy.engine.reflection import Inspector
+from sqlalchemy.pool import QueuePool
 from sqlalchemy.sql.expression import text
 from teradatasqlalchemy.dialect import TeradataDialect
 from teradatasqlalchemy.options import configure
@@ -179,10 +180,11 @@ def optimized_get_columns(
     connection: Connection,
     table_name: str,
     schema: Optional[str] = None,
-    tables_cache: MutableMapping[str, List[TeradataTable]] = {},
+    tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
     use_qvci: bool = False,
     **kw: Dict[str, Any],
 ) -> List[Dict]:
+    tables_cache = tables_cache or {}
     if schema is None:
         schema = self.default_schema_name
@@ -313,9 +315,10 @@ def optimized_get_view_definition(
     connection: Connection,
     view_name: str,
     schema: Optional[str] = None,
-    tables_cache: MutableMapping[str, List[TeradataTable]] = {},
+    tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
     **kw: Dict[str, Any],
 ) -> Optional[str]:
+    tables_cache = tables_cache or {}
     if schema is None:
         schema = self.default_schema_name
@@ -648,7 +651,7 @@ ORDER by DataBaseName, TableName;
             )
             # Disabling the below because the cached view definition is not the view definition the column in tablesv actually holds the last statement executed against the object... not necessarily the view definition
-            # setattr(  # noqa: B010
+            # setattr(
             #   TeradataDialect,
             #    "get_view_definition",
             #   lambda self, connection, view_name, schema=None, **kw: optimized_get_view_definition(
@@ -678,6 +681,16 @@ ORDER by DataBaseName, TableName;
             if self.config.stateful_ingestion:
                 self.config.stateful_ingestion.remove_stale_metadata = False
+    def _add_default_options(self, sql_config: SQLCommonConfig) -> None:
+        """Add Teradata-specific default options"""
+        super()._add_default_options(sql_config)
+        if sql_config.is_profiling_enabled():
+            # Sqlalchemy uses QueuePool by default however Teradata uses SingletonThreadPool.
+            # SingletonThreadPool does not support parellel connections. For using profiling, we need to use QueuePool.
+            # https://docs.sqlalchemy.org/en/20/core/pooling.html#connection-pool-configuration
+            # https://github.com/Teradata/sqlalchemy-teradata/issues/96
+            sql_config.options.setdefault("poolclass", QueuePool)
     @classmethod
     def create(cls, config_dict, ctx):
         config = TeradataConfig.parse_obj(config_dict)
@@ -705,6 +718,7 @@ ORDER by DataBaseName, TableName;
         # This method can be overridden in the case that you want to dynamically
         # run on multiple databases.
         url = self.config.get_sql_alchemy_url()
         logger.debug(f"sql_alchemy_url={url}")
         engine = create_engine(url, **self.config.options)
         with engine.connect() as conn:
@@ -734,7 +748,7 @@ ORDER by DataBaseName, TableName;
         else:
             raise Exception("Unable to get database name from Sqlalchemy inspector")
-    def cached_loop_tables(  # noqa: C901
+    def cached_loop_tables(
         self,
         inspector: Inspector,
         schema: str,
@@ -770,7 +784,7 @@ ORDER by DataBaseName, TableName;
                 break
         return description, properties, location
-    def cached_loop_views(  # noqa: C901
+    def cached_loop_views(
         self,
         inspector: Inspector,
         schema: str,

datahub/ingestion/source/sql/trino.py CHANGED Viewed

@@ -142,7 +142,7 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
                     if col_value is not None:
                         properties[col_name] = col_value
-            return {"text": properties.get("comment", None), "properties": properties}
+            return {"text": properties.get("comment"), "properties": properties}
         else:
             return self.get_table_comment_default(connection, table_name, schema)
     except Exception:
@@ -483,7 +483,7 @@ def _parse_struct_fields(parts):
 def _parse_basic_datatype(s):
-    for sql_type in _all_atomic_types.keys():
+    for sql_type in _all_atomic_types:
         if isinstance(s, sql_type):
             return {
                 "type": _all_atomic_types[sql_type],

datahub/ingestion/source/state/stale_entity_removal_handler.py CHANGED Viewed

@@ -114,14 +114,10 @@ class StaleEntityRemovalHandler(
         self.stateful_ingestion_config: Optional[StatefulStaleMetadataRemovalConfig] = (
             config.stateful_ingestion
         )
-        self.checkpointing_enabled: bool = (
-            True
-            if (
-                self.state_provider.is_stateful_ingestion_configured()
-                and self.stateful_ingestion_config
-                and self.stateful_ingestion_config.remove_stale_metadata
-            )
-            else False
+        self.checkpointing_enabled: bool = bool(
+            self.state_provider.is_stateful_ingestion_configured()
+            and self.stateful_ingestion_config
+            and self.stateful_ingestion_config.remove_stale_metadata
         )
         self._job_id = self._init_job_id()
         self._urns_to_skip: Set[str] = set()

acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl