acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
|
+
import platform
|
|
3
4
|
import re
|
|
4
5
|
|
|
5
6
|
# This import verifies that the dependencies are available.
|
|
@@ -85,6 +86,16 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
85
86
|
description="The data dictionary views mode, to extract information about schema objects "
|
|
86
87
|
"('ALL' and 'DBA' views are supported). (https://docs.oracle.com/cd/E11882_01/nav/catalog_views.htm)",
|
|
87
88
|
)
|
|
89
|
+
# oracledb settings to enable thick mode and client library location
|
|
90
|
+
enable_thick_mode: Optional[bool] = Field(
|
|
91
|
+
default=False,
|
|
92
|
+
description="Connection defaults to thin mode. Set to True to enable thick mode.",
|
|
93
|
+
)
|
|
94
|
+
thick_mode_lib_dir: Optional[str] = Field(
|
|
95
|
+
default=None,
|
|
96
|
+
description="If using thick mode on Windows or Mac, set thick_mode_lib_dir to the oracle client libraries path. "
|
|
97
|
+
"On Linux, this value is ignored, as ldconfig or LD_LIBRARY_PATH will define the location.",
|
|
98
|
+
)
|
|
88
99
|
|
|
89
100
|
@pydantic.validator("service_name")
|
|
90
101
|
def check_service_name(cls, v, values):
|
|
@@ -100,6 +111,18 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
100
111
|
raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
|
|
101
112
|
return values
|
|
102
113
|
|
|
114
|
+
@pydantic.validator("thick_mode_lib_dir", always=True)
|
|
115
|
+
def check_thick_mode_lib_dir(cls, v, values):
|
|
116
|
+
if (
|
|
117
|
+
v is None
|
|
118
|
+
and values.get("enable_thick_mode")
|
|
119
|
+
and (platform.system() == "Darwin" or platform.system() == "Windows")
|
|
120
|
+
):
|
|
121
|
+
raise ValueError(
|
|
122
|
+
"Specify 'thick_mode_lib_dir' on Mac/Windows when enable_thick_mode is true"
|
|
123
|
+
)
|
|
124
|
+
return v
|
|
125
|
+
|
|
103
126
|
def get_sql_alchemy_url(self):
|
|
104
127
|
url = super().get_sql_alchemy_url()
|
|
105
128
|
if self.service_name:
|
|
@@ -129,6 +152,7 @@ class OracleInspectorObjectWrapper:
|
|
|
129
152
|
self.exclude_tablespaces: Tuple[str, str] = ("SYSTEM", "SYSAUX")
|
|
130
153
|
|
|
131
154
|
def get_db_name(self) -> str:
|
|
155
|
+
db_name = None
|
|
132
156
|
try:
|
|
133
157
|
# Try to retrieve current DB name by executing query
|
|
134
158
|
db_name = self._inspector_instance.bind.execute(
|
|
@@ -136,7 +160,12 @@ class OracleInspectorObjectWrapper:
|
|
|
136
160
|
).scalar()
|
|
137
161
|
return str(db_name)
|
|
138
162
|
except sqlalchemy.exc.DatabaseError as e:
|
|
139
|
-
|
|
163
|
+
self.report.failure(
|
|
164
|
+
title="Error fetching database name using sys_context.",
|
|
165
|
+
message="database_fetch_error",
|
|
166
|
+
context=db_name,
|
|
167
|
+
exc=e,
|
|
168
|
+
)
|
|
140
169
|
return ""
|
|
141
170
|
|
|
142
171
|
def get_schema_names(self) -> List[str]:
|
|
@@ -303,8 +332,8 @@ class OracleInspectorObjectWrapper:
|
|
|
303
332
|
try:
|
|
304
333
|
coltype = ischema_names[coltype]()
|
|
305
334
|
except KeyError:
|
|
306
|
-
logger.
|
|
307
|
-
f"
|
|
335
|
+
logger.info(
|
|
336
|
+
f"Unrecognized column datatype {coltype} of column {colname}"
|
|
308
337
|
)
|
|
309
338
|
coltype = sqltypes.NULLTYPE
|
|
310
339
|
|
|
@@ -356,8 +385,8 @@ class OracleInspectorObjectWrapper:
|
|
|
356
385
|
COMMENT_SQL = """
|
|
357
386
|
SELECT comments
|
|
358
387
|
FROM dba_tab_comments
|
|
359
|
-
WHERE table_name =
|
|
360
|
-
AND owner =
|
|
388
|
+
WHERE table_name = :table_name
|
|
389
|
+
AND owner = :schema_name
|
|
361
390
|
"""
|
|
362
391
|
|
|
363
392
|
c = self._inspector_instance.bind.execute(
|
|
@@ -374,79 +403,93 @@ class OracleInspectorObjectWrapper:
|
|
|
374
403
|
|
|
375
404
|
text = (
|
|
376
405
|
"SELECT"
|
|
377
|
-
"\nac.constraint_name,"
|
|
378
|
-
"\nac.constraint_type,"
|
|
379
|
-
"\
|
|
380
|
-
"\
|
|
381
|
-
"\
|
|
382
|
-
"\
|
|
383
|
-
"\
|
|
384
|
-
"\
|
|
385
|
-
"\nac.search_condition,"
|
|
386
|
-
"\nac.delete_rule"
|
|
387
|
-
"\nFROM dba_constraints
|
|
388
|
-
"\
|
|
389
|
-
"\
|
|
390
|
-
"\
|
|
391
|
-
"\nAND ac.
|
|
406
|
+
"\nac.constraint_name,"
|
|
407
|
+
"\nac.constraint_type,"
|
|
408
|
+
"\nacc.column_name AS local_column,"
|
|
409
|
+
"\nNULL AS remote_table,"
|
|
410
|
+
"\nNULL AS remote_column,"
|
|
411
|
+
"\nNULL AS remote_owner,"
|
|
412
|
+
"\nacc.position AS loc_pos,"
|
|
413
|
+
"\nNULL AS rem_pos,"
|
|
414
|
+
"\nac.search_condition,"
|
|
415
|
+
"\nac.delete_rule"
|
|
416
|
+
"\nFROM dba_constraints ac"
|
|
417
|
+
"\nJOIN dba_cons_columns acc"
|
|
418
|
+
"\nON ac.owner = acc.owner"
|
|
419
|
+
"\nAND ac.constraint_name = acc.constraint_name"
|
|
420
|
+
"\nAND ac.table_name = acc.table_name"
|
|
421
|
+
"\nWHERE ac.table_name = :table_name"
|
|
422
|
+
"\nAND ac.constraint_type IN ('P', 'U', 'C')"
|
|
392
423
|
)
|
|
393
424
|
|
|
394
425
|
if schema is not None:
|
|
395
426
|
params["owner"] = schema
|
|
396
|
-
text += "\nAND ac.owner =
|
|
427
|
+
text += "\nAND ac.owner = :owner"
|
|
397
428
|
|
|
429
|
+
# Splitting into queries with UNION ALL for execution efficiency
|
|
398
430
|
text += (
|
|
399
|
-
"\
|
|
400
|
-
"\
|
|
401
|
-
"\
|
|
402
|
-
"\
|
|
403
|
-
"\
|
|
404
|
-
"\
|
|
431
|
+
"\nUNION ALL"
|
|
432
|
+
"\nSELECT"
|
|
433
|
+
"\nac.constraint_name,"
|
|
434
|
+
"\nac.constraint_type,"
|
|
435
|
+
"\nacc.column_name AS local_column,"
|
|
436
|
+
"\nac.r_table_name AS remote_table,"
|
|
437
|
+
"\nrcc.column_name AS remote_column,"
|
|
438
|
+
"\nac.r_owner AS remote_owner,"
|
|
439
|
+
"\nacc.position AS loc_pos,"
|
|
440
|
+
"\nrcc.position AS rem_pos,"
|
|
441
|
+
"\nac.search_condition,"
|
|
442
|
+
"\nac.delete_rule"
|
|
443
|
+
"\nFROM dba_constraints ac"
|
|
444
|
+
"\nJOIN dba_cons_columns acc"
|
|
445
|
+
"\nON ac.owner = acc.owner"
|
|
446
|
+
"\nAND ac.constraint_name = acc.constraint_name"
|
|
447
|
+
"\nAND ac.table_name = acc.table_name"
|
|
448
|
+
"\nLEFT JOIN dba_cons_columns rcc"
|
|
449
|
+
"\nON ac.r_owner = rcc.owner"
|
|
450
|
+
"\nAND ac.r_constraint_name = rcc.constraint_name"
|
|
451
|
+
"\nAND acc.position = rcc.position"
|
|
452
|
+
"\nWHERE ac.table_name = :table_name"
|
|
453
|
+
"\nAND ac.constraint_type = 'R'"
|
|
405
454
|
)
|
|
406
455
|
|
|
407
|
-
|
|
456
|
+
if schema is not None:
|
|
457
|
+
text += "\nAND ac.owner = :owner"
|
|
458
|
+
|
|
459
|
+
text += "\nORDER BY constraint_name, loc_pos"
|
|
460
|
+
|
|
408
461
|
rp = self._inspector_instance.bind.execute(sql.text(text), params)
|
|
409
|
-
|
|
410
|
-
return constraint_data
|
|
462
|
+
return rp.fetchall()
|
|
411
463
|
|
|
412
464
|
def get_pk_constraint(
|
|
413
465
|
self, table_name: str, schema: Optional[str] = None, dblink: str = ""
|
|
414
466
|
) -> Dict:
|
|
415
|
-
denormalized_table_name = self._inspector_instance.dialect.denormalize_name(
|
|
416
|
-
table_name
|
|
417
|
-
)
|
|
418
|
-
assert denormalized_table_name
|
|
419
|
-
|
|
420
|
-
schema = self._inspector_instance.dialect.denormalize_name(
|
|
421
|
-
schema or self.default_schema_name
|
|
422
|
-
)
|
|
423
|
-
|
|
424
|
-
if schema is None:
|
|
425
|
-
schema = self._inspector_instance.dialect.default_schema_name
|
|
426
|
-
|
|
427
467
|
pkeys = []
|
|
428
468
|
constraint_name = None
|
|
429
|
-
constraint_data = self._get_constraint_data(
|
|
430
|
-
denormalized_table_name, schema, dblink
|
|
431
|
-
)
|
|
432
469
|
|
|
433
|
-
|
|
434
|
-
(
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
470
|
+
try:
|
|
471
|
+
for row in self._get_constraint_data(table_name, schema, dblink):
|
|
472
|
+
if row[1] == "P": # constraint_type is 'P' for primary key
|
|
473
|
+
if constraint_name is None:
|
|
474
|
+
constraint_name = (
|
|
475
|
+
self._inspector_instance.dialect.normalize_name(row[0])
|
|
476
|
+
)
|
|
477
|
+
col_name = self._inspector_instance.dialect.normalize_name(
|
|
478
|
+
row[2]
|
|
479
|
+
) # local_column
|
|
480
|
+
pkeys.append(col_name)
|
|
481
|
+
except Exception as e:
|
|
482
|
+
self.report.warning(
|
|
483
|
+
title="Failed to Process Primary Keys",
|
|
484
|
+
message=(
|
|
485
|
+
f"Unable to process primary key constraints for {schema}.{table_name}. "
|
|
486
|
+
"Ensure SELECT access on DBA_CONSTRAINTS and DBA_CONS_COLUMNS.",
|
|
487
|
+
),
|
|
488
|
+
context=f"{schema}.{table_name}",
|
|
489
|
+
exc=e,
|
|
443
490
|
)
|
|
444
|
-
if
|
|
445
|
-
|
|
446
|
-
constraint_name = self._inspector_instance.dialect.normalize_name(
|
|
447
|
-
cons_name
|
|
448
|
-
)
|
|
449
|
-
pkeys.append(local_column)
|
|
491
|
+
# Return empty constraint if we can't process it
|
|
492
|
+
return {"constrained_columns": [], "name": None}
|
|
450
493
|
|
|
451
494
|
return {"constrained_columns": pkeys, "name": constraint_name}
|
|
452
495
|
|
|
@@ -504,6 +547,16 @@ class OracleInspectorObjectWrapper:
|
|
|
504
547
|
f"dba_cons_columns{dblink} - does the user have "
|
|
505
548
|
"proper rights to the table?"
|
|
506
549
|
)
|
|
550
|
+
self.report.warning(
|
|
551
|
+
title="Missing Table Permissions",
|
|
552
|
+
message=(
|
|
553
|
+
f"Unable to query table_name from dba_cons_columns{dblink}. "
|
|
554
|
+
"This usually indicates insufficient permissions on the target table. "
|
|
555
|
+
f"Foreign key relationships will not be detected for {schema}.{table_name}. "
|
|
556
|
+
"Please ensure the user has SELECT privileges on dba_cons_columns."
|
|
557
|
+
),
|
|
558
|
+
context=f"{schema}.{table_name}",
|
|
559
|
+
)
|
|
507
560
|
|
|
508
561
|
rec = fkeys[cons_name]
|
|
509
562
|
rec["name"] = cons_name
|
|
@@ -550,8 +603,8 @@ class OracleInspectorObjectWrapper:
|
|
|
550
603
|
text = "SELECT text FROM dba_views WHERE view_name=:view_name"
|
|
551
604
|
|
|
552
605
|
if schema is not None:
|
|
553
|
-
|
|
554
|
-
|
|
606
|
+
params["owner"] = schema
|
|
607
|
+
text += "\nAND owner = :owner"
|
|
555
608
|
|
|
556
609
|
rp = self._inspector_instance.bind.execute(sql.text(text), params).scalar()
|
|
557
610
|
|
|
@@ -586,6 +639,17 @@ class OracleSource(SQLAlchemySource):
|
|
|
586
639
|
def __init__(self, config, ctx):
|
|
587
640
|
super().__init__(config, ctx, "oracle")
|
|
588
641
|
|
|
642
|
+
# if connecting to oracle with enable_thick_mode, it must be initialized before calling
|
|
643
|
+
# create_engine, which is called in get_inspectors()
|
|
644
|
+
# https://python-oracledb.readthedocs.io/en/latest/user_guide/initialization.html#enabling-python-oracledb-thick-mode
|
|
645
|
+
if self.config.enable_thick_mode:
|
|
646
|
+
if platform.system() == "Darwin" or platform.system() == "Windows":
|
|
647
|
+
# windows and mac os require lib_dir to be set explicitly
|
|
648
|
+
oracledb.init_oracle_client(lib_dir=self.config.thick_mode_lib_dir)
|
|
649
|
+
else:
|
|
650
|
+
# linux requires configurating the library path with ldconfig or LD_LIBRARY_PATH
|
|
651
|
+
oracledb.init_oracle_client()
|
|
652
|
+
|
|
589
653
|
@classmethod
|
|
590
654
|
def create(cls, config_dict, ctx):
|
|
591
655
|
config = OracleConfig.parse_obj(config_dict)
|
|
@@ -204,7 +204,7 @@ def get_column_type(
|
|
|
204
204
|
"""
|
|
205
205
|
|
|
206
206
|
TypeClass: Optional[Type] = None
|
|
207
|
-
for sql_type in _field_type_mapping
|
|
207
|
+
for sql_type in _field_type_mapping:
|
|
208
208
|
if isinstance(column_type, sql_type):
|
|
209
209
|
TypeClass = _field_type_mapping[sql_type]
|
|
210
210
|
break
|
|
@@ -352,6 +352,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
352
352
|
)
|
|
353
353
|
self.report.sql_aggregator = self.aggregator.report
|
|
354
354
|
|
|
355
|
+
def _add_default_options(self, sql_config: SQLCommonConfig) -> None:
|
|
356
|
+
"""Add default SQLAlchemy options. Can be overridden by subclasses to add additional defaults."""
|
|
357
|
+
# Extra default SQLAlchemy option for better connection pooling and threading.
|
|
358
|
+
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
|
|
359
|
+
if sql_config.is_profiling_enabled():
|
|
360
|
+
sql_config.options.setdefault(
|
|
361
|
+
"max_overflow", sql_config.profiling.max_workers
|
|
362
|
+
)
|
|
363
|
+
|
|
355
364
|
@classmethod
|
|
356
365
|
def test_connection(cls, config_dict: dict) -> TestConnectionReport:
|
|
357
366
|
test_report = TestConnectionReport()
|
|
@@ -519,12 +528,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
519
528
|
# Known issue with sqlalchemy https://stackoverflow.com/questions/60804288/pycharm-duplicated-log-for-sqlalchemy-echo-true
|
|
520
529
|
sqlalchemy_log._add_default_handler = lambda x: None # type: ignore
|
|
521
530
|
|
|
522
|
-
|
|
523
|
-
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
|
|
524
|
-
if sql_config.is_profiling_enabled():
|
|
525
|
-
sql_config.options.setdefault(
|
|
526
|
-
"max_overflow", sql_config.profiling.max_workers
|
|
527
|
-
)
|
|
531
|
+
self._add_default_options(sql_config)
|
|
528
532
|
|
|
529
533
|
for inspector in self.get_inspectors():
|
|
530
534
|
profiler = None
|
|
@@ -631,7 +635,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
631
635
|
|
|
632
636
|
return None
|
|
633
637
|
|
|
634
|
-
def loop_tables(
|
|
638
|
+
def loop_tables(
|
|
635
639
|
self,
|
|
636
640
|
inspector: Inspector,
|
|
637
641
|
schema: str,
|
|
@@ -969,7 +973,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
969
973
|
inspector=inspector,
|
|
970
974
|
)
|
|
971
975
|
),
|
|
972
|
-
description=column.get("comment"
|
|
976
|
+
description=column.get("comment"),
|
|
973
977
|
nullable=column["nullable"],
|
|
974
978
|
recursive=False,
|
|
975
979
|
globalTags=gtc,
|
|
@@ -1027,16 +1031,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1027
1031
|
def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
|
|
1028
1032
|
try:
|
|
1029
1033
|
view_definition = inspector.get_view_definition(view, schema)
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
else:
|
|
1033
|
-
# Some dialects return a TextClause instead of a raw string,
|
|
1034
|
-
# so we need to convert them to a string.
|
|
1035
|
-
view_definition = str(view_definition)
|
|
1034
|
+
# Some dialects return a TextClause instead of a raw string, so we need to convert them to a string.
|
|
1035
|
+
return str(view_definition) if view_definition else ""
|
|
1036
1036
|
except NotImplementedError:
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
return view_definition
|
|
1037
|
+
return ""
|
|
1040
1038
|
|
|
1041
1039
|
def _process_view(
|
|
1042
1040
|
self,
|
|
@@ -317,10 +317,10 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
|
|
|
317
317
|
match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
|
|
318
318
|
if match:
|
|
319
319
|
modified_type_base = match.group(1) # Extract the base type
|
|
320
|
-
return SNOWFLAKE_TYPES_MAP.get(modified_type_base
|
|
320
|
+
return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
|
|
321
321
|
|
|
322
322
|
# Fallback for types without precision/scale
|
|
323
|
-
return SNOWFLAKE_TYPES_MAP.get(type_string
|
|
323
|
+
return SNOWFLAKE_TYPES_MAP.get(type_string)
|
|
324
324
|
|
|
325
325
|
|
|
326
326
|
# see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
|
|
@@ -22,6 +22,7 @@ from sqlalchemy import create_engine, inspect
|
|
|
22
22
|
from sqlalchemy.engine import Engine
|
|
23
23
|
from sqlalchemy.engine.base import Connection
|
|
24
24
|
from sqlalchemy.engine.reflection import Inspector
|
|
25
|
+
from sqlalchemy.pool import QueuePool
|
|
25
26
|
from sqlalchemy.sql.expression import text
|
|
26
27
|
from teradatasqlalchemy.dialect import TeradataDialect
|
|
27
28
|
from teradatasqlalchemy.options import configure
|
|
@@ -179,10 +180,11 @@ def optimized_get_columns(
|
|
|
179
180
|
connection: Connection,
|
|
180
181
|
table_name: str,
|
|
181
182
|
schema: Optional[str] = None,
|
|
182
|
-
tables_cache: MutableMapping[str, List[TeradataTable]] =
|
|
183
|
+
tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
|
|
183
184
|
use_qvci: bool = False,
|
|
184
185
|
**kw: Dict[str, Any],
|
|
185
186
|
) -> List[Dict]:
|
|
187
|
+
tables_cache = tables_cache or {}
|
|
186
188
|
if schema is None:
|
|
187
189
|
schema = self.default_schema_name
|
|
188
190
|
|
|
@@ -313,9 +315,10 @@ def optimized_get_view_definition(
|
|
|
313
315
|
connection: Connection,
|
|
314
316
|
view_name: str,
|
|
315
317
|
schema: Optional[str] = None,
|
|
316
|
-
tables_cache: MutableMapping[str, List[TeradataTable]] =
|
|
318
|
+
tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
|
|
317
319
|
**kw: Dict[str, Any],
|
|
318
320
|
) -> Optional[str]:
|
|
321
|
+
tables_cache = tables_cache or {}
|
|
319
322
|
if schema is None:
|
|
320
323
|
schema = self.default_schema_name
|
|
321
324
|
|
|
@@ -648,7 +651,7 @@ ORDER by DataBaseName, TableName;
|
|
|
648
651
|
)
|
|
649
652
|
|
|
650
653
|
# Disabling the below because the cached view definition is not the view definition the column in tablesv actually holds the last statement executed against the object... not necessarily the view definition
|
|
651
|
-
# setattr(
|
|
654
|
+
# setattr(
|
|
652
655
|
# TeradataDialect,
|
|
653
656
|
# "get_view_definition",
|
|
654
657
|
# lambda self, connection, view_name, schema=None, **kw: optimized_get_view_definition(
|
|
@@ -678,6 +681,16 @@ ORDER by DataBaseName, TableName;
|
|
|
678
681
|
if self.config.stateful_ingestion:
|
|
679
682
|
self.config.stateful_ingestion.remove_stale_metadata = False
|
|
680
683
|
|
|
684
|
+
def _add_default_options(self, sql_config: SQLCommonConfig) -> None:
|
|
685
|
+
"""Add Teradata-specific default options"""
|
|
686
|
+
super()._add_default_options(sql_config)
|
|
687
|
+
if sql_config.is_profiling_enabled():
|
|
688
|
+
# Sqlalchemy uses QueuePool by default however Teradata uses SingletonThreadPool.
|
|
689
|
+
# SingletonThreadPool does not support parellel connections. For using profiling, we need to use QueuePool.
|
|
690
|
+
# https://docs.sqlalchemy.org/en/20/core/pooling.html#connection-pool-configuration
|
|
691
|
+
# https://github.com/Teradata/sqlalchemy-teradata/issues/96
|
|
692
|
+
sql_config.options.setdefault("poolclass", QueuePool)
|
|
693
|
+
|
|
681
694
|
@classmethod
|
|
682
695
|
def create(cls, config_dict, ctx):
|
|
683
696
|
config = TeradataConfig.parse_obj(config_dict)
|
|
@@ -705,6 +718,7 @@ ORDER by DataBaseName, TableName;
|
|
|
705
718
|
# This method can be overridden in the case that you want to dynamically
|
|
706
719
|
# run on multiple databases.
|
|
707
720
|
url = self.config.get_sql_alchemy_url()
|
|
721
|
+
|
|
708
722
|
logger.debug(f"sql_alchemy_url={url}")
|
|
709
723
|
engine = create_engine(url, **self.config.options)
|
|
710
724
|
with engine.connect() as conn:
|
|
@@ -734,7 +748,7 @@ ORDER by DataBaseName, TableName;
|
|
|
734
748
|
else:
|
|
735
749
|
raise Exception("Unable to get database name from Sqlalchemy inspector")
|
|
736
750
|
|
|
737
|
-
def cached_loop_tables(
|
|
751
|
+
def cached_loop_tables(
|
|
738
752
|
self,
|
|
739
753
|
inspector: Inspector,
|
|
740
754
|
schema: str,
|
|
@@ -770,7 +784,7 @@ ORDER by DataBaseName, TableName;
|
|
|
770
784
|
break
|
|
771
785
|
return description, properties, location
|
|
772
786
|
|
|
773
|
-
def cached_loop_views(
|
|
787
|
+
def cached_loop_views(
|
|
774
788
|
self,
|
|
775
789
|
inspector: Inspector,
|
|
776
790
|
schema: str,
|
|
@@ -142,7 +142,7 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
|
|
|
142
142
|
if col_value is not None:
|
|
143
143
|
properties[col_name] = col_value
|
|
144
144
|
|
|
145
|
-
return {"text": properties.get("comment"
|
|
145
|
+
return {"text": properties.get("comment"), "properties": properties}
|
|
146
146
|
else:
|
|
147
147
|
return self.get_table_comment_default(connection, table_name, schema)
|
|
148
148
|
except Exception:
|
|
@@ -483,7 +483,7 @@ def _parse_struct_fields(parts):
|
|
|
483
483
|
|
|
484
484
|
|
|
485
485
|
def _parse_basic_datatype(s):
|
|
486
|
-
for sql_type in _all_atomic_types
|
|
486
|
+
for sql_type in _all_atomic_types:
|
|
487
487
|
if isinstance(s, sql_type):
|
|
488
488
|
return {
|
|
489
489
|
"type": _all_atomic_types[sql_type],
|
|
@@ -114,14 +114,10 @@ class StaleEntityRemovalHandler(
|
|
|
114
114
|
self.stateful_ingestion_config: Optional[StatefulStaleMetadataRemovalConfig] = (
|
|
115
115
|
config.stateful_ingestion
|
|
116
116
|
)
|
|
117
|
-
self.checkpointing_enabled: bool = (
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
and self.stateful_ingestion_config
|
|
122
|
-
and self.stateful_ingestion_config.remove_stale_metadata
|
|
123
|
-
)
|
|
124
|
-
else False
|
|
117
|
+
self.checkpointing_enabled: bool = bool(
|
|
118
|
+
self.state_provider.is_stateful_ingestion_configured()
|
|
119
|
+
and self.stateful_ingestion_config
|
|
120
|
+
and self.stateful_ingestion_config.remove_stale_metadata
|
|
125
121
|
)
|
|
126
122
|
self._job_id = self._init_job_id()
|
|
127
123
|
self._urns_to_skip: Set[str] = set()
|