acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +4 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/datahub_gc.py +1 -0
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +0 -1
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +245 -101
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
|
|
|
371
371
|
domain: Optional[str],
|
|
372
372
|
description: Optional[str],
|
|
373
373
|
) -> Iterable[MetadataWorkUnit]:
|
|
374
|
-
maybe_terms_wu: Optional[
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
374
|
+
maybe_terms_wu: Optional[MetadataWorkUnit] = (
|
|
375
|
+
self.get_resource_glossary_terms_work_unit(
|
|
376
|
+
entity_urn=entity_urn,
|
|
377
|
+
term_associations=term_associations,
|
|
378
|
+
)
|
|
379
379
|
)
|
|
380
380
|
if maybe_terms_wu:
|
|
381
381
|
self.report.num_glossary_term_workunits_produced += 1
|
|
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
|
|
|
389
389
|
self.report.num_tag_workunits_produced += 1
|
|
390
390
|
yield maybe_tags_wu
|
|
391
391
|
|
|
392
|
-
maybe_owners_wu: Optional[
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
392
|
+
maybe_owners_wu: Optional[MetadataWorkUnit] = (
|
|
393
|
+
self.get_resource_owners_work_unit(
|
|
394
|
+
entity_urn=entity_urn,
|
|
395
|
+
owners=owners,
|
|
396
|
+
)
|
|
397
397
|
)
|
|
398
398
|
if maybe_owners_wu:
|
|
399
399
|
self.report.num_owners_workunits_produced += 1
|
|
400
400
|
yield maybe_owners_wu
|
|
401
401
|
|
|
402
|
-
maybe_domain_wu: Optional[
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
402
|
+
maybe_domain_wu: Optional[MetadataWorkUnit] = (
|
|
403
|
+
self.get_resource_domain_work_unit(
|
|
404
|
+
entity_urn=entity_urn,
|
|
405
|
+
domain=domain,
|
|
406
|
+
)
|
|
407
407
|
)
|
|
408
408
|
if maybe_domain_wu:
|
|
409
409
|
self.report.num_domain_workunits_produced += 1
|
|
410
410
|
yield maybe_domain_wu
|
|
411
411
|
|
|
412
|
-
maybe_description_wu: Optional[
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
412
|
+
maybe_description_wu: Optional[MetadataWorkUnit] = (
|
|
413
|
+
self.get_resource_description_work_unit(
|
|
414
|
+
entity_urn=entity_urn,
|
|
415
|
+
description=description,
|
|
416
|
+
)
|
|
417
417
|
)
|
|
418
418
|
if maybe_description_wu:
|
|
419
419
|
self.report.num_description_workunits_produced += 1
|
|
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
|
|
|
426
426
|
needs_write: bool,
|
|
427
427
|
) -> Tuple[EditableSchemaMetadataClass, bool]:
|
|
428
428
|
field_path: str = sub_resource_row.field_path
|
|
429
|
-
term_associations: List[
|
|
430
|
-
|
|
431
|
-
|
|
429
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
430
|
+
sub_resource_row.term_associations
|
|
431
|
+
)
|
|
432
432
|
tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
|
|
433
433
|
description: Optional[str] = sub_resource_row.description
|
|
434
434
|
has_terms: bool = len(term_associations) > 0
|
|
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
|
|
|
517
517
|
# Boolean field to tell whether we need to write an MCPW.
|
|
518
518
|
needs_write = False
|
|
519
519
|
|
|
520
|
-
current_editable_schema_metadata: Optional[
|
|
521
|
-
|
|
522
|
-
|
|
520
|
+
current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
|
|
521
|
+
None
|
|
522
|
+
)
|
|
523
523
|
if self.ctx.graph and not self.should_overwrite:
|
|
524
524
|
# Fetch the current editable schema metadata
|
|
525
525
|
current_editable_schema_metadata = self.ctx.graph.get_aspect(
|
|
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
|
|
|
655
655
|
entity_urn = row["resource"]
|
|
656
656
|
entity_type = Urn.from_string(row["resource"]).get_type()
|
|
657
657
|
|
|
658
|
-
term_associations: List[
|
|
659
|
-
|
|
660
|
-
|
|
658
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
659
|
+
self.maybe_extract_glossary_terms(row)
|
|
660
|
+
)
|
|
661
661
|
tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
|
|
662
662
|
owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)
|
|
663
663
|
|
|
@@ -152,7 +152,9 @@ class DataHubDatabaseReader:
|
|
|
152
152
|
) -> Iterable[Dict[str, Any]]:
|
|
153
153
|
with self.engine.connect() as conn:
|
|
154
154
|
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
|
-
with
|
|
155
|
+
with (
|
|
156
|
+
conn.begin()
|
|
157
|
+
): # Transaction required for PostgreSQL server-side cursor
|
|
156
158
|
# Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
|
|
157
159
|
# https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
|
|
158
160
|
conn = conn.execution_options(
|
|
@@ -222,7 +224,7 @@ class DataHubDatabaseReader:
|
|
|
222
224
|
)
|
|
223
225
|
except Exception as e:
|
|
224
226
|
logger.warning(
|
|
225
|
-
f
|
|
227
|
+
f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
|
|
226
228
|
)
|
|
227
229
|
self.report.num_database_parse_errors += 1
|
|
228
230
|
self.report.database_parse_errors.setdefault(
|
|
@@ -194,20 +194,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
|
|
|
194
194
|
|
|
195
195
|
_DBT_FIELDS_BY_TYPE = {
|
|
196
196
|
"models": f"""
|
|
197
|
-
{
|
|
198
|
-
{
|
|
199
|
-
{
|
|
197
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
198
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
199
|
+
{_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
|
|
200
200
|
dependsOn
|
|
201
201
|
materializedType
|
|
202
202
|
""",
|
|
203
203
|
"seeds": f"""
|
|
204
|
-
{
|
|
205
|
-
{
|
|
206
|
-
{
|
|
204
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
205
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
206
|
+
{_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
|
|
207
207
|
""",
|
|
208
208
|
"sources": f"""
|
|
209
|
-
{
|
|
210
|
-
{
|
|
209
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
210
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
211
211
|
identifier
|
|
212
212
|
sourceName
|
|
213
213
|
sourceDescription
|
|
@@ -218,9 +218,9 @@ _DBT_FIELDS_BY_TYPE = {
|
|
|
218
218
|
loader
|
|
219
219
|
""",
|
|
220
220
|
"snapshots": f"""
|
|
221
|
-
{
|
|
222
|
-
{
|
|
223
|
-
{
|
|
221
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
222
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
223
|
+
{_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
|
|
224
224
|
parentsSources {{
|
|
225
225
|
uniqueId
|
|
226
226
|
}}
|
|
@@ -229,7 +229,7 @@ _DBT_FIELDS_BY_TYPE = {
|
|
|
229
229
|
}}
|
|
230
230
|
""",
|
|
231
231
|
"tests": f"""
|
|
232
|
-
{
|
|
232
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
233
233
|
state
|
|
234
234
|
columnName
|
|
235
235
|
status
|
|
@@ -315,7 +315,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
315
315
|
res = response.json()
|
|
316
316
|
if "errors" in res:
|
|
317
317
|
raise ValueError(
|
|
318
|
-
f
|
|
318
|
+
f"Unable to fetch metadata from dbt Cloud: {res['errors']}"
|
|
319
319
|
)
|
|
320
320
|
data = res["data"]
|
|
321
321
|
except JSONDecodeError as e:
|
|
@@ -506,16 +506,18 @@ class DBTNode:
|
|
|
506
506
|
materialization: Optional[str] # table, view, ephemeral, incremental, snapshot
|
|
507
507
|
# see https://docs.getdbt.com/reference/artifacts/manifest-json
|
|
508
508
|
catalog_type: Optional[str]
|
|
509
|
-
missing_from_catalog:
|
|
509
|
+
missing_from_catalog: (
|
|
510
|
+
bool # indicates if the node was missing from the catalog.json
|
|
511
|
+
)
|
|
510
512
|
|
|
511
513
|
owner: Optional[str]
|
|
512
514
|
|
|
513
515
|
columns: List[DBTColumn] = field(default_factory=list)
|
|
514
516
|
upstream_nodes: List[str] = field(default_factory=list) # list of upstream dbt_name
|
|
515
517
|
upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list)
|
|
516
|
-
raw_sql_parsing_result: Optional[
|
|
517
|
-
|
|
518
|
-
|
|
518
|
+
raw_sql_parsing_result: Optional[SqlParsingResult] = (
|
|
519
|
+
None # only set for nodes that don't depend on ephemeral models
|
|
520
|
+
)
|
|
519
521
|
cll_debug_info: Optional[SqlParsingDebugInfo] = None
|
|
520
522
|
|
|
521
523
|
meta: Dict[str, Any] = field(default_factory=dict)
|
|
@@ -869,10 +871,10 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
869
871
|
"platform": DBT_PLATFORM,
|
|
870
872
|
"name": node.dbt_name,
|
|
871
873
|
"instance": self.config.platform_instance,
|
|
874
|
+
# Ideally we'd include the env unconditionally. However, we started out
|
|
875
|
+
# not including env in the guid, so we need to maintain backwards compatibility
|
|
876
|
+
# with existing PROD assertions.
|
|
872
877
|
**(
|
|
873
|
-
# Ideally we'd include the env unconditionally. However, we started out
|
|
874
|
-
# not including env in the guid, so we need to maintain backwards compatibility
|
|
875
|
-
# with existing PROD assertions.
|
|
876
878
|
{"env": self.config.env}
|
|
877
879
|
if self.config.env != mce_builder.DEFAULT_ENV
|
|
878
880
|
and self.config.include_env_in_assertion_guid
|
|
@@ -181,7 +181,7 @@ class DremioAPIOperations:
|
|
|
181
181
|
return
|
|
182
182
|
|
|
183
183
|
# On-prem Dremio authentication (PAT or Basic Auth)
|
|
184
|
-
for
|
|
184
|
+
for _ in range(1, self._retry_count + 1):
|
|
185
185
|
try:
|
|
186
186
|
if connection_args.authentication_method == "PAT":
|
|
187
187
|
self.session.headers.update(
|
|
@@ -191,9 +191,9 @@ class DremioAPIOperations:
|
|
|
191
191
|
)
|
|
192
192
|
return
|
|
193
193
|
else:
|
|
194
|
-
assert (
|
|
195
|
-
|
|
196
|
-
)
|
|
194
|
+
assert connection_args.username and connection_args.password, (
|
|
195
|
+
"Username and password are required for authentication"
|
|
196
|
+
)
|
|
197
197
|
host = connection_args.hostname
|
|
198
198
|
port = connection_args.port
|
|
199
199
|
protocol = "https" if connection_args.tls else "http"
|
|
@@ -101,9 +101,9 @@ class DremioToDataHubSourceTypeMapping:
|
|
|
101
101
|
Add a new source type if not in the map (e.g., Dremio ARP).
|
|
102
102
|
"""
|
|
103
103
|
dremio_source_type = dremio_source_type.upper()
|
|
104
|
-
DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[
|
|
105
|
-
|
|
106
|
-
|
|
104
|
+
DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = (
|
|
105
|
+
datahub_source_type
|
|
106
|
+
)
|
|
107
107
|
|
|
108
108
|
if category:
|
|
109
109
|
if category.lower() == "file_object_storage":
|
|
@@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter:
|
|
|
111
111
|
|
|
112
112
|
@staticmethod
|
|
113
113
|
def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:
|
|
114
|
-
type_class: Optional[
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
114
|
+
type_class: Optional[Type] = (
|
|
115
|
+
ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
|
|
116
|
+
elastic_column_type
|
|
117
|
+
)
|
|
118
118
|
)
|
|
119
119
|
if type_class is None:
|
|
120
120
|
logger.warning(
|
|
@@ -292,6 +292,7 @@ class DataHubGcSource(Source):
|
|
|
292
292
|
tokens = list_access_tokens.get("tokens", [])
|
|
293
293
|
total = list_access_tokens.get("total", 0)
|
|
294
294
|
if tokens == []:
|
|
295
|
+
# Due to a server bug we cannot rely on just total
|
|
295
296
|
break
|
|
296
297
|
for token in tokens:
|
|
297
298
|
self.report.expired_tokens_revoked += 1
|
|
@@ -99,6 +99,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
99
99
|
|
|
100
100
|
@dataclass
|
|
101
101
|
class SoftDeletedEntitiesReport(SourceReport):
|
|
102
|
+
num_calls_made: Dict[str, int] = field(default_factory=dict)
|
|
102
103
|
num_entities_found: Dict[str, int] = field(default_factory=dict)
|
|
103
104
|
num_soft_deleted_entity_processed: int = 0
|
|
104
105
|
num_soft_deleted_retained_due_to_age: int = 0
|
|
@@ -154,9 +155,9 @@ class SoftDeletedEntitiesCleanup:
|
|
|
154
155
|
current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
|
|
155
156
|
self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
|
|
156
157
|
if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
|
|
157
|
-
self.report.sample_hard_deleted_aspects_by_type[
|
|
158
|
-
|
|
159
|
-
|
|
158
|
+
self.report.sample_hard_deleted_aspects_by_type[entity_type] = (
|
|
159
|
+
LossyList()
|
|
160
|
+
)
|
|
160
161
|
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
161
162
|
|
|
162
163
|
def delete_entity(self, urn: str) -> None:
|
|
@@ -242,6 +243,11 @@ class SoftDeletedEntitiesCleanup:
|
|
|
242
243
|
|
|
243
244
|
while True:
|
|
244
245
|
try:
|
|
246
|
+
if entity_type not in self.report.num_calls_made:
|
|
247
|
+
self.report.num_calls_made[entity_type] = 1
|
|
248
|
+
else:
|
|
249
|
+
self.report.num_calls_made[entity_type] += 1
|
|
250
|
+
self._print_report()
|
|
245
251
|
result = self.ctx.graph.execute_graphql(
|
|
246
252
|
graphql_query,
|
|
247
253
|
{
|
|
@@ -270,7 +276,13 @@ class SoftDeletedEntitiesCleanup:
|
|
|
270
276
|
)
|
|
271
277
|
break
|
|
272
278
|
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
273
|
-
if not scroll_across_entities
|
|
279
|
+
if not scroll_across_entities:
|
|
280
|
+
break
|
|
281
|
+
search_results = scroll_across_entities.get("searchResults")
|
|
282
|
+
count = scroll_across_entities.get("count")
|
|
283
|
+
if not count or not search_results:
|
|
284
|
+
# Due to a server bug we cannot rely on just count as it was returning response like this
|
|
285
|
+
# {'count': 1, 'nextScrollId': None, 'searchResults': []}
|
|
274
286
|
break
|
|
275
287
|
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
276
288
|
# Temp workaround. See note in beginning of the function
|
|
@@ -282,7 +294,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
282
294
|
self.report.num_entities_found[entity_type] += scroll_across_entities.get(
|
|
283
295
|
"count"
|
|
284
296
|
)
|
|
285
|
-
for query in
|
|
297
|
+
for query in search_results:
|
|
286
298
|
yield query["entity"]["urn"]
|
|
287
299
|
|
|
288
300
|
def _get_urns(self) -> Iterable[str]:
|
|
@@ -141,8 +141,9 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
141
141
|
source.source_config.platform = PLATFORM_GCS
|
|
142
142
|
|
|
143
143
|
source.is_s3_platform = lambda: True # type: ignore
|
|
144
|
-
source.create_s3_path = lambda bucket_name, key: unquote(
|
|
145
|
-
|
|
144
|
+
source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore
|
|
145
|
+
f"s3://{bucket_name}/{key}"
|
|
146
|
+
)
|
|
146
147
|
return source
|
|
147
148
|
|
|
148
149
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
@@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
|
|
|
267
267
|
"get_column_max",
|
|
268
268
|
"get_column_mean",
|
|
269
269
|
"get_column_stdev",
|
|
270
|
-
"get_column_stdev",
|
|
271
270
|
"get_column_nonnull_count",
|
|
272
271
|
"get_column_unique_count",
|
|
273
272
|
}
|
|
@@ -328,7 +327,7 @@ def _is_single_row_query_method(query: Any) -> bool:
|
|
|
328
327
|
|
|
329
328
|
|
|
330
329
|
def _run_with_query_combiner(
|
|
331
|
-
method: Callable[Concatenate["_SingleDatasetProfiler", P], None]
|
|
330
|
+
method: Callable[Concatenate["_SingleDatasetProfiler", P], None],
|
|
332
331
|
) -> Callable[Concatenate["_SingleDatasetProfiler", P], None]:
|
|
333
332
|
@functools.wraps(method)
|
|
334
333
|
def inner(
|
|
@@ -1538,9 +1537,7 @@ def create_bigquery_temp_table(
|
|
|
1538
1537
|
query_job: Optional["google.cloud.bigquery.job.query.QueryJob"] = (
|
|
1539
1538
|
# In google-cloud-bigquery 3.15.0, the _query_job attribute was
|
|
1540
1539
|
# made public and renamed to query_job.
|
|
1541
|
-
cursor.query_job
|
|
1542
|
-
if hasattr(cursor, "query_job")
|
|
1543
|
-
else cursor._query_job # type: ignore[attr-defined]
|
|
1540
|
+
cursor.query_job if hasattr(cursor, "query_job") else cursor._query_job # type: ignore[attr-defined]
|
|
1544
1541
|
)
|
|
1545
1542
|
assert query_job
|
|
1546
1543
|
temp_destination_table = query_job.destination
|
|
@@ -220,9 +220,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
220
220
|
)
|
|
221
221
|
values[field_level_metric] = False
|
|
222
222
|
|
|
223
|
-
assert (
|
|
224
|
-
|
|
225
|
-
)
|
|
223
|
+
assert max_num_fields_to_profile is None, (
|
|
224
|
+
f"{max_num_fields_to_profile_key} should be set to None"
|
|
225
|
+
)
|
|
226
226
|
|
|
227
227
|
# Disable expensive queries.
|
|
228
228
|
if values.get("turn_off_expensive_profiling_metrics"):
|
|
@@ -296,9 +296,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
296
296
|
custom_properties["snapshot-id"] = str(
|
|
297
297
|
table.current_snapshot().snapshot_id
|
|
298
298
|
)
|
|
299
|
-
custom_properties[
|
|
300
|
-
|
|
301
|
-
|
|
299
|
+
custom_properties["manifest-list"] = (
|
|
300
|
+
table.current_snapshot().manifest_list
|
|
301
|
+
)
|
|
302
302
|
dataset_properties = DatasetPropertiesClass(
|
|
303
303
|
name=table.name()[-1],
|
|
304
304
|
description=table.metadata.properties.get("comment", None),
|
|
@@ -354,9 +354,9 @@ class AzureADSource(StatefulIngestionSourceBase):
|
|
|
354
354
|
yield MetadataWorkUnit(id=group_status_wu_id, mcp=group_status_mcp)
|
|
355
355
|
|
|
356
356
|
# Populate GroupMembership Aspects for CorpUsers
|
|
357
|
-
datahub_corp_user_urn_to_group_membership: Dict[
|
|
358
|
-
|
|
359
|
-
|
|
357
|
+
datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
|
|
358
|
+
defaultdict(lambda: GroupMembershipClass(groups=[]))
|
|
359
|
+
)
|
|
360
360
|
if (
|
|
361
361
|
self.config.ingest_group_membership
|
|
362
362
|
and len(self.selected_azure_ad_groups) > 0
|
|
@@ -344,9 +344,9 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
344
344
|
).as_workunit()
|
|
345
345
|
|
|
346
346
|
# Step 2: Populate GroupMembership Aspects for CorpUsers
|
|
347
|
-
datahub_corp_user_urn_to_group_membership: Dict[
|
|
348
|
-
|
|
349
|
-
|
|
347
|
+
datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
|
|
348
|
+
defaultdict(lambda: GroupMembershipClass(groups=[]))
|
|
349
|
+
)
|
|
350
350
|
if self.config.ingest_group_membership and okta_groups is not None:
|
|
351
351
|
# Fetch membership for each group.
|
|
352
352
|
for okta_group in okta_groups:
|
|
@@ -419,10 +419,10 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
419
419
|
custom_props = self.build_custom_properties(
|
|
420
420
|
topic, topic_detail, extra_topic_config
|
|
421
421
|
)
|
|
422
|
-
schema_name: Optional[
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
422
|
+
schema_name: Optional[str] = (
|
|
423
|
+
self.schema_registry_client._get_subject_for_topic(
|
|
424
|
+
topic, is_key_schema=False
|
|
425
|
+
)
|
|
426
426
|
)
|
|
427
427
|
if schema_name is not None:
|
|
428
428
|
custom_props["Schema Name"] = schema_name
|
|
@@ -610,11 +610,13 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
610
610
|
|
|
611
611
|
def fetch_topic_configurations(self, topics: List[str]) -> Dict[str, dict]:
|
|
612
612
|
logger.info("Fetching config details for all topics")
|
|
613
|
-
configs: Dict[
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
613
|
+
configs: Dict[ConfigResource, concurrent.futures.Future] = (
|
|
614
|
+
self.admin_client.describe_configs(
|
|
615
|
+
resources=[
|
|
616
|
+
ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics
|
|
617
|
+
],
|
|
618
|
+
request_timeout=self.source_config.connection.client_timeout_seconds,
|
|
619
|
+
)
|
|
618
620
|
)
|
|
619
621
|
logger.debug("Waiting for config details futures to complete")
|
|
620
622
|
concurrent.futures.wait(configs.values())
|
|
@@ -110,9 +110,8 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
110
110
|
connector_manifest = self._get_connector_manifest(
|
|
111
111
|
connector_name, connector_url
|
|
112
112
|
)
|
|
113
|
-
if (
|
|
114
|
-
connector_manifest
|
|
115
|
-
or not self.config.connector_patterns.allowed(connector_manifest.name)
|
|
113
|
+
if connector_manifest is None or not self.config.connector_patterns.allowed(
|
|
114
|
+
connector_manifest.name
|
|
116
115
|
):
|
|
117
116
|
self.report.report_dropped(connector_name)
|
|
118
117
|
continue
|
|
@@ -199,9 +199,9 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
199
199
|
transforms.append(transform)
|
|
200
200
|
for key in self.connector_manifest.config.keys():
|
|
201
201
|
if key.startswith(f"transforms.{name}."):
|
|
202
|
-
transform[
|
|
203
|
-
|
|
204
|
-
|
|
202
|
+
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
203
|
+
self.connector_manifest.config[key]
|
|
204
|
+
)
|
|
205
205
|
|
|
206
206
|
if "defaultDataset" in connector_manifest.config:
|
|
207
207
|
defaultDataset = connector_manifest.config["defaultDataset"]
|
|
@@ -123,9 +123,9 @@ class ConfluentJDBCSourceConnector(BaseConnector):
|
|
|
123
123
|
transforms.append(transform)
|
|
124
124
|
for key in self.connector_manifest.config.keys():
|
|
125
125
|
if key.startswith(f"transforms.{name}."):
|
|
126
|
-
transform[
|
|
127
|
-
|
|
128
|
-
|
|
126
|
+
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
127
|
+
self.connector_manifest.config[key]
|
|
128
|
+
)
|
|
129
129
|
|
|
130
130
|
return self.JdbcParser(
|
|
131
131
|
db_connection_url,
|
|
@@ -596,9 +596,9 @@ class LookerUtil:
|
|
|
596
596
|
|
|
597
597
|
@staticmethod
|
|
598
598
|
def _extract_view_from_field(field: str) -> str:
|
|
599
|
-
assert (
|
|
600
|
-
|
|
601
|
-
)
|
|
599
|
+
assert field.count(".") == 1, (
|
|
600
|
+
f"Error: A field must be prefixed by a view name, field is: {field}"
|
|
601
|
+
)
|
|
602
602
|
return field.split(".")[0]
|
|
603
603
|
|
|
604
604
|
@staticmethod
|
|
@@ -815,9 +815,9 @@ class LookerExplore:
|
|
|
815
815
|
project_name: Optional[str] = None
|
|
816
816
|
label: Optional[str] = None
|
|
817
817
|
description: Optional[str] = None
|
|
818
|
-
upstream_views: Optional[
|
|
819
|
-
|
|
820
|
-
|
|
818
|
+
upstream_views: Optional[List[ProjectInclude]] = (
|
|
819
|
+
None # captures the view name(s) this explore is derived from
|
|
820
|
+
)
|
|
821
821
|
upstream_views_file_path: Dict[str, Optional[str]] = dataclasses_field(
|
|
822
822
|
default_factory=dict
|
|
823
823
|
) # view_name is key and file_path is value. A single file may contains multiple views
|
|
@@ -889,7 +889,7 @@ class LookerExplore:
|
|
|
889
889
|
upstream_views.extend(parsed_explore.upstream_views or [])
|
|
890
890
|
else:
|
|
891
891
|
logger.warning(
|
|
892
|
-
f
|
|
892
|
+
f"Could not find extended explore {extended_explore} for explore {dict['name']} in model {model_name}"
|
|
893
893
|
)
|
|
894
894
|
else:
|
|
895
895
|
# we only fallback to the view_names list if this is not an extended explore
|
|
@@ -903,7 +903,7 @@ class LookerExplore:
|
|
|
903
903
|
)
|
|
904
904
|
if not info:
|
|
905
905
|
logger.warning(
|
|
906
|
-
f
|
|
906
|
+
f"Could not resolve view {view_name} for explore {dict['name']} in model {model_name}"
|
|
907
907
|
)
|
|
908
908
|
else:
|
|
909
909
|
upstream_views.append(
|
|
@@ -935,9 +935,9 @@ class LookerExplore:
|
|
|
935
935
|
try:
|
|
936
936
|
explore = client.lookml_model_explore(model, explore_name)
|
|
937
937
|
views: Set[str] = set()
|
|
938
|
-
lkml_fields: List[
|
|
939
|
-
|
|
940
|
-
|
|
938
|
+
lkml_fields: List[LookmlModelExploreField] = (
|
|
939
|
+
explore_field_set_to_lkml_fields(explore)
|
|
940
|
+
)
|
|
941
941
|
|
|
942
942
|
if explore.view_name is not None and explore.view_name != explore.name:
|
|
943
943
|
# explore is not named after a view and is instead using a from field, which is modeled as view_name.
|
|
@@ -1034,9 +1034,9 @@ class LookerExplore:
|
|
|
1034
1034
|
if measure_field.name is None:
|
|
1035
1035
|
continue
|
|
1036
1036
|
else:
|
|
1037
|
-
field_name_vs_raw_explore_field[
|
|
1038
|
-
measure_field
|
|
1039
|
-
|
|
1037
|
+
field_name_vs_raw_explore_field[measure_field.name] = (
|
|
1038
|
+
measure_field
|
|
1039
|
+
)
|
|
1040
1040
|
|
|
1041
1041
|
view_fields.append(
|
|
1042
1042
|
ViewField(
|
|
@@ -1072,11 +1072,11 @@ class LookerExplore:
|
|
|
1072
1072
|
if view_project_map:
|
|
1073
1073
|
logger.debug(f"views and their projects: {view_project_map}")
|
|
1074
1074
|
|
|
1075
|
-
upstream_views_file_path: Dict[
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1075
|
+
upstream_views_file_path: Dict[str, Optional[str]] = (
|
|
1076
|
+
create_upstream_views_file_path_map(
|
|
1077
|
+
lkml_fields=lkml_fields,
|
|
1078
|
+
view_names=views,
|
|
1079
|
+
)
|
|
1080
1080
|
)
|
|
1081
1081
|
if upstream_views_file_path:
|
|
1082
1082
|
logger.debug(f"views and their file-paths: {upstream_views_file_path}")
|
|
@@ -166,9 +166,9 @@ def _get_generic_definition(
|
|
|
166
166
|
# e.g. spark1 or hive2 or druid_18
|
|
167
167
|
platform = re.sub(r"[0-9]+", "", dialect_name.split("_")[0])
|
|
168
168
|
|
|
169
|
-
assert (
|
|
170
|
-
platform
|
|
171
|
-
)
|
|
169
|
+
assert platform is not None, (
|
|
170
|
+
f"Failed to extract a valid platform from connection {looker_connection}"
|
|
171
|
+
)
|
|
172
172
|
db = looker_connection.database
|
|
173
173
|
schema = looker_connection.schema # ok for this to be None
|
|
174
174
|
return platform, db, schema
|