acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2335 -2337
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +157 -157
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +4 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +48 -49
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
- datahub/sql_parsing/sqlglot_lineage.py +5 -4
- datahub/sql_parsing/sqlglot_utils.py +3 -2
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -506,16 +506,18 @@ class DBTNode:
|
|
|
506
506
|
materialization: Optional[str] # table, view, ephemeral, incremental, snapshot
|
|
507
507
|
# see https://docs.getdbt.com/reference/artifacts/manifest-json
|
|
508
508
|
catalog_type: Optional[str]
|
|
509
|
-
missing_from_catalog:
|
|
509
|
+
missing_from_catalog: (
|
|
510
|
+
bool # indicates if the node was missing from the catalog.json
|
|
511
|
+
)
|
|
510
512
|
|
|
511
513
|
owner: Optional[str]
|
|
512
514
|
|
|
513
515
|
columns: List[DBTColumn] = field(default_factory=list)
|
|
514
516
|
upstream_nodes: List[str] = field(default_factory=list) # list of upstream dbt_name
|
|
515
517
|
upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list)
|
|
516
|
-
raw_sql_parsing_result: Optional[
|
|
517
|
-
|
|
518
|
-
|
|
518
|
+
raw_sql_parsing_result: Optional[SqlParsingResult] = (
|
|
519
|
+
None # only set for nodes that don't depend on ephemeral models
|
|
520
|
+
)
|
|
519
521
|
cll_debug_info: Optional[SqlParsingDebugInfo] = None
|
|
520
522
|
|
|
521
523
|
meta: Dict[str, Any] = field(default_factory=dict)
|
|
@@ -869,10 +871,10 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
869
871
|
"platform": DBT_PLATFORM,
|
|
870
872
|
"name": node.dbt_name,
|
|
871
873
|
"instance": self.config.platform_instance,
|
|
874
|
+
# Ideally we'd include the env unconditionally. However, we started out
|
|
875
|
+
# not including env in the guid, so we need to maintain backwards compatibility
|
|
876
|
+
# with existing PROD assertions.
|
|
872
877
|
**(
|
|
873
|
-
# Ideally we'd include the env unconditionally. However, we started out
|
|
874
|
-
# not including env in the guid, so we need to maintain backwards compatibility
|
|
875
|
-
# with existing PROD assertions.
|
|
876
878
|
{"env": self.config.env}
|
|
877
879
|
if self.config.env != mce_builder.DEFAULT_ENV
|
|
878
880
|
and self.config.include_env_in_assertion_guid
|
|
@@ -181,7 +181,7 @@ class DremioAPIOperations:
|
|
|
181
181
|
return
|
|
182
182
|
|
|
183
183
|
# On-prem Dremio authentication (PAT or Basic Auth)
|
|
184
|
-
for
|
|
184
|
+
for _ in range(1, self._retry_count + 1):
|
|
185
185
|
try:
|
|
186
186
|
if connection_args.authentication_method == "PAT":
|
|
187
187
|
self.session.headers.update(
|
|
@@ -191,9 +191,9 @@ class DremioAPIOperations:
|
|
|
191
191
|
)
|
|
192
192
|
return
|
|
193
193
|
else:
|
|
194
|
-
assert (
|
|
195
|
-
|
|
196
|
-
)
|
|
194
|
+
assert connection_args.username and connection_args.password, (
|
|
195
|
+
"Username and password are required for authentication"
|
|
196
|
+
)
|
|
197
197
|
host = connection_args.hostname
|
|
198
198
|
port = connection_args.port
|
|
199
199
|
protocol = "https" if connection_args.tls else "http"
|
|
@@ -101,9 +101,9 @@ class DremioToDataHubSourceTypeMapping:
|
|
|
101
101
|
Add a new source type if not in the map (e.g., Dremio ARP).
|
|
102
102
|
"""
|
|
103
103
|
dremio_source_type = dremio_source_type.upper()
|
|
104
|
-
DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[
|
|
105
|
-
|
|
106
|
-
|
|
104
|
+
DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = (
|
|
105
|
+
datahub_source_type
|
|
106
|
+
)
|
|
107
107
|
|
|
108
108
|
if category:
|
|
109
109
|
if category.lower() == "file_object_storage":
|
|
@@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter:
|
|
|
111
111
|
|
|
112
112
|
@staticmethod
|
|
113
113
|
def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:
|
|
114
|
-
type_class: Optional[
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
114
|
+
type_class: Optional[Type] = (
|
|
115
|
+
ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
|
|
116
|
+
elastic_column_type
|
|
117
|
+
)
|
|
118
118
|
)
|
|
119
119
|
if type_class is None:
|
|
120
120
|
logger.warning(
|
|
@@ -155,9 +155,9 @@ class SoftDeletedEntitiesCleanup:
|
|
|
155
155
|
current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
|
|
156
156
|
self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
|
|
157
157
|
if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
|
|
158
|
-
self.report.sample_hard_deleted_aspects_by_type[
|
|
159
|
-
|
|
160
|
-
|
|
158
|
+
self.report.sample_hard_deleted_aspects_by_type[entity_type] = (
|
|
159
|
+
LossyList()
|
|
160
|
+
)
|
|
161
161
|
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
162
162
|
|
|
163
163
|
def delete_entity(self, urn: str) -> None:
|
|
@@ -141,8 +141,9 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
141
141
|
source.source_config.platform = PLATFORM_GCS
|
|
142
142
|
|
|
143
143
|
source.is_s3_platform = lambda: True # type: ignore
|
|
144
|
-
source.create_s3_path = lambda bucket_name, key: unquote(
|
|
145
|
-
|
|
144
|
+
source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore
|
|
145
|
+
f"s3://{bucket_name}/{key}"
|
|
146
|
+
)
|
|
146
147
|
return source
|
|
147
148
|
|
|
148
149
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
|
|
2
|
+
|
|
1
3
|
import collections
|
|
2
4
|
import concurrent.futures
|
|
3
5
|
import contextlib
|
|
@@ -10,7 +12,6 @@ import threading
|
|
|
10
12
|
import traceback
|
|
11
13
|
import unittest.mock
|
|
12
14
|
import uuid
|
|
13
|
-
from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
|
|
14
15
|
from functools import lru_cache
|
|
15
16
|
from typing import (
|
|
16
17
|
TYPE_CHECKING,
|
|
@@ -326,7 +327,7 @@ def _is_single_row_query_method(query: Any) -> bool:
|
|
|
326
327
|
|
|
327
328
|
|
|
328
329
|
def _run_with_query_combiner(
|
|
329
|
-
method: Callable[Concatenate["_SingleDatasetProfiler", P], None]
|
|
330
|
+
method: Callable[Concatenate["_SingleDatasetProfiler", P], None],
|
|
330
331
|
) -> Callable[Concatenate["_SingleDatasetProfiler", P], None]:
|
|
331
332
|
@functools.wraps(method)
|
|
332
333
|
def inner(
|
|
@@ -1536,9 +1537,7 @@ def create_bigquery_temp_table(
|
|
|
1536
1537
|
query_job: Optional["google.cloud.bigquery.job.query.QueryJob"] = (
|
|
1537
1538
|
# In google-cloud-bigquery 3.15.0, the _query_job attribute was
|
|
1538
1539
|
# made public and renamed to query_job.
|
|
1539
|
-
cursor.query_job
|
|
1540
|
-
if hasattr(cursor, "query_job")
|
|
1541
|
-
else cursor._query_job # type: ignore[attr-defined]
|
|
1540
|
+
cursor.query_job if hasattr(cursor, "query_job") else cursor._query_job # type: ignore[attr-defined]
|
|
1542
1541
|
)
|
|
1543
1542
|
assert query_job
|
|
1544
1543
|
temp_destination_table = query_job.destination
|
|
@@ -220,9 +220,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
220
220
|
)
|
|
221
221
|
values[field_level_metric] = False
|
|
222
222
|
|
|
223
|
-
assert (
|
|
224
|
-
|
|
225
|
-
)
|
|
223
|
+
assert max_num_fields_to_profile is None, (
|
|
224
|
+
f"{max_num_fields_to_profile_key} should be set to None"
|
|
225
|
+
)
|
|
226
226
|
|
|
227
227
|
# Disable expensive queries.
|
|
228
228
|
if values.get("turn_off_expensive_profiling_metrics"):
|
|
@@ -296,9 +296,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
296
296
|
custom_properties["snapshot-id"] = str(
|
|
297
297
|
table.current_snapshot().snapshot_id
|
|
298
298
|
)
|
|
299
|
-
custom_properties[
|
|
300
|
-
|
|
301
|
-
|
|
299
|
+
custom_properties["manifest-list"] = (
|
|
300
|
+
table.current_snapshot().manifest_list
|
|
301
|
+
)
|
|
302
302
|
dataset_properties = DatasetPropertiesClass(
|
|
303
303
|
name=table.name()[-1],
|
|
304
304
|
description=table.metadata.properties.get("comment", None),
|
|
@@ -354,9 +354,9 @@ class AzureADSource(StatefulIngestionSourceBase):
|
|
|
354
354
|
yield MetadataWorkUnit(id=group_status_wu_id, mcp=group_status_mcp)
|
|
355
355
|
|
|
356
356
|
# Populate GroupMembership Aspects for CorpUsers
|
|
357
|
-
datahub_corp_user_urn_to_group_membership: Dict[
|
|
358
|
-
|
|
359
|
-
|
|
357
|
+
datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
|
|
358
|
+
defaultdict(lambda: GroupMembershipClass(groups=[]))
|
|
359
|
+
)
|
|
360
360
|
if (
|
|
361
361
|
self.config.ingest_group_membership
|
|
362
362
|
and len(self.selected_azure_ad_groups) > 0
|
|
@@ -344,9 +344,9 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
344
344
|
).as_workunit()
|
|
345
345
|
|
|
346
346
|
# Step 2: Populate GroupMembership Aspects for CorpUsers
|
|
347
|
-
datahub_corp_user_urn_to_group_membership: Dict[
|
|
348
|
-
|
|
349
|
-
|
|
347
|
+
datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
|
|
348
|
+
defaultdict(lambda: GroupMembershipClass(groups=[]))
|
|
349
|
+
)
|
|
350
350
|
if self.config.ingest_group_membership and okta_groups is not None:
|
|
351
351
|
# Fetch membership for each group.
|
|
352
352
|
for okta_group in okta_groups:
|
|
@@ -419,10 +419,10 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
419
419
|
custom_props = self.build_custom_properties(
|
|
420
420
|
topic, topic_detail, extra_topic_config
|
|
421
421
|
)
|
|
422
|
-
schema_name: Optional[
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
422
|
+
schema_name: Optional[str] = (
|
|
423
|
+
self.schema_registry_client._get_subject_for_topic(
|
|
424
|
+
topic, is_key_schema=False
|
|
425
|
+
)
|
|
426
426
|
)
|
|
427
427
|
if schema_name is not None:
|
|
428
428
|
custom_props["Schema Name"] = schema_name
|
|
@@ -610,11 +610,13 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
610
610
|
|
|
611
611
|
def fetch_topic_configurations(self, topics: List[str]) -> Dict[str, dict]:
|
|
612
612
|
logger.info("Fetching config details for all topics")
|
|
613
|
-
configs: Dict[
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
613
|
+
configs: Dict[ConfigResource, concurrent.futures.Future] = (
|
|
614
|
+
self.admin_client.describe_configs(
|
|
615
|
+
resources=[
|
|
616
|
+
ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics
|
|
617
|
+
],
|
|
618
|
+
request_timeout=self.source_config.connection.client_timeout_seconds,
|
|
619
|
+
)
|
|
618
620
|
)
|
|
619
621
|
logger.debug("Waiting for config details futures to complete")
|
|
620
622
|
concurrent.futures.wait(configs.values())
|
|
@@ -110,9 +110,8 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
110
110
|
connector_manifest = self._get_connector_manifest(
|
|
111
111
|
connector_name, connector_url
|
|
112
112
|
)
|
|
113
|
-
if (
|
|
114
|
-
connector_manifest
|
|
115
|
-
or not self.config.connector_patterns.allowed(connector_manifest.name)
|
|
113
|
+
if connector_manifest is None or not self.config.connector_patterns.allowed(
|
|
114
|
+
connector_manifest.name
|
|
116
115
|
):
|
|
117
116
|
self.report.report_dropped(connector_name)
|
|
118
117
|
continue
|
|
@@ -199,9 +199,9 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
199
199
|
transforms.append(transform)
|
|
200
200
|
for key in self.connector_manifest.config.keys():
|
|
201
201
|
if key.startswith(f"transforms.{name}."):
|
|
202
|
-
transform[
|
|
203
|
-
|
|
204
|
-
|
|
202
|
+
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
203
|
+
self.connector_manifest.config[key]
|
|
204
|
+
)
|
|
205
205
|
|
|
206
206
|
if "defaultDataset" in connector_manifest.config:
|
|
207
207
|
defaultDataset = connector_manifest.config["defaultDataset"]
|
|
@@ -123,9 +123,9 @@ class ConfluentJDBCSourceConnector(BaseConnector):
|
|
|
123
123
|
transforms.append(transform)
|
|
124
124
|
for key in self.connector_manifest.config.keys():
|
|
125
125
|
if key.startswith(f"transforms.{name}."):
|
|
126
|
-
transform[
|
|
127
|
-
|
|
128
|
-
|
|
126
|
+
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
127
|
+
self.connector_manifest.config[key]
|
|
128
|
+
)
|
|
129
129
|
|
|
130
130
|
return self.JdbcParser(
|
|
131
131
|
db_connection_url,
|
|
@@ -596,9 +596,9 @@ class LookerUtil:
|
|
|
596
596
|
|
|
597
597
|
@staticmethod
|
|
598
598
|
def _extract_view_from_field(field: str) -> str:
|
|
599
|
-
assert (
|
|
600
|
-
|
|
601
|
-
)
|
|
599
|
+
assert field.count(".") == 1, (
|
|
600
|
+
f"Error: A field must be prefixed by a view name, field is: {field}"
|
|
601
|
+
)
|
|
602
602
|
return field.split(".")[0]
|
|
603
603
|
|
|
604
604
|
@staticmethod
|
|
@@ -815,9 +815,9 @@ class LookerExplore:
|
|
|
815
815
|
project_name: Optional[str] = None
|
|
816
816
|
label: Optional[str] = None
|
|
817
817
|
description: Optional[str] = None
|
|
818
|
-
upstream_views: Optional[
|
|
819
|
-
|
|
820
|
-
|
|
818
|
+
upstream_views: Optional[List[ProjectInclude]] = (
|
|
819
|
+
None # captures the view name(s) this explore is derived from
|
|
820
|
+
)
|
|
821
821
|
upstream_views_file_path: Dict[str, Optional[str]] = dataclasses_field(
|
|
822
822
|
default_factory=dict
|
|
823
823
|
) # view_name is key and file_path is value. A single file may contains multiple views
|
|
@@ -889,7 +889,7 @@ class LookerExplore:
|
|
|
889
889
|
upstream_views.extend(parsed_explore.upstream_views or [])
|
|
890
890
|
else:
|
|
891
891
|
logger.warning(
|
|
892
|
-
f
|
|
892
|
+
f"Could not find extended explore {extended_explore} for explore {dict['name']} in model {model_name}"
|
|
893
893
|
)
|
|
894
894
|
else:
|
|
895
895
|
# we only fallback to the view_names list if this is not an extended explore
|
|
@@ -903,7 +903,7 @@ class LookerExplore:
|
|
|
903
903
|
)
|
|
904
904
|
if not info:
|
|
905
905
|
logger.warning(
|
|
906
|
-
f
|
|
906
|
+
f"Could not resolve view {view_name} for explore {dict['name']} in model {model_name}"
|
|
907
907
|
)
|
|
908
908
|
else:
|
|
909
909
|
upstream_views.append(
|
|
@@ -935,9 +935,9 @@ class LookerExplore:
|
|
|
935
935
|
try:
|
|
936
936
|
explore = client.lookml_model_explore(model, explore_name)
|
|
937
937
|
views: Set[str] = set()
|
|
938
|
-
lkml_fields: List[
|
|
939
|
-
|
|
940
|
-
|
|
938
|
+
lkml_fields: List[LookmlModelExploreField] = (
|
|
939
|
+
explore_field_set_to_lkml_fields(explore)
|
|
940
|
+
)
|
|
941
941
|
|
|
942
942
|
if explore.view_name is not None and explore.view_name != explore.name:
|
|
943
943
|
# explore is not named after a view and is instead using a from field, which is modeled as view_name.
|
|
@@ -1034,9 +1034,9 @@ class LookerExplore:
|
|
|
1034
1034
|
if measure_field.name is None:
|
|
1035
1035
|
continue
|
|
1036
1036
|
else:
|
|
1037
|
-
field_name_vs_raw_explore_field[
|
|
1038
|
-
measure_field
|
|
1039
|
-
|
|
1037
|
+
field_name_vs_raw_explore_field[measure_field.name] = (
|
|
1038
|
+
measure_field
|
|
1039
|
+
)
|
|
1040
1040
|
|
|
1041
1041
|
view_fields.append(
|
|
1042
1042
|
ViewField(
|
|
@@ -1072,11 +1072,11 @@ class LookerExplore:
|
|
|
1072
1072
|
if view_project_map:
|
|
1073
1073
|
logger.debug(f"views and their projects: {view_project_map}")
|
|
1074
1074
|
|
|
1075
|
-
upstream_views_file_path: Dict[
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1075
|
+
upstream_views_file_path: Dict[str, Optional[str]] = (
|
|
1076
|
+
create_upstream_views_file_path_map(
|
|
1077
|
+
lkml_fields=lkml_fields,
|
|
1078
|
+
view_names=views,
|
|
1079
|
+
)
|
|
1080
1080
|
)
|
|
1081
1081
|
if upstream_views_file_path:
|
|
1082
1082
|
logger.debug(f"views and their file-paths: {upstream_views_file_path}")
|
|
@@ -166,9 +166,9 @@ def _get_generic_definition(
|
|
|
166
166
|
# e.g. spark1 or hive2 or druid_18
|
|
167
167
|
platform = re.sub(r"[0-9]+", "", dialect_name.split("_")[0])
|
|
168
168
|
|
|
169
|
-
assert (
|
|
170
|
-
platform
|
|
171
|
-
)
|
|
169
|
+
assert platform is not None, (
|
|
170
|
+
f"Failed to extract a valid platform from connection {looker_connection}"
|
|
171
|
+
)
|
|
172
172
|
db = looker_connection.database
|
|
173
173
|
schema = looker_connection.schema # ok for this to be None
|
|
174
174
|
return platform, db, schema
|
|
@@ -250,9 +250,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
250
250
|
|
|
251
251
|
@staticmethod
|
|
252
252
|
def _extract_view_from_field(field: str) -> str:
|
|
253
|
-
assert (
|
|
254
|
-
|
|
255
|
-
)
|
|
253
|
+
assert field.count(".") == 1, (
|
|
254
|
+
f"Error: A field must be prefixed by a view name, field is: {field}"
|
|
255
|
+
)
|
|
256
256
|
return field.split(".")[0]
|
|
257
257
|
|
|
258
258
|
def _get_views_from_fields(self, fields: List[str]) -> List[str]:
|
|
@@ -610,12 +610,12 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
610
610
|
def _create_platform_instance_aspect(
|
|
611
611
|
self,
|
|
612
612
|
) -> DataPlatformInstance:
|
|
613
|
-
assert (
|
|
614
|
-
|
|
615
|
-
)
|
|
616
|
-
assert (
|
|
617
|
-
|
|
618
|
-
)
|
|
613
|
+
assert self.source_config.platform_name, (
|
|
614
|
+
"Platform name is not set in the configuration."
|
|
615
|
+
)
|
|
616
|
+
assert self.source_config.platform_instance, (
|
|
617
|
+
"Platform instance is not set in the configuration."
|
|
618
|
+
)
|
|
619
619
|
|
|
620
620
|
return DataPlatformInstance(
|
|
621
621
|
platform=builder.make_data_platform_urn(self.source_config.platform_name),
|
|
@@ -1016,9 +1016,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
1016
1016
|
yield from chart_events
|
|
1017
1017
|
|
|
1018
1018
|
# Step 2: Emit metadata events for the Dashboard itself.
|
|
1019
|
-
chart_urns: Set[
|
|
1020
|
-
|
|
1021
|
-
|
|
1019
|
+
chart_urns: Set[str] = (
|
|
1020
|
+
set()
|
|
1021
|
+
) # Collect the unique child chart urns for dashboard input lineage.
|
|
1022
1022
|
for chart_event in chart_events:
|
|
1023
1023
|
chart_event_urn = self._extract_event_urn(chart_event)
|
|
1024
1024
|
if chart_event_urn:
|
|
@@ -1538,20 +1538,20 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
1538
1538
|
}
|
|
1539
1539
|
)
|
|
1540
1540
|
|
|
1541
|
-
dashboard_element: Optional[
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1541
|
+
dashboard_element: Optional[LookerDashboardElement] = (
|
|
1542
|
+
self._get_looker_dashboard_element(
|
|
1543
|
+
DashboardElement(
|
|
1544
|
+
id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes),
|
|
1545
|
+
# we add the "looks_" prefix to look.id.
|
|
1546
|
+
title=look.title,
|
|
1547
|
+
subtitle_text=look.description,
|
|
1548
|
+
look_id=look.id,
|
|
1549
|
+
dashboard_id=None, # As this is an independent look
|
|
1550
|
+
look=LookWithQuery(
|
|
1551
|
+
query=query, folder=look.folder, user_id=look.user_id
|
|
1552
|
+
),
|
|
1553
1553
|
),
|
|
1554
|
-
)
|
|
1554
|
+
)
|
|
1555
1555
|
)
|
|
1556
1556
|
|
|
1557
1557
|
if dashboard_element is not None:
|
|
@@ -33,9 +33,9 @@ logger = logging.getLogger(__name__)
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class SpecialVariable:
|
|
36
|
-
SPECIAL_VARIABLE_PATTERN: ClassVar[
|
|
37
|
-
|
|
38
|
-
|
|
36
|
+
SPECIAL_VARIABLE_PATTERN: ClassVar[str] = (
|
|
37
|
+
r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b"
|
|
38
|
+
)
|
|
39
39
|
liquid_variable: dict
|
|
40
40
|
|
|
41
41
|
def __init__(self, liquid_variable):
|
|
@@ -257,9 +257,9 @@ class BaseStatGenerator(ABC):
|
|
|
257
257
|
|
|
258
258
|
for row in rows:
|
|
259
259
|
logger.debug(row)
|
|
260
|
-
entity_stat_aspect[
|
|
261
|
-
self.
|
|
262
|
-
|
|
260
|
+
entity_stat_aspect[self.get_entity_stat_key(row)] = (
|
|
261
|
+
self.to_entity_timeseries_stat_aspect(row)
|
|
262
|
+
)
|
|
263
263
|
|
|
264
264
|
return entity_stat_aspect
|
|
265
265
|
|
|
@@ -385,10 +385,8 @@ class BaseStatGenerator(ABC):
|
|
|
385
385
|
entity_rows: List[Dict] = self._execute_query(
|
|
386
386
|
entity_query_with_filters, "entity_query"
|
|
387
387
|
)
|
|
388
|
-
entity_usage_stat: Dict[
|
|
389
|
-
|
|
390
|
-
] = self._process_entity_timeseries_rows(
|
|
391
|
-
entity_rows
|
|
388
|
+
entity_usage_stat: Dict[Tuple[str, str], Any] = (
|
|
389
|
+
self._process_entity_timeseries_rows(entity_rows)
|
|
392
390
|
) # Any type to pass mypy unbound Aspect type error
|
|
393
391
|
|
|
394
392
|
user_wise_query_with_filters: LookerQuery = self._append_filters(
|
|
@@ -38,16 +38,16 @@ def merge_parent_and_child_fields(
|
|
|
38
38
|
# Create a map field-name vs field
|
|
39
39
|
child_field_map: dict = {}
|
|
40
40
|
for field in child_fields:
|
|
41
|
-
assert (
|
|
42
|
-
|
|
43
|
-
)
|
|
41
|
+
assert NAME in field, (
|
|
42
|
+
"A lookml view must have a name field"
|
|
43
|
+
) # name is required field of lookml field array
|
|
44
44
|
|
|
45
45
|
child_field_map[field[NAME]] = field
|
|
46
46
|
|
|
47
47
|
for field in parent_fields:
|
|
48
|
-
assert (
|
|
49
|
-
|
|
50
|
-
)
|
|
48
|
+
assert NAME in field, (
|
|
49
|
+
"A lookml view must have a name field"
|
|
50
|
+
) # name is required field of lookml field array
|
|
51
51
|
|
|
52
52
|
if field[NAME] in child_field_map:
|
|
53
53
|
# Fields defined in the child view take higher precedence.
|
|
@@ -482,14 +482,14 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
482
482
|
if self.source_config.project_name is not None:
|
|
483
483
|
return self.source_config.project_name
|
|
484
484
|
|
|
485
|
-
assert (
|
|
486
|
-
|
|
487
|
-
)
|
|
485
|
+
assert self.looker_client is not None, (
|
|
486
|
+
"Failed to find a configured Looker API client"
|
|
487
|
+
)
|
|
488
488
|
try:
|
|
489
489
|
model = self.looker_client.lookml_model(model_name, fields="project_name")
|
|
490
|
-
assert (
|
|
491
|
-
|
|
492
|
-
)
|
|
490
|
+
assert model.project_name is not None, (
|
|
491
|
+
f"Failed to find a project name for model {model_name}"
|
|
492
|
+
)
|
|
493
493
|
return model.project_name
|
|
494
494
|
except SDKError:
|
|
495
495
|
raise ValueError(
|
|
@@ -541,9 +541,9 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
541
541
|
self.reporter.git_clone_latency = datetime.now() - start_time
|
|
542
542
|
self.source_config.base_folder = checkout_dir.resolve()
|
|
543
543
|
|
|
544
|
-
self.base_projects_folder[
|
|
545
|
-
|
|
546
|
-
|
|
544
|
+
self.base_projects_folder[BASE_PROJECT_NAME] = (
|
|
545
|
+
self.source_config.base_folder
|
|
546
|
+
)
|
|
547
547
|
|
|
548
548
|
visited_projects: Set[str] = set()
|
|
549
549
|
|
|
@@ -641,9 +641,9 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
641
641
|
repo_url=remote_project.url,
|
|
642
642
|
)
|
|
643
643
|
|
|
644
|
-
self.base_projects_folder[
|
|
645
|
-
|
|
646
|
-
|
|
644
|
+
self.base_projects_folder[remote_project.name] = (
|
|
645
|
+
p_checkout_dir.resolve()
|
|
646
|
+
)
|
|
647
647
|
repo = p_cloner.get_last_repo_cloned()
|
|
648
648
|
assert repo
|
|
649
649
|
remote_git_info = GitInfo(
|
|
@@ -930,9 +930,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
930
930
|
logger.warning(
|
|
931
931
|
f"view {maybe_looker_view.id.view_name} from model {model_name}, connection {model.connection} was previously processed via model {prev_model_name}, connection {prev_model_connection} and will likely lead to incorrect lineage to the underlying tables"
|
|
932
932
|
)
|
|
933
|
-
if
|
|
934
|
-
not self.source_config.emit_reachable_views_only
|
|
935
|
-
):
|
|
933
|
+
if not self.source_config.emit_reachable_views_only:
|
|
936
934
|
logger.warning(
|
|
937
935
|
"Consider enabling the `emit_reachable_views_only` flag to handle this case."
|
|
938
936
|
)
|
|
@@ -484,11 +484,11 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
|
|
|
484
484
|
)
|
|
485
485
|
|
|
486
486
|
def __get_upstream_dataset_urn(self) -> List[str]:
|
|
487
|
-
current_view_id: Optional[
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
487
|
+
current_view_id: Optional[LookerViewId] = (
|
|
488
|
+
self.looker_view_id_cache.get_looker_view_id(
|
|
489
|
+
view_name=self.view_context.name(),
|
|
490
|
+
base_folder_path=self.view_context.base_folder_path,
|
|
491
|
+
)
|
|
492
492
|
)
|
|
493
493
|
|
|
494
494
|
# Current view will always be present in cache. assert will silence the lint
|
|
@@ -172,10 +172,10 @@ class MLflowSource(Source):
|
|
|
172
172
|
"""
|
|
173
173
|
Get all Registered Models in MLflow Model Registry.
|
|
174
174
|
"""
|
|
175
|
-
registered_models: Iterable[
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
175
|
+
registered_models: Iterable[RegisteredModel] = (
|
|
176
|
+
self._traverse_mlflow_search_func(
|
|
177
|
+
search_func=self.client.search_registered_models,
|
|
178
|
+
)
|
|
179
179
|
)
|
|
180
180
|
return registered_models
|
|
181
181
|
|