acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +126 -85
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +4 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +10 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -74
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -194,20 +194,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
|
|
|
194
194
|
|
|
195
195
|
_DBT_FIELDS_BY_TYPE = {
|
|
196
196
|
"models": f"""
|
|
197
|
-
{
|
|
198
|
-
{
|
|
199
|
-
{
|
|
197
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
198
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
199
|
+
{_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
|
|
200
200
|
dependsOn
|
|
201
201
|
materializedType
|
|
202
202
|
""",
|
|
203
203
|
"seeds": f"""
|
|
204
|
-
{
|
|
205
|
-
{
|
|
206
|
-
{
|
|
204
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
205
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
206
|
+
{_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
|
|
207
207
|
""",
|
|
208
208
|
"sources": f"""
|
|
209
|
-
{
|
|
210
|
-
{
|
|
209
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
210
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
211
211
|
identifier
|
|
212
212
|
sourceName
|
|
213
213
|
sourceDescription
|
|
@@ -218,9 +218,9 @@ _DBT_FIELDS_BY_TYPE = {
|
|
|
218
218
|
loader
|
|
219
219
|
""",
|
|
220
220
|
"snapshots": f"""
|
|
221
|
-
{
|
|
222
|
-
{
|
|
223
|
-
{
|
|
221
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
222
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
223
|
+
{_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
|
|
224
224
|
parentsSources {{
|
|
225
225
|
uniqueId
|
|
226
226
|
}}
|
|
@@ -229,7 +229,7 @@ _DBT_FIELDS_BY_TYPE = {
|
|
|
229
229
|
}}
|
|
230
230
|
""",
|
|
231
231
|
"tests": f"""
|
|
232
|
-
{
|
|
232
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
233
233
|
state
|
|
234
234
|
columnName
|
|
235
235
|
status
|
|
@@ -315,7 +315,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
315
315
|
res = response.json()
|
|
316
316
|
if "errors" in res:
|
|
317
317
|
raise ValueError(
|
|
318
|
-
f
|
|
318
|
+
f"Unable to fetch metadata from dbt Cloud: {res['errors']}"
|
|
319
319
|
)
|
|
320
320
|
data = res["data"]
|
|
321
321
|
except JSONDecodeError as e:
|
|
@@ -506,16 +506,18 @@ class DBTNode:
|
|
|
506
506
|
materialization: Optional[str] # table, view, ephemeral, incremental, snapshot
|
|
507
507
|
# see https://docs.getdbt.com/reference/artifacts/manifest-json
|
|
508
508
|
catalog_type: Optional[str]
|
|
509
|
-
missing_from_catalog:
|
|
509
|
+
missing_from_catalog: (
|
|
510
|
+
bool # indicates if the node was missing from the catalog.json
|
|
511
|
+
)
|
|
510
512
|
|
|
511
513
|
owner: Optional[str]
|
|
512
514
|
|
|
513
515
|
columns: List[DBTColumn] = field(default_factory=list)
|
|
514
516
|
upstream_nodes: List[str] = field(default_factory=list) # list of upstream dbt_name
|
|
515
517
|
upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list)
|
|
516
|
-
raw_sql_parsing_result: Optional[
|
|
517
|
-
|
|
518
|
-
|
|
518
|
+
raw_sql_parsing_result: Optional[SqlParsingResult] = (
|
|
519
|
+
None # only set for nodes that don't depend on ephemeral models
|
|
520
|
+
)
|
|
519
521
|
cll_debug_info: Optional[SqlParsingDebugInfo] = None
|
|
520
522
|
|
|
521
523
|
meta: Dict[str, Any] = field(default_factory=dict)
|
|
@@ -869,10 +871,10 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
869
871
|
"platform": DBT_PLATFORM,
|
|
870
872
|
"name": node.dbt_name,
|
|
871
873
|
"instance": self.config.platform_instance,
|
|
874
|
+
# Ideally we'd include the env unconditionally. However, we started out
|
|
875
|
+
# not including env in the guid, so we need to maintain backwards compatibility
|
|
876
|
+
# with existing PROD assertions.
|
|
872
877
|
**(
|
|
873
|
-
# Ideally we'd include the env unconditionally. However, we started out
|
|
874
|
-
# not including env in the guid, so we need to maintain backwards compatibility
|
|
875
|
-
# with existing PROD assertions.
|
|
876
878
|
{"env": self.config.env}
|
|
877
879
|
if self.config.env != mce_builder.DEFAULT_ENV
|
|
878
880
|
and self.config.include_env_in_assertion_guid
|
|
@@ -122,11 +122,6 @@ class DeltaLakeSource(Source):
|
|
|
122
122
|
config_report,
|
|
123
123
|
)
|
|
124
124
|
|
|
125
|
-
@classmethod
|
|
126
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
|
|
127
|
-
config = DeltaLakeSourceConfig.parse_obj(config_dict)
|
|
128
|
-
return cls(config, ctx)
|
|
129
|
-
|
|
130
125
|
def _parse_datatype(self, raw_field_json_str: str) -> List[SchemaFieldClass]:
|
|
131
126
|
raw_field_json = json.loads(raw_field_json_str)
|
|
132
127
|
|
|
@@ -29,7 +29,7 @@ class DemoDataSource(Source):
|
|
|
29
29
|
|
|
30
30
|
def __init__(self, ctx: PipelineContext, config: DemoDataConfig):
|
|
31
31
|
file_config = FileSourceConfig(path=str(download_sample_data()))
|
|
32
|
-
self.file_source = GenericFileSource(ctx, file_config)
|
|
32
|
+
self.file_source: GenericFileSource = GenericFileSource(ctx, file_config)
|
|
33
33
|
|
|
34
34
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
35
35
|
yield from self.file_source.get_workunits()
|
|
@@ -181,7 +181,7 @@ class DremioAPIOperations:
|
|
|
181
181
|
return
|
|
182
182
|
|
|
183
183
|
# On-prem Dremio authentication (PAT or Basic Auth)
|
|
184
|
-
for
|
|
184
|
+
for _ in range(1, self._retry_count + 1):
|
|
185
185
|
try:
|
|
186
186
|
if connection_args.authentication_method == "PAT":
|
|
187
187
|
self.session.headers.update(
|
|
@@ -191,9 +191,9 @@ class DremioAPIOperations:
|
|
|
191
191
|
)
|
|
192
192
|
return
|
|
193
193
|
else:
|
|
194
|
-
assert (
|
|
195
|
-
|
|
196
|
-
)
|
|
194
|
+
assert connection_args.username and connection_args.password, (
|
|
195
|
+
"Username and password are required for authentication"
|
|
196
|
+
)
|
|
197
197
|
host = connection_args.hostname
|
|
198
198
|
port = connection_args.port
|
|
199
199
|
protocol = "https" if connection_args.tls else "http"
|
|
@@ -101,9 +101,9 @@ class DremioToDataHubSourceTypeMapping:
|
|
|
101
101
|
Add a new source type if not in the map (e.g., Dremio ARP).
|
|
102
102
|
"""
|
|
103
103
|
dremio_source_type = dremio_source_type.upper()
|
|
104
|
-
DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[
|
|
105
|
-
|
|
106
|
-
|
|
104
|
+
DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = (
|
|
105
|
+
datahub_source_type
|
|
106
|
+
)
|
|
107
107
|
|
|
108
108
|
if category:
|
|
109
109
|
if category.lower() == "file_object_storage":
|
|
@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
472
472
|
env=self.config.env,
|
|
473
473
|
platform_instance=self.config.platform_instance,
|
|
474
474
|
)
|
|
475
|
-
self.report.
|
|
476
|
-
|
|
475
|
+
with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
|
|
476
|
+
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
477
477
|
|
|
478
478
|
def generate_view_lineage(
|
|
479
479
|
self, dataset_urn: str, parents: List[str]
|
|
@@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter:
|
|
|
111
111
|
|
|
112
112
|
@staticmethod
|
|
113
113
|
def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:
|
|
114
|
-
type_class: Optional[
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
114
|
+
type_class: Optional[Type] = (
|
|
115
|
+
ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
|
|
116
|
+
elastic_column_type
|
|
117
|
+
)
|
|
118
118
|
)
|
|
119
119
|
if type_class is None:
|
|
120
120
|
logger.warning(
|
|
@@ -16,7 +16,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
16
|
platform_name,
|
|
17
17
|
support_status,
|
|
18
18
|
)
|
|
19
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor,
|
|
19
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
20
20
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
21
21
|
from datahub.ingestion.source.fivetran.config import (
|
|
22
22
|
KNOWN_DATA_PLATFORM_MAPPING,
|
|
@@ -291,11 +291,6 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
291
291
|
dpi = self._generate_dpi_from_job(job, datajob)
|
|
292
292
|
yield from self._get_dpi_workunits(job, dpi)
|
|
293
293
|
|
|
294
|
-
@classmethod
|
|
295
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
296
|
-
config = FivetranSourceConfig.parse_obj(config_dict)
|
|
297
|
-
return cls(config, ctx)
|
|
298
|
-
|
|
299
294
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
300
295
|
return [
|
|
301
296
|
*super().get_workunit_processors(),
|
|
@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
|
|
|
141
141
|
) -> Iterable[MetadataWorkUnit]:
|
|
142
142
|
if self.config.cleanup_expired_tokens:
|
|
143
143
|
try:
|
|
144
|
-
self.report.
|
|
145
|
-
|
|
144
|
+
with self.report.new_stage("Expired Token Cleanup"):
|
|
145
|
+
self.revoke_expired_tokens()
|
|
146
146
|
except Exception as e:
|
|
147
147
|
self.report.failure("While trying to cleanup expired token ", exc=e)
|
|
148
148
|
if self.config.truncate_indices:
|
|
149
149
|
try:
|
|
150
|
-
self.report.
|
|
151
|
-
|
|
150
|
+
with self.report.new_stage("Truncate Indices"):
|
|
151
|
+
self.truncate_indices()
|
|
152
152
|
except Exception as e:
|
|
153
153
|
self.report.failure("While trying to truncate indices ", exc=e)
|
|
154
154
|
if self.config.soft_deleted_entities_cleanup.enabled:
|
|
155
155
|
try:
|
|
156
|
-
self.report.
|
|
157
|
-
|
|
158
|
-
)
|
|
159
|
-
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
156
|
+
with self.report.new_stage("Soft Deleted Entities Cleanup"):
|
|
157
|
+
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
160
158
|
except Exception as e:
|
|
161
159
|
self.report.failure(
|
|
162
160
|
"While trying to cleanup soft deleted entities ", exc=e
|
|
163
161
|
)
|
|
164
162
|
if self.config.dataprocess_cleanup.enabled:
|
|
165
163
|
try:
|
|
166
|
-
self.report.
|
|
167
|
-
|
|
164
|
+
with self.report.new_stage("Data Process Cleanup"):
|
|
165
|
+
yield from self.dataprocess_cleanup.get_workunits_internal()
|
|
168
166
|
except Exception as e:
|
|
169
167
|
self.report.failure("While trying to cleanup data process ", exc=e)
|
|
170
168
|
if self.config.execution_request_cleanup.enabled:
|
|
171
169
|
try:
|
|
172
|
-
self.report.
|
|
173
|
-
|
|
170
|
+
with self.report.new_stage("Execution request Cleanup"):
|
|
171
|
+
self.execution_request_cleanup.run()
|
|
174
172
|
except Exception as e:
|
|
175
173
|
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
176
|
-
# Otherwise last stage's duration does not get calculated.
|
|
177
|
-
self.report.report_ingestion_stage_start("End")
|
|
178
174
|
yield from []
|
|
179
175
|
|
|
180
176
|
def truncate_indices(self) -> None:
|
|
@@ -296,6 +292,7 @@ class DataHubGcSource(Source):
|
|
|
296
292
|
tokens = list_access_tokens.get("tokens", [])
|
|
297
293
|
total = list_access_tokens.get("total", 0)
|
|
298
294
|
if tokens == []:
|
|
295
|
+
# Due to a server bug we cannot rely on just total
|
|
299
296
|
break
|
|
300
297
|
for token in tokens:
|
|
301
298
|
self.report.expired_tokens_revoked += 1
|
|
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
keep_history_max_days: int = Field(
|
|
32
|
-
|
|
32
|
+
90,
|
|
33
33
|
description="Maximum number of days to keep execution requests for, per ingestion source",
|
|
34
34
|
)
|
|
35
35
|
|
|
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
48
48
|
description="Maximum runtime in seconds for the cleanup task",
|
|
49
49
|
)
|
|
50
50
|
|
|
51
|
+
limit_entities_delete: Optional[int] = Field(
|
|
52
|
+
10000, description="Max number of execution requests to hard delete."
|
|
53
|
+
)
|
|
54
|
+
|
|
51
55
|
max_read_errors: int = Field(
|
|
52
56
|
default=10,
|
|
53
57
|
description="Maximum number of read errors before aborting",
|
|
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
|
|
|
65
69
|
ergc_delete_errors: int = 0
|
|
66
70
|
ergc_start_time: Optional[datetime.datetime] = None
|
|
67
71
|
ergc_end_time: Optional[datetime.datetime] = None
|
|
72
|
+
ergc_delete_limit_reached: bool = False
|
|
73
|
+
ergc_runtime_limit_reached: bool = False
|
|
68
74
|
|
|
69
75
|
|
|
70
76
|
class CleanupRecord(BaseModel):
|
|
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
|
|
|
85
91
|
self.graph = graph
|
|
86
92
|
self.report = report
|
|
87
93
|
self.instance_id = int(time.time())
|
|
94
|
+
self.last_print_time = 0.0
|
|
88
95
|
|
|
89
96
|
if config is not None:
|
|
90
97
|
self.config = config
|
|
91
98
|
else:
|
|
92
99
|
self.config = DatahubExecutionRequestCleanupConfig()
|
|
93
100
|
|
|
101
|
+
def _print_report(self) -> None:
|
|
102
|
+
time_taken = round(time.time() - self.last_print_time, 1)
|
|
103
|
+
# Print report every 2 minutes
|
|
104
|
+
if time_taken > 120:
|
|
105
|
+
self.last_print_time = time.time()
|
|
106
|
+
logger.info(f"\n{self.report.as_string()}")
|
|
107
|
+
|
|
94
108
|
def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
|
|
95
109
|
input_aspect = (
|
|
96
110
|
entry.get("aspects", {})
|
|
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
175
189
|
running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
|
|
176
190
|
|
|
177
191
|
for entry in self._scroll_execution_requests():
|
|
192
|
+
self._print_report()
|
|
178
193
|
self.report.ergc_records_read += 1
|
|
179
194
|
key = entry.ingestion_source
|
|
180
195
|
|
|
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
|
|
|
225
240
|
f"record timestamp: {entry.requested_at}."
|
|
226
241
|
)
|
|
227
242
|
)
|
|
228
|
-
self.report.ergc_records_deleted += 1
|
|
229
243
|
yield entry
|
|
230
244
|
|
|
231
245
|
def _delete_entry(self, entry: CleanupRecord) -> None:
|
|
232
246
|
try:
|
|
233
|
-
logger.info(
|
|
234
|
-
f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
|
|
235
|
-
)
|
|
236
247
|
self.graph.delete_entity(entry.urn, True)
|
|
248
|
+
self.report.ergc_records_deleted += 1
|
|
237
249
|
except Exception as e:
|
|
238
250
|
self.report.ergc_delete_errors += 1
|
|
239
251
|
self.report.failure(
|
|
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
|
|
|
252
264
|
>= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
|
|
253
265
|
)
|
|
254
266
|
):
|
|
267
|
+
self.report.ergc_runtime_limit_reached = True
|
|
255
268
|
logger.info(f"ergc({self.instance_id}): max runtime reached.")
|
|
256
269
|
return True
|
|
257
270
|
return False
|
|
258
271
|
|
|
272
|
+
def _reached_delete_limit(self) -> bool:
|
|
273
|
+
if (
|
|
274
|
+
self.config.limit_entities_delete
|
|
275
|
+
and self.report.ergc_records_deleted >= self.config.limit_entities_delete
|
|
276
|
+
):
|
|
277
|
+
logger.info(
|
|
278
|
+
f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
|
|
279
|
+
)
|
|
280
|
+
self.report.ergc_delete_limit_reached = True
|
|
281
|
+
return True
|
|
282
|
+
return False
|
|
283
|
+
|
|
259
284
|
def run(self) -> None:
|
|
260
285
|
if not self.config.enabled:
|
|
261
286
|
logger.info(
|
|
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
274
299
|
)
|
|
275
300
|
|
|
276
301
|
for entry in self._scroll_garbage_records():
|
|
277
|
-
if self._reached_runtime_limit():
|
|
302
|
+
if self._reached_runtime_limit() or self._reached_delete_limit():
|
|
278
303
|
break
|
|
279
304
|
self._delete_entry(entry)
|
|
280
305
|
|
|
@@ -19,8 +19,8 @@ from datahub.utilities.urns._urn_base import Urn
|
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
query
|
|
22
|
+
QUERY_ENTITIES = """
|
|
23
|
+
query listEntities($input: ScrollAcrossEntitiesInput!) {
|
|
24
24
|
scrollAcrossEntities(input: $input) {
|
|
25
25
|
nextScrollId
|
|
26
26
|
count
|
|
@@ -29,6 +29,9 @@ query listQueries($input: ScrollAcrossEntitiesInput!) {
|
|
|
29
29
|
... on QueryEntity {
|
|
30
30
|
urn
|
|
31
31
|
}
|
|
32
|
+
... on DataProcessInstance {
|
|
33
|
+
urn
|
|
34
|
+
}
|
|
32
35
|
}
|
|
33
36
|
}
|
|
34
37
|
}
|
|
@@ -96,7 +99,8 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
96
99
|
|
|
97
100
|
@dataclass
|
|
98
101
|
class SoftDeletedEntitiesReport(SourceReport):
|
|
99
|
-
|
|
102
|
+
num_calls_made: Dict[str, int] = field(default_factory=dict)
|
|
103
|
+
num_entities_found: Dict[str, int] = field(default_factory=dict)
|
|
100
104
|
num_soft_deleted_entity_processed: int = 0
|
|
101
105
|
num_soft_deleted_retained_due_to_age: int = 0
|
|
102
106
|
num_soft_deleted_entity_removal_started: int = 0
|
|
@@ -151,9 +155,9 @@ class SoftDeletedEntitiesCleanup:
|
|
|
151
155
|
current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
|
|
152
156
|
self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
|
|
153
157
|
if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
|
|
154
|
-
self.report.sample_hard_deleted_aspects_by_type[
|
|
155
|
-
|
|
156
|
-
|
|
158
|
+
self.report.sample_hard_deleted_aspects_by_type[entity_type] = (
|
|
159
|
+
LossyList()
|
|
160
|
+
)
|
|
157
161
|
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
158
162
|
|
|
159
163
|
def delete_entity(self, urn: str) -> None:
|
|
@@ -225,19 +229,33 @@ class SoftDeletedEntitiesCleanup:
|
|
|
225
229
|
time.sleep(self.config.delay)
|
|
226
230
|
return futures
|
|
227
231
|
|
|
228
|
-
def
|
|
232
|
+
def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
|
|
229
233
|
assert self.ctx.graph
|
|
230
234
|
scroll_id: Optional[str] = None
|
|
235
|
+
|
|
236
|
+
batch_size = self.config.batch_size
|
|
237
|
+
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
238
|
+
# Due to a bug in Data process instance querying this is a temp workaround
|
|
239
|
+
# to avoid a giant stacktrace by having a smaller batch size in first call
|
|
240
|
+
# This will be remove in future version after server with fix has been
|
|
241
|
+
# around for a while
|
|
242
|
+
batch_size = 10
|
|
243
|
+
|
|
231
244
|
while True:
|
|
232
245
|
try:
|
|
246
|
+
if entity_type not in self.report.num_calls_made:
|
|
247
|
+
self.report.num_calls_made[entity_type] = 1
|
|
248
|
+
else:
|
|
249
|
+
self.report.num_calls_made[entity_type] += 1
|
|
250
|
+
self._print_report()
|
|
233
251
|
result = self.ctx.graph.execute_graphql(
|
|
234
|
-
|
|
252
|
+
graphql_query,
|
|
235
253
|
{
|
|
236
254
|
"input": {
|
|
237
|
-
"types": [
|
|
255
|
+
"types": [entity_type],
|
|
238
256
|
"query": "*",
|
|
239
257
|
"scrollId": scroll_id if scroll_id else None,
|
|
240
|
-
"count":
|
|
258
|
+
"count": batch_size,
|
|
241
259
|
"orFilters": [
|
|
242
260
|
{
|
|
243
261
|
"and": [
|
|
@@ -254,15 +272,29 @@ class SoftDeletedEntitiesCleanup:
|
|
|
254
272
|
)
|
|
255
273
|
except Exception as e:
|
|
256
274
|
self.report.failure(
|
|
257
|
-
f"While trying to get
|
|
275
|
+
f"While trying to get {entity_type} with {scroll_id}", exc=e
|
|
258
276
|
)
|
|
259
277
|
break
|
|
260
278
|
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
261
|
-
if not scroll_across_entities
|
|
279
|
+
if not scroll_across_entities:
|
|
262
280
|
break
|
|
281
|
+
search_results = scroll_across_entities.get("searchResults")
|
|
282
|
+
count = scroll_across_entities.get("count")
|
|
283
|
+
if not count or not search_results:
|
|
284
|
+
# Due to a server bug we cannot rely on just count as it was returning response like this
|
|
285
|
+
# {'count': 1, 'nextScrollId': None, 'searchResults': []}
|
|
286
|
+
break
|
|
287
|
+
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
288
|
+
# Temp workaround. See note in beginning of the function
|
|
289
|
+
# We make the batch size = config after call has succeeded once
|
|
290
|
+
batch_size = self.config.batch_size
|
|
263
291
|
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
264
|
-
self.report.
|
|
265
|
-
|
|
292
|
+
if entity_type not in self.report.num_entities_found:
|
|
293
|
+
self.report.num_entities_found[entity_type] = 0
|
|
294
|
+
self.report.num_entities_found[entity_type] += scroll_across_entities.get(
|
|
295
|
+
"count"
|
|
296
|
+
)
|
|
297
|
+
for query in search_results:
|
|
266
298
|
yield query["entity"]["urn"]
|
|
267
299
|
|
|
268
300
|
def _get_urns(self) -> Iterable[str]:
|
|
@@ -275,7 +307,8 @@ class SoftDeletedEntitiesCleanup:
|
|
|
275
307
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
276
308
|
batch_size=self.config.batch_size,
|
|
277
309
|
)
|
|
278
|
-
yield from self.
|
|
310
|
+
yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
|
|
311
|
+
yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
|
|
279
312
|
|
|
280
313
|
def _times_up(self) -> bool:
|
|
281
314
|
if (
|
|
@@ -141,8 +141,9 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
141
141
|
source.source_config.platform = PLATFORM_GCS
|
|
142
142
|
|
|
143
143
|
source.is_s3_platform = lambda: True # type: ignore
|
|
144
|
-
source.create_s3_path = lambda bucket_name, key: unquote(
|
|
145
|
-
|
|
144
|
+
source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore
|
|
145
|
+
f"s3://{bucket_name}/{key}"
|
|
146
|
+
)
|
|
146
147
|
return source
|
|
147
148
|
|
|
148
149
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
@@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
|
|
|
267
267
|
"get_column_max",
|
|
268
268
|
"get_column_mean",
|
|
269
269
|
"get_column_stdev",
|
|
270
|
-
"get_column_stdev",
|
|
271
270
|
"get_column_nonnull_count",
|
|
272
271
|
"get_column_unique_count",
|
|
273
272
|
}
|
|
@@ -328,7 +327,7 @@ def _is_single_row_query_method(query: Any) -> bool:
|
|
|
328
327
|
|
|
329
328
|
|
|
330
329
|
def _run_with_query_combiner(
|
|
331
|
-
method: Callable[Concatenate["_SingleDatasetProfiler", P], None]
|
|
330
|
+
method: Callable[Concatenate["_SingleDatasetProfiler", P], None],
|
|
332
331
|
) -> Callable[Concatenate["_SingleDatasetProfiler", P], None]:
|
|
333
332
|
@functools.wraps(method)
|
|
334
333
|
def inner(
|
|
@@ -1538,9 +1537,7 @@ def create_bigquery_temp_table(
|
|
|
1538
1537
|
query_job: Optional["google.cloud.bigquery.job.query.QueryJob"] = (
|
|
1539
1538
|
# In google-cloud-bigquery 3.15.0, the _query_job attribute was
|
|
1540
1539
|
# made public and renamed to query_job.
|
|
1541
|
-
cursor.query_job
|
|
1542
|
-
if hasattr(cursor, "query_job")
|
|
1543
|
-
else cursor._query_job # type: ignore[attr-defined]
|
|
1540
|
+
cursor.query_job if hasattr(cursor, "query_job") else cursor._query_job # type: ignore[attr-defined]
|
|
1544
1541
|
)
|
|
1545
1542
|
assert query_job
|
|
1546
1543
|
temp_destination_table = query_job.destination
|
|
@@ -220,9 +220,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
220
220
|
)
|
|
221
221
|
values[field_level_metric] = False
|
|
222
222
|
|
|
223
|
-
assert (
|
|
224
|
-
|
|
225
|
-
)
|
|
223
|
+
assert max_num_fields_to_profile is None, (
|
|
224
|
+
f"{max_num_fields_to_profile_key} should be set to None"
|
|
225
|
+
)
|
|
226
226
|
|
|
227
227
|
# Disable expensive queries.
|
|
228
228
|
if values.get("turn_off_expensive_profiling_metrics"):
|
|
@@ -203,7 +203,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
203
203
|
with PerfTimer() as timer:
|
|
204
204
|
table = thread_local.local_catalog.load_table(dataset_path)
|
|
205
205
|
time_taken = timer.elapsed_seconds()
|
|
206
|
-
self.report.report_table_load_time(
|
|
206
|
+
self.report.report_table_load_time(
|
|
207
|
+
time_taken, dataset_name, table.metadata_location
|
|
208
|
+
)
|
|
207
209
|
LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
|
|
208
210
|
yield from self._create_iceberg_workunit(dataset_name, table)
|
|
209
211
|
except NoSuchPropertyException as e:
|
|
@@ -247,7 +249,10 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
247
249
|
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
|
|
248
250
|
)
|
|
249
251
|
except Exception as e:
|
|
250
|
-
self.report.report_failure(
|
|
252
|
+
self.report.report_failure(
|
|
253
|
+
"general",
|
|
254
|
+
f"Failed to create workunit for dataset {dataset_name}: {e}",
|
|
255
|
+
)
|
|
251
256
|
LOGGER.exception(
|
|
252
257
|
f"Exception while processing table {dataset_path}, skipping it.",
|
|
253
258
|
)
|
|
@@ -291,9 +296,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
291
296
|
custom_properties["snapshot-id"] = str(
|
|
292
297
|
table.current_snapshot().snapshot_id
|
|
293
298
|
)
|
|
294
|
-
custom_properties[
|
|
295
|
-
|
|
296
|
-
|
|
299
|
+
custom_properties["manifest-list"] = (
|
|
300
|
+
table.current_snapshot().manifest_list
|
|
301
|
+
)
|
|
297
302
|
dataset_properties = DatasetPropertiesClass(
|
|
298
303
|
name=table.name()[-1],
|
|
299
304
|
description=table.metadata.properties.get("comment", None),
|
|
@@ -312,7 +317,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
312
317
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
313
318
|
|
|
314
319
|
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
315
|
-
self.report.report_table_processing_time(
|
|
320
|
+
self.report.report_table_processing_time(
|
|
321
|
+
timer.elapsed_seconds(), dataset_name, table.metadata_location
|
|
322
|
+
)
|
|
316
323
|
yield MetadataWorkUnit(id=dataset_name, mce=mce)
|
|
317
324
|
|
|
318
325
|
dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|