acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -4,6 +4,7 @@ import functools
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
|
+
import re
|
|
7
8
|
import time
|
|
8
9
|
from collections import defaultdict
|
|
9
10
|
from dataclasses import dataclass
|
|
@@ -60,6 +61,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
|
60
61
|
MetadataChangeProposal,
|
|
61
62
|
)
|
|
62
63
|
from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
|
|
64
|
+
from datahub.metadata.schema_classes import (
|
|
65
|
+
KEY_ASPECT_NAMES,
|
|
66
|
+
ChangeTypeClass,
|
|
67
|
+
)
|
|
63
68
|
from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
|
|
64
69
|
|
|
65
70
|
if TYPE_CHECKING:
|
|
@@ -104,6 +109,22 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
|
104
109
|
)
|
|
105
110
|
|
|
106
111
|
|
|
112
|
+
def preserve_unicode_escapes(obj: Any) -> Any:
|
|
113
|
+
"""Recursively convert unicode characters back to escape sequences"""
|
|
114
|
+
if isinstance(obj, dict):
|
|
115
|
+
return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
|
|
116
|
+
elif isinstance(obj, list):
|
|
117
|
+
return [preserve_unicode_escapes(item) for item in obj]
|
|
118
|
+
elif isinstance(obj, str):
|
|
119
|
+
# Convert non-ASCII characters back to \u escapes
|
|
120
|
+
def escape_unicode(match: Any) -> Any:
|
|
121
|
+
return f"\\u{ord(match.group(0)):04x}"
|
|
122
|
+
|
|
123
|
+
return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
|
|
124
|
+
else:
|
|
125
|
+
return obj
|
|
126
|
+
|
|
127
|
+
|
|
107
128
|
class EmitMode(ConfigEnum):
|
|
108
129
|
# Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
|
|
109
130
|
# Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
|
|
@@ -314,6 +335,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
314
335
|
openapi_ingestion: Optional[bool] = None,
|
|
315
336
|
client_mode: Optional[ClientMode] = None,
|
|
316
337
|
datahub_component: Optional[str] = None,
|
|
338
|
+
server_config_refresh_interval: Optional[int] = None,
|
|
317
339
|
):
|
|
318
340
|
if not gms_server:
|
|
319
341
|
raise ConfigurationError("gms server is required")
|
|
@@ -329,6 +351,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
329
351
|
self._openapi_ingestion = (
|
|
330
352
|
openapi_ingestion # Re-evaluated after test connection
|
|
331
353
|
)
|
|
354
|
+
self._server_config_refresh_interval = server_config_refresh_interval
|
|
355
|
+
self._config_fetch_time: Optional[float] = None
|
|
332
356
|
|
|
333
357
|
headers = {
|
|
334
358
|
"X-RestLi-Protocol-Version": "2.0.0",
|
|
@@ -398,7 +422,17 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
398
422
|
Raises:
|
|
399
423
|
ConfigurationError: If there's an error fetching or validating the configuration
|
|
400
424
|
"""
|
|
401
|
-
|
|
425
|
+
|
|
426
|
+
if (
|
|
427
|
+
not hasattr(self, "_server_config")
|
|
428
|
+
or self._server_config is None
|
|
429
|
+
or (
|
|
430
|
+
self._server_config_refresh_interval is not None
|
|
431
|
+
and self._config_fetch_time is not None
|
|
432
|
+
and (time.time() - self._config_fetch_time)
|
|
433
|
+
> self._server_config_refresh_interval
|
|
434
|
+
)
|
|
435
|
+
):
|
|
402
436
|
if self._session is None or self._gms_server is None:
|
|
403
437
|
raise ConfigurationError(
|
|
404
438
|
"Session and URL are required to load configuration"
|
|
@@ -419,6 +453,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
419
453
|
)
|
|
420
454
|
|
|
421
455
|
self._server_config = RestServiceConfig(raw_config=raw_config)
|
|
456
|
+
self._config_fetch_time = time.time()
|
|
422
457
|
self._post_fetch_server_config()
|
|
423
458
|
|
|
424
459
|
else:
|
|
@@ -453,6 +488,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
453
488
|
DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
|
|
454
489
|
)
|
|
455
490
|
|
|
491
|
+
def test_connection(self) -> None:
|
|
492
|
+
self.fetch_server_config()
|
|
456
493
|
logger.debug(
|
|
457
494
|
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
458
495
|
)
|
|
@@ -460,12 +497,21 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
460
497
|
f"{EmitMode.ASYNC_WAIT} {'IS' if self._should_trace(emit_mode=EmitMode.ASYNC_WAIT, warn=False) else 'IS NOT'} supported."
|
|
461
498
|
)
|
|
462
499
|
|
|
463
|
-
def test_connection(self) -> None:
|
|
464
|
-
self.fetch_server_config()
|
|
465
|
-
|
|
466
500
|
def get_server_config(self) -> dict:
|
|
467
501
|
return self.server_config.raw_config
|
|
468
502
|
|
|
503
|
+
def invalidate_config_cache(self) -> None:
|
|
504
|
+
"""Manually invalidate the configuration cache."""
|
|
505
|
+
if (
|
|
506
|
+
hasattr(self, "_server_config")
|
|
507
|
+
and self._server_config is not None
|
|
508
|
+
and self._server_config_refresh_interval is not None
|
|
509
|
+
):
|
|
510
|
+
# Set fetch time to beyond TTL in the past to force refresh on next access
|
|
511
|
+
self._config_fetch_time = (
|
|
512
|
+
time.time() - self._server_config_refresh_interval - 1
|
|
513
|
+
)
|
|
514
|
+
|
|
469
515
|
def to_graph(self) -> "DataHubGraph":
|
|
470
516
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
471
517
|
|
|
@@ -584,15 +630,27 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
584
630
|
trace_data = extract_trace_data(response) if response else None
|
|
585
631
|
|
|
586
632
|
else:
|
|
587
|
-
|
|
633
|
+
if mcp.changeType == ChangeTypeClass.DELETE:
|
|
634
|
+
if mcp.aspectName not in KEY_ASPECT_NAMES:
|
|
635
|
+
raise OperationalError(
|
|
636
|
+
f"Delete not supported for non key aspect: {mcp.aspectName} for urn: "
|
|
637
|
+
f"{mcp.entityUrn}"
|
|
638
|
+
)
|
|
588
639
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
640
|
+
url = f"{self._gms_server}/entities?action=delete"
|
|
641
|
+
payload_dict = {
|
|
642
|
+
"urn": mcp.entityUrn,
|
|
643
|
+
}
|
|
644
|
+
else:
|
|
645
|
+
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
646
|
+
|
|
647
|
+
mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
|
|
648
|
+
payload_dict = {
|
|
649
|
+
"proposal": mcp_obj,
|
|
650
|
+
"async": "true"
|
|
651
|
+
if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
|
|
652
|
+
else "false",
|
|
653
|
+
}
|
|
596
654
|
|
|
597
655
|
payload = json.dumps(payload_dict)
|
|
598
656
|
|
datahub/entrypoints.py
CHANGED
|
@@ -10,6 +10,7 @@ import click
|
|
|
10
10
|
import datahub._version as datahub_version
|
|
11
11
|
from datahub.cli.check_cli import check
|
|
12
12
|
from datahub.cli.cli_utils import (
|
|
13
|
+
enable_auto_decorators,
|
|
13
14
|
fixup_gms_url,
|
|
14
15
|
generate_access_token,
|
|
15
16
|
make_shim_command,
|
|
@@ -38,7 +39,6 @@ from datahub.cli.timeline_cli import timeline
|
|
|
38
39
|
from datahub.configuration.common import should_show_stack_trace
|
|
39
40
|
from datahub.ingestion.graph.client import get_default_graph
|
|
40
41
|
from datahub.ingestion.graph.config import ClientMode
|
|
41
|
-
from datahub.telemetry import telemetry
|
|
42
42
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
43
43
|
from datahub.utilities.logging_manager import configure_logging
|
|
44
44
|
from datahub.utilities.server_config_util import get_gms_config
|
|
@@ -111,7 +111,6 @@ def datahub(
|
|
|
111
111
|
default=False,
|
|
112
112
|
help="If passed will show server config. Assumes datahub init has happened.",
|
|
113
113
|
)
|
|
114
|
-
@telemetry.with_telemetry()
|
|
115
114
|
def version(include_server: bool = False) -> None:
|
|
116
115
|
"""Print version number and exit."""
|
|
117
116
|
|
|
@@ -131,7 +130,6 @@ def version(include_server: bool = False) -> None:
|
|
|
131
130
|
default=False,
|
|
132
131
|
help="If passed then uses password to initialise token.",
|
|
133
132
|
)
|
|
134
|
-
@telemetry.with_telemetry()
|
|
135
133
|
def init(use_password: bool = False) -> None:
|
|
136
134
|
"""Configure which datahub instance to connect to"""
|
|
137
135
|
|
|
@@ -218,6 +216,9 @@ except ImportError as e:
|
|
|
218
216
|
make_shim_command("actions", "run `pip install acryl-datahub-actions`")
|
|
219
217
|
)
|
|
220
218
|
|
|
219
|
+
# Adding telemetry and upgrade decorators to all commands
|
|
220
|
+
enable_auto_decorators(datahub)
|
|
221
|
+
|
|
221
222
|
|
|
222
223
|
def main(**kwargs):
|
|
223
224
|
# We use threads in a variety of places within our CLI. The multiprocessing
|
|
@@ -1,12 +1,16 @@
|
|
|
1
|
+
# So that SourceCapabilityModifier can be resolved at runtime
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
1
4
|
from dataclasses import dataclass
|
|
2
5
|
from enum import Enum, auto
|
|
3
|
-
from typing import Callable, Dict, Optional, Type
|
|
6
|
+
from typing import Callable, Dict, List, Optional, Type
|
|
4
7
|
|
|
5
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
6
9
|
from datahub.ingestion.api.source import (
|
|
7
10
|
Source,
|
|
8
11
|
SourceCapability as SourceCapability,
|
|
9
12
|
)
|
|
13
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
def config_class(config_cls: Type) -> Callable[[Type], Type]:
|
|
@@ -88,10 +92,14 @@ class CapabilitySetting:
|
|
|
88
92
|
capability: SourceCapability
|
|
89
93
|
description: str
|
|
90
94
|
supported: bool
|
|
95
|
+
subtype_modifier: Optional[List[SourceCapabilityModifier]] = None
|
|
91
96
|
|
|
92
97
|
|
|
93
98
|
def capability(
|
|
94
|
-
capability_name: SourceCapability,
|
|
99
|
+
capability_name: SourceCapability,
|
|
100
|
+
description: str,
|
|
101
|
+
supported: bool = True,
|
|
102
|
+
subtype_modifier: Optional[List[SourceCapabilityModifier]] = None,
|
|
95
103
|
) -> Callable[[Type], Type]:
|
|
96
104
|
"""
|
|
97
105
|
A decorator to mark a source as having a certain capability
|
|
@@ -104,6 +112,7 @@ def capability(
|
|
|
104
112
|
for base in cls.__bases__
|
|
105
113
|
):
|
|
106
114
|
cls.__capabilities = {}
|
|
115
|
+
|
|
107
116
|
cls.get_capabilities = lambda: cls.__capabilities.values()
|
|
108
117
|
|
|
109
118
|
# If the superclasses have capability annotations, copy those over.
|
|
@@ -113,7 +122,10 @@ def capability(
|
|
|
113
122
|
cls.__capabilities.update(base_caps)
|
|
114
123
|
|
|
115
124
|
cls.__capabilities[capability_name] = CapabilitySetting(
|
|
116
|
-
capability=capability_name,
|
|
125
|
+
capability=capability_name,
|
|
126
|
+
description=description,
|
|
127
|
+
supported=supported,
|
|
128
|
+
subtype_modifier=subtype_modifier,
|
|
117
129
|
)
|
|
118
130
|
return cls
|
|
119
131
|
|
datahub/ingestion/api/report.py
CHANGED
|
@@ -2,17 +2,31 @@ import dataclasses
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import pprint
|
|
5
|
-
from
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from dataclasses import dataclass, field
|
|
6
7
|
from datetime import datetime, timedelta
|
|
7
8
|
from enum import Enum
|
|
8
|
-
from typing import Any, Optional, runtime_checkable
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
|
|
9
10
|
|
|
10
11
|
import humanfriendly
|
|
11
12
|
import pydantic
|
|
12
13
|
from pydantic import BaseModel
|
|
14
|
+
from tabulate import tabulate
|
|
13
15
|
from typing_extensions import Literal, Protocol
|
|
14
16
|
|
|
17
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
|
+
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
19
|
+
from datahub.ingestion.api.closeable import Closeable
|
|
15
20
|
from datahub.ingestion.api.report_helpers import format_datetime_relative
|
|
21
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
|
|
23
|
+
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
24
|
+
from datahub.metadata.schema_classes import (
|
|
25
|
+
MetadataChangeProposalClass,
|
|
26
|
+
SubTypesClass,
|
|
27
|
+
UpstreamLineageClass,
|
|
28
|
+
)
|
|
29
|
+
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
16
30
|
from datahub.utilities.lossy_collections import LossyList
|
|
17
31
|
|
|
18
32
|
logger = logging.getLogger(__name__)
|
|
@@ -82,7 +96,58 @@ class Report(SupportsAsObj):
|
|
|
82
96
|
}
|
|
83
97
|
|
|
84
98
|
def as_string(self) -> str:
|
|
85
|
-
|
|
99
|
+
self_obj = self.as_obj()
|
|
100
|
+
_aspects_by_subtypes = self_obj.pop("aspects_by_subtypes", None)
|
|
101
|
+
|
|
102
|
+
# Format the main report data
|
|
103
|
+
result = pprint.pformat(self_obj, width=150, sort_dicts=False)
|
|
104
|
+
|
|
105
|
+
# Add aspects_by_subtypes table if it exists
|
|
106
|
+
if _aspects_by_subtypes:
|
|
107
|
+
result += "\n\nAspects by Subtypes:\n"
|
|
108
|
+
result += self._format_aspects_by_subtypes_table(_aspects_by_subtypes)
|
|
109
|
+
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
def _format_aspects_by_subtypes_table(
|
|
113
|
+
self, aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]]
|
|
114
|
+
) -> str:
|
|
115
|
+
"""Format aspects_by_subtypes data as a table with aspects as rows and entity/subtype as columns."""
|
|
116
|
+
if not aspects_by_subtypes:
|
|
117
|
+
return "No aspects by subtypes data available."
|
|
118
|
+
|
|
119
|
+
all_aspects: set[str] = {
|
|
120
|
+
aspect
|
|
121
|
+
for subtypes in aspects_by_subtypes.values()
|
|
122
|
+
for aspects in subtypes.values()
|
|
123
|
+
for aspect in aspects
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
aspect_rows = sorted(all_aspects)
|
|
127
|
+
|
|
128
|
+
entity_subtype_columns = []
|
|
129
|
+
for entity_type, subtypes in aspects_by_subtypes.items():
|
|
130
|
+
for subtype in subtypes:
|
|
131
|
+
entity_subtype_columns.append(f"{entity_type} ({subtype})")
|
|
132
|
+
|
|
133
|
+
entity_subtype_columns.sort()
|
|
134
|
+
|
|
135
|
+
headers = ["Aspect"] + entity_subtype_columns
|
|
136
|
+
|
|
137
|
+
table_data = [
|
|
138
|
+
[aspect]
|
|
139
|
+
+ [
|
|
140
|
+
aspects.get(aspect, 0)
|
|
141
|
+
for subtypes in aspects_by_subtypes.values()
|
|
142
|
+
for aspects in subtypes.values()
|
|
143
|
+
]
|
|
144
|
+
for aspect in aspect_rows
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
if table_data:
|
|
148
|
+
return tabulate(table_data, headers=headers, tablefmt="grid")
|
|
149
|
+
else:
|
|
150
|
+
return "No aspects by subtypes data available."
|
|
86
151
|
|
|
87
152
|
def as_json(self) -> str:
|
|
88
153
|
return json.dumps(self.as_obj())
|
|
@@ -90,6 +155,14 @@ class Report(SupportsAsObj):
|
|
|
90
155
|
# TODO add helper method for warning / failure status + counts?
|
|
91
156
|
|
|
92
157
|
|
|
158
|
+
@dataclass
|
|
159
|
+
class SourceReportSubtypes:
|
|
160
|
+
urn: str
|
|
161
|
+
entity_type: str
|
|
162
|
+
subType: str = field(default="unknown")
|
|
163
|
+
aspects: Dict[str, int] = field(default_factory=dict)
|
|
164
|
+
|
|
165
|
+
|
|
93
166
|
class ReportAttribute(BaseModel):
|
|
94
167
|
severity: LogLevel = "DEBUG"
|
|
95
168
|
help: Optional[str] = None
|
|
@@ -108,6 +181,262 @@ class ReportAttribute(BaseModel):
|
|
|
108
181
|
logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
|
|
109
182
|
|
|
110
183
|
|
|
184
|
+
@dataclass
|
|
185
|
+
class ExamplesReport(Report, Closeable):
|
|
186
|
+
aspects: Dict[str, Dict[str, int]] = field(
|
|
187
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
188
|
+
)
|
|
189
|
+
aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
|
|
190
|
+
default_factory=lambda: defaultdict(
|
|
191
|
+
lambda: defaultdict(lambda: defaultdict(int))
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
samples: Dict[str, Dict[str, List[str]]] = field(
|
|
195
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(list))
|
|
196
|
+
)
|
|
197
|
+
_file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
|
|
198
|
+
|
|
199
|
+
# We are adding this to make querying easier for fine-grained lineage
|
|
200
|
+
_fine_grained_lineage_special_case_name = "fineGrainedLineages"
|
|
201
|
+
_samples_to_add: int = 20
|
|
202
|
+
_lineage_aspects_seen: Set[str] = field(default_factory=set)
|
|
203
|
+
|
|
204
|
+
def __post_init__(self) -> None:
|
|
205
|
+
self._file_based_dict = FileBackedDict(
|
|
206
|
+
tablename="urn_aspects",
|
|
207
|
+
extra_columns={
|
|
208
|
+
"urn": lambda val: val.urn,
|
|
209
|
+
"entityType": lambda val: val.entity_type,
|
|
210
|
+
"subTypes": lambda val: val.subType,
|
|
211
|
+
"aspects": lambda val: json.dumps(val.aspects),
|
|
212
|
+
},
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def close(self) -> None:
|
|
216
|
+
self.compute_stats()
|
|
217
|
+
if self._file_based_dict is not None:
|
|
218
|
+
self._file_based_dict.close()
|
|
219
|
+
self._file_based_dict = None
|
|
220
|
+
|
|
221
|
+
def _build_aspects_where_clause(self, aspects: List[str]) -> str:
|
|
222
|
+
"""Build WHERE clause for matching any of the given aspects."""
|
|
223
|
+
if not aspects:
|
|
224
|
+
return ""
|
|
225
|
+
|
|
226
|
+
conditions = []
|
|
227
|
+
for aspect in aspects:
|
|
228
|
+
conditions.append(f"aspects LIKE '%{aspect}%'")
|
|
229
|
+
|
|
230
|
+
return " OR ".join(conditions)
|
|
231
|
+
|
|
232
|
+
def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
|
|
233
|
+
"""Helper method to collect samples organized by subtype for a given where clause."""
|
|
234
|
+
|
|
235
|
+
subtype_query = f"""
|
|
236
|
+
SELECT DISTINCT subTypes
|
|
237
|
+
FROM urn_aspects
|
|
238
|
+
WHERE {where_clause}
|
|
239
|
+
"""
|
|
240
|
+
assert self._file_based_dict is not None
|
|
241
|
+
subtypes = set()
|
|
242
|
+
for row in self._file_based_dict.sql_query(subtype_query):
|
|
243
|
+
sub_type = row["subTypes"] or "unknown"
|
|
244
|
+
subtypes.add(sub_type)
|
|
245
|
+
|
|
246
|
+
for sub_type in subtypes:
|
|
247
|
+
query = f"""
|
|
248
|
+
SELECT urn
|
|
249
|
+
FROM urn_aspects
|
|
250
|
+
WHERE {where_clause} AND subTypes = ?
|
|
251
|
+
limit {self._samples_to_add}
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
for row in self._file_based_dict.sql_query(query, (sub_type,)):
|
|
255
|
+
self.samples[sample_key][sub_type].append(row["urn"])
|
|
256
|
+
|
|
257
|
+
def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
|
|
258
|
+
"""Helper method to collect samples for entities that have any of the given aspects."""
|
|
259
|
+
if not aspects:
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
where_clause = self._build_aspects_where_clause(aspects)
|
|
263
|
+
self._collect_samples_by_subtype(where_clause, sample_key)
|
|
264
|
+
|
|
265
|
+
def _collect_samples_by_lineage_aspects(
|
|
266
|
+
self, aspects: List[str], sample_key: str
|
|
267
|
+
) -> None:
|
|
268
|
+
"""Helper method to collect samples for entities that have any of the given lineage aspects.
|
|
269
|
+
|
|
270
|
+
Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
|
|
271
|
+
"""
|
|
272
|
+
if not aspects:
|
|
273
|
+
return
|
|
274
|
+
|
|
275
|
+
lineage_conditions = []
|
|
276
|
+
for aspect in aspects:
|
|
277
|
+
lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
|
|
278
|
+
|
|
279
|
+
where_clause = " OR ".join(lineage_conditions)
|
|
280
|
+
self._collect_samples_by_subtype(where_clause, sample_key)
|
|
281
|
+
|
|
282
|
+
def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
|
|
283
|
+
"""
|
|
284
|
+
Collect samples for entities that have lineage, profiling, and usage aspects.
|
|
285
|
+
These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
|
|
286
|
+
"""
|
|
287
|
+
if not self._lineage_aspects_seen:
|
|
288
|
+
return
|
|
289
|
+
assert self._file_based_dict is not None
|
|
290
|
+
|
|
291
|
+
# Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
|
|
292
|
+
lineage_conditions = []
|
|
293
|
+
for aspect in self._lineage_aspects_seen:
|
|
294
|
+
lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
|
|
295
|
+
lineage_where_clause = " OR ".join(lineage_conditions)
|
|
296
|
+
|
|
297
|
+
# Build profiling conditions using the same logic as _collect_samples_by_aspects
|
|
298
|
+
profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
|
|
299
|
+
|
|
300
|
+
# Build usage conditions using the same logic as _collect_samples_by_aspects
|
|
301
|
+
usage_where_clause = self._build_aspects_where_clause(
|
|
302
|
+
[
|
|
303
|
+
"datasetUsageStatistics",
|
|
304
|
+
"chartUsageStatistics",
|
|
305
|
+
"dashboardUsageStatistics",
|
|
306
|
+
]
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
query = f"""
|
|
310
|
+
SELECT urn, subTypes
|
|
311
|
+
FROM urn_aspects
|
|
312
|
+
WHERE ({lineage_where_clause})
|
|
313
|
+
AND ({profiling_where_clause})
|
|
314
|
+
AND ({usage_where_clause})
|
|
315
|
+
limit {self._samples_to_add}
|
|
316
|
+
"""
|
|
317
|
+
|
|
318
|
+
for row in self._file_based_dict.sql_query(query):
|
|
319
|
+
sub_type = row["subTypes"] or "unknown"
|
|
320
|
+
self.samples[sample_key][sub_type].append(row["urn"])
|
|
321
|
+
|
|
322
|
+
def _has_fine_grained_lineage(
|
|
323
|
+
self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
|
|
324
|
+
) -> bool:
|
|
325
|
+
if isinstance(mcp.aspect, UpstreamLineageClass):
|
|
326
|
+
upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
|
|
327
|
+
if upstream_lineage.fineGrainedLineages:
|
|
328
|
+
return True
|
|
329
|
+
return False
|
|
330
|
+
|
|
331
|
+
def _update_file_based_dict(
|
|
332
|
+
self,
|
|
333
|
+
urn: str,
|
|
334
|
+
entityType: str,
|
|
335
|
+
aspectName: str,
|
|
336
|
+
mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
|
|
337
|
+
) -> None:
|
|
338
|
+
if is_lineage_aspect(entityType, aspectName):
|
|
339
|
+
self._lineage_aspects_seen.add(aspectName)
|
|
340
|
+
has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
|
|
341
|
+
|
|
342
|
+
sub_type = "unknown"
|
|
343
|
+
if isinstance(mcp.aspect, SubTypesClass):
|
|
344
|
+
sub_type = mcp.aspect.typeNames[0]
|
|
345
|
+
|
|
346
|
+
assert self._file_based_dict is not None
|
|
347
|
+
if urn in self._file_based_dict:
|
|
348
|
+
if sub_type != "unknown":
|
|
349
|
+
self._file_based_dict[urn].subType = sub_type
|
|
350
|
+
aspects_dict = self._file_based_dict[urn].aspects
|
|
351
|
+
if aspectName in aspects_dict:
|
|
352
|
+
aspects_dict[aspectName] += 1
|
|
353
|
+
else:
|
|
354
|
+
aspects_dict[aspectName] = 1
|
|
355
|
+
if has_fine_grained_lineage:
|
|
356
|
+
if self._fine_grained_lineage_special_case_name in aspects_dict:
|
|
357
|
+
aspects_dict[self._fine_grained_lineage_special_case_name] += 1
|
|
358
|
+
else:
|
|
359
|
+
aspects_dict[self._fine_grained_lineage_special_case_name] = 1
|
|
360
|
+
self._file_based_dict.mark_dirty(urn)
|
|
361
|
+
else:
|
|
362
|
+
aspects_dict = {aspectName: 1}
|
|
363
|
+
if has_fine_grained_lineage:
|
|
364
|
+
aspects_dict[self._fine_grained_lineage_special_case_name] = 1
|
|
365
|
+
self._file_based_dict[urn] = SourceReportSubtypes(
|
|
366
|
+
urn=urn,
|
|
367
|
+
entity_type=entityType,
|
|
368
|
+
subType=sub_type,
|
|
369
|
+
aspects=aspects_dict,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
|
|
373
|
+
urn = wu.get_urn()
|
|
374
|
+
|
|
375
|
+
if not isinstance(wu.metadata, MetadataChangeEvent):
|
|
376
|
+
mcps = [wu.metadata]
|
|
377
|
+
else:
|
|
378
|
+
mcps = list(mcps_from_mce(wu.metadata))
|
|
379
|
+
|
|
380
|
+
for mcp in mcps:
|
|
381
|
+
entityType = mcp.entityType
|
|
382
|
+
aspectName = mcp.aspectName
|
|
383
|
+
|
|
384
|
+
if aspectName is None:
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
self._update_file_based_dict(urn, entityType, aspectName, mcp)
|
|
388
|
+
|
|
389
|
+
def compute_stats(self) -> None:
|
|
390
|
+
if self._file_based_dict is None:
|
|
391
|
+
return
|
|
392
|
+
|
|
393
|
+
query = """
|
|
394
|
+
SELECT entityType, subTypes, aspects, count(*) as count
|
|
395
|
+
FROM urn_aspects
|
|
396
|
+
group by entityType, subTypes, aspects
|
|
397
|
+
"""
|
|
398
|
+
|
|
399
|
+
entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
|
|
400
|
+
defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
|
401
|
+
)
|
|
402
|
+
for row in self._file_based_dict.sql_query(query):
|
|
403
|
+
entity_type = row["entityType"]
|
|
404
|
+
sub_type = row["subTypes"]
|
|
405
|
+
count = row["count"]
|
|
406
|
+
aspects_raw = row["aspects"] or "[]"
|
|
407
|
+
|
|
408
|
+
aspects = json.loads(aspects_raw)
|
|
409
|
+
for aspect, aspect_count in aspects.items():
|
|
410
|
+
entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
|
|
411
|
+
aspect_count * count
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
self.aspects.clear()
|
|
415
|
+
self.aspects_by_subtypes.clear()
|
|
416
|
+
_aspects_seen: Set[str] = set()
|
|
417
|
+
for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
|
|
418
|
+
for sub_type, aspect_counts in subtype_counts.items():
|
|
419
|
+
for aspect, count in aspect_counts.items():
|
|
420
|
+
self.aspects[entity_type][aspect] += count
|
|
421
|
+
_aspects_seen.add(aspect)
|
|
422
|
+
self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
|
|
423
|
+
|
|
424
|
+
self.samples.clear()
|
|
425
|
+
self._collect_samples_by_aspects(["datasetProfile"], "profiling")
|
|
426
|
+
self._collect_samples_by_aspects(
|
|
427
|
+
[
|
|
428
|
+
"datasetUsageStatistics",
|
|
429
|
+
"chartUsageStatistics",
|
|
430
|
+
"dashboardUsageStatistics",
|
|
431
|
+
],
|
|
432
|
+
"usage",
|
|
433
|
+
)
|
|
434
|
+
self._collect_samples_by_lineage_aspects(
|
|
435
|
+
list(self._lineage_aspects_seen), "lineage"
|
|
436
|
+
)
|
|
437
|
+
self._collect_samples_with_all_conditions("all_3")
|
|
438
|
+
|
|
439
|
+
|
|
111
440
|
class EntityFilterReport(ReportAttribute):
|
|
112
441
|
type: str
|
|
113
442
|
|
datahub/ingestion/api/sink.py
CHANGED
|
@@ -147,6 +147,9 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
|
|
|
147
147
|
def close(self) -> None:
|
|
148
148
|
pass
|
|
149
149
|
|
|
150
|
+
def flush(self) -> None:
|
|
151
|
+
pass
|
|
152
|
+
|
|
150
153
|
def configured(self) -> str:
|
|
151
154
|
"""Override this method to output a human-readable and scrubbed version of the configured sink"""
|
|
152
155
|
return ""
|