acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1433 -546
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17736 -17112
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
datahub/entrypoints.py
CHANGED
|
@@ -10,6 +10,7 @@ import click
|
|
|
10
10
|
import datahub._version as datahub_version
|
|
11
11
|
from datahub.cli.check_cli import check
|
|
12
12
|
from datahub.cli.cli_utils import (
|
|
13
|
+
enable_auto_decorators,
|
|
13
14
|
fixup_gms_url,
|
|
14
15
|
generate_access_token,
|
|
15
16
|
make_shim_command,
|
|
@@ -38,7 +39,6 @@ from datahub.cli.timeline_cli import timeline
|
|
|
38
39
|
from datahub.configuration.common import should_show_stack_trace
|
|
39
40
|
from datahub.ingestion.graph.client import get_default_graph
|
|
40
41
|
from datahub.ingestion.graph.config import ClientMode
|
|
41
|
-
from datahub.telemetry import telemetry
|
|
42
42
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
43
43
|
from datahub.utilities.logging_manager import configure_logging
|
|
44
44
|
from datahub.utilities.server_config_util import get_gms_config
|
|
@@ -111,7 +111,6 @@ def datahub(
|
|
|
111
111
|
default=False,
|
|
112
112
|
help="If passed will show server config. Assumes datahub init has happened.",
|
|
113
113
|
)
|
|
114
|
-
@telemetry.with_telemetry()
|
|
115
114
|
def version(include_server: bool = False) -> None:
|
|
116
115
|
"""Print version number and exit."""
|
|
117
116
|
|
|
@@ -131,7 +130,6 @@ def version(include_server: bool = False) -> None:
|
|
|
131
130
|
default=False,
|
|
132
131
|
help="If passed then uses password to initialise token.",
|
|
133
132
|
)
|
|
134
|
-
@telemetry.with_telemetry()
|
|
135
133
|
def init(use_password: bool = False) -> None:
|
|
136
134
|
"""Configure which datahub instance to connect to"""
|
|
137
135
|
|
|
@@ -218,6 +216,9 @@ except ImportError as e:
|
|
|
218
216
|
make_shim_command("actions", "run `pip install acryl-datahub-actions`")
|
|
219
217
|
)
|
|
220
218
|
|
|
219
|
+
# Adding telemetry and upgrade decorators to all commands
|
|
220
|
+
enable_auto_decorators(datahub)
|
|
221
|
+
|
|
221
222
|
|
|
222
223
|
def main(**kwargs):
|
|
223
224
|
# We use threads in a variety of places within our CLI. The multiprocessing
|
|
@@ -1,12 +1,16 @@
|
|
|
1
|
+
# So that SourceCapabilityModifier can be resolved at runtime
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
1
4
|
from dataclasses import dataclass
|
|
2
5
|
from enum import Enum, auto
|
|
3
|
-
from typing import Callable, Dict, Optional, Type
|
|
6
|
+
from typing import Callable, Dict, List, Optional, Type
|
|
4
7
|
|
|
5
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
6
9
|
from datahub.ingestion.api.source import (
|
|
7
10
|
Source,
|
|
8
11
|
SourceCapability as SourceCapability,
|
|
9
12
|
)
|
|
13
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
def config_class(config_cls: Type) -> Callable[[Type], Type]:
|
|
@@ -88,10 +92,14 @@ class CapabilitySetting:
|
|
|
88
92
|
capability: SourceCapability
|
|
89
93
|
description: str
|
|
90
94
|
supported: bool
|
|
95
|
+
subtype_modifier: Optional[List[SourceCapabilityModifier]] = None
|
|
91
96
|
|
|
92
97
|
|
|
93
98
|
def capability(
|
|
94
|
-
capability_name: SourceCapability,
|
|
99
|
+
capability_name: SourceCapability,
|
|
100
|
+
description: str,
|
|
101
|
+
supported: bool = True,
|
|
102
|
+
subtype_modifier: Optional[List[SourceCapabilityModifier]] = None,
|
|
95
103
|
) -> Callable[[Type], Type]:
|
|
96
104
|
"""
|
|
97
105
|
A decorator to mark a source as having a certain capability
|
|
@@ -104,6 +112,7 @@ def capability(
|
|
|
104
112
|
for base in cls.__bases__
|
|
105
113
|
):
|
|
106
114
|
cls.__capabilities = {}
|
|
115
|
+
|
|
107
116
|
cls.get_capabilities = lambda: cls.__capabilities.values()
|
|
108
117
|
|
|
109
118
|
# If the superclasses have capability annotations, copy those over.
|
|
@@ -113,7 +122,10 @@ def capability(
|
|
|
113
122
|
cls.__capabilities.update(base_caps)
|
|
114
123
|
|
|
115
124
|
cls.__capabilities[capability_name] = CapabilitySetting(
|
|
116
|
-
capability=capability_name,
|
|
125
|
+
capability=capability_name,
|
|
126
|
+
description=description,
|
|
127
|
+
supported=supported,
|
|
128
|
+
subtype_modifier=subtype_modifier,
|
|
117
129
|
)
|
|
118
130
|
return cls
|
|
119
131
|
|
datahub/ingestion/api/report.py
CHANGED
|
@@ -2,17 +2,31 @@ import dataclasses
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import pprint
|
|
5
|
-
from
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from dataclasses import dataclass, field
|
|
6
7
|
from datetime import datetime, timedelta
|
|
7
8
|
from enum import Enum
|
|
8
|
-
from typing import Any, Optional, runtime_checkable
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
|
|
9
10
|
|
|
10
11
|
import humanfriendly
|
|
11
12
|
import pydantic
|
|
12
13
|
from pydantic import BaseModel
|
|
14
|
+
from tabulate import tabulate
|
|
13
15
|
from typing_extensions import Literal, Protocol
|
|
14
16
|
|
|
17
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
|
+
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
19
|
+
from datahub.ingestion.api.closeable import Closeable
|
|
15
20
|
from datahub.ingestion.api.report_helpers import format_datetime_relative
|
|
21
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
|
|
23
|
+
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
24
|
+
from datahub.metadata.schema_classes import (
|
|
25
|
+
MetadataChangeProposalClass,
|
|
26
|
+
SubTypesClass,
|
|
27
|
+
UpstreamLineageClass,
|
|
28
|
+
)
|
|
29
|
+
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
16
30
|
from datahub.utilities.lossy_collections import LossyList
|
|
17
31
|
|
|
18
32
|
logger = logging.getLogger(__name__)
|
|
@@ -82,7 +96,58 @@ class Report(SupportsAsObj):
|
|
|
82
96
|
}
|
|
83
97
|
|
|
84
98
|
def as_string(self) -> str:
|
|
85
|
-
|
|
99
|
+
self_obj = self.as_obj()
|
|
100
|
+
_aspects_by_subtypes = self_obj.pop("aspects_by_subtypes", None)
|
|
101
|
+
|
|
102
|
+
# Format the main report data
|
|
103
|
+
result = pprint.pformat(self_obj, width=150, sort_dicts=False)
|
|
104
|
+
|
|
105
|
+
# Add aspects_by_subtypes table if it exists
|
|
106
|
+
if _aspects_by_subtypes:
|
|
107
|
+
result += "\n\nAspects by Subtypes:\n"
|
|
108
|
+
result += self._format_aspects_by_subtypes_table(_aspects_by_subtypes)
|
|
109
|
+
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
def _format_aspects_by_subtypes_table(
|
|
113
|
+
self, aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]]
|
|
114
|
+
) -> str:
|
|
115
|
+
"""Format aspects_by_subtypes data as a table with aspects as rows and entity/subtype as columns."""
|
|
116
|
+
if not aspects_by_subtypes:
|
|
117
|
+
return "No aspects by subtypes data available."
|
|
118
|
+
|
|
119
|
+
all_aspects: set[str] = {
|
|
120
|
+
aspect
|
|
121
|
+
for subtypes in aspects_by_subtypes.values()
|
|
122
|
+
for aspects in subtypes.values()
|
|
123
|
+
for aspect in aspects
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
aspect_rows = sorted(all_aspects)
|
|
127
|
+
|
|
128
|
+
entity_subtype_columns = []
|
|
129
|
+
for entity_type, subtypes in aspects_by_subtypes.items():
|
|
130
|
+
for subtype in subtypes:
|
|
131
|
+
entity_subtype_columns.append(f"{entity_type} ({subtype})")
|
|
132
|
+
|
|
133
|
+
entity_subtype_columns.sort()
|
|
134
|
+
|
|
135
|
+
headers = ["Aspect"] + entity_subtype_columns
|
|
136
|
+
|
|
137
|
+
table_data = [
|
|
138
|
+
[aspect]
|
|
139
|
+
+ [
|
|
140
|
+
aspects.get(aspect, 0)
|
|
141
|
+
for subtypes in aspects_by_subtypes.values()
|
|
142
|
+
for aspects in subtypes.values()
|
|
143
|
+
]
|
|
144
|
+
for aspect in aspect_rows
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
if table_data:
|
|
148
|
+
return tabulate(table_data, headers=headers, tablefmt="grid")
|
|
149
|
+
else:
|
|
150
|
+
return "No aspects by subtypes data available."
|
|
86
151
|
|
|
87
152
|
def as_json(self) -> str:
|
|
88
153
|
return json.dumps(self.as_obj())
|
|
@@ -90,6 +155,14 @@ class Report(SupportsAsObj):
|
|
|
90
155
|
# TODO add helper method for warning / failure status + counts?
|
|
91
156
|
|
|
92
157
|
|
|
158
|
+
@dataclass
|
|
159
|
+
class SourceReportSubtypes:
|
|
160
|
+
urn: str
|
|
161
|
+
entity_type: str
|
|
162
|
+
subType: str = field(default="unknown")
|
|
163
|
+
aspects: Dict[str, int] = field(default_factory=dict)
|
|
164
|
+
|
|
165
|
+
|
|
93
166
|
class ReportAttribute(BaseModel):
|
|
94
167
|
severity: LogLevel = "DEBUG"
|
|
95
168
|
help: Optional[str] = None
|
|
@@ -108,6 +181,262 @@ class ReportAttribute(BaseModel):
|
|
|
108
181
|
logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
|
|
109
182
|
|
|
110
183
|
|
|
184
|
+
@dataclass
|
|
185
|
+
class ExamplesReport(Report, Closeable):
|
|
186
|
+
aspects: Dict[str, Dict[str, int]] = field(
|
|
187
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
188
|
+
)
|
|
189
|
+
aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
|
|
190
|
+
default_factory=lambda: defaultdict(
|
|
191
|
+
lambda: defaultdict(lambda: defaultdict(int))
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
samples: Dict[str, Dict[str, List[str]]] = field(
|
|
195
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(list))
|
|
196
|
+
)
|
|
197
|
+
_file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
|
|
198
|
+
|
|
199
|
+
# We are adding this to make querying easier for fine-grained lineage
|
|
200
|
+
_fine_grained_lineage_special_case_name = "fineGrainedLineages"
|
|
201
|
+
_samples_to_add: int = 20
|
|
202
|
+
_lineage_aspects_seen: Set[str] = field(default_factory=set)
|
|
203
|
+
|
|
204
|
+
def __post_init__(self) -> None:
|
|
205
|
+
self._file_based_dict = FileBackedDict(
|
|
206
|
+
tablename="urn_aspects",
|
|
207
|
+
extra_columns={
|
|
208
|
+
"urn": lambda val: val.urn,
|
|
209
|
+
"entityType": lambda val: val.entity_type,
|
|
210
|
+
"subTypes": lambda val: val.subType,
|
|
211
|
+
"aspects": lambda val: json.dumps(val.aspects),
|
|
212
|
+
},
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def close(self) -> None:
|
|
216
|
+
self.compute_stats()
|
|
217
|
+
if self._file_based_dict is not None:
|
|
218
|
+
self._file_based_dict.close()
|
|
219
|
+
self._file_based_dict = None
|
|
220
|
+
|
|
221
|
+
def _build_aspects_where_clause(self, aspects: List[str]) -> str:
|
|
222
|
+
"""Build WHERE clause for matching any of the given aspects."""
|
|
223
|
+
if not aspects:
|
|
224
|
+
return ""
|
|
225
|
+
|
|
226
|
+
conditions = []
|
|
227
|
+
for aspect in aspects:
|
|
228
|
+
conditions.append(f"aspects LIKE '%{aspect}%'")
|
|
229
|
+
|
|
230
|
+
return " OR ".join(conditions)
|
|
231
|
+
|
|
232
|
+
def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
|
|
233
|
+
"""Helper method to collect samples organized by subtype for a given where clause."""
|
|
234
|
+
|
|
235
|
+
subtype_query = f"""
|
|
236
|
+
SELECT DISTINCT subTypes
|
|
237
|
+
FROM urn_aspects
|
|
238
|
+
WHERE {where_clause}
|
|
239
|
+
"""
|
|
240
|
+
assert self._file_based_dict is not None
|
|
241
|
+
subtypes = set()
|
|
242
|
+
for row in self._file_based_dict.sql_query(subtype_query):
|
|
243
|
+
sub_type = row["subTypes"] or "unknown"
|
|
244
|
+
subtypes.add(sub_type)
|
|
245
|
+
|
|
246
|
+
for sub_type in subtypes:
|
|
247
|
+
query = f"""
|
|
248
|
+
SELECT urn
|
|
249
|
+
FROM urn_aspects
|
|
250
|
+
WHERE {where_clause} AND subTypes = ?
|
|
251
|
+
limit {self._samples_to_add}
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
for row in self._file_based_dict.sql_query(query, (sub_type,)):
|
|
255
|
+
self.samples[sample_key][sub_type].append(row["urn"])
|
|
256
|
+
|
|
257
|
+
def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
|
|
258
|
+
"""Helper method to collect samples for entities that have any of the given aspects."""
|
|
259
|
+
if not aspects:
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
where_clause = self._build_aspects_where_clause(aspects)
|
|
263
|
+
self._collect_samples_by_subtype(where_clause, sample_key)
|
|
264
|
+
|
|
265
|
+
def _collect_samples_by_lineage_aspects(
|
|
266
|
+
self, aspects: List[str], sample_key: str
|
|
267
|
+
) -> None:
|
|
268
|
+
"""Helper method to collect samples for entities that have any of the given lineage aspects.
|
|
269
|
+
|
|
270
|
+
Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
|
|
271
|
+
"""
|
|
272
|
+
if not aspects:
|
|
273
|
+
return
|
|
274
|
+
|
|
275
|
+
lineage_conditions = []
|
|
276
|
+
for aspect in aspects:
|
|
277
|
+
lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
|
|
278
|
+
|
|
279
|
+
where_clause = " OR ".join(lineage_conditions)
|
|
280
|
+
self._collect_samples_by_subtype(where_clause, sample_key)
|
|
281
|
+
|
|
282
|
+
def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
|
|
283
|
+
"""
|
|
284
|
+
Collect samples for entities that have lineage, profiling, and usage aspects.
|
|
285
|
+
These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
|
|
286
|
+
"""
|
|
287
|
+
if not self._lineage_aspects_seen:
|
|
288
|
+
return
|
|
289
|
+
assert self._file_based_dict is not None
|
|
290
|
+
|
|
291
|
+
# Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
|
|
292
|
+
lineage_conditions = []
|
|
293
|
+
for aspect in self._lineage_aspects_seen:
|
|
294
|
+
lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
|
|
295
|
+
lineage_where_clause = " OR ".join(lineage_conditions)
|
|
296
|
+
|
|
297
|
+
# Build profiling conditions using the same logic as _collect_samples_by_aspects
|
|
298
|
+
profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
|
|
299
|
+
|
|
300
|
+
# Build usage conditions using the same logic as _collect_samples_by_aspects
|
|
301
|
+
usage_where_clause = self._build_aspects_where_clause(
|
|
302
|
+
[
|
|
303
|
+
"datasetUsageStatistics",
|
|
304
|
+
"chartUsageStatistics",
|
|
305
|
+
"dashboardUsageStatistics",
|
|
306
|
+
]
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
query = f"""
|
|
310
|
+
SELECT urn, subTypes
|
|
311
|
+
FROM urn_aspects
|
|
312
|
+
WHERE ({lineage_where_clause})
|
|
313
|
+
AND ({profiling_where_clause})
|
|
314
|
+
AND ({usage_where_clause})
|
|
315
|
+
limit {self._samples_to_add}
|
|
316
|
+
"""
|
|
317
|
+
|
|
318
|
+
for row in self._file_based_dict.sql_query(query):
|
|
319
|
+
sub_type = row["subTypes"] or "unknown"
|
|
320
|
+
self.samples[sample_key][sub_type].append(row["urn"])
|
|
321
|
+
|
|
322
|
+
def _has_fine_grained_lineage(
|
|
323
|
+
self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
|
|
324
|
+
) -> bool:
|
|
325
|
+
if isinstance(mcp.aspect, UpstreamLineageClass):
|
|
326
|
+
upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
|
|
327
|
+
if upstream_lineage.fineGrainedLineages:
|
|
328
|
+
return True
|
|
329
|
+
return False
|
|
330
|
+
|
|
331
|
+
def _update_file_based_dict(
|
|
332
|
+
self,
|
|
333
|
+
urn: str,
|
|
334
|
+
entityType: str,
|
|
335
|
+
aspectName: str,
|
|
336
|
+
mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
|
|
337
|
+
) -> None:
|
|
338
|
+
if is_lineage_aspect(entityType, aspectName):
|
|
339
|
+
self._lineage_aspects_seen.add(aspectName)
|
|
340
|
+
has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
|
|
341
|
+
|
|
342
|
+
sub_type = "unknown"
|
|
343
|
+
if isinstance(mcp.aspect, SubTypesClass):
|
|
344
|
+
sub_type = mcp.aspect.typeNames[0]
|
|
345
|
+
|
|
346
|
+
assert self._file_based_dict is not None
|
|
347
|
+
if urn in self._file_based_dict:
|
|
348
|
+
if sub_type != "unknown":
|
|
349
|
+
self._file_based_dict[urn].subType = sub_type
|
|
350
|
+
aspects_dict = self._file_based_dict[urn].aspects
|
|
351
|
+
if aspectName in aspects_dict:
|
|
352
|
+
aspects_dict[aspectName] += 1
|
|
353
|
+
else:
|
|
354
|
+
aspects_dict[aspectName] = 1
|
|
355
|
+
if has_fine_grained_lineage:
|
|
356
|
+
if self._fine_grained_lineage_special_case_name in aspects_dict:
|
|
357
|
+
aspects_dict[self._fine_grained_lineage_special_case_name] += 1
|
|
358
|
+
else:
|
|
359
|
+
aspects_dict[self._fine_grained_lineage_special_case_name] = 1
|
|
360
|
+
self._file_based_dict.mark_dirty(urn)
|
|
361
|
+
else:
|
|
362
|
+
aspects_dict = {aspectName: 1}
|
|
363
|
+
if has_fine_grained_lineage:
|
|
364
|
+
aspects_dict[self._fine_grained_lineage_special_case_name] = 1
|
|
365
|
+
self._file_based_dict[urn] = SourceReportSubtypes(
|
|
366
|
+
urn=urn,
|
|
367
|
+
entity_type=entityType,
|
|
368
|
+
subType=sub_type,
|
|
369
|
+
aspects=aspects_dict,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
|
|
373
|
+
urn = wu.get_urn()
|
|
374
|
+
|
|
375
|
+
if not isinstance(wu.metadata, MetadataChangeEvent):
|
|
376
|
+
mcps = [wu.metadata]
|
|
377
|
+
else:
|
|
378
|
+
mcps = list(mcps_from_mce(wu.metadata))
|
|
379
|
+
|
|
380
|
+
for mcp in mcps:
|
|
381
|
+
entityType = mcp.entityType
|
|
382
|
+
aspectName = mcp.aspectName
|
|
383
|
+
|
|
384
|
+
if aspectName is None:
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
self._update_file_based_dict(urn, entityType, aspectName, mcp)
|
|
388
|
+
|
|
389
|
+
def compute_stats(self) -> None:
|
|
390
|
+
if self._file_based_dict is None:
|
|
391
|
+
return
|
|
392
|
+
|
|
393
|
+
query = """
|
|
394
|
+
SELECT entityType, subTypes, aspects, count(*) as count
|
|
395
|
+
FROM urn_aspects
|
|
396
|
+
group by entityType, subTypes, aspects
|
|
397
|
+
"""
|
|
398
|
+
|
|
399
|
+
entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
|
|
400
|
+
defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
|
401
|
+
)
|
|
402
|
+
for row in self._file_based_dict.sql_query(query):
|
|
403
|
+
entity_type = row["entityType"]
|
|
404
|
+
sub_type = row["subTypes"]
|
|
405
|
+
count = row["count"]
|
|
406
|
+
aspects_raw = row["aspects"] or "[]"
|
|
407
|
+
|
|
408
|
+
aspects = json.loads(aspects_raw)
|
|
409
|
+
for aspect, aspect_count in aspects.items():
|
|
410
|
+
entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
|
|
411
|
+
aspect_count * count
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
self.aspects.clear()
|
|
415
|
+
self.aspects_by_subtypes.clear()
|
|
416
|
+
_aspects_seen: Set[str] = set()
|
|
417
|
+
for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
|
|
418
|
+
for sub_type, aspect_counts in subtype_counts.items():
|
|
419
|
+
for aspect, count in aspect_counts.items():
|
|
420
|
+
self.aspects[entity_type][aspect] += count
|
|
421
|
+
_aspects_seen.add(aspect)
|
|
422
|
+
self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
|
|
423
|
+
|
|
424
|
+
self.samples.clear()
|
|
425
|
+
self._collect_samples_by_aspects(["datasetProfile"], "profiling")
|
|
426
|
+
self._collect_samples_by_aspects(
|
|
427
|
+
[
|
|
428
|
+
"datasetUsageStatistics",
|
|
429
|
+
"chartUsageStatistics",
|
|
430
|
+
"dashboardUsageStatistics",
|
|
431
|
+
],
|
|
432
|
+
"usage",
|
|
433
|
+
)
|
|
434
|
+
self._collect_samples_by_lineage_aspects(
|
|
435
|
+
list(self._lineage_aspects_seen), "lineage"
|
|
436
|
+
)
|
|
437
|
+
self._collect_samples_with_all_conditions("all_3")
|
|
438
|
+
|
|
439
|
+
|
|
111
440
|
class EntityFilterReport(ReportAttribute):
|
|
112
441
|
type: str
|
|
113
442
|
|
datahub/ingestion/api/sink.py
CHANGED
|
@@ -147,6 +147,9 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
|
|
|
147
147
|
def close(self) -> None:
|
|
148
148
|
pass
|
|
149
149
|
|
|
150
|
+
def flush(self) -> None:
|
|
151
|
+
pass
|
|
152
|
+
|
|
150
153
|
def configured(self) -> str:
|
|
151
154
|
"""Override this method to output a human-readable and scrubbed version of the configured sink"""
|
|
152
155
|
return ""
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -2,7 +2,6 @@ import contextlib
|
|
|
2
2
|
import datetime
|
|
3
3
|
import logging
|
|
4
4
|
from abc import ABCMeta, abstractmethod
|
|
5
|
-
from collections import defaultdict
|
|
6
5
|
from dataclasses import dataclass, field
|
|
7
6
|
from enum import Enum
|
|
8
7
|
from functools import partial
|
|
@@ -15,7 +14,6 @@ from typing import (
|
|
|
15
14
|
List,
|
|
16
15
|
Optional,
|
|
17
16
|
Sequence,
|
|
18
|
-
Set,
|
|
19
17
|
Type,
|
|
20
18
|
TypeVar,
|
|
21
19
|
Union,
|
|
@@ -28,7 +26,6 @@ from typing_extensions import LiteralString, Self
|
|
|
28
26
|
from datahub.configuration.common import ConfigModel
|
|
29
27
|
from datahub.configuration.source_common import PlatformInstanceConfigMixin
|
|
30
28
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
31
|
-
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
32
29
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
33
30
|
auto_patch_last_modified,
|
|
34
31
|
)
|
|
@@ -37,7 +34,7 @@ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
|
37
34
|
)
|
|
38
35
|
from datahub.ingestion.api.closeable import Closeable
|
|
39
36
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
40
|
-
from datahub.ingestion.api.report import Report
|
|
37
|
+
from datahub.ingestion.api.report import ExamplesReport, Report
|
|
41
38
|
from datahub.ingestion.api.source_helpers import (
|
|
42
39
|
AutoSystemMetadata,
|
|
43
40
|
auto_browse_path_v2,
|
|
@@ -50,9 +47,8 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
50
47
|
auto_workunit_reporter,
|
|
51
48
|
)
|
|
52
49
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
53
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
54
|
-
from datahub.metadata.schema_classes import UpstreamLineageClass
|
|
55
50
|
from datahub.sdk.entity import Entity
|
|
51
|
+
from datahub.telemetry import stats
|
|
56
52
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
57
53
|
from datahub.utilities.type_annotations import get_class_from_annotation
|
|
58
54
|
|
|
@@ -76,6 +72,7 @@ class SourceCapability(Enum):
|
|
|
76
72
|
SCHEMA_METADATA = "Schema Metadata"
|
|
77
73
|
CONTAINERS = "Asset Containers"
|
|
78
74
|
CLASSIFICATION = "Classification"
|
|
75
|
+
TEST_CONNECTION = "Test Connection"
|
|
79
76
|
|
|
80
77
|
|
|
81
78
|
class StructuredLogLevel(Enum):
|
|
@@ -190,20 +187,11 @@ class StructuredLogs(Report):
|
|
|
190
187
|
|
|
191
188
|
|
|
192
189
|
@dataclass
|
|
193
|
-
class SourceReport(
|
|
190
|
+
class SourceReport(ExamplesReport):
|
|
194
191
|
event_not_produced_warn: bool = True
|
|
195
192
|
events_produced: int = 0
|
|
196
193
|
events_produced_per_sec: int = 0
|
|
197
194
|
|
|
198
|
-
_urns_seen: Set[str] = field(default_factory=set)
|
|
199
|
-
entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
|
|
200
|
-
aspects: Dict[str, Dict[str, int]] = field(
|
|
201
|
-
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
202
|
-
)
|
|
203
|
-
aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
|
|
204
|
-
default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
|
|
205
|
-
)
|
|
206
|
-
|
|
207
195
|
_structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
|
|
208
196
|
|
|
209
197
|
@property
|
|
@@ -220,33 +208,10 @@ class SourceReport(Report):
|
|
|
220
208
|
|
|
221
209
|
def report_workunit(self, wu: WorkUnit) -> None:
|
|
222
210
|
self.events_produced += 1
|
|
211
|
+
if not isinstance(wu, MetadataWorkUnit):
|
|
212
|
+
return
|
|
223
213
|
|
|
224
|
-
|
|
225
|
-
urn = wu.get_urn()
|
|
226
|
-
|
|
227
|
-
# Specialized entity reporting.
|
|
228
|
-
if not isinstance(wu.metadata, MetadataChangeEvent):
|
|
229
|
-
mcps = [wu.metadata]
|
|
230
|
-
else:
|
|
231
|
-
mcps = list(mcps_from_mce(wu.metadata))
|
|
232
|
-
|
|
233
|
-
for mcp in mcps:
|
|
234
|
-
entityType = mcp.entityType
|
|
235
|
-
aspectName = mcp.aspectName
|
|
236
|
-
|
|
237
|
-
if urn not in self._urns_seen:
|
|
238
|
-
self._urns_seen.add(urn)
|
|
239
|
-
self.entities[entityType].append(urn)
|
|
240
|
-
|
|
241
|
-
if aspectName is not None: # usually true
|
|
242
|
-
self.aspects[entityType][aspectName] += 1
|
|
243
|
-
self.aspect_urn_samples[entityType][aspectName].append(urn)
|
|
244
|
-
if isinstance(mcp.aspect, UpstreamLineageClass):
|
|
245
|
-
upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
|
|
246
|
-
if upstream_lineage.fineGrainedLineages:
|
|
247
|
-
self.aspect_urn_samples[entityType][
|
|
248
|
-
"fineGrainedLineages"
|
|
249
|
-
].append(urn)
|
|
214
|
+
super()._store_workunit_data(wu)
|
|
250
215
|
|
|
251
216
|
def report_warning(
|
|
252
217
|
self,
|
|
@@ -265,9 +230,10 @@ class SourceReport(Report):
|
|
|
265
230
|
context: Optional[str] = None,
|
|
266
231
|
title: Optional[LiteralString] = None,
|
|
267
232
|
exc: Optional[BaseException] = None,
|
|
233
|
+
log: bool = True,
|
|
268
234
|
) -> None:
|
|
269
235
|
self._structured_logs.report_log(
|
|
270
|
-
StructuredLogLevel.WARN, message, title, context, exc, log=
|
|
236
|
+
StructuredLogLevel.WARN, message, title, context, exc, log=log
|
|
271
237
|
)
|
|
272
238
|
|
|
273
239
|
def report_failure(
|
|
@@ -325,6 +291,7 @@ class SourceReport(Report):
|
|
|
325
291
|
)
|
|
326
292
|
|
|
327
293
|
def __post_init__(self) -> None:
|
|
294
|
+
super().__post_init__()
|
|
328
295
|
self.start_time = datetime.datetime.now()
|
|
329
296
|
self.running_time: datetime.timedelta = datetime.timedelta(seconds=0)
|
|
330
297
|
|
|
@@ -337,6 +304,43 @@ class SourceReport(Report):
|
|
|
337
304
|
"infos": Report.to_pure_python_obj(self.infos),
|
|
338
305
|
}
|
|
339
306
|
|
|
307
|
+
@staticmethod
|
|
308
|
+
def _discretize_dict_values(
|
|
309
|
+
nested_dict: Dict[str, Dict[str, int]],
|
|
310
|
+
) -> Dict[str, Dict[str, int]]:
|
|
311
|
+
"""Helper method to discretize values in a nested dictionary structure."""
|
|
312
|
+
result = {}
|
|
313
|
+
for outer_key, inner_dict in nested_dict.items():
|
|
314
|
+
discretized_dict: Dict[str, int] = {}
|
|
315
|
+
for inner_key, count in inner_dict.items():
|
|
316
|
+
discretized_dict[inner_key] = stats.discretize(count)
|
|
317
|
+
result[outer_key] = discretized_dict
|
|
318
|
+
return result
|
|
319
|
+
|
|
320
|
+
def get_aspects_dict(self) -> Dict[str, Dict[str, int]]:
|
|
321
|
+
"""Convert the nested defaultdict aspects to a regular dict for serialization."""
|
|
322
|
+
return self._discretize_dict_values(self.aspects)
|
|
323
|
+
|
|
324
|
+
def get_aspects_by_subtypes_dict(self) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
325
|
+
"""Get aspect counts grouped by entity type and subtype."""
|
|
326
|
+
return self._discretize_dict_values_nested(self.aspects_by_subtypes)
|
|
327
|
+
|
|
328
|
+
@staticmethod
|
|
329
|
+
def _discretize_dict_values_nested(
|
|
330
|
+
nested_dict: Dict[str, Dict[str, Dict[str, int]]],
|
|
331
|
+
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
332
|
+
"""Helper method to discretize values in a nested dictionary structure with three levels."""
|
|
333
|
+
result = {}
|
|
334
|
+
for outer_key, middle_dict in nested_dict.items():
|
|
335
|
+
discretized_middle_dict: Dict[str, Dict[str, int]] = {}
|
|
336
|
+
for middle_key, inner_dict in middle_dict.items():
|
|
337
|
+
discretized_inner_dict: Dict[str, int] = {}
|
|
338
|
+
for inner_key, count in inner_dict.items():
|
|
339
|
+
discretized_inner_dict[inner_key] = stats.discretize(count)
|
|
340
|
+
discretized_middle_dict[middle_key] = discretized_inner_dict
|
|
341
|
+
result[outer_key] = discretized_middle_dict
|
|
342
|
+
return result
|
|
343
|
+
|
|
340
344
|
def compute_stats(self) -> None:
|
|
341
345
|
super().compute_stats()
|
|
342
346
|
|
|
@@ -503,7 +507,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
503
507
|
pass
|
|
504
508
|
|
|
505
509
|
def close(self) -> None:
|
|
506
|
-
|
|
510
|
+
self.get_report().close()
|
|
507
511
|
|
|
508
512
|
def _infer_platform(self) -> Optional[str]:
|
|
509
513
|
config = self.get_config()
|
|
File without changes
|