acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,20 @@
|
|
|
1
|
+
import time
|
|
1
2
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Any, Callable, Iterable, Optional, TypeVar, Union
|
|
3
|
+
from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
|
|
3
4
|
|
|
4
5
|
from mlflow import MlflowClient
|
|
5
|
-
from mlflow.entities import Run
|
|
6
|
+
from mlflow.entities import Experiment, Run
|
|
6
7
|
from mlflow.entities.model_registry import ModelVersion, RegisteredModel
|
|
7
8
|
from mlflow.store.entities import PagedList
|
|
8
9
|
from pydantic.fields import Field
|
|
9
10
|
|
|
10
11
|
import datahub.emitter.mce_builder as builder
|
|
12
|
+
from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
13
|
+
DataProcessInstance,
|
|
14
|
+
)
|
|
11
15
|
from datahub.configuration.source_common import EnvConfigMixin
|
|
12
16
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
17
|
+
from datahub.emitter.mcp_builder import ContainerKey
|
|
13
18
|
from datahub.ingestion.api.common import PipelineContext
|
|
14
19
|
from datahub.ingestion.api.decorators import (
|
|
15
20
|
SupportStatus,
|
|
@@ -18,24 +23,62 @@ from datahub.ingestion.api.decorators import (
|
|
|
18
23
|
platform_name,
|
|
19
24
|
support_status,
|
|
20
25
|
)
|
|
21
|
-
from datahub.ingestion.api.source import
|
|
26
|
+
from datahub.ingestion.api.source import (
|
|
27
|
+
MetadataWorkUnitProcessor,
|
|
28
|
+
SourceCapability,
|
|
29
|
+
SourceReport,
|
|
30
|
+
)
|
|
22
31
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
32
|
+
from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
|
|
33
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
34
|
+
StaleEntityRemovalHandler,
|
|
35
|
+
StaleEntityRemovalSourceReport,
|
|
36
|
+
)
|
|
37
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
38
|
+
StatefulIngestionConfigBase,
|
|
39
|
+
StatefulIngestionSourceBase,
|
|
40
|
+
)
|
|
23
41
|
from datahub.metadata.schema_classes import (
|
|
42
|
+
AuditStampClass,
|
|
43
|
+
ContainerClass,
|
|
44
|
+
DataPlatformInstanceClass,
|
|
45
|
+
DataProcessInstanceOutputClass,
|
|
46
|
+
DataProcessInstancePropertiesClass,
|
|
47
|
+
DataProcessInstanceRunEventClass,
|
|
48
|
+
DataProcessInstanceRunResultClass,
|
|
49
|
+
DataProcessRunStatusClass,
|
|
50
|
+
EdgeClass,
|
|
24
51
|
GlobalTagsClass,
|
|
52
|
+
MetadataAttributionClass,
|
|
25
53
|
MLHyperParamClass,
|
|
26
54
|
MLMetricClass,
|
|
27
55
|
MLModelGroupPropertiesClass,
|
|
28
56
|
MLModelPropertiesClass,
|
|
57
|
+
MLTrainingRunPropertiesClass,
|
|
58
|
+
PlatformResourceInfoClass,
|
|
59
|
+
SubTypesClass,
|
|
29
60
|
TagAssociationClass,
|
|
30
61
|
TagPropertiesClass,
|
|
62
|
+
TimeStampClass,
|
|
63
|
+
VersionPropertiesClass,
|
|
31
64
|
VersionTagClass,
|
|
32
65
|
_Aspect,
|
|
33
66
|
)
|
|
67
|
+
from datahub.metadata.urns import (
|
|
68
|
+
DataPlatformUrn,
|
|
69
|
+
MlModelUrn,
|
|
70
|
+
VersionSetUrn,
|
|
71
|
+
)
|
|
72
|
+
from datahub.sdk.container import Container
|
|
34
73
|
|
|
35
74
|
T = TypeVar("T")
|
|
36
75
|
|
|
37
76
|
|
|
38
|
-
class
|
|
77
|
+
class ContainerKeyWithId(ContainerKey):
|
|
78
|
+
id: str
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
39
82
|
tracking_uri: Optional[str] = Field(
|
|
40
83
|
default=None,
|
|
41
84
|
description=(
|
|
@@ -79,7 +122,7 @@ class MLflowRegisteredModelStageInfo:
|
|
|
79
122
|
"Extract descriptions for MLflow Registered Models and Model Versions",
|
|
80
123
|
)
|
|
81
124
|
@capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
|
|
82
|
-
class MLflowSource(
|
|
125
|
+
class MLflowSource(StatefulIngestionSourceBase):
|
|
83
126
|
platform = "mlflow"
|
|
84
127
|
registered_model_stages_info = (
|
|
85
128
|
MLflowRegisteredModelStageInfo(
|
|
@@ -105,9 +148,10 @@ class MLflowSource(Source):
|
|
|
105
148
|
)
|
|
106
149
|
|
|
107
150
|
def __init__(self, ctx: PipelineContext, config: MLflowConfig):
|
|
108
|
-
super().__init__(ctx)
|
|
151
|
+
super().__init__(config, ctx)
|
|
152
|
+
self.ctx = ctx
|
|
109
153
|
self.config = config
|
|
110
|
-
self.report =
|
|
154
|
+
self.report = StaleEntityRemovalSourceReport()
|
|
111
155
|
self.client = MlflowClient(
|
|
112
156
|
tracking_uri=self.config.tracking_uri,
|
|
113
157
|
registry_uri=self.config.registry_uri,
|
|
@@ -116,8 +160,17 @@ class MLflowSource(Source):
|
|
|
116
160
|
def get_report(self) -> SourceReport:
|
|
117
161
|
return self.report
|
|
118
162
|
|
|
163
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
164
|
+
return [
|
|
165
|
+
*super().get_workunit_processors(),
|
|
166
|
+
StaleEntityRemovalHandler.create(
|
|
167
|
+
self, self.config, self.ctx
|
|
168
|
+
).workunit_processor,
|
|
169
|
+
]
|
|
170
|
+
|
|
119
171
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
120
172
|
yield from self._get_tags_workunits()
|
|
173
|
+
yield from self._get_experiment_workunits()
|
|
121
174
|
yield from self._get_ml_model_workunits()
|
|
122
175
|
|
|
123
176
|
def _get_tags_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -151,22 +204,162 @@ class MLflowSource(Source):
|
|
|
151
204
|
aspect=aspect,
|
|
152
205
|
).as_workunit()
|
|
153
206
|
|
|
154
|
-
def
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
207
|
+
def _get_experiment_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
208
|
+
experiments = self._get_mlflow_experiments()
|
|
209
|
+
for experiment in experiments:
|
|
210
|
+
yield from self._get_experiment_container_workunit(experiment)
|
|
211
|
+
|
|
212
|
+
runs = self._get_mlflow_runs_from_experiment(experiment)
|
|
213
|
+
if runs:
|
|
214
|
+
for run in runs:
|
|
215
|
+
yield from self._get_run_workunits(experiment, run)
|
|
216
|
+
|
|
217
|
+
def _get_experiment_custom_properties(self, experiment):
|
|
218
|
+
experiment_custom_props = getattr(experiment, "tags", {}) or {}
|
|
219
|
+
experiment_custom_props.pop("mlflow.note.content", None)
|
|
220
|
+
experiment_custom_props["artifacts_location"] = experiment.artifact_location
|
|
221
|
+
return experiment_custom_props
|
|
222
|
+
|
|
223
|
+
def _get_experiment_container_workunit(
|
|
224
|
+
self, experiment: Experiment
|
|
225
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
226
|
+
experiment_container = Container(
|
|
227
|
+
container_key=ContainerKeyWithId(
|
|
228
|
+
platform=str(DataPlatformUrn(platform_name=self.platform)),
|
|
229
|
+
id=experiment.name,
|
|
230
|
+
),
|
|
231
|
+
subtype=MLAssetSubTypes.MLFLOW_EXPERIMENT,
|
|
232
|
+
display_name=experiment.name,
|
|
233
|
+
description=experiment.tags.get("mlflow.note.content"),
|
|
234
|
+
extra_properties=self._get_experiment_custom_properties(experiment),
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
yield from experiment_container.as_workunits()
|
|
238
|
+
|
|
239
|
+
def _get_run_metrics(self, run: Run) -> List[MLMetricClass]:
|
|
240
|
+
return [
|
|
241
|
+
MLMetricClass(name=k, value=str(v)) for k, v in run.data.metrics.items()
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
def _get_run_params(self, run: Run) -> List[MLHyperParamClass]:
|
|
245
|
+
return [
|
|
246
|
+
MLHyperParamClass(name=k, value=str(v)) for k, v in run.data.params.items()
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
def _convert_run_result_type(
|
|
250
|
+
self, status: str
|
|
251
|
+
) -> DataProcessInstanceRunResultClass:
|
|
252
|
+
if status == "FINISHED":
|
|
253
|
+
return DataProcessInstanceRunResultClass(
|
|
254
|
+
type="SUCCESS", nativeResultType=self.platform
|
|
255
|
+
)
|
|
256
|
+
elif status == "FAILED":
|
|
257
|
+
return DataProcessInstanceRunResultClass(
|
|
258
|
+
type="FAILURE", nativeResultType=self.platform
|
|
259
|
+
)
|
|
260
|
+
else:
|
|
261
|
+
return DataProcessInstanceRunResultClass(
|
|
262
|
+
type="SKIPPED", nativeResultType=self.platform
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def _get_run_workunits(
|
|
266
|
+
self, experiment: Experiment, run: Run
|
|
267
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
268
|
+
experiment_key = ContainerKeyWithId(
|
|
269
|
+
platform=str(DataPlatformUrn(self.platform)), id=experiment.name
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
data_process_instance = DataProcessInstance(
|
|
273
|
+
id=run.info.run_id,
|
|
274
|
+
orchestrator=self.platform,
|
|
275
|
+
template_urn=None,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
created_time = run.info.start_time or int(time.time() * 1000)
|
|
279
|
+
user_id = run.info.user_id if run.info.user_id else "mlflow"
|
|
280
|
+
guid_dict_user = {"platform": self.platform, "user": user_id}
|
|
281
|
+
platform_user_urn = (
|
|
282
|
+
f"urn:li:platformResource:{builder.datahub_guid(guid_dict_user)}"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
yield MetadataChangeProposalWrapper(
|
|
286
|
+
entityUrn=platform_user_urn,
|
|
287
|
+
aspect=PlatformResourceInfoClass(
|
|
288
|
+
resourceType="user",
|
|
289
|
+
primaryKey=user_id,
|
|
290
|
+
),
|
|
291
|
+
).as_workunit()
|
|
292
|
+
|
|
293
|
+
yield MetadataChangeProposalWrapper(
|
|
294
|
+
entityUrn=str(data_process_instance.urn),
|
|
295
|
+
aspect=DataProcessInstancePropertiesClass(
|
|
296
|
+
name=run.info.run_name or run.info.run_id,
|
|
297
|
+
created=AuditStampClass(
|
|
298
|
+
time=created_time,
|
|
299
|
+
actor=platform_user_urn,
|
|
300
|
+
),
|
|
301
|
+
externalUrl=self._make_external_url_from_run(experiment, run),
|
|
302
|
+
customProperties=getattr(run, "tags", {}) or {},
|
|
303
|
+
),
|
|
304
|
+
).as_workunit()
|
|
305
|
+
|
|
306
|
+
yield MetadataChangeProposalWrapper(
|
|
307
|
+
entityUrn=str(data_process_instance.urn),
|
|
308
|
+
aspect=ContainerClass(container=experiment_key.as_urn()),
|
|
309
|
+
).as_workunit()
|
|
310
|
+
|
|
311
|
+
model_versions = self.get_mlflow_model_versions_from_run(run.info.run_id)
|
|
312
|
+
if model_versions:
|
|
313
|
+
model_version_urn = self._make_ml_model_urn(model_versions[0])
|
|
314
|
+
yield MetadataChangeProposalWrapper(
|
|
315
|
+
entityUrn=str(data_process_instance.urn),
|
|
316
|
+
aspect=DataProcessInstanceOutputClass(
|
|
317
|
+
outputs=[],
|
|
318
|
+
outputEdges=[
|
|
319
|
+
EdgeClass(destinationUrn=model_version_urn),
|
|
320
|
+
],
|
|
321
|
+
),
|
|
322
|
+
).as_workunit()
|
|
323
|
+
|
|
324
|
+
metrics = self._get_run_metrics(run)
|
|
325
|
+
hyperparams = self._get_run_params(run)
|
|
326
|
+
yield MetadataChangeProposalWrapper(
|
|
327
|
+
entityUrn=str(data_process_instance.urn),
|
|
328
|
+
aspect=MLTrainingRunPropertiesClass(
|
|
329
|
+
hyperParams=hyperparams,
|
|
330
|
+
trainingMetrics=metrics,
|
|
331
|
+
outputUrls=[run.info.artifact_uri],
|
|
332
|
+
id=run.info.run_id,
|
|
333
|
+
),
|
|
334
|
+
).as_workunit()
|
|
335
|
+
|
|
336
|
+
if run.info.end_time:
|
|
337
|
+
duration_millis = run.info.end_time - run.info.start_time
|
|
338
|
+
|
|
339
|
+
yield MetadataChangeProposalWrapper(
|
|
340
|
+
entityUrn=str(data_process_instance.urn),
|
|
341
|
+
aspect=DataProcessInstanceRunEventClass(
|
|
342
|
+
status=DataProcessRunStatusClass.COMPLETE,
|
|
343
|
+
timestampMillis=run.info.end_time,
|
|
344
|
+
result=DataProcessInstanceRunResultClass(
|
|
345
|
+
type=self._convert_run_result_type(run.info.status).type,
|
|
346
|
+
nativeResultType=self.platform,
|
|
347
|
+
),
|
|
348
|
+
durationMillis=duration_millis,
|
|
349
|
+
),
|
|
350
|
+
).as_workunit()
|
|
351
|
+
|
|
352
|
+
yield MetadataChangeProposalWrapper(
|
|
353
|
+
entityUrn=str(data_process_instance.urn),
|
|
354
|
+
aspect=DataPlatformInstanceClass(
|
|
355
|
+
platform=str(DataPlatformUrn(self.platform))
|
|
356
|
+
),
|
|
357
|
+
).as_workunit()
|
|
358
|
+
|
|
359
|
+
yield MetadataChangeProposalWrapper(
|
|
360
|
+
entityUrn=str(data_process_instance.urn),
|
|
361
|
+
aspect=SubTypesClass(typeNames=[MLAssetSubTypes.MLFLOW_TRAINING_RUN]),
|
|
362
|
+
).as_workunit()
|
|
170
363
|
|
|
171
364
|
def _get_mlflow_registered_models(self) -> Iterable[RegisteredModel]:
|
|
172
365
|
"""
|
|
@@ -179,6 +372,19 @@ class MLflowSource(Source):
|
|
|
179
372
|
)
|
|
180
373
|
return registered_models
|
|
181
374
|
|
|
375
|
+
def _get_mlflow_experiments(self) -> Iterable[Experiment]:
|
|
376
|
+
experiments: Iterable[Experiment] = self._traverse_mlflow_search_func(
|
|
377
|
+
search_func=self.client.search_experiments,
|
|
378
|
+
)
|
|
379
|
+
return experiments
|
|
380
|
+
|
|
381
|
+
def _get_mlflow_runs_from_experiment(self, experiment: Experiment) -> Iterable[Run]:
|
|
382
|
+
runs: Iterable[Run] = self._traverse_mlflow_search_func(
|
|
383
|
+
search_func=self.client.search_runs,
|
|
384
|
+
experiment_ids=[experiment.experiment_id],
|
|
385
|
+
)
|
|
386
|
+
return runs
|
|
387
|
+
|
|
182
388
|
@staticmethod
|
|
183
389
|
def _traverse_mlflow_search_func(
|
|
184
390
|
search_func: Callable[..., PagedList[T]],
|
|
@@ -195,6 +401,13 @@ class MLflowSource(Source):
|
|
|
195
401
|
if not next_page_token:
|
|
196
402
|
return
|
|
197
403
|
|
|
404
|
+
def _get_latest_version(self, registered_model: RegisteredModel) -> Optional[str]:
|
|
405
|
+
return (
|
|
406
|
+
str(registered_model.latest_versions[0].version)
|
|
407
|
+
if registered_model.latest_versions
|
|
408
|
+
else None
|
|
409
|
+
)
|
|
410
|
+
|
|
198
411
|
def _get_ml_group_workunit(
|
|
199
412
|
self,
|
|
200
413
|
registered_model: RegisteredModel,
|
|
@@ -206,7 +419,20 @@ class MLflowSource(Source):
|
|
|
206
419
|
ml_model_group_properties = MLModelGroupPropertiesClass(
|
|
207
420
|
customProperties=registered_model.tags,
|
|
208
421
|
description=registered_model.description,
|
|
209
|
-
|
|
422
|
+
created=TimeStampClass(
|
|
423
|
+
time=registered_model.creation_timestamp, actor=None
|
|
424
|
+
),
|
|
425
|
+
lastModified=TimeStampClass(
|
|
426
|
+
time=registered_model.last_updated_timestamp,
|
|
427
|
+
actor=None,
|
|
428
|
+
),
|
|
429
|
+
version=VersionTagClass(
|
|
430
|
+
versionTag=self._get_latest_version(registered_model),
|
|
431
|
+
metadataAttribution=MetadataAttributionClass(
|
|
432
|
+
time=registered_model.last_updated_timestamp,
|
|
433
|
+
actor="urn:li:corpuser:datahub",
|
|
434
|
+
),
|
|
435
|
+
),
|
|
210
436
|
)
|
|
211
437
|
wu = self._create_workunit(
|
|
212
438
|
urn=ml_model_group_urn,
|
|
@@ -236,6 +462,16 @@ class MLflowSource(Source):
|
|
|
236
462
|
)
|
|
237
463
|
return model_versions
|
|
238
464
|
|
|
465
|
+
def get_mlflow_model_versions_from_run(self, run_id):
|
|
466
|
+
filter_string = f"run_id = '{run_id}'"
|
|
467
|
+
|
|
468
|
+
model_versions: Iterable[ModelVersion] = self._traverse_mlflow_search_func(
|
|
469
|
+
search_func=self.client.search_model_versions,
|
|
470
|
+
filter_string=filter_string,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
return list(model_versions)
|
|
474
|
+
|
|
239
475
|
def _get_mlflow_run(self, model_version: ModelVersion) -> Union[None, Run]:
|
|
240
476
|
"""
|
|
241
477
|
Get a Run associated with a Model Version. Some MVs may exist without Run.
|
|
@@ -246,6 +482,67 @@ class MLflowSource(Source):
|
|
|
246
482
|
else:
|
|
247
483
|
return None
|
|
248
484
|
|
|
485
|
+
def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
486
|
+
"""
|
|
487
|
+
Traverse each Registered Model in Model Registry and generate a corresponding workunit.
|
|
488
|
+
"""
|
|
489
|
+
registered_models = self._get_mlflow_registered_models()
|
|
490
|
+
for registered_model in registered_models:
|
|
491
|
+
version_set_urn = self._get_version_set_urn(registered_model)
|
|
492
|
+
yield self._get_ml_group_workunit(registered_model)
|
|
493
|
+
model_versions = self._get_mlflow_model_versions(registered_model)
|
|
494
|
+
for model_version in model_versions:
|
|
495
|
+
run = self._get_mlflow_run(model_version)
|
|
496
|
+
yield self._get_ml_model_properties_workunit(
|
|
497
|
+
registered_model=registered_model,
|
|
498
|
+
model_version=model_version,
|
|
499
|
+
run=run,
|
|
500
|
+
)
|
|
501
|
+
yield self._get_ml_model_version_properties_workunit(
|
|
502
|
+
model_version=model_version,
|
|
503
|
+
version_set_urn=version_set_urn,
|
|
504
|
+
)
|
|
505
|
+
yield self._get_global_tags_workunit(model_version=model_version)
|
|
506
|
+
|
|
507
|
+
def _get_version_set_urn(self, registered_model: RegisteredModel) -> VersionSetUrn:
|
|
508
|
+
guid_dict = {"platform": self.platform, "name": registered_model.name}
|
|
509
|
+
version_set_urn = VersionSetUrn(
|
|
510
|
+
id=builder.datahub_guid(guid_dict),
|
|
511
|
+
entity_type=MlModelUrn.ENTITY_TYPE,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
return version_set_urn
|
|
515
|
+
|
|
516
|
+
def _get_ml_model_version_properties_workunit(
|
|
517
|
+
self,
|
|
518
|
+
model_version: ModelVersion,
|
|
519
|
+
version_set_urn: VersionSetUrn,
|
|
520
|
+
) -> MetadataWorkUnit:
|
|
521
|
+
ml_model_urn = self._make_ml_model_urn(model_version)
|
|
522
|
+
|
|
523
|
+
# get mlmodel name from ml model urn
|
|
524
|
+
ml_model_version_properties = VersionPropertiesClass(
|
|
525
|
+
version=VersionTagClass(
|
|
526
|
+
versionTag=str(model_version.version),
|
|
527
|
+
metadataAttribution=MetadataAttributionClass(
|
|
528
|
+
time=model_version.creation_timestamp,
|
|
529
|
+
actor="urn:li:corpuser:datahub",
|
|
530
|
+
),
|
|
531
|
+
),
|
|
532
|
+
versionSet=str(version_set_urn),
|
|
533
|
+
sortId=str(model_version.version).zfill(10),
|
|
534
|
+
aliases=[
|
|
535
|
+
VersionTagClass(versionTag=alias) for alias in model_version.aliases
|
|
536
|
+
],
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
wu = MetadataChangeProposalWrapper(
|
|
540
|
+
entityUrn=str(ml_model_urn),
|
|
541
|
+
aspect=ml_model_version_properties,
|
|
542
|
+
).as_workunit()
|
|
543
|
+
|
|
544
|
+
return wu
|
|
545
|
+
|
|
249
546
|
def _get_ml_model_properties_workunit(
|
|
250
547
|
self,
|
|
251
548
|
registered_model: RegisteredModel,
|
|
@@ -259,28 +556,47 @@ class MLflowSource(Source):
|
|
|
259
556
|
"""
|
|
260
557
|
ml_model_group_urn = self._make_ml_model_group_urn(registered_model)
|
|
261
558
|
ml_model_urn = self._make_ml_model_urn(model_version)
|
|
559
|
+
|
|
262
560
|
if run:
|
|
263
|
-
hyperparams
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
561
|
+
# Use the same metrics and hyperparams from the run
|
|
562
|
+
hyperparams = self._get_run_params(run)
|
|
563
|
+
training_metrics = self._get_run_metrics(run)
|
|
564
|
+
run_urn = DataProcessInstance(
|
|
565
|
+
id=run.info.run_id,
|
|
566
|
+
orchestrator=self.platform,
|
|
567
|
+
).urn
|
|
568
|
+
|
|
569
|
+
training_jobs = [str(run_urn)] if run_urn else []
|
|
270
570
|
else:
|
|
271
571
|
hyperparams = None
|
|
272
572
|
training_metrics = None
|
|
573
|
+
training_jobs = []
|
|
574
|
+
|
|
575
|
+
created_time = model_version.creation_timestamp
|
|
576
|
+
created_actor = (
|
|
577
|
+
f"urn:li:platformResource:{model_version.user_id}"
|
|
578
|
+
if model_version.user_id
|
|
579
|
+
else None
|
|
580
|
+
)
|
|
581
|
+
model_version_tags = [f"{k}:{v}" for k, v in model_version.tags.items()]
|
|
582
|
+
|
|
273
583
|
ml_model_properties = MLModelPropertiesClass(
|
|
274
584
|
customProperties=model_version.tags,
|
|
275
585
|
externalUrl=self._make_external_url(model_version),
|
|
586
|
+
lastModified=TimeStampClass(
|
|
587
|
+
time=model_version.last_updated_timestamp,
|
|
588
|
+
actor=None,
|
|
589
|
+
),
|
|
276
590
|
description=model_version.description,
|
|
277
|
-
|
|
278
|
-
|
|
591
|
+
created=TimeStampClass(
|
|
592
|
+
time=created_time,
|
|
593
|
+
actor=created_actor,
|
|
594
|
+
),
|
|
279
595
|
hyperParams=hyperparams,
|
|
280
596
|
trainingMetrics=training_metrics,
|
|
281
|
-
|
|
282
|
-
tags=list(model_version.tags.keys()),
|
|
597
|
+
tags=model_version_tags,
|
|
283
598
|
groups=[ml_model_group_urn],
|
|
599
|
+
trainingJobs=training_jobs,
|
|
284
600
|
)
|
|
285
601
|
wu = self._create_workunit(urn=ml_model_urn, aspect=ml_model_properties)
|
|
286
602
|
return wu
|
|
@@ -314,6 +630,15 @@ class MLflowSource(Source):
|
|
|
314
630
|
else:
|
|
315
631
|
return None
|
|
316
632
|
|
|
633
|
+
def _make_external_url_from_run(
|
|
634
|
+
self, experiment: Experiment, run: Run
|
|
635
|
+
) -> Union[None, str]:
|
|
636
|
+
base_uri = self.client.tracking_uri
|
|
637
|
+
if base_uri.startswith("http"):
|
|
638
|
+
return f"{base_uri.rstrip('/')}/#/experiments/{experiment.experiment_id}/runs/{run.info.run_id}"
|
|
639
|
+
else:
|
|
640
|
+
return None
|
|
641
|
+
|
|
317
642
|
def _get_global_tags_workunit(
|
|
318
643
|
self,
|
|
319
644
|
model_version: ModelVersion,
|
|
@@ -333,3 +658,8 @@ class MLflowSource(Source):
|
|
|
333
658
|
aspect=global_tags,
|
|
334
659
|
)
|
|
335
660
|
return wu
|
|
661
|
+
|
|
662
|
+
@classmethod
|
|
663
|
+
def create(cls, config_dict: dict, ctx: PipelineContext) -> "MLflowSource":
|
|
664
|
+
config = MLflowConfig.parse_obj(config_dict)
|
|
665
|
+
return cls(ctx, config)
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -23,7 +23,9 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
|
|
|
23
23
|
|
|
24
24
|
import datahub.emitter.mce_builder as builder
|
|
25
25
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
26
|
-
from datahub.configuration.source_common import
|
|
26
|
+
from datahub.configuration.source_common import (
|
|
27
|
+
DatasetLineageProviderConfigBase,
|
|
28
|
+
)
|
|
27
29
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
28
30
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
29
31
|
from datahub.emitter.mcp_builder import (
|
|
@@ -137,7 +139,10 @@ class ModeAPIConfig(ConfigModel):
|
|
|
137
139
|
)
|
|
138
140
|
|
|
139
141
|
|
|
140
|
-
class ModeConfig(
|
|
142
|
+
class ModeConfig(
|
|
143
|
+
StatefulIngestionConfigBase,
|
|
144
|
+
DatasetLineageProviderConfigBase,
|
|
145
|
+
):
|
|
141
146
|
# See https://mode.com/developer/api-reference/authentication/
|
|
142
147
|
# for authentication
|
|
143
148
|
connect_uri: str = Field(
|
|
@@ -154,7 +159,12 @@ class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
|
|
|
154
159
|
)
|
|
155
160
|
|
|
156
161
|
workspace: str = Field(
|
|
157
|
-
description="The Mode workspace
|
|
162
|
+
description="The Mode workspace username. If you navigate to Workspace Settings > Details, "
|
|
163
|
+
"the url will be `https://app.mode.com/organizations/<workspace-username>`. "
|
|
164
|
+
# The lowercase comment is derived from a comment in a Mode API example.
|
|
165
|
+
# https://mode.com/developer/api-cookbook/management/get-all-reports/
|
|
166
|
+
# > "Note: workspace_name value should be all lowercase"
|
|
167
|
+
"This is distinct from the workspace's display name, and should be all lowercase."
|
|
158
168
|
)
|
|
159
169
|
_default_schema = pydantic_removed_field("default_schema")
|
|
160
170
|
|
|
@@ -372,7 +382,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
372
382
|
]
|
|
373
383
|
|
|
374
384
|
def _dashboard_urn(self, report_info: dict) -> str:
|
|
375
|
-
return builder.make_dashboard_urn(self.platform, report_info.get("id", ""))
|
|
385
|
+
return builder.make_dashboard_urn(self.platform, str(report_info.get("id", "")))
|
|
376
386
|
|
|
377
387
|
def _parse_last_run_at(self, report_info: dict) -> Optional[int]:
|
|
378
388
|
# Mode queries are refreshed, and that timestamp is reflected correctly here.
|
|
@@ -759,9 +769,9 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
759
769
|
return platform, database
|
|
760
770
|
else:
|
|
761
771
|
self.report.report_warning(
|
|
762
|
-
title="
|
|
763
|
-
message=
|
|
764
|
-
f"{data_source_id}",
|
|
772
|
+
title="Unable to construct upstream lineage",
|
|
773
|
+
message="We did not find a data source / connection with a matching ID, meaning that we do not know the platform/database to use in lineage.",
|
|
774
|
+
context=f"Data Source ID: {data_source_id}",
|
|
765
775
|
)
|
|
766
776
|
return None, None
|
|
767
777
|
|
|
@@ -1489,7 +1499,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1489
1499
|
sleep_time = error_response.headers.get("retry-after")
|
|
1490
1500
|
if sleep_time is not None:
|
|
1491
1501
|
time.sleep(float(sleep_time))
|
|
1492
|
-
raise HTTPError429
|
|
1502
|
+
raise HTTPError429 from None
|
|
1493
1503
|
|
|
1494
1504
|
raise http_error
|
|
1495
1505
|
|
|
@@ -7,7 +7,9 @@ import pandas as pd
|
|
|
7
7
|
from neo4j import GraphDatabase
|
|
8
8
|
from pydantic.fields import Field
|
|
9
9
|
|
|
10
|
-
from datahub.configuration.source_common import
|
|
10
|
+
from datahub.configuration.source_common import (
|
|
11
|
+
EnvConfigMixin,
|
|
12
|
+
)
|
|
11
13
|
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
|
|
12
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
15
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -17,9 +19,19 @@ from datahub.ingestion.api.decorators import (
|
|
|
17
19
|
platform_name,
|
|
18
20
|
support_status,
|
|
19
21
|
)
|
|
20
|
-
from datahub.ingestion.api.source import
|
|
22
|
+
from datahub.ingestion.api.source import (
|
|
23
|
+
MetadataWorkUnitProcessor,
|
|
24
|
+
)
|
|
21
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
26
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
27
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
28
|
+
StaleEntityRemovalHandler,
|
|
29
|
+
)
|
|
30
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
31
|
+
StatefulIngestionConfigBase,
|
|
32
|
+
StatefulIngestionReport,
|
|
33
|
+
StatefulIngestionSourceBase,
|
|
34
|
+
)
|
|
23
35
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
|
|
24
36
|
from datahub.metadata.schema_classes import (
|
|
25
37
|
AuditStampClass,
|
|
@@ -52,7 +64,7 @@ _type_mapping: Dict[Union[Type, str], Type] = {
|
|
|
52
64
|
}
|
|
53
65
|
|
|
54
66
|
|
|
55
|
-
class Neo4jConfig(EnvConfigMixin):
|
|
67
|
+
class Neo4jConfig(EnvConfigMixin, StatefulIngestionConfigBase):
|
|
56
68
|
username: str = Field(description="Neo4j Username")
|
|
57
69
|
password: str = Field(description="Neo4j Password")
|
|
58
70
|
uri: str = Field(description="The URI for the Neo4j server")
|
|
@@ -60,7 +72,7 @@ class Neo4jConfig(EnvConfigMixin):
|
|
|
60
72
|
|
|
61
73
|
|
|
62
74
|
@dataclass
|
|
63
|
-
class Neo4jSourceReport(
|
|
75
|
+
class Neo4jSourceReport(StatefulIngestionReport):
|
|
64
76
|
obj_failures: int = 0
|
|
65
77
|
obj_created: int = 0
|
|
66
78
|
|
|
@@ -68,7 +80,7 @@ class Neo4jSourceReport(SourceReport):
|
|
|
68
80
|
@platform_name("Neo4j", id="neo4j")
|
|
69
81
|
@config_class(Neo4jConfig)
|
|
70
82
|
@support_status(SupportStatus.CERTIFIED)
|
|
71
|
-
class Neo4jSource(
|
|
83
|
+
class Neo4jSource(StatefulIngestionSourceBase):
|
|
72
84
|
NODE = "node"
|
|
73
85
|
RELATIONSHIP = "relationship"
|
|
74
86
|
PLATFORM = "neo4j"
|
|
@@ -76,7 +88,7 @@ class Neo4jSource(Source):
|
|
|
76
88
|
def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
|
|
77
89
|
self.ctx = ctx
|
|
78
90
|
self.config = config
|
|
79
|
-
self.report = Neo4jSourceReport()
|
|
91
|
+
self.report: Neo4jSourceReport = Neo4jSourceReport()
|
|
80
92
|
|
|
81
93
|
@classmethod
|
|
82
94
|
def create(cls, config_dict, ctx):
|
|
@@ -280,7 +292,15 @@ class Neo4jSource(Source):
|
|
|
280
292
|
return record["properties"]
|
|
281
293
|
|
|
282
294
|
def get_relationships(self, record: dict) -> dict:
|
|
283
|
-
return record.get("relationships",
|
|
295
|
+
return record.get("relationships", {})
|
|
296
|
+
|
|
297
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
298
|
+
return [
|
|
299
|
+
*super().get_workunit_processors(),
|
|
300
|
+
StaleEntityRemovalHandler.create(
|
|
301
|
+
self, self.config, self.ctx
|
|
302
|
+
).workunit_processor,
|
|
303
|
+
]
|
|
284
304
|
|
|
285
305
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
286
306
|
df = self.get_neo4j_metadata(
|