acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,20 @@
1
+ import time
1
2
  from dataclasses import dataclass
2
- from typing import Any, Callable, Iterable, Optional, TypeVar, Union
3
+ from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
3
4
 
4
5
  from mlflow import MlflowClient
5
- from mlflow.entities import Run
6
+ from mlflow.entities import Experiment, Run
6
7
  from mlflow.entities.model_registry import ModelVersion, RegisteredModel
7
8
  from mlflow.store.entities import PagedList
8
9
  from pydantic.fields import Field
9
10
 
10
11
  import datahub.emitter.mce_builder as builder
12
+ from datahub.api.entities.dataprocess.dataprocess_instance import (
13
+ DataProcessInstance,
14
+ )
11
15
  from datahub.configuration.source_common import EnvConfigMixin
12
16
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
+ from datahub.emitter.mcp_builder import ContainerKey
13
18
  from datahub.ingestion.api.common import PipelineContext
14
19
  from datahub.ingestion.api.decorators import (
15
20
  SupportStatus,
@@ -18,24 +23,62 @@ from datahub.ingestion.api.decorators import (
18
23
  platform_name,
19
24
  support_status,
20
25
  )
21
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
26
+ from datahub.ingestion.api.source import (
27
+ MetadataWorkUnitProcessor,
28
+ SourceCapability,
29
+ SourceReport,
30
+ )
22
31
  from datahub.ingestion.api.workunit import MetadataWorkUnit
32
+ from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
33
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
34
+ StaleEntityRemovalHandler,
35
+ StaleEntityRemovalSourceReport,
36
+ )
37
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
38
+ StatefulIngestionConfigBase,
39
+ StatefulIngestionSourceBase,
40
+ )
23
41
  from datahub.metadata.schema_classes import (
42
+ AuditStampClass,
43
+ ContainerClass,
44
+ DataPlatformInstanceClass,
45
+ DataProcessInstanceOutputClass,
46
+ DataProcessInstancePropertiesClass,
47
+ DataProcessInstanceRunEventClass,
48
+ DataProcessInstanceRunResultClass,
49
+ DataProcessRunStatusClass,
50
+ EdgeClass,
24
51
  GlobalTagsClass,
52
+ MetadataAttributionClass,
25
53
  MLHyperParamClass,
26
54
  MLMetricClass,
27
55
  MLModelGroupPropertiesClass,
28
56
  MLModelPropertiesClass,
57
+ MLTrainingRunPropertiesClass,
58
+ PlatformResourceInfoClass,
59
+ SubTypesClass,
29
60
  TagAssociationClass,
30
61
  TagPropertiesClass,
62
+ TimeStampClass,
63
+ VersionPropertiesClass,
31
64
  VersionTagClass,
32
65
  _Aspect,
33
66
  )
67
+ from datahub.metadata.urns import (
68
+ DataPlatformUrn,
69
+ MlModelUrn,
70
+ VersionSetUrn,
71
+ )
72
+ from datahub.sdk.container import Container
34
73
 
35
74
  T = TypeVar("T")
36
75
 
37
76
 
38
- class MLflowConfig(EnvConfigMixin):
77
+ class ContainerKeyWithId(ContainerKey):
78
+ id: str
79
+
80
+
81
+ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
39
82
  tracking_uri: Optional[str] = Field(
40
83
  default=None,
41
84
  description=(
@@ -79,7 +122,7 @@ class MLflowRegisteredModelStageInfo:
79
122
  "Extract descriptions for MLflow Registered Models and Model Versions",
80
123
  )
81
124
  @capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
82
- class MLflowSource(Source):
125
+ class MLflowSource(StatefulIngestionSourceBase):
83
126
  platform = "mlflow"
84
127
  registered_model_stages_info = (
85
128
  MLflowRegisteredModelStageInfo(
@@ -105,9 +148,10 @@ class MLflowSource(Source):
105
148
  )
106
149
 
107
150
  def __init__(self, ctx: PipelineContext, config: MLflowConfig):
108
- super().__init__(ctx)
151
+ super().__init__(config, ctx)
152
+ self.ctx = ctx
109
153
  self.config = config
110
- self.report = SourceReport()
154
+ self.report = StaleEntityRemovalSourceReport()
111
155
  self.client = MlflowClient(
112
156
  tracking_uri=self.config.tracking_uri,
113
157
  registry_uri=self.config.registry_uri,
@@ -116,8 +160,17 @@ class MLflowSource(Source):
116
160
  def get_report(self) -> SourceReport:
117
161
  return self.report
118
162
 
163
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
164
+ return [
165
+ *super().get_workunit_processors(),
166
+ StaleEntityRemovalHandler.create(
167
+ self, self.config, self.ctx
168
+ ).workunit_processor,
169
+ ]
170
+
119
171
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
120
172
  yield from self._get_tags_workunits()
173
+ yield from self._get_experiment_workunits()
121
174
  yield from self._get_ml_model_workunits()
122
175
 
123
176
  def _get_tags_workunits(self) -> Iterable[MetadataWorkUnit]:
@@ -151,22 +204,162 @@ class MLflowSource(Source):
151
204
  aspect=aspect,
152
205
  ).as_workunit()
153
206
 
154
- def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]:
155
- """
156
- Traverse each Registered Model in Model Registry and generate a corresponding workunit.
157
- """
158
- registered_models = self._get_mlflow_registered_models()
159
- for registered_model in registered_models:
160
- yield self._get_ml_group_workunit(registered_model)
161
- model_versions = self._get_mlflow_model_versions(registered_model)
162
- for model_version in model_versions:
163
- run = self._get_mlflow_run(model_version)
164
- yield self._get_ml_model_properties_workunit(
165
- registered_model=registered_model,
166
- model_version=model_version,
167
- run=run,
168
- )
169
- yield self._get_global_tags_workunit(model_version=model_version)
207
+ def _get_experiment_workunits(self) -> Iterable[MetadataWorkUnit]:
208
+ experiments = self._get_mlflow_experiments()
209
+ for experiment in experiments:
210
+ yield from self._get_experiment_container_workunit(experiment)
211
+
212
+ runs = self._get_mlflow_runs_from_experiment(experiment)
213
+ if runs:
214
+ for run in runs:
215
+ yield from self._get_run_workunits(experiment, run)
216
+
217
+ def _get_experiment_custom_properties(self, experiment):
218
+ experiment_custom_props = getattr(experiment, "tags", {}) or {}
219
+ experiment_custom_props.pop("mlflow.note.content", None)
220
+ experiment_custom_props["artifacts_location"] = experiment.artifact_location
221
+ return experiment_custom_props
222
+
223
+ def _get_experiment_container_workunit(
224
+ self, experiment: Experiment
225
+ ) -> Iterable[MetadataWorkUnit]:
226
+ experiment_container = Container(
227
+ container_key=ContainerKeyWithId(
228
+ platform=str(DataPlatformUrn(platform_name=self.platform)),
229
+ id=experiment.name,
230
+ ),
231
+ subtype=MLAssetSubTypes.MLFLOW_EXPERIMENT,
232
+ display_name=experiment.name,
233
+ description=experiment.tags.get("mlflow.note.content"),
234
+ extra_properties=self._get_experiment_custom_properties(experiment),
235
+ )
236
+
237
+ yield from experiment_container.as_workunits()
238
+
239
+ def _get_run_metrics(self, run: Run) -> List[MLMetricClass]:
240
+ return [
241
+ MLMetricClass(name=k, value=str(v)) for k, v in run.data.metrics.items()
242
+ ]
243
+
244
+ def _get_run_params(self, run: Run) -> List[MLHyperParamClass]:
245
+ return [
246
+ MLHyperParamClass(name=k, value=str(v)) for k, v in run.data.params.items()
247
+ ]
248
+
249
+ def _convert_run_result_type(
250
+ self, status: str
251
+ ) -> DataProcessInstanceRunResultClass:
252
+ if status == "FINISHED":
253
+ return DataProcessInstanceRunResultClass(
254
+ type="SUCCESS", nativeResultType=self.platform
255
+ )
256
+ elif status == "FAILED":
257
+ return DataProcessInstanceRunResultClass(
258
+ type="FAILURE", nativeResultType=self.platform
259
+ )
260
+ else:
261
+ return DataProcessInstanceRunResultClass(
262
+ type="SKIPPED", nativeResultType=self.platform
263
+ )
264
+
265
+ def _get_run_workunits(
266
+ self, experiment: Experiment, run: Run
267
+ ) -> Iterable[MetadataWorkUnit]:
268
+ experiment_key = ContainerKeyWithId(
269
+ platform=str(DataPlatformUrn(self.platform)), id=experiment.name
270
+ )
271
+
272
+ data_process_instance = DataProcessInstance(
273
+ id=run.info.run_id,
274
+ orchestrator=self.platform,
275
+ template_urn=None,
276
+ )
277
+
278
+ created_time = run.info.start_time or int(time.time() * 1000)
279
+ user_id = run.info.user_id if run.info.user_id else "mlflow"
280
+ guid_dict_user = {"platform": self.platform, "user": user_id}
281
+ platform_user_urn = (
282
+ f"urn:li:platformResource:{builder.datahub_guid(guid_dict_user)}"
283
+ )
284
+
285
+ yield MetadataChangeProposalWrapper(
286
+ entityUrn=platform_user_urn,
287
+ aspect=PlatformResourceInfoClass(
288
+ resourceType="user",
289
+ primaryKey=user_id,
290
+ ),
291
+ ).as_workunit()
292
+
293
+ yield MetadataChangeProposalWrapper(
294
+ entityUrn=str(data_process_instance.urn),
295
+ aspect=DataProcessInstancePropertiesClass(
296
+ name=run.info.run_name or run.info.run_id,
297
+ created=AuditStampClass(
298
+ time=created_time,
299
+ actor=platform_user_urn,
300
+ ),
301
+ externalUrl=self._make_external_url_from_run(experiment, run),
302
+ customProperties=getattr(run, "tags", {}) or {},
303
+ ),
304
+ ).as_workunit()
305
+
306
+ yield MetadataChangeProposalWrapper(
307
+ entityUrn=str(data_process_instance.urn),
308
+ aspect=ContainerClass(container=experiment_key.as_urn()),
309
+ ).as_workunit()
310
+
311
+ model_versions = self.get_mlflow_model_versions_from_run(run.info.run_id)
312
+ if model_versions:
313
+ model_version_urn = self._make_ml_model_urn(model_versions[0])
314
+ yield MetadataChangeProposalWrapper(
315
+ entityUrn=str(data_process_instance.urn),
316
+ aspect=DataProcessInstanceOutputClass(
317
+ outputs=[],
318
+ outputEdges=[
319
+ EdgeClass(destinationUrn=model_version_urn),
320
+ ],
321
+ ),
322
+ ).as_workunit()
323
+
324
+ metrics = self._get_run_metrics(run)
325
+ hyperparams = self._get_run_params(run)
326
+ yield MetadataChangeProposalWrapper(
327
+ entityUrn=str(data_process_instance.urn),
328
+ aspect=MLTrainingRunPropertiesClass(
329
+ hyperParams=hyperparams,
330
+ trainingMetrics=metrics,
331
+ outputUrls=[run.info.artifact_uri],
332
+ id=run.info.run_id,
333
+ ),
334
+ ).as_workunit()
335
+
336
+ if run.info.end_time:
337
+ duration_millis = run.info.end_time - run.info.start_time
338
+
339
+ yield MetadataChangeProposalWrapper(
340
+ entityUrn=str(data_process_instance.urn),
341
+ aspect=DataProcessInstanceRunEventClass(
342
+ status=DataProcessRunStatusClass.COMPLETE,
343
+ timestampMillis=run.info.end_time,
344
+ result=DataProcessInstanceRunResultClass(
345
+ type=self._convert_run_result_type(run.info.status).type,
346
+ nativeResultType=self.platform,
347
+ ),
348
+ durationMillis=duration_millis,
349
+ ),
350
+ ).as_workunit()
351
+
352
+ yield MetadataChangeProposalWrapper(
353
+ entityUrn=str(data_process_instance.urn),
354
+ aspect=DataPlatformInstanceClass(
355
+ platform=str(DataPlatformUrn(self.platform))
356
+ ),
357
+ ).as_workunit()
358
+
359
+ yield MetadataChangeProposalWrapper(
360
+ entityUrn=str(data_process_instance.urn),
361
+ aspect=SubTypesClass(typeNames=[MLAssetSubTypes.MLFLOW_TRAINING_RUN]),
362
+ ).as_workunit()
170
363
 
171
364
  def _get_mlflow_registered_models(self) -> Iterable[RegisteredModel]:
172
365
  """
@@ -179,6 +372,19 @@ class MLflowSource(Source):
179
372
  )
180
373
  return registered_models
181
374
 
375
+ def _get_mlflow_experiments(self) -> Iterable[Experiment]:
376
+ experiments: Iterable[Experiment] = self._traverse_mlflow_search_func(
377
+ search_func=self.client.search_experiments,
378
+ )
379
+ return experiments
380
+
381
+ def _get_mlflow_runs_from_experiment(self, experiment: Experiment) -> Iterable[Run]:
382
+ runs: Iterable[Run] = self._traverse_mlflow_search_func(
383
+ search_func=self.client.search_runs,
384
+ experiment_ids=[experiment.experiment_id],
385
+ )
386
+ return runs
387
+
182
388
  @staticmethod
183
389
  def _traverse_mlflow_search_func(
184
390
  search_func: Callable[..., PagedList[T]],
@@ -195,6 +401,13 @@ class MLflowSource(Source):
195
401
  if not next_page_token:
196
402
  return
197
403
 
404
+ def _get_latest_version(self, registered_model: RegisteredModel) -> Optional[str]:
405
+ return (
406
+ str(registered_model.latest_versions[0].version)
407
+ if registered_model.latest_versions
408
+ else None
409
+ )
410
+
198
411
  def _get_ml_group_workunit(
199
412
  self,
200
413
  registered_model: RegisteredModel,
@@ -206,7 +419,20 @@ class MLflowSource(Source):
206
419
  ml_model_group_properties = MLModelGroupPropertiesClass(
207
420
  customProperties=registered_model.tags,
208
421
  description=registered_model.description,
209
- createdAt=registered_model.creation_timestamp,
422
+ created=TimeStampClass(
423
+ time=registered_model.creation_timestamp, actor=None
424
+ ),
425
+ lastModified=TimeStampClass(
426
+ time=registered_model.last_updated_timestamp,
427
+ actor=None,
428
+ ),
429
+ version=VersionTagClass(
430
+ versionTag=self._get_latest_version(registered_model),
431
+ metadataAttribution=MetadataAttributionClass(
432
+ time=registered_model.last_updated_timestamp,
433
+ actor="urn:li:corpuser:datahub",
434
+ ),
435
+ ),
210
436
  )
211
437
  wu = self._create_workunit(
212
438
  urn=ml_model_group_urn,
@@ -236,6 +462,16 @@ class MLflowSource(Source):
236
462
  )
237
463
  return model_versions
238
464
 
465
+ def get_mlflow_model_versions_from_run(self, run_id):
466
+ filter_string = f"run_id = '{run_id}'"
467
+
468
+ model_versions: Iterable[ModelVersion] = self._traverse_mlflow_search_func(
469
+ search_func=self.client.search_model_versions,
470
+ filter_string=filter_string,
471
+ )
472
+
473
+ return list(model_versions)
474
+
239
475
  def _get_mlflow_run(self, model_version: ModelVersion) -> Union[None, Run]:
240
476
  """
241
477
  Get a Run associated with a Model Version. Some MVs may exist without Run.
@@ -246,6 +482,67 @@ class MLflowSource(Source):
246
482
  else:
247
483
  return None
248
484
 
485
+ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]:
486
+ """
487
+ Traverse each Registered Model in Model Registry and generate a corresponding workunit.
488
+ """
489
+ registered_models = self._get_mlflow_registered_models()
490
+ for registered_model in registered_models:
491
+ version_set_urn = self._get_version_set_urn(registered_model)
492
+ yield self._get_ml_group_workunit(registered_model)
493
+ model_versions = self._get_mlflow_model_versions(registered_model)
494
+ for model_version in model_versions:
495
+ run = self._get_mlflow_run(model_version)
496
+ yield self._get_ml_model_properties_workunit(
497
+ registered_model=registered_model,
498
+ model_version=model_version,
499
+ run=run,
500
+ )
501
+ yield self._get_ml_model_version_properties_workunit(
502
+ model_version=model_version,
503
+ version_set_urn=version_set_urn,
504
+ )
505
+ yield self._get_global_tags_workunit(model_version=model_version)
506
+
507
+ def _get_version_set_urn(self, registered_model: RegisteredModel) -> VersionSetUrn:
508
+ guid_dict = {"platform": self.platform, "name": registered_model.name}
509
+ version_set_urn = VersionSetUrn(
510
+ id=builder.datahub_guid(guid_dict),
511
+ entity_type=MlModelUrn.ENTITY_TYPE,
512
+ )
513
+
514
+ return version_set_urn
515
+
516
+ def _get_ml_model_version_properties_workunit(
517
+ self,
518
+ model_version: ModelVersion,
519
+ version_set_urn: VersionSetUrn,
520
+ ) -> MetadataWorkUnit:
521
+ ml_model_urn = self._make_ml_model_urn(model_version)
522
+
523
+ # get mlmodel name from ml model urn
524
+ ml_model_version_properties = VersionPropertiesClass(
525
+ version=VersionTagClass(
526
+ versionTag=str(model_version.version),
527
+ metadataAttribution=MetadataAttributionClass(
528
+ time=model_version.creation_timestamp,
529
+ actor="urn:li:corpuser:datahub",
530
+ ),
531
+ ),
532
+ versionSet=str(version_set_urn),
533
+ sortId=str(model_version.version).zfill(10),
534
+ aliases=[
535
+ VersionTagClass(versionTag=alias) for alias in model_version.aliases
536
+ ],
537
+ )
538
+
539
+ wu = MetadataChangeProposalWrapper(
540
+ entityUrn=str(ml_model_urn),
541
+ aspect=ml_model_version_properties,
542
+ ).as_workunit()
543
+
544
+ return wu
545
+
249
546
  def _get_ml_model_properties_workunit(
250
547
  self,
251
548
  registered_model: RegisteredModel,
@@ -259,28 +556,47 @@ class MLflowSource(Source):
259
556
  """
260
557
  ml_model_group_urn = self._make_ml_model_group_urn(registered_model)
261
558
  ml_model_urn = self._make_ml_model_urn(model_version)
559
+
262
560
  if run:
263
- hyperparams = [
264
- MLHyperParamClass(name=k, value=str(v))
265
- for k, v in run.data.params.items()
266
- ]
267
- training_metrics = [
268
- MLMetricClass(name=k, value=str(v)) for k, v in run.data.metrics.items()
269
- ]
561
+ # Use the same metrics and hyperparams from the run
562
+ hyperparams = self._get_run_params(run)
563
+ training_metrics = self._get_run_metrics(run)
564
+ run_urn = DataProcessInstance(
565
+ id=run.info.run_id,
566
+ orchestrator=self.platform,
567
+ ).urn
568
+
569
+ training_jobs = [str(run_urn)] if run_urn else []
270
570
  else:
271
571
  hyperparams = None
272
572
  training_metrics = None
573
+ training_jobs = []
574
+
575
+ created_time = model_version.creation_timestamp
576
+ created_actor = (
577
+ f"urn:li:platformResource:{model_version.user_id}"
578
+ if model_version.user_id
579
+ else None
580
+ )
581
+ model_version_tags = [f"{k}:{v}" for k, v in model_version.tags.items()]
582
+
273
583
  ml_model_properties = MLModelPropertiesClass(
274
584
  customProperties=model_version.tags,
275
585
  externalUrl=self._make_external_url(model_version),
586
+ lastModified=TimeStampClass(
587
+ time=model_version.last_updated_timestamp,
588
+ actor=None,
589
+ ),
276
590
  description=model_version.description,
277
- date=model_version.creation_timestamp,
278
- version=VersionTagClass(versionTag=str(model_version.version)),
591
+ created=TimeStampClass(
592
+ time=created_time,
593
+ actor=created_actor,
594
+ ),
279
595
  hyperParams=hyperparams,
280
596
  trainingMetrics=training_metrics,
281
- # mlflow tags are dicts, but datahub tags are lists. currently use only keys from mlflow tags
282
- tags=list(model_version.tags.keys()),
597
+ tags=model_version_tags,
283
598
  groups=[ml_model_group_urn],
599
+ trainingJobs=training_jobs,
284
600
  )
285
601
  wu = self._create_workunit(urn=ml_model_urn, aspect=ml_model_properties)
286
602
  return wu
@@ -314,6 +630,15 @@ class MLflowSource(Source):
314
630
  else:
315
631
  return None
316
632
 
633
+ def _make_external_url_from_run(
634
+ self, experiment: Experiment, run: Run
635
+ ) -> Union[None, str]:
636
+ base_uri = self.client.tracking_uri
637
+ if base_uri.startswith("http"):
638
+ return f"{base_uri.rstrip('/')}/#/experiments/{experiment.experiment_id}/runs/{run.info.run_id}"
639
+ else:
640
+ return None
641
+
317
642
  def _get_global_tags_workunit(
318
643
  self,
319
644
  model_version: ModelVersion,
@@ -333,3 +658,8 @@ class MLflowSource(Source):
333
658
  aspect=global_tags,
334
659
  )
335
660
  return wu
661
+
662
+ @classmethod
663
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> "MLflowSource":
664
+ config = MLflowConfig.parse_obj(config_dict)
665
+ return cls(ctx, config)
@@ -23,7 +23,9 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
23
23
 
24
24
  import datahub.emitter.mce_builder as builder
25
25
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
26
- from datahub.configuration.source_common import DatasetLineageProviderConfigBase
26
+ from datahub.configuration.source_common import (
27
+ DatasetLineageProviderConfigBase,
28
+ )
27
29
  from datahub.configuration.validate_field_removal import pydantic_removed_field
28
30
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
29
31
  from datahub.emitter.mcp_builder import (
@@ -137,7 +139,10 @@ class ModeAPIConfig(ConfigModel):
137
139
  )
138
140
 
139
141
 
140
- class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
142
+ class ModeConfig(
143
+ StatefulIngestionConfigBase,
144
+ DatasetLineageProviderConfigBase,
145
+ ):
141
146
  # See https://mode.com/developer/api-reference/authentication/
142
147
  # for authentication
143
148
  connect_uri: str = Field(
@@ -154,7 +159,12 @@ class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
154
159
  )
155
160
 
156
161
  workspace: str = Field(
157
- description="The Mode workspace name. Find it in Settings > Workspace > Details."
162
+ description="The Mode workspace username. If you navigate to Workspace Settings > Details, "
163
+ "the url will be `https://app.mode.com/organizations/<workspace-username>`. "
164
+ # The lowercase comment is derived from a comment in a Mode API example.
165
+ # https://mode.com/developer/api-cookbook/management/get-all-reports/
166
+ # > "Note: workspace_name value should be all lowercase"
167
+ "This is distinct from the workspace's display name, and should be all lowercase."
158
168
  )
159
169
  _default_schema = pydantic_removed_field("default_schema")
160
170
 
@@ -372,7 +382,7 @@ class ModeSource(StatefulIngestionSourceBase):
372
382
  ]
373
383
 
374
384
  def _dashboard_urn(self, report_info: dict) -> str:
375
- return builder.make_dashboard_urn(self.platform, report_info.get("id", ""))
385
+ return builder.make_dashboard_urn(self.platform, str(report_info.get("id", "")))
376
386
 
377
387
  def _parse_last_run_at(self, report_info: dict) -> Optional[int]:
378
388
  # Mode queries are refreshed, and that timestamp is reflected correctly here.
@@ -759,9 +769,9 @@ class ModeSource(StatefulIngestionSourceBase):
759
769
  return platform, database
760
770
  else:
761
771
  self.report.report_warning(
762
- title="Failed to create Data Platform Urn",
763
- message=f"Cannot create datasource urn for datasource id: "
764
- f"{data_source_id}",
772
+ title="Unable to construct upstream lineage",
773
+ message="We did not find a data source / connection with a matching ID, meaning that we do not know the platform/database to use in lineage.",
774
+ context=f"Data Source ID: {data_source_id}",
765
775
  )
766
776
  return None, None
767
777
 
@@ -1489,7 +1499,7 @@ class ModeSource(StatefulIngestionSourceBase):
1489
1499
  sleep_time = error_response.headers.get("retry-after")
1490
1500
  if sleep_time is not None:
1491
1501
  time.sleep(float(sleep_time))
1492
- raise HTTPError429
1502
+ raise HTTPError429 from None
1493
1503
 
1494
1504
  raise http_error
1495
1505
 
@@ -7,7 +7,9 @@ import pandas as pd
7
7
  from neo4j import GraphDatabase
8
8
  from pydantic.fields import Field
9
9
 
10
- from datahub.configuration.source_common import EnvConfigMixin
10
+ from datahub.configuration.source_common import (
11
+ EnvConfigMixin,
12
+ )
11
13
  from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
12
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
15
  from datahub.ingestion.api.common import PipelineContext
@@ -17,9 +19,19 @@ from datahub.ingestion.api.decorators import (
17
19
  platform_name,
18
20
  support_status,
19
21
  )
20
- from datahub.ingestion.api.source import Source, SourceReport
22
+ from datahub.ingestion.api.source import (
23
+ MetadataWorkUnitProcessor,
24
+ )
21
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
26
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
27
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
28
+ StaleEntityRemovalHandler,
29
+ )
30
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
31
+ StatefulIngestionConfigBase,
32
+ StatefulIngestionReport,
33
+ StatefulIngestionSourceBase,
34
+ )
23
35
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
24
36
  from datahub.metadata.schema_classes import (
25
37
  AuditStampClass,
@@ -52,7 +64,7 @@ _type_mapping: Dict[Union[Type, str], Type] = {
52
64
  }
53
65
 
54
66
 
55
- class Neo4jConfig(EnvConfigMixin):
67
+ class Neo4jConfig(EnvConfigMixin, StatefulIngestionConfigBase):
56
68
  username: str = Field(description="Neo4j Username")
57
69
  password: str = Field(description="Neo4j Password")
58
70
  uri: str = Field(description="The URI for the Neo4j server")
@@ -60,7 +72,7 @@ class Neo4jConfig(EnvConfigMixin):
60
72
 
61
73
 
62
74
  @dataclass
63
- class Neo4jSourceReport(SourceReport):
75
+ class Neo4jSourceReport(StatefulIngestionReport):
64
76
  obj_failures: int = 0
65
77
  obj_created: int = 0
66
78
 
@@ -68,7 +80,7 @@ class Neo4jSourceReport(SourceReport):
68
80
  @platform_name("Neo4j", id="neo4j")
69
81
  @config_class(Neo4jConfig)
70
82
  @support_status(SupportStatus.CERTIFIED)
71
- class Neo4jSource(Source):
83
+ class Neo4jSource(StatefulIngestionSourceBase):
72
84
  NODE = "node"
73
85
  RELATIONSHIP = "relationship"
74
86
  PLATFORM = "neo4j"
@@ -76,7 +88,7 @@ class Neo4jSource(Source):
76
88
  def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
77
89
  self.ctx = ctx
78
90
  self.config = config
79
- self.report = Neo4jSourceReport()
91
+ self.report: Neo4jSourceReport = Neo4jSourceReport()
80
92
 
81
93
  @classmethod
82
94
  def create(cls, config_dict, ctx):
@@ -280,7 +292,15 @@ class Neo4jSource(Source):
280
292
  return record["properties"]
281
293
 
282
294
  def get_relationships(self, record: dict) -> dict:
283
- return record.get("relationships", None)
295
+ return record.get("relationships", {})
296
+
297
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
298
+ return [
299
+ *super().get_workunit_processors(),
300
+ StaleEntityRemovalHandler.create(
301
+ self, self.config, self.ctx
302
+ ).workunit_processor,
303
+ ]
284
304
 
285
305
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
286
306
  df = self.get_neo4j_metadata(