acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
  2. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,367 @@
1
+ import logging
2
+ from typing import Iterable, List, Optional, Type
3
+
4
+ import jpype
5
+ import jpype.imports
6
+ import requests
7
+
8
+ import datahub.emitter.mce_builder as builder
9
+ import datahub.metadata.schema_classes as models
10
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
11
+ from datahub.ingestion.api.common import PipelineContext
12
+ from datahub.ingestion.api.decorators import (
13
+ SourceCapability,
14
+ SupportStatus,
15
+ capability,
16
+ config_class,
17
+ platform_name,
18
+ support_status,
19
+ )
20
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
21
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.source.kafka_connect.common import (
23
+ CONNECTOR_CLASS,
24
+ SINK,
25
+ SOURCE,
26
+ BaseConnector,
27
+ ConnectorManifest,
28
+ KafkaConnectLineage,
29
+ KafkaConnectSourceConfig,
30
+ KafkaConnectSourceReport,
31
+ get_platform_instance,
32
+ transform_connector_config,
33
+ )
34
+ from datahub.ingestion.source.kafka_connect.sink_connectors import (
35
+ BIGQUERY_SINK_CONNECTOR_CLASS,
36
+ S3_SINK_CONNECTOR_CLASS,
37
+ SNOWFLAKE_SINK_CONNECTOR_CLASS,
38
+ BigQuerySinkConnector,
39
+ ConfluentS3SinkConnector,
40
+ SnowflakeSinkConnector,
41
+ )
42
+ from datahub.ingestion.source.kafka_connect.source_connectors import (
43
+ DEBEZIUM_SOURCE_CONNECTOR_PREFIX,
44
+ JDBC_SOURCE_CONNECTOR_CLASS,
45
+ MONGO_SOURCE_CONNECTOR_CLASS,
46
+ ConfigDrivenSourceConnector,
47
+ ConfluentJDBCSourceConnector,
48
+ DebeziumSourceConnector,
49
+ MongoSourceConnector,
50
+ )
51
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
52
+ StaleEntityRemovalHandler,
53
+ )
54
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
55
+ StatefulIngestionSourceBase,
56
+ )
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ @platform_name("Kafka Connect")
62
+ @config_class(KafkaConnectSourceConfig)
63
+ @support_status(SupportStatus.CERTIFIED)
64
+ @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
65
+ @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
66
+ @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
67
+ class KafkaConnectSource(StatefulIngestionSourceBase):
68
+ config: KafkaConnectSourceConfig
69
+ report: KafkaConnectSourceReport
70
+ platform: str = "kafka-connect"
71
+
72
+ def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext):
73
+ super().__init__(config, ctx)
74
+ self.config = config
75
+ self.report = KafkaConnectSourceReport()
76
+ self.session = requests.Session()
77
+ self.session.headers.update(
78
+ {
79
+ "Accept": "application/json",
80
+ "Content-Type": "application/json",
81
+ }
82
+ )
83
+
84
+ # Test the connection
85
+ if self.config.username is not None and self.config.password is not None:
86
+ logger.info(
87
+ f"Connecting to {self.config.connect_uri} with Authentication..."
88
+ )
89
+ self.session.auth = (self.config.username, self.config.password)
90
+
91
+ test_response = self.session.get(f"{self.config.connect_uri}/connectors")
92
+ test_response.raise_for_status()
93
+ logger.info(f"Connection to {self.config.connect_uri} is ok")
94
+ if not jpype.isJVMStarted():
95
+ jpype.startJVM()
96
+
97
+ @classmethod
98
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
99
+ config = KafkaConnectSourceConfig.parse_obj(config_dict)
100
+ return cls(config, ctx)
101
+
102
+ def get_connectors_manifest(self) -> Iterable[ConnectorManifest]:
103
+ """Get Kafka Connect connectors manifest using REST API.
104
+ Enrich with lineages metadata.
105
+ """
106
+
107
+ connector_response = self.session.get(
108
+ f"{self.config.connect_uri}/connectors",
109
+ )
110
+
111
+ payload = connector_response.json()
112
+
113
+ for connector_name in payload:
114
+ connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
115
+ connector_manifest = self._get_connector_manifest(
116
+ connector_name, connector_url
117
+ )
118
+ if (
119
+ connector_manifest is None
120
+ or not self.config.connector_patterns.allowed(connector_manifest.name)
121
+ ):
122
+ self.report.report_dropped(connector_name)
123
+ continue
124
+
125
+ if self.config.provided_configs:
126
+ transform_connector_config(
127
+ connector_manifest.config, self.config.provided_configs
128
+ )
129
+ connector_manifest.url = connector_url
130
+ connector_manifest.topic_names = self._get_connector_topics(connector_name)
131
+ connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or ""
132
+
133
+ class_type: Type[BaseConnector] = BaseConnector
134
+
135
+ # Populate Source Connector metadata
136
+ if connector_manifest.type == SOURCE:
137
+ connector_manifest.tasks = self._get_connector_tasks(connector_name)
138
+
139
+ # JDBC source connector lineages
140
+ if connector_class_value == JDBC_SOURCE_CONNECTOR_CLASS:
141
+ class_type = ConfluentJDBCSourceConnector
142
+ elif connector_class_value.startswith(DEBEZIUM_SOURCE_CONNECTOR_PREFIX):
143
+ class_type = DebeziumSourceConnector
144
+ elif connector_class_value == MONGO_SOURCE_CONNECTOR_CLASS:
145
+ class_type = MongoSourceConnector
146
+ elif any(
147
+ [
148
+ connector.connector_name == connector_manifest.name
149
+ for connector in self.config.generic_connectors
150
+ ]
151
+ ):
152
+ class_type = ConfigDrivenSourceConnector
153
+ else:
154
+ self.report.report_dropped(connector_manifest.name)
155
+ self.report.warning(
156
+ "Lineage for Source Connector not supported. "
157
+ "Please refer to Kafka Connect docs to use `generic_connectors` config.",
158
+ context=f"{connector_manifest.name} of type {connector_class_value}",
159
+ )
160
+ continue
161
+ elif connector_manifest.type == SINK:
162
+ if connector_class_value == BIGQUERY_SINK_CONNECTOR_CLASS:
163
+ class_type = BigQuerySinkConnector
164
+ elif connector_class_value == S3_SINK_CONNECTOR_CLASS:
165
+ class_type = ConfluentS3SinkConnector
166
+ elif connector_class_value == SNOWFLAKE_SINK_CONNECTOR_CLASS:
167
+ class_type = SnowflakeSinkConnector
168
+ else:
169
+ self.report.report_dropped(connector_manifest.name)
170
+ self.report.warning(
171
+ "Lineage for Sink Connector not supported.",
172
+ context=f"{connector_manifest.name} of type {connector_class_value}",
173
+ )
174
+
175
+ connector_class = class_type(connector_manifest, self.config, self.report)
176
+ connector_manifest.lineages = connector_class.extract_lineages()
177
+ connector_manifest.flow_property_bag = (
178
+ connector_class.extract_flow_property_bag()
179
+ )
180
+
181
+ yield connector_manifest
182
+
183
+ def _get_connector_manifest(
184
+ self, connector_name: str, connector_url: str
185
+ ) -> Optional[ConnectorManifest]:
186
+ try:
187
+ connector_response = self.session.get(connector_url)
188
+ connector_response.raise_for_status()
189
+ except Exception as e:
190
+ self.report.warning(
191
+ "Failed to get connector details", connector_name, exc=e
192
+ )
193
+ return None
194
+ manifest = connector_response.json()
195
+ connector_manifest = ConnectorManifest(**manifest)
196
+ return connector_manifest
197
+
198
+ def _get_connector_tasks(self, connector_name: str) -> dict:
199
+ try:
200
+ response = self.session.get(
201
+ f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
202
+ )
203
+ response.raise_for_status()
204
+ except Exception as e:
205
+ self.report.warning(
206
+ "Error getting connector tasks", context=connector_name, exc=e
207
+ )
208
+ return {}
209
+
210
+ return response.json()
211
+
212
+ def _get_connector_topics(self, connector_name: str) -> List[str]:
213
+ try:
214
+ response = self.session.get(
215
+ f"{self.config.connect_uri}/connectors/{connector_name}/topics",
216
+ )
217
+ response.raise_for_status()
218
+ except Exception as e:
219
+ self.report.warning(
220
+ "Error getting connector topics", context=connector_name, exc=e
221
+ )
222
+ return []
223
+
224
+ return response.json()[connector_name]["topics"]
225
+
226
+ def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
227
+ connector_name = connector.name
228
+ connector_type = connector.type
229
+ connector_class = connector.config.get(CONNECTOR_CLASS)
230
+ flow_property_bag = connector.flow_property_bag
231
+ # connector_url = connector.url # NOTE: this will expose connector credential when used
232
+ flow_urn = builder.make_data_flow_urn(
233
+ self.platform,
234
+ connector_name,
235
+ self.config.env,
236
+ self.config.platform_instance,
237
+ )
238
+
239
+ return MetadataChangeProposalWrapper(
240
+ entityUrn=flow_urn,
241
+ aspect=models.DataFlowInfoClass(
242
+ name=connector_name,
243
+ description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.",
244
+ customProperties=flow_property_bag,
245
+ # externalUrl=connector_url, # NOTE: this will expose connector credential when used
246
+ ),
247
+ ).as_workunit()
248
+
249
+ def construct_job_workunits(
250
+ self, connector: ConnectorManifest
251
+ ) -> Iterable[MetadataWorkUnit]:
252
+ connector_name = connector.name
253
+ flow_urn = builder.make_data_flow_urn(
254
+ self.platform,
255
+ connector_name,
256
+ self.config.env,
257
+ self.config.platform_instance,
258
+ )
259
+
260
+ lineages = connector.lineages
261
+ if lineages:
262
+ for lineage in lineages:
263
+ source_dataset = lineage.source_dataset
264
+ source_platform = lineage.source_platform
265
+ target_dataset = lineage.target_dataset
266
+ target_platform = lineage.target_platform
267
+ job_property_bag = lineage.job_property_bag
268
+
269
+ source_platform_instance = get_platform_instance(
270
+ self.config, connector_name, source_platform
271
+ )
272
+ target_platform_instance = get_platform_instance(
273
+ self.config, connector_name, target_platform
274
+ )
275
+
276
+ job_id = self.get_job_id(lineage, connector, self.config)
277
+ job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id)
278
+
279
+ inlets = (
280
+ [
281
+ self.make_lineage_dataset_urn(
282
+ source_platform, source_dataset, source_platform_instance
283
+ )
284
+ ]
285
+ if source_dataset
286
+ else []
287
+ )
288
+ outlets = [
289
+ self.make_lineage_dataset_urn(
290
+ target_platform, target_dataset, target_platform_instance
291
+ )
292
+ ]
293
+
294
+ yield MetadataChangeProposalWrapper(
295
+ entityUrn=job_urn,
296
+ aspect=models.DataJobInfoClass(
297
+ name=f"{connector_name}:{job_id}",
298
+ type="COMMAND",
299
+ customProperties=job_property_bag,
300
+ ),
301
+ ).as_workunit()
302
+
303
+ yield MetadataChangeProposalWrapper(
304
+ entityUrn=job_urn,
305
+ aspect=models.DataJobInputOutputClass(
306
+ inputDatasets=inlets,
307
+ outputDatasets=outlets,
308
+ ),
309
+ ).as_workunit()
310
+
311
+ def get_job_id(
312
+ self,
313
+ lineage: KafkaConnectLineage,
314
+ connector: ConnectorManifest,
315
+ config: KafkaConnectSourceConfig,
316
+ ) -> str:
317
+ connector_class = connector.config.get(CONNECTOR_CLASS)
318
+
319
+ # Note - This block is only to maintain backward compatibility of Job URN
320
+ if (
321
+ connector_class
322
+ and connector.type == SOURCE
323
+ and (
324
+ "JdbcSourceConnector" in connector_class
325
+ or connector_class.startswith("io.debezium.connector")
326
+ )
327
+ and lineage.source_dataset
328
+ and config.connect_to_platform_map
329
+ and config.connect_to_platform_map.get(connector.name)
330
+ and config.connect_to_platform_map[connector.name].get(
331
+ lineage.source_platform
332
+ )
333
+ ):
334
+ return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}"
335
+
336
+ return (
337
+ lineage.source_dataset
338
+ if lineage.source_dataset
339
+ else f"unknown_source.{lineage.target_dataset}"
340
+ )
341
+
342
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
343
+ return [
344
+ *super().get_workunit_processors(),
345
+ StaleEntityRemovalHandler.create(
346
+ self, self.config, self.ctx
347
+ ).workunit_processor,
348
+ ]
349
+
350
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
351
+ for connector in self.get_connectors_manifest():
352
+ yield self.construct_flow_workunit(connector)
353
+ yield from self.construct_job_workunits(connector)
354
+ self.report.report_connector_scanned(connector.name)
355
+
356
+ def get_report(self) -> KafkaConnectSourceReport:
357
+ return self.report
358
+
359
+ def make_lineage_dataset_urn(
360
+ self, platform: str, name: str, platform_instance: Optional[str]
361
+ ) -> str:
362
+ if self.config.convert_lineage_urns_to_lowercase:
363
+ name = name.lower()
364
+
365
+ return builder.make_dataset_urn_with_platform_instance(
366
+ platform, name, platform_instance, self.config.env
367
+ )