acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
  2. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
1
  import logging
2
2
  import time
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import datetime, timezone
6
- from typing import List, Optional
6
+ from threading import Lock
7
+ from typing import Dict, Iterable, List, Optional
7
8
 
8
9
  from pydantic import Field
9
10
 
@@ -18,12 +19,28 @@ from datahub.utilities.urns._urn_base import Urn
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
22
+ QUERY_QUERY_ENTITY = """
23
+ query listQueries($input: ScrollAcrossEntitiesInput!) {
24
+ scrollAcrossEntities(input: $input) {
25
+ nextScrollId
26
+ count
27
+ searchResults {
28
+ entity {
29
+ ... on QueryEntity {
30
+ urn
31
+ }
32
+ }
33
+ }
34
+ }
35
+ }
36
+ """
37
+
21
38
 
22
39
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
23
40
  enabled: bool = Field(
24
41
  default=True, description="Whether to do soft deletion cleanup."
25
42
  )
26
- retention_days: Optional[int] = Field(
43
+ retention_days: int = Field(
27
44
  10,
28
45
  description="Number of days to retain metadata in DataHub",
29
46
  )
@@ -62,25 +79,34 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
62
79
  default=None,
63
80
  description="Query to filter entities",
64
81
  )
82
+
65
83
  limit_entities_delete: Optional[int] = Field(
66
84
  25000, description="Max number of entities to delete."
67
85
  )
68
86
 
69
- runtime_limit_seconds: Optional[int] = Field(
70
- None,
87
+ futures_max_at_time: int = Field(
88
+ 1000, description="Max number of futures to have at a time."
89
+ )
90
+
91
+ runtime_limit_seconds: int = Field(
92
+ 7200, # 2 hours by default
71
93
  description="Runtime limit in seconds",
72
94
  )
73
95
 
74
96
 
75
97
  @dataclass
76
98
  class SoftDeletedEntitiesReport(SourceReport):
77
- num_soft_deleted_entity_removed: int = 0
78
- num_soft_deleted_entity_removed_by_type: TopKDict[str, int] = field(
79
- default_factory=TopKDict
80
- )
81
- sample_soft_deleted_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
99
+ num_queries_found: int = 0
100
+ num_soft_deleted_entity_processed: int = 0
101
+ num_soft_deleted_retained_due_to_age: int = 0
102
+ num_soft_deleted_entity_removal_started: int = 0
103
+ num_hard_deleted: int = 0
104
+ num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
105
+ sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
82
106
  default_factory=TopKDict
83
107
  )
108
+ runtime_limit_reached: bool = False
109
+ deletion_limit_reached: bool = False
84
110
 
85
111
 
86
112
  class SoftDeletedEntitiesCleanup:
@@ -103,48 +129,55 @@ class SoftDeletedEntitiesCleanup:
103
129
  self.config = config
104
130
  self.report = report
105
131
  self.dry_run = dry_run
132
+ self.start_time = 0.0
133
+ self._report_lock: Lock = Lock()
134
+ self.last_print_time = 0.0
135
+
136
+ def _increment_retained_count(self) -> None:
137
+ """Thread-safe method to update report fields"""
138
+ with self._report_lock:
139
+ self.report.num_soft_deleted_retained_due_to_age += 1
140
+
141
+ def _increment_removal_started_count(self) -> None:
142
+ """Thread-safe method to update report fields"""
143
+ with self._report_lock:
144
+ self.report.num_soft_deleted_entity_removal_started += 1
145
+
146
+ def _update_report(self, urn: str, entity_type: str) -> None:
147
+ """Thread-safe method to update report fields"""
148
+ with self._report_lock:
149
+ self.report.num_hard_deleted += 1
150
+
151
+ current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
152
+ self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
153
+ if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
154
+ self.report.sample_hard_deleted_aspects_by_type[
155
+ entity_type
156
+ ] = LossyList()
157
+ self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
106
158
 
107
159
  def delete_entity(self, urn: str) -> None:
108
160
  assert self.ctx.graph
109
161
 
110
162
  entity_urn = Urn.from_string(urn)
111
- self.report.num_soft_deleted_entity_removed += 1
112
- self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
113
- self.report.num_soft_deleted_entity_removed_by_type.get(
114
- entity_urn.entity_type, 0
115
- )
116
- + 1
117
- )
118
- if (
119
- entity_urn.entity_type
120
- not in self.report.sample_soft_deleted_removed_aspects_by_type
121
- ):
122
- self.report.sample_soft_deleted_removed_aspects_by_type[
123
- entity_urn.entity_type
124
- ] = LossyList()
125
- self.report.sample_soft_deleted_removed_aspects_by_type[
126
- entity_urn.entity_type
127
- ].append(urn)
128
-
129
163
  if self.dry_run:
130
164
  logger.info(
131
165
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
132
166
  )
133
167
  return
134
-
168
+ if self._deletion_limit_reached() or self._times_up():
169
+ return
170
+ self._increment_removal_started_count()
135
171
  self.ctx.graph.delete_entity(urn=urn, hard=True)
136
172
  self.ctx.graph.delete_references_to_urn(
137
173
  urn=urn,
138
174
  dry_run=False,
139
175
  )
176
+ self._update_report(urn, entity_urn.entity_type)
140
177
 
141
178
  def delete_soft_deleted_entity(self, urn: str) -> None:
142
179
  assert self.ctx.graph
143
180
 
144
- if self.config.retention_days is None:
145
- logger.info("Retention days is not set, skipping soft delete cleanup")
146
- return
147
-
148
181
  retention_time = (
149
182
  int(datetime.now(timezone.utc).timestamp())
150
183
  - self.config.retention_days * 24 * 60 * 60
@@ -157,15 +190,84 @@ class SoftDeletedEntitiesCleanup:
157
190
  ]["created"]["time"] < (retention_time * 1000):
158
191
  logger.debug(f"Hard deleting {urn}")
159
192
  self.delete_entity(urn)
193
+ else:
194
+ self._increment_retained_count()
195
+
196
+ def _print_report(self) -> None:
197
+ time_taken = round(time.time() - self.last_print_time, 1)
198
+ # Print report every 2 minutes
199
+ if time_taken > 120:
200
+ self.last_print_time = time.time()
201
+ logger.info(f"\n{self.report.as_string()}")
202
+
203
+ def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
204
+ done, not_done = wait(futures, return_when=FIRST_COMPLETED)
205
+ futures = {future: urn for future, urn in futures.items() if future in not_done}
206
+
207
+ for future in done:
208
+ self._print_report()
209
+ if future.exception():
210
+ self.report.failure(
211
+ title="Failed to delete entity",
212
+ message="Failed to delete entity",
213
+ context=futures[future],
214
+ exc=future.exception(),
215
+ )
216
+ self.report.num_soft_deleted_entity_processed += 1
217
+ if (
218
+ self.report.num_soft_deleted_entity_processed % self.config.batch_size
219
+ == 0
220
+ ):
221
+ if self.config.delay:
222
+ logger.debug(
223
+ f"Sleeping for {self.config.delay} seconds before further processing batch"
224
+ )
225
+ time.sleep(self.config.delay)
226
+ return futures
160
227
 
161
- def cleanup_soft_deleted_entities(self) -> None:
162
- if not self.config.enabled:
163
- return
228
+ def _get_soft_deleted_queries(self) -> Iterable[str]:
164
229
  assert self.ctx.graph
165
- start_time = time.time()
166
-
167
- deleted_count_retention = 0
168
- urns = self.ctx.graph.get_urns_by_filter(
230
+ scroll_id: Optional[str] = None
231
+ while True:
232
+ try:
233
+ result = self.ctx.graph.execute_graphql(
234
+ QUERY_QUERY_ENTITY,
235
+ {
236
+ "input": {
237
+ "types": ["QUERY"],
238
+ "query": "*",
239
+ "scrollId": scroll_id if scroll_id else None,
240
+ "count": self.config.batch_size,
241
+ "orFilters": [
242
+ {
243
+ "and": [
244
+ {
245
+ "field": "removed",
246
+ "values": ["true"],
247
+ "condition": "EQUAL",
248
+ }
249
+ ]
250
+ }
251
+ ],
252
+ }
253
+ },
254
+ )
255
+ except Exception as e:
256
+ self.report.failure(
257
+ f"While trying to get queries with {scroll_id}", exc=e
258
+ )
259
+ break
260
+ scroll_across_entities = result.get("scrollAcrossEntities")
261
+ if not scroll_across_entities or not scroll_across_entities.get("count"):
262
+ break
263
+ scroll_id = scroll_across_entities.get("nextScrollId")
264
+ self.report.num_queries_found += scroll_across_entities.get("count")
265
+ for query in scroll_across_entities.get("searchResults"):
266
+ yield query["entity"]["urn"]
267
+
268
+ def _get_urns(self) -> Iterable[str]:
269
+ assert self.ctx.graph
270
+ yield from self.ctx.graph.get_urns_by_filter(
169
271
  entity_types=self.config.entity_types,
170
272
  platform=self.config.platform,
171
273
  env=self.config.env,
@@ -173,52 +275,45 @@ class SoftDeletedEntitiesCleanup:
173
275
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
174
276
  batch_size=self.config.batch_size,
175
277
  )
278
+ yield from self._get_soft_deleted_queries()
279
+
280
+ def _times_up(self) -> bool:
281
+ if (
282
+ self.config.runtime_limit_seconds
283
+ and time.time() - self.start_time > self.config.runtime_limit_seconds
284
+ ):
285
+ with self._report_lock:
286
+ self.report.runtime_limit_reached = True
287
+ return True
288
+ return False
289
+
290
+ def _deletion_limit_reached(self) -> bool:
291
+ if (
292
+ self.config.limit_entities_delete
293
+ and self.report.num_hard_deleted > self.config.limit_entities_delete
294
+ ):
295
+ with self._report_lock:
296
+ self.report.deletion_limit_reached = True
297
+ return True
298
+ return False
299
+
300
+ def cleanup_soft_deleted_entities(self) -> None:
301
+ if not self.config.enabled:
302
+ return
303
+ self.start_time = time.time()
176
304
 
177
- futures = {}
305
+ futures: Dict[Future, str] = dict()
178
306
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
179
- num_urns_submitted = 0
180
- for urn in urns:
181
- num_urns_submitted += 1
182
- if (
183
- self.config.limit_entities_delete
184
- and num_urns_submitted > self.config.limit_entities_delete
185
- ):
186
- logger.info(
187
- f"Limit of {self.config.limit_entities_delete} entities reached. Stopping"
188
- )
307
+ for urn in self._get_urns():
308
+ self._print_report()
309
+ while len(futures) >= self.config.futures_max_at_time:
310
+ futures = self._process_futures(futures)
311
+ if self._deletion_limit_reached() or self._times_up():
189
312
  break
190
- if (
191
- self.config.runtime_limit_seconds
192
- and time.time() - start_time > self.config.runtime_limit_seconds
193
- ):
194
- logger.info(
195
- f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Stopping"
196
- )
197
- break
198
-
199
313
  future = executor.submit(self.delete_soft_deleted_entity, urn)
200
314
  futures[future] = urn
201
315
 
202
- if not futures:
203
- return
204
- for future in as_completed(futures):
205
- if future.exception():
206
- logger.error(
207
- f"Failed to delete entity {futures[future]}: {future.exception()}"
208
- )
209
- self.report.failure(
210
- f"Failed to delete entity {futures[future]}",
211
- exc=future.exception(),
212
- )
213
- deleted_count_retention += 1
214
-
215
- if deleted_count_retention % self.config.batch_size == 0:
216
- logger.info(
217
- f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far"
218
- )
219
-
220
- if self.config.delay:
221
- logger.debug(
222
- f"Sleeping for {self.config.delay} seconds before getting next batch"
223
- )
224
- time.sleep(self.config.delay)
316
+ logger.info(f"Waiting for {len(futures)} futures to complete")
317
+ while len(futures) > 0:
318
+ self._print_report()
319
+ futures = self._process_futures(futures)
@@ -10,6 +10,7 @@ from pyiceberg.exceptions import (
10
10
  NoSuchNamespaceError,
11
11
  NoSuchPropertyException,
12
12
  NoSuchTableError,
13
+ ServerError,
13
14
  )
14
15
  from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
15
16
  from pyiceberg.table import Table
@@ -145,6 +146,13 @@ class IcebergSource(StatefulIngestionSourceBase):
145
146
  self.report.report_no_listed_namespaces(len(namespaces))
146
147
  tables_count = 0
147
148
  for namespace in namespaces:
149
+ namespace_repr = ".".join(namespace)
150
+ if not self.config.namespace_pattern.allowed(namespace_repr):
151
+ LOGGER.info(
152
+ f"Namespace {namespace_repr} is not allowed by config pattern, skipping"
153
+ )
154
+ self.report.report_dropped(f"{namespace_repr}.*")
155
+ continue
148
156
  try:
149
157
  tables = catalog.list_tables(namespace)
150
158
  tables_count += len(tables)
@@ -181,6 +189,9 @@ class IcebergSource(StatefulIngestionSourceBase):
181
189
  if not self.config.table_pattern.allowed(dataset_name):
182
190
  # Dataset name is rejected by pattern, report as dropped.
183
191
  self.report.report_dropped(dataset_name)
192
+ LOGGER.debug(
193
+ f"Skipping table {dataset_name} due to not being allowed by the config pattern"
194
+ )
184
195
  return
185
196
  try:
186
197
  if not hasattr(thread_local, "local_catalog"):
@@ -219,6 +230,22 @@ class IcebergSource(StatefulIngestionSourceBase):
219
230
  LOGGER.warning(
220
231
  f"NoSuchTableError while processing table {dataset_path}, skipping it.",
221
232
  )
233
+ except FileNotFoundError as e:
234
+ self.report.report_warning(
235
+ "file-not-found",
236
+ f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}",
237
+ )
238
+ LOGGER.warning(
239
+ f"FileNotFoundError while processing table {dataset_path}, skipping it."
240
+ )
241
+ except ServerError as e:
242
+ self.report.report_warning(
243
+ "iceberg-rest-server-error",
244
+ f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
245
+ )
246
+ LOGGER.warning(
247
+ f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
248
+ )
222
249
  except Exception as e:
223
250
  self.report.report_failure("general", f"Failed to create workunit: {e}")
224
251
  LOGGER.exception(
@@ -269,7 +296,6 @@ class IcebergSource(StatefulIngestionSourceBase):
269
296
  ] = table.current_snapshot().manifest_list
270
297
  dataset_properties = DatasetPropertiesClass(
271
298
  name=table.name()[-1],
272
- tags=[],
273
299
  description=table.metadata.properties.get("comment", None),
274
300
  customProperties=custom_properties,
275
301
  )
@@ -68,6 +68,10 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
68
68
  default=AllowDenyPattern.allow_all(),
69
69
  description="Regex patterns for tables to filter in ingestion.",
70
70
  )
71
+ namespace_pattern: AllowDenyPattern = Field(
72
+ default=AllowDenyPattern.allow_all(),
73
+ description="Regex patterns for namespaces to filter in ingestion.",
74
+ )
71
75
  user_ownership_property: Optional[str] = Field(
72
76
  default="owner",
73
77
  description="Iceberg table property to look for a `CorpUser` owner. Can only hold a single user value. If property has no value, no owner information will be emitted.",
File without changes
@@ -0,0 +1,202 @@
1
+ import logging
2
+ from dataclasses import dataclass, field
3
+ from typing import Dict, Iterable, List, Optional
4
+
5
+ from pydantic.fields import Field
6
+
7
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel
8
+ from datahub.configuration.source_common import (
9
+ DatasetLineageProviderConfigBase,
10
+ PlatformInstanceConfigMixin,
11
+ )
12
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
13
+ StaleEntityRemovalSourceReport,
14
+ StatefulStaleMetadataRemovalConfig,
15
+ )
16
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
17
+ StatefulIngestionConfigBase,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ KAFKA = "kafka"
23
+ SOURCE = "source"
24
+ SINK = "sink"
25
+ CONNECTOR_CLASS = "connector.class"
26
+
27
+
28
+ class ProvidedConfig(ConfigModel):
29
+ provider: str
30
+ path_key: str
31
+ value: str
32
+
33
+
34
+ class GenericConnectorConfig(ConfigModel):
35
+ connector_name: str
36
+ source_dataset: str
37
+ source_platform: str
38
+
39
+
40
+ class KafkaConnectSourceConfig(
41
+ PlatformInstanceConfigMixin,
42
+ DatasetLineageProviderConfigBase,
43
+ StatefulIngestionConfigBase,
44
+ ):
45
+ # See the Connect REST Interface for details
46
+ # https://docs.confluent.io/platform/current/connect/references/restapi.html#
47
+ connect_uri: str = Field(
48
+ default="http://localhost:8083/", description="URI to connect to."
49
+ )
50
+ username: Optional[str] = Field(default=None, description="Kafka Connect username.")
51
+ password: Optional[str] = Field(default=None, description="Kafka Connect password.")
52
+ cluster_name: Optional[str] = Field(
53
+ default="connect-cluster", description="Cluster to ingest from."
54
+ )
55
+ # convert lineage dataset's urns to lowercase
56
+ convert_lineage_urns_to_lowercase: bool = Field(
57
+ default=False,
58
+ description="Whether to convert the urns of ingested lineage dataset to lowercase",
59
+ )
60
+ connector_patterns: AllowDenyPattern = Field(
61
+ default=AllowDenyPattern.allow_all(),
62
+ description="regex patterns for connectors to filter for ingestion.",
63
+ )
64
+ provided_configs: Optional[List[ProvidedConfig]] = Field(
65
+ default=None, description="Provided Configurations"
66
+ )
67
+ connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field(
68
+ default=None,
69
+ description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`',
70
+ )
71
+ platform_instance_map: Optional[Dict[str, str]] = Field(
72
+ default=None,
73
+ description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`',
74
+ )
75
+ generic_connectors: List[GenericConnectorConfig] = Field(
76
+ default=[],
77
+ description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector",
78
+ )
79
+
80
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
81
+
82
+
83
+ @dataclass
84
+ class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
85
+ connectors_scanned: int = 0
86
+ filtered: List[str] = field(default_factory=list)
87
+
88
+ def report_connector_scanned(self, connector: str) -> None:
89
+ self.connectors_scanned += 1
90
+
91
+ def report_dropped(self, connector: str) -> None:
92
+ self.filtered.append(connector)
93
+
94
+
95
+ @dataclass
96
+ class KafkaConnectLineage:
97
+ """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob"""
98
+
99
+ source_platform: str
100
+ target_dataset: str
101
+ target_platform: str
102
+ job_property_bag: Optional[Dict[str, str]] = None
103
+ source_dataset: Optional[str] = None
104
+
105
+
106
+ @dataclass
107
+ class ConnectorManifest:
108
+ """Each instance is potential DataFlow"""
109
+
110
+ name: str
111
+ type: str
112
+ config: Dict
113
+ tasks: Dict
114
+ url: Optional[str] = None
115
+ flow_property_bag: Optional[Dict[str, str]] = None
116
+ lineages: List[KafkaConnectLineage] = field(default_factory=list)
117
+ topic_names: Iterable[str] = field(default_factory=list)
118
+
119
+
120
+ def remove_prefix(text: str, prefix: str) -> str:
121
+ if text.startswith(prefix):
122
+ index = len(prefix)
123
+ return text[index:]
124
+ return text
125
+
126
+
127
+ def unquote(
128
+ string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None
129
+ ) -> str:
130
+ """
131
+ If string starts and ends with a quote, unquote it
132
+ """
133
+ trailing_quote = trailing_quote if trailing_quote else leading_quote
134
+ if string.startswith(leading_quote) and string.endswith(trailing_quote):
135
+ string = string[1:-1]
136
+ return string
137
+
138
+
139
+ def get_dataset_name(
140
+ database_name: Optional[str],
141
+ source_table: str,
142
+ ) -> str:
143
+ if database_name:
144
+ dataset_name = database_name + "." + source_table
145
+ else:
146
+ dataset_name = source_table
147
+
148
+ return dataset_name
149
+
150
+
151
+ def get_platform_instance(
152
+ config: KafkaConnectSourceConfig, connector_name: str, platform: str
153
+ ) -> Optional[str]:
154
+ instance_name = None
155
+ if (
156
+ config.connect_to_platform_map
157
+ and config.connect_to_platform_map.get(connector_name)
158
+ and config.connect_to_platform_map[connector_name].get(platform)
159
+ ):
160
+ instance_name = config.connect_to_platform_map[connector_name][platform]
161
+ if config.platform_instance_map and config.platform_instance_map.get(platform):
162
+ logger.warning(
163
+ f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map."
164
+ "Will prefer connector specific platform instance from connect_to_platform_map."
165
+ )
166
+ elif config.platform_instance_map and config.platform_instance_map.get(platform):
167
+ instance_name = config.platform_instance_map[platform]
168
+ logger.info(
169
+ f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}"
170
+ )
171
+ return instance_name
172
+
173
+
174
+ def transform_connector_config(
175
+ connector_config: Dict, provided_configs: List[ProvidedConfig]
176
+ ) -> None:
177
+ """This method will update provided configs in connector config values, if any"""
178
+ lookupsByProvider = {}
179
+ for pconfig in provided_configs:
180
+ lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value
181
+ for k, v in connector_config.items():
182
+ for key, value in lookupsByProvider.items():
183
+ if key in v:
184
+ connector_config[k] = connector_config[k].replace(key, value)
185
+
186
+
187
+ # TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
188
+ def has_three_level_hierarchy(platform: str) -> bool:
189
+ return platform in ["postgres", "trino", "redshift", "snowflake"]
190
+
191
+
192
+ @dataclass
193
+ class BaseConnector:
194
+ connector_manifest: ConnectorManifest
195
+ config: KafkaConnectSourceConfig
196
+ report: KafkaConnectSourceReport
197
+
198
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
199
+ return []
200
+
201
+ def extract_flow_property_bag(self) -> Optional[Dict[str, str]]:
202
+ return None