acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
  2. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,570 @@
1
+ import logging
2
+ import re
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Iterable, List, Optional, Tuple
5
+
6
+ from sqlalchemy.engine.url import make_url
7
+
8
+ from datahub.ingestion.source.kafka_connect.common import (
9
+ CONNECTOR_CLASS,
10
+ KAFKA,
11
+ BaseConnector,
12
+ ConnectorManifest,
13
+ KafkaConnectLineage,
14
+ get_dataset_name,
15
+ has_three_level_hierarchy,
16
+ remove_prefix,
17
+ unquote,
18
+ )
19
+ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
20
+ get_platform_from_sqlalchemy_uri,
21
+ )
22
+
23
+
24
+ @dataclass
25
+ class ConfluentJDBCSourceConnector(BaseConnector):
26
+ REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter"
27
+ KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER]
28
+ # https://kafka.apache.org/documentation/#connect_included_transformation
29
+ KAFKA_NONTOPICROUTING_TRANSFORMS = [
30
+ "InsertField",
31
+ "InsertField$Key",
32
+ "InsertField$Value",
33
+ "ReplaceField",
34
+ "ReplaceField$Key",
35
+ "ReplaceField$Value",
36
+ "MaskField",
37
+ "MaskField$Key",
38
+ "MaskField$Value",
39
+ "ValueToKey",
40
+ "ValueToKey$Key",
41
+ "ValueToKey$Value",
42
+ "HoistField",
43
+ "HoistField$Key",
44
+ "HoistField$Value",
45
+ "ExtractField",
46
+ "ExtractField$Key",
47
+ "ExtractField$Value",
48
+ "SetSchemaMetadata",
49
+ "SetSchemaMetadata$Key",
50
+ "SetSchemaMetadata$Value",
51
+ "Flatten",
52
+ "Flatten$Key",
53
+ "Flatten$Value",
54
+ "Cast",
55
+ "Cast$Key",
56
+ "Cast$Value",
57
+ "HeadersFrom",
58
+ "HeadersFrom$Key",
59
+ "HeadersFrom$Value",
60
+ "TimestampConverter",
61
+ "Filter",
62
+ "InsertHeader",
63
+ "DropHeaders",
64
+ ]
65
+ # https://docs.confluent.io/platform/current/connect/transforms/overview.html
66
+ CONFLUENT_NONTOPICROUTING_TRANSFORMS = [
67
+ "Drop",
68
+ "Drop$Key",
69
+ "Drop$Value",
70
+ "Filter",
71
+ "Filter$Key",
72
+ "Filter$Value",
73
+ "TombstoneHandler",
74
+ ]
75
+ KNOWN_NONTOPICROUTING_TRANSFORMS = (
76
+ KAFKA_NONTOPICROUTING_TRANSFORMS
77
+ + [
78
+ f"org.apache.kafka.connect.transforms.{t}"
79
+ for t in KAFKA_NONTOPICROUTING_TRANSFORMS
80
+ ]
81
+ + CONFLUENT_NONTOPICROUTING_TRANSFORMS
82
+ + [
83
+ f"io.confluent.connect.transforms.{t}"
84
+ for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS
85
+ ]
86
+ )
87
+
88
+ @dataclass
89
+ class JdbcParser:
90
+ db_connection_url: str
91
+ source_platform: str
92
+ database_name: str
93
+ topic_prefix: str
94
+ query: str
95
+ transforms: list
96
+
97
+ def get_parser(
98
+ self,
99
+ connector_manifest: ConnectorManifest,
100
+ ) -> JdbcParser:
101
+ url = remove_prefix(
102
+ str(connector_manifest.config.get("connection.url")), "jdbc:"
103
+ )
104
+ url_instance = make_url(url)
105
+ source_platform = get_platform_from_sqlalchemy_uri(str(url_instance))
106
+ database_name = url_instance.database
107
+ assert database_name
108
+ db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
109
+
110
+ topic_prefix = self.connector_manifest.config.get("topic.prefix", None)
111
+
112
+ query = self.connector_manifest.config.get("query", None)
113
+
114
+ transform_names = (
115
+ self.connector_manifest.config.get("transforms", "").split(",")
116
+ if self.connector_manifest.config.get("transforms")
117
+ else []
118
+ )
119
+
120
+ transforms = []
121
+ for name in transform_names:
122
+ transform = {"name": name}
123
+ transforms.append(transform)
124
+ for key in self.connector_manifest.config.keys():
125
+ if key.startswith(f"transforms.{name}."):
126
+ transform[
127
+ key.replace(f"transforms.{name}.", "")
128
+ ] = self.connector_manifest.config[key]
129
+
130
+ return self.JdbcParser(
131
+ db_connection_url,
132
+ source_platform,
133
+ database_name,
134
+ topic_prefix,
135
+ query,
136
+ transforms,
137
+ )
138
+
139
+ def default_get_lineages(
140
+ self,
141
+ topic_prefix: str,
142
+ database_name: str,
143
+ source_platform: str,
144
+ topic_names: Optional[Iterable[str]] = None,
145
+ include_source_dataset: bool = True,
146
+ ) -> List[KafkaConnectLineage]:
147
+ lineages: List[KafkaConnectLineage] = []
148
+ if not topic_names:
149
+ topic_names = self.connector_manifest.topic_names
150
+ table_name_tuples: List[Tuple] = self.get_table_names()
151
+ for topic in topic_names:
152
+ # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM))
153
+ source_table: str = (
154
+ remove_prefix(topic, topic_prefix) if topic_prefix else topic
155
+ )
156
+ # include schema name for three-level hierarchies
157
+ if has_three_level_hierarchy(source_platform):
158
+ table_name_tuple: Tuple = next(
159
+ iter([t for t in table_name_tuples if t and t[-1] == source_table]),
160
+ (),
161
+ )
162
+ if len(table_name_tuple) > 1:
163
+ source_table = f"{table_name_tuple[-2]}.{source_table}"
164
+ else:
165
+ include_source_dataset = False
166
+ self.report.warning(
167
+ "Could not find schema for table"
168
+ f"{self.connector_manifest.name} : {source_table}",
169
+ )
170
+ dataset_name: str = get_dataset_name(database_name, source_table)
171
+ lineage = KafkaConnectLineage(
172
+ source_dataset=dataset_name if include_source_dataset else None,
173
+ source_platform=source_platform,
174
+ target_dataset=topic,
175
+ target_platform=KAFKA,
176
+ )
177
+ lineages.append(lineage)
178
+ return lineages
179
+
180
+ def get_table_names(self) -> List[Tuple]:
181
+ sep: str = "."
182
+ leading_quote_char: str = '"'
183
+ trailing_quote_char: str = leading_quote_char
184
+
185
+ table_ids: List[str] = []
186
+ if self.connector_manifest.tasks:
187
+ table_ids = (
188
+ ",".join(
189
+ [
190
+ task["config"].get("tables")
191
+ for task in self.connector_manifest.tasks
192
+ ]
193
+ )
194
+ ).split(",")
195
+ quote_method = self.connector_manifest.config.get(
196
+ "quote.sql.identifiers", "always"
197
+ )
198
+ if (
199
+ quote_method == "always"
200
+ and table_ids
201
+ and table_ids[0]
202
+ and table_ids[-1]
203
+ ):
204
+ leading_quote_char = table_ids[0][0]
205
+ trailing_quote_char = table_ids[-1][-1]
206
+ # This will only work for single character quotes
207
+ elif self.connector_manifest.config.get("table.whitelist"):
208
+ table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore
209
+
210
+ # List of Tuple containing (schema, table)
211
+ tables: List[Tuple] = [
212
+ (
213
+ (
214
+ unquote(
215
+ table_id.split(sep)[-2], leading_quote_char, trailing_quote_char
216
+ )
217
+ if len(table_id.split(sep)) > 1
218
+ else ""
219
+ ),
220
+ unquote(
221
+ table_id.split(sep)[-1], leading_quote_char, trailing_quote_char
222
+ ),
223
+ )
224
+ for table_id in table_ids
225
+ ]
226
+ return tables
227
+
228
+ def extract_flow_property_bag(self) -> Dict[str, str]:
229
+ flow_property_bag = {
230
+ k: v
231
+ for k, v in self.connector_manifest.config.items()
232
+ if k not in ["connection.password", "connection.user"]
233
+ }
234
+
235
+ # Mask/Remove properties that may reveal credentials
236
+ flow_property_bag["connection.url"] = self.get_parser(
237
+ self.connector_manifest
238
+ ).db_connection_url
239
+
240
+ return flow_property_bag
241
+
242
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
243
+ lineages: List[KafkaConnectLineage] = list()
244
+ parser = self.get_parser(self.connector_manifest)
245
+ source_platform = parser.source_platform
246
+ database_name = parser.database_name
247
+ query = parser.query
248
+ topic_prefix = parser.topic_prefix
249
+ transforms = parser.transforms
250
+
251
+ logging.debug(
252
+ f"Extracting source platform: {source_platform} and database name: {database_name} from connection url "
253
+ )
254
+
255
+ if not self.connector_manifest.topic_names:
256
+ return lineages
257
+
258
+ if query:
259
+ # Lineage source_table can be extracted by parsing query
260
+ for topic in self.connector_manifest.topic_names:
261
+ # default method - as per earlier implementation
262
+ dataset_name: str = get_dataset_name(database_name, topic)
263
+
264
+ lineage = KafkaConnectLineage(
265
+ source_dataset=None,
266
+ source_platform=source_platform,
267
+ target_dataset=topic,
268
+ target_platform=KAFKA,
269
+ )
270
+ lineages.append(lineage)
271
+ self.report.warning(
272
+ "Could not find input dataset, the connector has query configuration set",
273
+ self.connector_manifest.name,
274
+ )
275
+ return lineages
276
+
277
+ SINGLE_TRANSFORM = len(transforms) == 1
278
+ NO_TRANSFORM = len(transforms) == 0
279
+ UNKNOWN_TRANSFORM = any(
280
+ [
281
+ transform["type"]
282
+ not in self.KNOWN_TOPICROUTING_TRANSFORMS
283
+ + self.KNOWN_NONTOPICROUTING_TRANSFORMS
284
+ for transform in transforms
285
+ ]
286
+ )
287
+ ALL_TRANSFORMS_NON_TOPICROUTING = all(
288
+ [
289
+ transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS
290
+ for transform in transforms
291
+ ]
292
+ )
293
+
294
+ if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING:
295
+ return self.default_get_lineages(
296
+ database_name=database_name,
297
+ source_platform=source_platform,
298
+ topic_prefix=topic_prefix,
299
+ )
300
+
301
+ if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER:
302
+ tables = self.get_table_names()
303
+ topic_names = list(self.connector_manifest.topic_names)
304
+
305
+ from java.util.regex import Pattern
306
+
307
+ for table in tables:
308
+ source_table: str = table[-1]
309
+ topic = topic_prefix + source_table if topic_prefix else source_table
310
+
311
+ transform_regex = Pattern.compile(transforms[0]["regex"])
312
+ transform_replacement = transforms[0]["replacement"]
313
+
314
+ matcher = transform_regex.matcher(topic)
315
+ if matcher.matches():
316
+ topic = str(matcher.replaceFirst(transform_replacement))
317
+
318
+ # Additional check to confirm that the topic present
319
+ # in connector topics
320
+
321
+ if topic in self.connector_manifest.topic_names:
322
+ # include schema name for three-level hierarchies
323
+ if has_three_level_hierarchy(source_platform) and len(table) > 1:
324
+ source_table = f"{table[-2]}.{table[-1]}"
325
+
326
+ dataset_name = get_dataset_name(database_name, source_table)
327
+
328
+ lineage = KafkaConnectLineage(
329
+ source_dataset=dataset_name,
330
+ source_platform=source_platform,
331
+ target_dataset=topic,
332
+ target_platform=KAFKA,
333
+ )
334
+ topic_names.remove(topic)
335
+ lineages.append(lineage)
336
+
337
+ if topic_names:
338
+ lineages.extend(
339
+ self.default_get_lineages(
340
+ database_name=database_name,
341
+ source_platform=source_platform,
342
+ topic_prefix=topic_prefix,
343
+ topic_names=topic_names,
344
+ include_source_dataset=False,
345
+ )
346
+ )
347
+ self.report.warning(
348
+ "Could not find input dataset for connector topics",
349
+ f"{self.connector_manifest.name} : {topic_names}",
350
+ )
351
+ return lineages
352
+ else:
353
+ include_source_dataset = True
354
+ if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
355
+ self.report.warning(
356
+ "Could not find input dataset, connector has unknown transform",
357
+ f"{self.connector_manifest.name} : {transforms[0]['type']}",
358
+ )
359
+ include_source_dataset = False
360
+ if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
361
+ self.report.warning(
362
+ "Could not find input dataset, connector has one or more unknown transforms",
363
+ self.connector_manifest.name,
364
+ )
365
+ include_source_dataset = False
366
+ lineages = self.default_get_lineages(
367
+ database_name=database_name,
368
+ source_platform=source_platform,
369
+ topic_prefix=topic_prefix,
370
+ include_source_dataset=include_source_dataset,
371
+ )
372
+ return lineages
373
+
374
+
375
+ @dataclass
376
+ class MongoSourceConnector(BaseConnector):
377
+ # https://www.mongodb.com/docs/kafka-connector/current/source-connector/
378
+
379
+ @dataclass
380
+ class MongoSourceParser:
381
+ db_connection_url: Optional[str]
382
+ source_platform: str
383
+ database_name: Optional[str]
384
+ topic_prefix: Optional[str]
385
+ transforms: List[str]
386
+
387
+ def get_parser(
388
+ self,
389
+ connector_manifest: ConnectorManifest,
390
+ ) -> MongoSourceParser:
391
+ parser = self.MongoSourceParser(
392
+ db_connection_url=connector_manifest.config.get("connection.uri"),
393
+ source_platform="mongodb",
394
+ database_name=connector_manifest.config.get("database"),
395
+ topic_prefix=connector_manifest.config.get("topic_prefix"),
396
+ transforms=(
397
+ connector_manifest.config["transforms"].split(",")
398
+ if "transforms" in connector_manifest.config
399
+ else []
400
+ ),
401
+ )
402
+
403
+ return parser
404
+
405
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
406
+ lineages: List[KafkaConnectLineage] = list()
407
+ parser = self.get_parser(self.connector_manifest)
408
+ source_platform = parser.source_platform
409
+ topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
410
+
411
+ if not self.connector_manifest.topic_names:
412
+ return lineages
413
+
414
+ for topic in self.connector_manifest.topic_names:
415
+ found = re.search(re.compile(topic_naming_pattern), topic)
416
+
417
+ if found:
418
+ table_name = get_dataset_name(found.group(1), found.group(2))
419
+
420
+ lineage = KafkaConnectLineage(
421
+ source_dataset=table_name,
422
+ source_platform=source_platform,
423
+ target_dataset=topic,
424
+ target_platform=KAFKA,
425
+ )
426
+ lineages.append(lineage)
427
+ return lineages
428
+
429
+
430
+ @dataclass
431
+ class DebeziumSourceConnector(BaseConnector):
432
+ @dataclass
433
+ class DebeziumParser:
434
+ source_platform: str
435
+ server_name: Optional[str]
436
+ database_name: Optional[str]
437
+
438
+ def get_server_name(self, connector_manifest: ConnectorManifest) -> str:
439
+ if "topic.prefix" in connector_manifest.config:
440
+ return connector_manifest.config["topic.prefix"]
441
+ else:
442
+ return connector_manifest.config.get("database.server.name", "")
443
+
444
+ def get_parser(
445
+ self,
446
+ connector_manifest: ConnectorManifest,
447
+ ) -> DebeziumParser:
448
+ connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
449
+
450
+ if connector_class == "io.debezium.connector.mysql.MySqlConnector":
451
+ parser = self.DebeziumParser(
452
+ source_platform="mysql",
453
+ server_name=self.get_server_name(connector_manifest),
454
+ database_name=None,
455
+ )
456
+ elif connector_class == "MySqlConnector":
457
+ parser = self.DebeziumParser(
458
+ source_platform="mysql",
459
+ server_name=self.get_server_name(connector_manifest),
460
+ database_name=None,
461
+ )
462
+ elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector":
463
+ parser = self.DebeziumParser(
464
+ source_platform="mongodb",
465
+ server_name=self.get_server_name(connector_manifest),
466
+ database_name=None,
467
+ )
468
+ elif connector_class == "io.debezium.connector.postgresql.PostgresConnector":
469
+ parser = self.DebeziumParser(
470
+ source_platform="postgres",
471
+ server_name=self.get_server_name(connector_manifest),
472
+ database_name=connector_manifest.config.get("database.dbname"),
473
+ )
474
+ elif connector_class == "io.debezium.connector.oracle.OracleConnector":
475
+ parser = self.DebeziumParser(
476
+ source_platform="oracle",
477
+ server_name=self.get_server_name(connector_manifest),
478
+ database_name=connector_manifest.config.get("database.dbname"),
479
+ )
480
+ elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector":
481
+ database_name = connector_manifest.config.get(
482
+ "database.names"
483
+ ) or connector_manifest.config.get("database.dbname")
484
+
485
+ if "," in str(database_name):
486
+ raise Exception(
487
+ f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}"
488
+ )
489
+
490
+ parser = self.DebeziumParser(
491
+ source_platform="mssql",
492
+ server_name=self.get_server_name(connector_manifest),
493
+ database_name=database_name,
494
+ )
495
+ elif connector_class == "io.debezium.connector.db2.Db2Connector":
496
+ parser = self.DebeziumParser(
497
+ source_platform="db2",
498
+ server_name=self.get_server_name(connector_manifest),
499
+ database_name=connector_manifest.config.get("database.dbname"),
500
+ )
501
+ elif connector_class == "io.debezium.connector.vitess.VitessConnector":
502
+ parser = self.DebeziumParser(
503
+ source_platform="vitess",
504
+ server_name=self.get_server_name(connector_manifest),
505
+ database_name=connector_manifest.config.get("vitess.keyspace"),
506
+ )
507
+ else:
508
+ raise ValueError(f"Connector class '{connector_class}' is unknown.")
509
+
510
+ return parser
511
+
512
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
513
+ lineages: List[KafkaConnectLineage] = list()
514
+
515
+ try:
516
+ parser = self.get_parser(self.connector_manifest)
517
+ source_platform = parser.source_platform
518
+ server_name = parser.server_name
519
+ database_name = parser.database_name
520
+ topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
521
+
522
+ if not self.connector_manifest.topic_names:
523
+ return lineages
524
+
525
+ for topic in self.connector_manifest.topic_names:
526
+ found = re.search(re.compile(topic_naming_pattern), topic)
527
+
528
+ if found:
529
+ table_name = get_dataset_name(database_name, found.group(2))
530
+
531
+ lineage = KafkaConnectLineage(
532
+ source_dataset=table_name,
533
+ source_platform=source_platform,
534
+ target_dataset=topic,
535
+ target_platform=KAFKA,
536
+ )
537
+ lineages.append(lineage)
538
+ return lineages
539
+ except Exception as e:
540
+ self.report.warning(
541
+ "Error resolving lineage for connector",
542
+ self.connector_manifest.name,
543
+ exc=e,
544
+ )
545
+
546
+ return []
547
+
548
+
549
+ @dataclass
550
+ class ConfigDrivenSourceConnector(BaseConnector):
551
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
552
+ lineages = []
553
+ for connector in self.config.generic_connectors:
554
+ if connector.connector_name == self.connector_manifest.name:
555
+ target_connector = connector
556
+ break
557
+ for topic in self.connector_manifest.topic_names:
558
+ lineage = KafkaConnectLineage(
559
+ source_dataset=target_connector.source_dataset,
560
+ source_platform=target_connector.source_platform,
561
+ target_dataset=topic,
562
+ target_platform=KAFKA,
563
+ )
564
+ lineages.append(lineage)
565
+ return lineages
566
+
567
+
568
+ JDBC_SOURCE_CONNECTOR_CLASS = "io.confluent.connect.jdbc.JdbcSourceConnector"
569
+ DEBEZIUM_SOURCE_CONNECTOR_PREFIX = "io.debezium.connector"
570
+ MONGO_SOURCE_CONNECTOR_CLASS = "com.mongodb.kafka.connect.MongoSourceConnector"
@@ -31,6 +31,10 @@ from looker_sdk.sdk.api40.models import (
31
31
  from pydantic.class_validators import validator
32
32
 
33
33
  import datahub.emitter.mce_builder as builder
34
+ from datahub.api.entities.platformresource.platform_resource import (
35
+ PlatformResource,
36
+ PlatformResourceKey,
37
+ )
34
38
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
35
39
  from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp
36
40
  from datahub.ingestion.api.report import Report
@@ -106,7 +110,7 @@ from datahub.utilities.lossy_collections import LossyList, LossySet
106
110
  from datahub.utilities.url_util import remove_port_from_url
107
111
 
108
112
  CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
109
-
113
+ LOOKER = "looker"
110
114
  logger = logging.getLogger(__name__)
111
115
 
112
116
 
@@ -1404,6 +1408,15 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport):
1404
1408
  dashboards_with_activity: LossySet[str] = dataclasses_field(
1405
1409
  default_factory=LossySet
1406
1410
  )
1411
+
1412
+ # Entities that don't seem to exist, so we don't emit usage aspects for them despite having usage data
1413
+ dashboards_skipped_for_usage: LossySet[str] = dataclasses_field(
1414
+ default_factory=LossySet
1415
+ )
1416
+ charts_skipped_for_usage: LossySet[str] = dataclasses_field(
1417
+ default_factory=LossySet
1418
+ )
1419
+
1407
1420
  stage_latency: List[StageLatency] = dataclasses_field(default_factory=list)
1408
1421
  _looker_explore_registry: Optional[LookerExploreRegistry] = None
1409
1422
  total_explores: int = 0
@@ -1411,6 +1424,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport):
1411
1424
 
1412
1425
  resolved_user_ids: int = 0
1413
1426
  email_ids_missing: int = 0 # resolved users with missing email addresses
1427
+ looker_user_count: int = 0
1414
1428
 
1415
1429
  _looker_api: Optional[LookerAPI] = None
1416
1430
  query_latency: Dict[str, datetime.timedelta] = dataclasses_field(
@@ -1614,9 +1628,21 @@ class LookerDashboard:
1614
1628
  class LookerUserRegistry:
1615
1629
  looker_api_wrapper: LookerAPI
1616
1630
  fields: str = ",".join(["id", "email", "display_name", "first_name", "last_name"])
1631
+ _user_cache: Dict[str, LookerUser] = {}
1617
1632
 
1618
- def __init__(self, looker_api: LookerAPI):
1633
+ def __init__(self, looker_api: LookerAPI, report: LookerDashboardSourceReport):
1619
1634
  self.looker_api_wrapper = looker_api
1635
+ self.report = report
1636
+ self._initialize_user_cache()
1637
+
1638
+ def _initialize_user_cache(self) -> None:
1639
+ raw_users: Sequence[User] = self.looker_api_wrapper.all_users(
1640
+ user_fields=self.fields
1641
+ )
1642
+
1643
+ for raw_user in raw_users:
1644
+ looker_user = LookerUser.create_looker_user(raw_user)
1645
+ self._user_cache[str(looker_user.id)] = looker_user
1620
1646
 
1621
1647
  def get_by_id(self, id_: str) -> Optional[LookerUser]:
1622
1648
  if not id_:
@@ -1624,6 +1650,9 @@ class LookerUserRegistry:
1624
1650
 
1625
1651
  logger.debug(f"Will get user {id_}")
1626
1652
 
1653
+ if str(id_) in self._user_cache:
1654
+ return self._user_cache.get(str(id_))
1655
+
1627
1656
  raw_user: Optional[User] = self.looker_api_wrapper.get_user(
1628
1657
  str(id_), user_fields=self.fields
1629
1658
  )
@@ -1632,3 +1661,35 @@ class LookerUserRegistry:
1632
1661
 
1633
1662
  looker_user = LookerUser.create_looker_user(raw_user)
1634
1663
  return looker_user
1664
+
1665
+ def to_platform_resource(
1666
+ self, platform_instance: Optional[str]
1667
+ ) -> Iterable[MetadataChangeProposalWrapper]:
1668
+ try:
1669
+ platform_resource_key = PlatformResourceKey(
1670
+ platform=LOOKER,
1671
+ resource_type="USER_ID_MAPPING",
1672
+ platform_instance=platform_instance,
1673
+ primary_key="",
1674
+ )
1675
+
1676
+ # Extract user email mappings
1677
+ user_email_cache = {
1678
+ user_id: user.email
1679
+ for user_id, user in self._user_cache.items()
1680
+ if user.email
1681
+ }
1682
+
1683
+ platform_resource = PlatformResource.create(
1684
+ key=platform_resource_key,
1685
+ value=user_email_cache,
1686
+ )
1687
+
1688
+ self.report.looker_user_count = len(user_email_cache)
1689
+ yield from platform_resource.to_mcps()
1690
+
1691
+ except Exception as exc:
1692
+ self.report.warning(
1693
+ message="Failed to generate platform resource for looker id mappings",
1694
+ exc=exc,
1695
+ )