acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
  2. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -1,1468 +0,0 @@
1
- import logging
2
- import re
3
- from dataclasses import dataclass, field
4
- from typing import Dict, Iterable, List, Optional, Tuple
5
-
6
- import jpype
7
- import jpype.imports
8
- import requests
9
- from pydantic.fields import Field
10
- from sqlalchemy.engine.url import make_url
11
-
12
- import datahub.emitter.mce_builder as builder
13
- import datahub.metadata.schema_classes as models
14
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
15
- from datahub.configuration.source_common import (
16
- DatasetLineageProviderConfigBase,
17
- PlatformInstanceConfigMixin,
18
- )
19
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
- from datahub.ingestion.api.common import PipelineContext
21
- from datahub.ingestion.api.decorators import (
22
- SourceCapability,
23
- SupportStatus,
24
- capability,
25
- config_class,
26
- platform_name,
27
- support_status,
28
- )
29
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
30
- from datahub.ingestion.api.workunit import MetadataWorkUnit
31
- from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
32
- get_platform_from_sqlalchemy_uri,
33
- )
34
- from datahub.ingestion.source.state.stale_entity_removal_handler import (
35
- StaleEntityRemovalHandler,
36
- StaleEntityRemovalSourceReport,
37
- StatefulStaleMetadataRemovalConfig,
38
- )
39
- from datahub.ingestion.source.state.stateful_ingestion_base import (
40
- StatefulIngestionConfigBase,
41
- StatefulIngestionSourceBase,
42
- )
43
-
44
- logger = logging.getLogger(__name__)
45
-
46
- KAFKA = "kafka"
47
- SOURCE = "source"
48
- SINK = "sink"
49
- CONNECTOR_CLASS = "connector.class"
50
-
51
-
52
- class ProvidedConfig(ConfigModel):
53
- provider: str
54
- path_key: str
55
- value: str
56
-
57
-
58
- class GenericConnectorConfig(ConfigModel):
59
- connector_name: str
60
- source_dataset: str
61
- source_platform: str
62
-
63
-
64
- class KafkaConnectSourceConfig(
65
- PlatformInstanceConfigMixin,
66
- DatasetLineageProviderConfigBase,
67
- StatefulIngestionConfigBase,
68
- ):
69
- # See the Connect REST Interface for details
70
- # https://docs.confluent.io/platform/current/connect/references/restapi.html#
71
- connect_uri: str = Field(
72
- default="http://localhost:8083/", description="URI to connect to."
73
- )
74
- username: Optional[str] = Field(default=None, description="Kafka Connect username.")
75
- password: Optional[str] = Field(default=None, description="Kafka Connect password.")
76
- cluster_name: Optional[str] = Field(
77
- default="connect-cluster", description="Cluster to ingest from."
78
- )
79
- # convert lineage dataset's urns to lowercase
80
- convert_lineage_urns_to_lowercase: bool = Field(
81
- default=False,
82
- description="Whether to convert the urns of ingested lineage dataset to lowercase",
83
- )
84
- connector_patterns: AllowDenyPattern = Field(
85
- default=AllowDenyPattern.allow_all(),
86
- description="regex patterns for connectors to filter for ingestion.",
87
- )
88
- provided_configs: Optional[List[ProvidedConfig]] = Field(
89
- default=None, description="Provided Configurations"
90
- )
91
- connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field(
92
- default=None,
93
- description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`',
94
- )
95
- platform_instance_map: Optional[Dict[str, str]] = Field(
96
- default=None,
97
- description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`',
98
- )
99
- generic_connectors: List[GenericConnectorConfig] = Field(
100
- default=[],
101
- description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector",
102
- )
103
-
104
- stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
105
-
106
-
107
- @dataclass
108
- class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
109
- connectors_scanned: int = 0
110
- filtered: List[str] = field(default_factory=list)
111
-
112
- def report_connector_scanned(self, connector: str) -> None:
113
- self.connectors_scanned += 1
114
-
115
- def report_dropped(self, connector: str) -> None:
116
- self.filtered.append(connector)
117
-
118
-
119
- @dataclass
120
- class KafkaConnectLineage:
121
- """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob"""
122
-
123
- source_platform: str
124
- target_dataset: str
125
- target_platform: str
126
- job_property_bag: Optional[Dict[str, str]] = None
127
- source_dataset: Optional[str] = None
128
-
129
-
130
- @dataclass
131
- class ConnectorManifest:
132
- """Each instance is potential DataFlow"""
133
-
134
- name: str
135
- type: str
136
- config: Dict
137
- tasks: Dict
138
- url: Optional[str] = None
139
- flow_property_bag: Optional[Dict[str, str]] = None
140
- lineages: List[KafkaConnectLineage] = field(default_factory=list)
141
- topic_names: Iterable[str] = field(default_factory=list)
142
-
143
-
144
- def remove_prefix(text: str, prefix: str) -> str:
145
- if text.startswith(prefix):
146
- index = len(prefix)
147
- return text[index:]
148
- return text
149
-
150
-
151
- def unquote(
152
- string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None
153
- ) -> str:
154
- """
155
- If string starts and ends with a quote, unquote it
156
- """
157
- trailing_quote = trailing_quote if trailing_quote else leading_quote
158
- if string.startswith(leading_quote) and string.endswith(trailing_quote):
159
- string = string[1:-1]
160
- return string
161
-
162
-
163
- def get_dataset_name(
164
- database_name: Optional[str],
165
- source_table: str,
166
- ) -> str:
167
- if database_name:
168
- dataset_name = database_name + "." + source_table
169
- else:
170
- dataset_name = source_table
171
-
172
- return dataset_name
173
-
174
-
175
- def get_platform_instance(
176
- config: KafkaConnectSourceConfig, connector_name: str, platform: str
177
- ) -> Optional[str]:
178
- instance_name = None
179
- if (
180
- config.connect_to_platform_map
181
- and config.connect_to_platform_map.get(connector_name)
182
- and config.connect_to_platform_map[connector_name].get(platform)
183
- ):
184
- instance_name = config.connect_to_platform_map[connector_name][platform]
185
- if config.platform_instance_map and config.platform_instance_map.get(platform):
186
- logger.warning(
187
- f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map."
188
- "Will prefer connector specific platform instance from connect_to_platform_map."
189
- )
190
- elif config.platform_instance_map and config.platform_instance_map.get(platform):
191
- instance_name = config.platform_instance_map[platform]
192
- logger.info(
193
- f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}"
194
- )
195
- return instance_name
196
-
197
-
198
- @dataclass
199
- class ConfluentJDBCSourceConnector:
200
- connector_manifest: ConnectorManifest
201
- report: KafkaConnectSourceReport
202
-
203
- def __init__(
204
- self,
205
- connector_manifest: ConnectorManifest,
206
- config: KafkaConnectSourceConfig,
207
- report: KafkaConnectSourceReport,
208
- ) -> None:
209
- self.connector_manifest = connector_manifest
210
- self.config = config
211
- self.report = report
212
- self._extract_lineages()
213
-
214
- REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter"
215
- KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER]
216
- # https://kafka.apache.org/documentation/#connect_included_transformation
217
- KAFKA_NONTOPICROUTING_TRANSFORMS = [
218
- "InsertField",
219
- "InsertField$Key",
220
- "InsertField$Value",
221
- "ReplaceField",
222
- "ReplaceField$Key",
223
- "ReplaceField$Value",
224
- "MaskField",
225
- "MaskField$Key",
226
- "MaskField$Value",
227
- "ValueToKey",
228
- "ValueToKey$Key",
229
- "ValueToKey$Value",
230
- "HoistField",
231
- "HoistField$Key",
232
- "HoistField$Value",
233
- "ExtractField",
234
- "ExtractField$Key",
235
- "ExtractField$Value",
236
- "SetSchemaMetadata",
237
- "SetSchemaMetadata$Key",
238
- "SetSchemaMetadata$Value",
239
- "Flatten",
240
- "Flatten$Key",
241
- "Flatten$Value",
242
- "Cast",
243
- "Cast$Key",
244
- "Cast$Value",
245
- "HeadersFrom",
246
- "HeadersFrom$Key",
247
- "HeadersFrom$Value",
248
- "TimestampConverter",
249
- "Filter",
250
- "InsertHeader",
251
- "DropHeaders",
252
- ]
253
- # https://docs.confluent.io/platform/current/connect/transforms/overview.html
254
- CONFLUENT_NONTOPICROUTING_TRANSFORMS = [
255
- "Drop",
256
- "Drop$Key",
257
- "Drop$Value",
258
- "Filter",
259
- "Filter$Key",
260
- "Filter$Value",
261
- "TombstoneHandler",
262
- ]
263
- KNOWN_NONTOPICROUTING_TRANSFORMS = (
264
- KAFKA_NONTOPICROUTING_TRANSFORMS
265
- + [
266
- f"org.apache.kafka.connect.transforms.{t}"
267
- for t in KAFKA_NONTOPICROUTING_TRANSFORMS
268
- ]
269
- + CONFLUENT_NONTOPICROUTING_TRANSFORMS
270
- + [
271
- f"io.confluent.connect.transforms.{t}"
272
- for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS
273
- ]
274
- )
275
-
276
- @dataclass
277
- class JdbcParser:
278
- db_connection_url: str
279
- source_platform: str
280
- database_name: str
281
- topic_prefix: str
282
- query: str
283
- transforms: list
284
-
285
- def get_parser(
286
- self,
287
- connector_manifest: ConnectorManifest,
288
- ) -> JdbcParser:
289
- url = remove_prefix(
290
- str(connector_manifest.config.get("connection.url")), "jdbc:"
291
- )
292
- url_instance = make_url(url)
293
- source_platform = get_platform_from_sqlalchemy_uri(str(url_instance))
294
- database_name = url_instance.database
295
- assert database_name
296
- db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
297
-
298
- topic_prefix = self.connector_manifest.config.get("topic.prefix", None)
299
-
300
- query = self.connector_manifest.config.get("query", None)
301
-
302
- transform_names = (
303
- self.connector_manifest.config.get("transforms", "").split(",")
304
- if self.connector_manifest.config.get("transforms")
305
- else []
306
- )
307
-
308
- transforms = []
309
- for name in transform_names:
310
- transform = {"name": name}
311
- transforms.append(transform)
312
- for key in self.connector_manifest.config.keys():
313
- if key.startswith(f"transforms.{name}."):
314
- transform[
315
- key.replace(f"transforms.{name}.", "")
316
- ] = self.connector_manifest.config[key]
317
-
318
- return self.JdbcParser(
319
- db_connection_url,
320
- source_platform,
321
- database_name,
322
- topic_prefix,
323
- query,
324
- transforms,
325
- )
326
-
327
- def default_get_lineages(
328
- self,
329
- topic_prefix: str,
330
- database_name: str,
331
- source_platform: str,
332
- topic_names: Optional[Iterable[str]] = None,
333
- include_source_dataset: bool = True,
334
- ) -> List[KafkaConnectLineage]:
335
- lineages: List[KafkaConnectLineage] = []
336
- if not topic_names:
337
- topic_names = self.connector_manifest.topic_names
338
- table_name_tuples: List[Tuple] = self.get_table_names()
339
- for topic in topic_names:
340
- # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM))
341
- source_table: str = (
342
- remove_prefix(topic, topic_prefix) if topic_prefix else topic
343
- )
344
- # include schema name for three-level hierarchies
345
- if has_three_level_hierarchy(source_platform):
346
- table_name_tuple: Tuple = next(
347
- iter([t for t in table_name_tuples if t and t[-1] == source_table]),
348
- (),
349
- )
350
- if len(table_name_tuple) > 1:
351
- source_table = f"{table_name_tuple[-2]}.{source_table}"
352
- else:
353
- include_source_dataset = False
354
- self.report.warning(
355
- "Could not find schema for table"
356
- f"{self.connector_manifest.name} : {source_table}",
357
- )
358
- dataset_name: str = get_dataset_name(database_name, source_table)
359
- lineage = KafkaConnectLineage(
360
- source_dataset=dataset_name if include_source_dataset else None,
361
- source_platform=source_platform,
362
- target_dataset=topic,
363
- target_platform=KAFKA,
364
- )
365
- lineages.append(lineage)
366
- return lineages
367
-
368
- def get_table_names(self) -> List[Tuple]:
369
- sep: str = "."
370
- leading_quote_char: str = '"'
371
- trailing_quote_char: str = leading_quote_char
372
-
373
- table_ids: List[str] = []
374
- if self.connector_manifest.tasks:
375
- table_ids = (
376
- ",".join(
377
- [
378
- task["config"].get("tables")
379
- for task in self.connector_manifest.tasks
380
- ]
381
- )
382
- ).split(",")
383
- quote_method = self.connector_manifest.config.get(
384
- "quote.sql.identifiers", "always"
385
- )
386
- if (
387
- quote_method == "always"
388
- and table_ids
389
- and table_ids[0]
390
- and table_ids[-1]
391
- ):
392
- leading_quote_char = table_ids[0][0]
393
- trailing_quote_char = table_ids[-1][-1]
394
- # This will only work for single character quotes
395
- elif self.connector_manifest.config.get("table.whitelist"):
396
- table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore
397
-
398
- # List of Tuple containing (schema, table)
399
- tables: List[Tuple] = [
400
- (
401
- (
402
- unquote(
403
- table_id.split(sep)[-2], leading_quote_char, trailing_quote_char
404
- )
405
- if len(table_id.split(sep)) > 1
406
- else ""
407
- ),
408
- unquote(
409
- table_id.split(sep)[-1], leading_quote_char, trailing_quote_char
410
- ),
411
- )
412
- for table_id in table_ids
413
- ]
414
- return tables
415
-
416
- def _extract_lineages(self):
417
- lineages: List[KafkaConnectLineage] = list()
418
- parser = self.get_parser(self.connector_manifest)
419
- source_platform = parser.source_platform
420
- database_name = parser.database_name
421
- query = parser.query
422
- topic_prefix = parser.topic_prefix
423
- transforms = parser.transforms
424
- self.connector_manifest.flow_property_bag = self.connector_manifest.config
425
-
426
- # Mask/Remove properties that may reveal credentials
427
- self.connector_manifest.flow_property_bag[
428
- "connection.url"
429
- ] = parser.db_connection_url
430
- if "connection.password" in self.connector_manifest.flow_property_bag:
431
- del self.connector_manifest.flow_property_bag["connection.password"]
432
- if "connection.user" in self.connector_manifest.flow_property_bag:
433
- del self.connector_manifest.flow_property_bag["connection.user"]
434
-
435
- logging.debug(
436
- f"Extracting source platform: {source_platform} and database name: {database_name} from connection url "
437
- )
438
-
439
- if not self.connector_manifest.topic_names:
440
- self.connector_manifest.lineages = lineages
441
- return
442
-
443
- if query:
444
- # Lineage source_table can be extracted by parsing query
445
- for topic in self.connector_manifest.topic_names:
446
- # default method - as per earlier implementation
447
- dataset_name: str = get_dataset_name(database_name, topic)
448
-
449
- lineage = KafkaConnectLineage(
450
- source_dataset=None,
451
- source_platform=source_platform,
452
- target_dataset=topic,
453
- target_platform=KAFKA,
454
- )
455
- lineages.append(lineage)
456
- self.report.warning(
457
- "Could not find input dataset, the connector has query configuration set",
458
- self.connector_manifest.name,
459
- )
460
- self.connector_manifest.lineages = lineages
461
- return
462
-
463
- SINGLE_TRANSFORM = len(transforms) == 1
464
- NO_TRANSFORM = len(transforms) == 0
465
- UNKNOWN_TRANSFORM = any(
466
- [
467
- transform["type"]
468
- not in self.KNOWN_TOPICROUTING_TRANSFORMS
469
- + self.KNOWN_NONTOPICROUTING_TRANSFORMS
470
- for transform in transforms
471
- ]
472
- )
473
- ALL_TRANSFORMS_NON_TOPICROUTING = all(
474
- [
475
- transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS
476
- for transform in transforms
477
- ]
478
- )
479
-
480
- if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING:
481
- self.connector_manifest.lineages = self.default_get_lineages(
482
- database_name=database_name,
483
- source_platform=source_platform,
484
- topic_prefix=topic_prefix,
485
- )
486
- return
487
-
488
- if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER:
489
- tables = self.get_table_names()
490
- topic_names = list(self.connector_manifest.topic_names)
491
-
492
- from java.util.regex import Pattern
493
-
494
- for table in tables:
495
- source_table: str = table[-1]
496
- topic = topic_prefix + source_table if topic_prefix else source_table
497
-
498
- transform_regex = Pattern.compile(transforms[0]["regex"])
499
- transform_replacement = transforms[0]["replacement"]
500
-
501
- matcher = transform_regex.matcher(topic)
502
- if matcher.matches():
503
- topic = str(matcher.replaceFirst(transform_replacement))
504
-
505
- # Additional check to confirm that the topic present
506
- # in connector topics
507
-
508
- if topic in self.connector_manifest.topic_names:
509
- # include schema name for three-level hierarchies
510
- if has_three_level_hierarchy(source_platform) and len(table) > 1:
511
- source_table = f"{table[-2]}.{table[-1]}"
512
-
513
- dataset_name = get_dataset_name(database_name, source_table)
514
-
515
- lineage = KafkaConnectLineage(
516
- source_dataset=dataset_name,
517
- source_platform=source_platform,
518
- target_dataset=topic,
519
- target_platform=KAFKA,
520
- )
521
- topic_names.remove(topic)
522
- lineages.append(lineage)
523
-
524
- if topic_names:
525
- lineages.extend(
526
- self.default_get_lineages(
527
- database_name=database_name,
528
- source_platform=source_platform,
529
- topic_prefix=topic_prefix,
530
- topic_names=topic_names,
531
- include_source_dataset=False,
532
- )
533
- )
534
- self.report.warning(
535
- "Could not find input dataset for connector topics",
536
- f"{self.connector_manifest.name} : {topic_names}",
537
- )
538
- self.connector_manifest.lineages = lineages
539
- return
540
- else:
541
- include_source_dataset = True
542
- if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
543
- self.report.warning(
544
- "Could not find input dataset, connector has unknown transform",
545
- f"{self.connector_manifest.name} : {transforms[0]['type']}",
546
- )
547
- include_source_dataset = False
548
- if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
549
- self.report.warning(
550
- "Could not find input dataset, connector has one or more unknown transforms",
551
- self.connector_manifest.name,
552
- )
553
- include_source_dataset = False
554
- lineages = self.default_get_lineages(
555
- database_name=database_name,
556
- source_platform=source_platform,
557
- topic_prefix=topic_prefix,
558
- include_source_dataset=include_source_dataset,
559
- )
560
- self.connector_manifest.lineages = lineages
561
- return
562
-
563
-
564
- @dataclass
565
- class MongoSourceConnector:
566
- # https://www.mongodb.com/docs/kafka-connector/current/source-connector/
567
-
568
- connector_manifest: ConnectorManifest
569
-
570
- def __init__(
571
- self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig
572
- ) -> None:
573
- self.connector_manifest = connector_manifest
574
- self.config = config
575
- self._extract_lineages()
576
-
577
- @dataclass
578
- class MongoSourceParser:
579
- db_connection_url: Optional[str]
580
- source_platform: str
581
- database_name: Optional[str]
582
- topic_prefix: Optional[str]
583
- transforms: List[str]
584
-
585
- def get_parser(
586
- self,
587
- connector_manifest: ConnectorManifest,
588
- ) -> MongoSourceParser:
589
- parser = self.MongoSourceParser(
590
- db_connection_url=connector_manifest.config.get("connection.uri"),
591
- source_platform="mongodb",
592
- database_name=connector_manifest.config.get("database"),
593
- topic_prefix=connector_manifest.config.get("topic_prefix"),
594
- transforms=(
595
- connector_manifest.config["transforms"].split(",")
596
- if "transforms" in connector_manifest.config
597
- else []
598
- ),
599
- )
600
-
601
- return parser
602
-
603
- def _extract_lineages(self):
604
- lineages: List[KafkaConnectLineage] = list()
605
- parser = self.get_parser(self.connector_manifest)
606
- source_platform = parser.source_platform
607
- topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
608
-
609
- if not self.connector_manifest.topic_names:
610
- return lineages
611
-
612
- for topic in self.connector_manifest.topic_names:
613
- found = re.search(re.compile(topic_naming_pattern), topic)
614
-
615
- if found:
616
- table_name = get_dataset_name(found.group(1), found.group(2))
617
-
618
- lineage = KafkaConnectLineage(
619
- source_dataset=table_name,
620
- source_platform=source_platform,
621
- target_dataset=topic,
622
- target_platform=KAFKA,
623
- )
624
- lineages.append(lineage)
625
- self.connector_manifest.lineages = lineages
626
-
627
-
628
- @dataclass
629
- class DebeziumSourceConnector:
630
- connector_manifest: ConnectorManifest
631
- report: KafkaConnectSourceReport
632
-
633
- def __init__(
634
- self,
635
- connector_manifest: ConnectorManifest,
636
- config: KafkaConnectSourceConfig,
637
- report: KafkaConnectSourceReport,
638
- ) -> None:
639
- self.connector_manifest = connector_manifest
640
- self.config = config
641
- self.report = report
642
- self._extract_lineages()
643
-
644
- @dataclass
645
- class DebeziumParser:
646
- source_platform: str
647
- server_name: Optional[str]
648
- database_name: Optional[str]
649
-
650
- def get_server_name(self, connector_manifest: ConnectorManifest) -> str:
651
- if "topic.prefix" in connector_manifest.config:
652
- return connector_manifest.config["topic.prefix"]
653
- else:
654
- return connector_manifest.config.get("database.server.name", "")
655
-
656
- def get_parser(
657
- self,
658
- connector_manifest: ConnectorManifest,
659
- ) -> DebeziumParser:
660
- connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
661
-
662
- if connector_class == "io.debezium.connector.mysql.MySqlConnector":
663
- parser = self.DebeziumParser(
664
- source_platform="mysql",
665
- server_name=self.get_server_name(connector_manifest),
666
- database_name=None,
667
- )
668
- elif connector_class == "MySqlConnector":
669
- parser = self.DebeziumParser(
670
- source_platform="mysql",
671
- server_name=self.get_server_name(connector_manifest),
672
- database_name=None,
673
- )
674
- elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector":
675
- parser = self.DebeziumParser(
676
- source_platform="mongodb",
677
- server_name=self.get_server_name(connector_manifest),
678
- database_name=None,
679
- )
680
- elif connector_class == "io.debezium.connector.postgresql.PostgresConnector":
681
- parser = self.DebeziumParser(
682
- source_platform="postgres",
683
- server_name=self.get_server_name(connector_manifest),
684
- database_name=connector_manifest.config.get("database.dbname"),
685
- )
686
- elif connector_class == "io.debezium.connector.oracle.OracleConnector":
687
- parser = self.DebeziumParser(
688
- source_platform="oracle",
689
- server_name=self.get_server_name(connector_manifest),
690
- database_name=connector_manifest.config.get("database.dbname"),
691
- )
692
- elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector":
693
- database_name = connector_manifest.config.get(
694
- "database.names"
695
- ) or connector_manifest.config.get("database.dbname")
696
-
697
- if "," in str(database_name):
698
- raise Exception(
699
- f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}"
700
- )
701
-
702
- parser = self.DebeziumParser(
703
- source_platform="mssql",
704
- server_name=self.get_server_name(connector_manifest),
705
- database_name=database_name,
706
- )
707
- elif connector_class == "io.debezium.connector.db2.Db2Connector":
708
- parser = self.DebeziumParser(
709
- source_platform="db2",
710
- server_name=self.get_server_name(connector_manifest),
711
- database_name=connector_manifest.config.get("database.dbname"),
712
- )
713
- elif connector_class == "io.debezium.connector.vitess.VitessConnector":
714
- parser = self.DebeziumParser(
715
- source_platform="vitess",
716
- server_name=self.get_server_name(connector_manifest),
717
- database_name=connector_manifest.config.get("vitess.keyspace"),
718
- )
719
- else:
720
- raise ValueError(f"Connector class '{connector_class}' is unknown.")
721
-
722
- return parser
723
-
724
- def _extract_lineages(self):
725
- lineages: List[KafkaConnectLineage] = list()
726
-
727
- try:
728
- parser = self.get_parser(self.connector_manifest)
729
- source_platform = parser.source_platform
730
- server_name = parser.server_name
731
- database_name = parser.database_name
732
- topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
733
-
734
- if not self.connector_manifest.topic_names:
735
- return lineages
736
-
737
- for topic in self.connector_manifest.topic_names:
738
- found = re.search(re.compile(topic_naming_pattern), topic)
739
-
740
- if found:
741
- table_name = get_dataset_name(database_name, found.group(2))
742
-
743
- lineage = KafkaConnectLineage(
744
- source_dataset=table_name,
745
- source_platform=source_platform,
746
- target_dataset=topic,
747
- target_platform=KAFKA,
748
- )
749
- lineages.append(lineage)
750
- self.connector_manifest.lineages = lineages
751
- except Exception as e:
752
- self.report.warning(
753
- "Error resolving lineage for connector",
754
- self.connector_manifest.name,
755
- exc=e,
756
- )
757
-
758
- return
759
-
760
-
761
- @dataclass
762
- class BigQuerySinkConnector:
763
- connector_manifest: ConnectorManifest
764
- report: KafkaConnectSourceReport
765
-
766
- def __init__(
767
- self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport
768
- ) -> None:
769
- self.connector_manifest = connector_manifest
770
- self.report = report
771
- self._extract_lineages()
772
-
773
- @dataclass
774
- class BQParser:
775
- project: str
776
- target_platform: str
777
- sanitizeTopics: str
778
- transforms: list
779
- topicsToTables: Optional[str] = None
780
- datasets: Optional[str] = None
781
- defaultDataset: Optional[str] = None
782
- version: str = "v1"
783
-
784
- def get_parser(
785
- self,
786
- connector_manifest: ConnectorManifest,
787
- ) -> BQParser:
788
- project = connector_manifest.config["project"]
789
- sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false")
790
- transform_names = (
791
- self.connector_manifest.config.get("transforms", "").split(",")
792
- if self.connector_manifest.config.get("transforms")
793
- else []
794
- )
795
- transforms = []
796
- for name in transform_names:
797
- transform = {"name": name}
798
- transforms.append(transform)
799
- for key in self.connector_manifest.config.keys():
800
- if key.startswith(f"transforms.{name}."):
801
- transform[
802
- key.replace(f"transforms.{name}.", "")
803
- ] = self.connector_manifest.config[key]
804
-
805
- if "defaultDataset" in connector_manifest.config:
806
- defaultDataset = connector_manifest.config["defaultDataset"]
807
- return self.BQParser(
808
- project=project,
809
- defaultDataset=defaultDataset,
810
- target_platform="bigquery",
811
- sanitizeTopics=sanitizeTopics.lower() == "true",
812
- version="v2",
813
- transforms=transforms,
814
- )
815
- else:
816
- # version 1.6.x and similar configs supported
817
- datasets = connector_manifest.config["datasets"]
818
- topicsToTables = connector_manifest.config.get("topicsToTables")
819
-
820
- return self.BQParser(
821
- project=project,
822
- topicsToTables=topicsToTables,
823
- datasets=datasets,
824
- target_platform="bigquery",
825
- sanitizeTopics=sanitizeTopics.lower() == "true",
826
- transforms=transforms,
827
- )
828
-
829
- def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
830
- entries = property.split(",")
831
- for entry in entries:
832
- key, val = entry.rsplit("=")
833
- yield (key.strip(), val.strip())
834
-
835
- def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]:
836
- topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore
837
- from java.util.regex import Pattern
838
-
839
- for pattern, dataset in topicregex_dataset_map.items():
840
- patternMatcher = Pattern.compile(pattern).matcher(topic)
841
- if patternMatcher.matches():
842
- return dataset
843
- return None
844
-
845
- def sanitize_table_name(self, table_name):
846
- table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
847
- if re.match("^[^a-zA-Z_].*", table_name):
848
- table_name = "_" + table_name
849
-
850
- return table_name
851
-
852
- def get_dataset_table_for_topic(
853
- self, topic: str, parser: BQParser
854
- ) -> Optional[str]:
855
- if parser.version == "v2":
856
- dataset = parser.defaultDataset
857
- parts = topic.split(":")
858
- if len(parts) == 2:
859
- dataset = parts[0]
860
- table = parts[1]
861
- else:
862
- table = parts[0]
863
- else:
864
- dataset = self.get_dataset_for_topic_v1(topic, parser)
865
- if dataset is None:
866
- return None
867
-
868
- table = topic
869
- if parser.topicsToTables:
870
- topicregex_table_map: Dict[str, str] = dict(
871
- self.get_list(parser.topicsToTables) # type: ignore
872
- )
873
- from java.util.regex import Pattern
874
-
875
- for pattern, tbl in topicregex_table_map.items():
876
- patternMatcher = Pattern.compile(pattern).matcher(topic)
877
- if patternMatcher.matches():
878
- table = tbl
879
- break
880
-
881
- if parser.sanitizeTopics:
882
- table = self.sanitize_table_name(table)
883
- return f"{dataset}.{table}"
884
-
885
- def apply_transformations(
886
- self, topic: str, transforms: List[Dict[str, str]]
887
- ) -> str:
888
- for transform in transforms:
889
- if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
890
- regex = transform["regex"]
891
- replacement = transform["replacement"]
892
- pattern = re.compile(regex)
893
- if pattern.match(topic):
894
- topic = pattern.sub(replacement, topic, count=1)
895
- return topic
896
-
897
- def _extract_lineages(self):
898
- lineages: List[KafkaConnectLineage] = list()
899
- parser = self.get_parser(self.connector_manifest)
900
- if not parser:
901
- return lineages
902
- target_platform = parser.target_platform
903
- project = parser.project
904
- transforms = parser.transforms
905
- self.connector_manifest.flow_property_bag = self.connector_manifest.config
906
- # Mask/Remove properties that may reveal credentials
907
- if "keyfile" in self.connector_manifest.flow_property_bag:
908
- del self.connector_manifest.flow_property_bag["keyfile"]
909
-
910
- for topic in self.connector_manifest.topic_names:
911
- transformed_topic = self.apply_transformations(topic, transforms)
912
- dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
913
- if dataset_table is None:
914
- self.report.warning(
915
- "Could not find target dataset for topic, please check your connector configuration"
916
- f"{self.connector_manifest.name} : {transformed_topic} ",
917
- )
918
- continue
919
- target_dataset = f"{project}.{dataset_table}"
920
-
921
- lineages.append(
922
- KafkaConnectLineage(
923
- source_dataset=transformed_topic,
924
- source_platform=KAFKA,
925
- target_dataset=target_dataset,
926
- target_platform=target_platform,
927
- )
928
- )
929
- self.connector_manifest.lineages = lineages
930
- return
931
-
932
-
933
- @dataclass
934
- class SnowflakeSinkConnector:
935
- connector_manifest: ConnectorManifest
936
- report: KafkaConnectSourceReport
937
-
938
- def __init__(
939
- self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport
940
- ) -> None:
941
- self.connector_manifest = connector_manifest
942
- self.report = report
943
- self._extract_lineages()
944
-
945
- @dataclass
946
- class SnowflakeParser:
947
- database_name: str
948
- schema_name: str
949
- topics_to_tables: Dict[str, str]
950
-
951
- def get_table_name_from_topic_name(self, topic_name: str) -> str:
952
- """
953
- This function converts the topic name to a valid Snowflake table name using some rules.
954
- Refer below link for more info
955
- https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
956
- """
957
- table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
958
- if re.match("^[^a-zA-Z_].*", table_name):
959
- table_name = "_" + table_name
960
- # Connector may append original topic's hash code as suffix for conflict resolution
961
- # if generated table names for 2 topics are similar. This corner case is not handled here.
962
- # Note that Snowflake recommends to choose topic names that follow the rules for
963
- # Snowflake identifier names so this case is not recommended by snowflake.
964
- return table_name
965
-
966
- def get_parser(
967
- self,
968
- connector_manifest: ConnectorManifest,
969
- ) -> SnowflakeParser:
970
- database_name = connector_manifest.config["snowflake.database.name"]
971
- schema_name = connector_manifest.config["snowflake.schema.name"]
972
-
973
- # Fetch user provided topic to table map
974
- provided_topics_to_tables: Dict[str, str] = {}
975
- if connector_manifest.config.get("snowflake.topic2table.map"):
976
- for each in connector_manifest.config["snowflake.topic2table.map"].split(
977
- ","
978
- ):
979
- topic, table = each.split(":")
980
- provided_topics_to_tables[topic.strip()] = table.strip()
981
-
982
- topics_to_tables: Dict[str, str] = {}
983
- # Extract lineage for only those topics whose data ingestion started
984
- for topic in connector_manifest.topic_names:
985
- if topic in provided_topics_to_tables:
986
- # If user provided which table to get mapped with this topic
987
- topics_to_tables[topic] = provided_topics_to_tables[topic]
988
- else:
989
- # Else connector converts topic name to a valid Snowflake table name.
990
- topics_to_tables[topic] = self.get_table_name_from_topic_name(topic)
991
-
992
- return self.SnowflakeParser(
993
- database_name=database_name,
994
- schema_name=schema_name,
995
- topics_to_tables=topics_to_tables,
996
- )
997
-
998
- def _extract_lineages(self):
999
- self.connector_manifest.flow_property_bag = self.connector_manifest.config
1000
-
1001
- # For all snowflake sink connector properties, refer below link
1002
- # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
1003
- # remove private keys, secrets from properties
1004
- secret_properties = [
1005
- "snowflake.private.key",
1006
- "snowflake.private.key.passphrase",
1007
- "value.converter.basic.auth.user.info",
1008
- ]
1009
- for k in secret_properties:
1010
- if k in self.connector_manifest.flow_property_bag:
1011
- del self.connector_manifest.flow_property_bag[k]
1012
-
1013
- lineages: List[KafkaConnectLineage] = list()
1014
- parser = self.get_parser(self.connector_manifest)
1015
-
1016
- for topic, table in parser.topics_to_tables.items():
1017
- target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
1018
- lineages.append(
1019
- KafkaConnectLineage(
1020
- source_dataset=topic,
1021
- source_platform=KAFKA,
1022
- target_dataset=target_dataset,
1023
- target_platform="snowflake",
1024
- )
1025
- )
1026
-
1027
- self.connector_manifest.lineages = lineages
1028
- return
1029
-
1030
-
1031
- @dataclass
1032
- class ConfluentS3SinkConnector:
1033
- connector_manifest: ConnectorManifest
1034
-
1035
- def __init__(
1036
- self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport
1037
- ) -> None:
1038
- self.connector_manifest = connector_manifest
1039
- self.report = report
1040
- self._extract_lineages()
1041
-
1042
- @dataclass
1043
- class S3SinkParser:
1044
- target_platform: str
1045
- bucket: str
1046
- topics_dir: str
1047
- topics: Iterable[str]
1048
-
1049
- def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
1050
- # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
1051
- bucket = connector_manifest.config.get("s3.bucket.name")
1052
- if not bucket:
1053
- raise ValueError(
1054
- "Could not find 's3.bucket.name' in connector configuration"
1055
- )
1056
-
1057
- # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
1058
- topics_dir = connector_manifest.config.get("topics.dir", "topics")
1059
-
1060
- return self.S3SinkParser(
1061
- target_platform="s3",
1062
- bucket=bucket,
1063
- topics_dir=topics_dir,
1064
- topics=connector_manifest.topic_names,
1065
- )
1066
-
1067
- def _extract_lineages(self):
1068
- self.connector_manifest.flow_property_bag = self.connector_manifest.config
1069
-
1070
- # remove keys, secrets from properties
1071
- secret_properties = [
1072
- "aws.access.key.id",
1073
- "aws.secret.access.key",
1074
- "s3.sse.customer.key",
1075
- "s3.proxy.password",
1076
- ]
1077
- for k in secret_properties:
1078
- if k in self.connector_manifest.flow_property_bag:
1079
- del self.connector_manifest.flow_property_bag[k]
1080
-
1081
- try:
1082
- parser = self._get_parser(self.connector_manifest)
1083
-
1084
- lineages: List[KafkaConnectLineage] = list()
1085
- for topic in parser.topics:
1086
- target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}"
1087
-
1088
- lineages.append(
1089
- KafkaConnectLineage(
1090
- source_dataset=topic,
1091
- source_platform="kafka",
1092
- target_dataset=target_dataset,
1093
- target_platform=parser.target_platform,
1094
- )
1095
- )
1096
- self.connector_manifest.lineages = lineages
1097
- except Exception as e:
1098
- self.report.warning(
1099
- "Error resolving lineage for connector",
1100
- self.connector_manifest.name,
1101
- exc=e,
1102
- )
1103
-
1104
- return
1105
-
1106
-
1107
- def transform_connector_config(
1108
- connector_config: Dict, provided_configs: List[ProvidedConfig]
1109
- ) -> None:
1110
- """This method will update provided configs in connector config values, if any"""
1111
- lookupsByProvider = {}
1112
- for pconfig in provided_configs:
1113
- lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value
1114
- for k, v in connector_config.items():
1115
- for key, value in lookupsByProvider.items():
1116
- if key in v:
1117
- connector_config[k] = connector_config[k].replace(key, value)
1118
-
1119
-
1120
- @platform_name("Kafka Connect")
1121
- @config_class(KafkaConnectSourceConfig)
1122
- @support_status(SupportStatus.CERTIFIED)
1123
- @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
1124
- @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
1125
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
1126
- class KafkaConnectSource(StatefulIngestionSourceBase):
1127
- config: KafkaConnectSourceConfig
1128
- report: KafkaConnectSourceReport
1129
- platform: str = "kafka-connect"
1130
-
1131
- def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext):
1132
- super().__init__(config, ctx)
1133
- self.config = config
1134
- self.report = KafkaConnectSourceReport()
1135
- self.session = requests.Session()
1136
- self.session.headers.update(
1137
- {
1138
- "Accept": "application/json",
1139
- "Content-Type": "application/json",
1140
- }
1141
- )
1142
-
1143
- # Test the connection
1144
- if self.config.username is not None and self.config.password is not None:
1145
- logger.info(
1146
- f"Connecting to {self.config.connect_uri} with Authentication..."
1147
- )
1148
- self.session.auth = (self.config.username, self.config.password)
1149
-
1150
- test_response = self.session.get(f"{self.config.connect_uri}/connectors")
1151
- test_response.raise_for_status()
1152
- logger.info(f"Connection to {self.config.connect_uri} is ok")
1153
- if not jpype.isJVMStarted():
1154
- jpype.startJVM()
1155
-
1156
- @classmethod
1157
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
1158
- config = KafkaConnectSourceConfig.parse_obj(config_dict)
1159
- return cls(config, ctx)
1160
-
1161
- def get_connectors_manifest(self) -> List[ConnectorManifest]:
1162
- """Get Kafka Connect connectors manifest using REST API.
1163
- Enrich with lineages metadata.
1164
- """
1165
- connectors_manifest = list()
1166
-
1167
- connector_response = self.session.get(
1168
- f"{self.config.connect_uri}/connectors",
1169
- )
1170
-
1171
- payload = connector_response.json()
1172
-
1173
- for connector_name in payload:
1174
- connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
1175
- connector_manifest = self._get_connector_manifest(
1176
- connector_name, connector_url
1177
- )
1178
- if (
1179
- connector_manifest is None
1180
- or not self.config.connector_patterns.allowed(connector_manifest.name)
1181
- ):
1182
- self.report.report_dropped(connector_name)
1183
- continue
1184
-
1185
- if self.config.provided_configs:
1186
- transform_connector_config(
1187
- connector_manifest.config, self.config.provided_configs
1188
- )
1189
- # Initialize connector lineages
1190
- connector_manifest.lineages = list()
1191
- connector_manifest.url = connector_url
1192
-
1193
- connector_manifest.topic_names = self._get_connector_topics(connector_name)
1194
-
1195
- # Populate Source Connector metadata
1196
- if connector_manifest.type == SOURCE:
1197
- connector_manifest.tasks = self._get_connector_tasks(connector_name)
1198
-
1199
- # JDBC source connector lineages
1200
- if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
1201
- "io.confluent.connect.jdbc.JdbcSourceConnector"
1202
- ):
1203
- connector_manifest = ConfluentJDBCSourceConnector(
1204
- connector_manifest=connector_manifest,
1205
- config=self.config,
1206
- report=self.report,
1207
- ).connector_manifest
1208
- elif connector_manifest.config.get(CONNECTOR_CLASS, "").startswith(
1209
- "io.debezium.connector"
1210
- ):
1211
- connector_manifest = DebeziumSourceConnector(
1212
- connector_manifest=connector_manifest,
1213
- config=self.config,
1214
- report=self.report,
1215
- ).connector_manifest
1216
- elif (
1217
- connector_manifest.config.get(CONNECTOR_CLASS, "")
1218
- == "com.mongodb.kafka.connect.MongoSourceConnector"
1219
- ):
1220
- connector_manifest = MongoSourceConnector(
1221
- connector_manifest=connector_manifest, config=self.config
1222
- ).connector_manifest
1223
- else:
1224
- # Find the target connector object in the list, or log an error if unknown.
1225
- target_connector = None
1226
- for connector in self.config.generic_connectors:
1227
- if connector.connector_name == connector_manifest.name:
1228
- target_connector = connector
1229
- break
1230
- if not target_connector:
1231
- logger.warning(
1232
- f"Detected undefined connector {connector_manifest.name}, which is not in the customized connector list. Please refer to Kafka Connect ingestion recipe to define this customized connector."
1233
- )
1234
- continue
1235
-
1236
- for topic in connector_manifest.topic_names:
1237
- lineage = KafkaConnectLineage(
1238
- source_dataset=target_connector.source_dataset,
1239
- source_platform=target_connector.source_platform,
1240
- target_dataset=topic,
1241
- target_platform=KAFKA,
1242
- )
1243
-
1244
- connector_manifest.lineages.append(lineage)
1245
-
1246
- if connector_manifest.type == SINK:
1247
- if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
1248
- "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector"
1249
- ):
1250
- connector_manifest = BigQuerySinkConnector(
1251
- connector_manifest=connector_manifest, report=self.report
1252
- ).connector_manifest
1253
- elif connector_manifest.config.get("connector.class").__eq__(
1254
- "io.confluent.connect.s3.S3SinkConnector"
1255
- ):
1256
- connector_manifest = ConfluentS3SinkConnector(
1257
- connector_manifest=connector_manifest, report=self.report
1258
- ).connector_manifest
1259
- elif connector_manifest.config.get("connector.class").__eq__(
1260
- "com.snowflake.kafka.connector.SnowflakeSinkConnector"
1261
- ):
1262
- connector_manifest = SnowflakeSinkConnector(
1263
- connector_manifest=connector_manifest, report=self.report
1264
- ).connector_manifest
1265
- else:
1266
- self.report.report_dropped(connector_manifest.name)
1267
- logger.warning(
1268
- f"Skipping connector {connector_manifest.name}. Lineage for Connector not yet implemented"
1269
- )
1270
- pass
1271
-
1272
- connectors_manifest.append(connector_manifest)
1273
-
1274
- return connectors_manifest
1275
-
1276
- def _get_connector_manifest(
1277
- self, connector_name: str, connector_url: str
1278
- ) -> Optional[ConnectorManifest]:
1279
- try:
1280
- connector_response = self.session.get(connector_url)
1281
- connector_response.raise_for_status()
1282
- except Exception as e:
1283
- self.report.warning(
1284
- "Failed to get connector details", connector_name, exc=e
1285
- )
1286
- return None
1287
- manifest = connector_response.json()
1288
- connector_manifest = ConnectorManifest(**manifest)
1289
- return connector_manifest
1290
-
1291
- def _get_connector_tasks(self, connector_name: str) -> dict:
1292
- try:
1293
- response = self.session.get(
1294
- f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
1295
- )
1296
- response.raise_for_status()
1297
- except Exception as e:
1298
- self.report.warning(
1299
- "Error getting connector tasks", context=connector_name, exc=e
1300
- )
1301
- return {}
1302
-
1303
- return response.json()
1304
-
1305
- def _get_connector_topics(self, connector_name: str) -> List[str]:
1306
- try:
1307
- response = self.session.get(
1308
- f"{self.config.connect_uri}/connectors/{connector_name}/topics",
1309
- )
1310
- response.raise_for_status()
1311
- except Exception as e:
1312
- self.report.warning(
1313
- "Error getting connector topics", context=connector_name, exc=e
1314
- )
1315
- return []
1316
-
1317
- return response.json()[connector_name]["topics"]
1318
-
1319
- def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
1320
- connector_name = connector.name
1321
- connector_type = connector.type
1322
- connector_class = connector.config.get(CONNECTOR_CLASS)
1323
- flow_property_bag = connector.flow_property_bag
1324
- # connector_url = connector.url # NOTE: this will expose connector credential when used
1325
- flow_urn = builder.make_data_flow_urn(
1326
- self.platform,
1327
- connector_name,
1328
- self.config.env,
1329
- self.config.platform_instance,
1330
- )
1331
-
1332
- return MetadataChangeProposalWrapper(
1333
- entityUrn=flow_urn,
1334
- aspect=models.DataFlowInfoClass(
1335
- name=connector_name,
1336
- description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.",
1337
- customProperties=flow_property_bag,
1338
- # externalUrl=connector_url, # NOTE: this will expose connector credential when used
1339
- ),
1340
- ).as_workunit()
1341
-
1342
- def construct_job_workunits(
1343
- self, connector: ConnectorManifest
1344
- ) -> Iterable[MetadataWorkUnit]:
1345
- connector_name = connector.name
1346
- flow_urn = builder.make_data_flow_urn(
1347
- self.platform,
1348
- connector_name,
1349
- self.config.env,
1350
- self.config.platform_instance,
1351
- )
1352
-
1353
- lineages = connector.lineages
1354
- if lineages:
1355
- for lineage in lineages:
1356
- source_dataset = lineage.source_dataset
1357
- source_platform = lineage.source_platform
1358
- target_dataset = lineage.target_dataset
1359
- target_platform = lineage.target_platform
1360
- job_property_bag = lineage.job_property_bag
1361
-
1362
- source_platform_instance = get_platform_instance(
1363
- self.config, connector_name, source_platform
1364
- )
1365
- target_platform_instance = get_platform_instance(
1366
- self.config, connector_name, target_platform
1367
- )
1368
-
1369
- job_id = self.get_job_id(lineage, connector, self.config)
1370
- job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id)
1371
-
1372
- inlets = (
1373
- [
1374
- self.make_lineage_dataset_urn(
1375
- source_platform, source_dataset, source_platform_instance
1376
- )
1377
- ]
1378
- if source_dataset
1379
- else []
1380
- )
1381
- outlets = [
1382
- self.make_lineage_dataset_urn(
1383
- target_platform, target_dataset, target_platform_instance
1384
- )
1385
- ]
1386
-
1387
- yield MetadataChangeProposalWrapper(
1388
- entityUrn=job_urn,
1389
- aspect=models.DataJobInfoClass(
1390
- name=f"{connector_name}:{job_id}",
1391
- type="COMMAND",
1392
- customProperties=job_property_bag,
1393
- ),
1394
- ).as_workunit()
1395
-
1396
- yield MetadataChangeProposalWrapper(
1397
- entityUrn=job_urn,
1398
- aspect=models.DataJobInputOutputClass(
1399
- inputDatasets=inlets,
1400
- outputDatasets=outlets,
1401
- ),
1402
- ).as_workunit()
1403
-
1404
- def get_job_id(
1405
- self,
1406
- lineage: KafkaConnectLineage,
1407
- connector: ConnectorManifest,
1408
- config: KafkaConnectSourceConfig,
1409
- ) -> str:
1410
- connector_class = connector.config.get(CONNECTOR_CLASS)
1411
-
1412
- # Note - This block is only to maintain backward compatibility of Job URN
1413
- if (
1414
- connector_class
1415
- and connector.type == SOURCE
1416
- and (
1417
- "JdbcSourceConnector" in connector_class
1418
- or connector_class.startswith("io.debezium.connector")
1419
- )
1420
- and lineage.source_dataset
1421
- and config.connect_to_platform_map
1422
- and config.connect_to_platform_map.get(connector.name)
1423
- and config.connect_to_platform_map[connector.name].get(
1424
- lineage.source_platform
1425
- )
1426
- ):
1427
- return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}"
1428
-
1429
- return (
1430
- lineage.source_dataset
1431
- if lineage.source_dataset
1432
- else f"unknown_source.{lineage.target_dataset}"
1433
- )
1434
-
1435
- def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
1436
- return [
1437
- *super().get_workunit_processors(),
1438
- StaleEntityRemovalHandler.create(
1439
- self, self.config, self.ctx
1440
- ).workunit_processor,
1441
- ]
1442
-
1443
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1444
- connectors_manifest = self.get_connectors_manifest()
1445
- for connector in connectors_manifest:
1446
- name = connector.name
1447
-
1448
- yield self.construct_flow_workunit(connector)
1449
- yield from self.construct_job_workunits(connector)
1450
- self.report.report_connector_scanned(name)
1451
-
1452
- def get_report(self) -> KafkaConnectSourceReport:
1453
- return self.report
1454
-
1455
- def make_lineage_dataset_urn(
1456
- self, platform: str, name: str, platform_instance: Optional[str]
1457
- ) -> str:
1458
- if self.config.convert_lineage_urns_to_lowercase:
1459
- name = name.lower()
1460
-
1461
- return builder.make_dataset_urn_with_platform_instance(
1462
- platform, name, platform_instance, self.config.env
1463
- )
1464
-
1465
-
1466
- # TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
1467
- def has_three_level_hierarchy(platform: str) -> bool:
1468
- return platform in ["postgres", "trino", "redshift", "snowflake"]