acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show
  1. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
  2. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
  3. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +1 -1
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +3 -5
  46. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  47. datahub/ingestion/source/delta_lake/config.py +8 -1
  48. datahub/ingestion/source/delta_lake/report.py +4 -2
  49. datahub/ingestion/source/delta_lake/source.py +20 -5
  50. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  51. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  52. datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
  53. datahub/ingestion/source/elastic_search.py +26 -6
  54. datahub/ingestion/source/feast.py +27 -8
  55. datahub/ingestion/source/file.py +6 -3
  56. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  57. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  58. datahub/ingestion/source/ge_data_profiler.py +12 -15
  59. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  60. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  61. datahub/ingestion/source/identity/okta.py +37 -7
  62. datahub/ingestion/source/kafka/kafka.py +1 -1
  63. datahub/ingestion/source/kafka_connect/common.py +2 -7
  64. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  65. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  66. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  67. datahub/ingestion/source/looker/looker_common.py +3 -3
  68. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  69. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  70. datahub/ingestion/source/looker/looker_source.py +1 -1
  71. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  72. datahub/ingestion/source/looker/lookml_source.py +3 -2
  73. datahub/ingestion/source/metabase.py +57 -35
  74. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  75. datahub/ingestion/source/metadata/lineage.py +2 -2
  76. datahub/ingestion/source/mlflow.py +365 -35
  77. datahub/ingestion/source/mode.py +18 -8
  78. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  79. datahub/ingestion/source/nifi.py +37 -11
  80. datahub/ingestion/source/openapi.py +1 -1
  81. datahub/ingestion/source/openapi_parser.py +49 -17
  82. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  83. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  84. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  85. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  86. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  87. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  88. datahub/ingestion/source/preset.py +7 -4
  89. datahub/ingestion/source/pulsar.py +3 -2
  90. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  91. datahub/ingestion/source/redash.py +31 -7
  92. datahub/ingestion/source/redshift/config.py +4 -0
  93. datahub/ingestion/source/redshift/datashares.py +236 -0
  94. datahub/ingestion/source/redshift/lineage.py +6 -2
  95. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  96. datahub/ingestion/source/redshift/profile.py +1 -1
  97. datahub/ingestion/source/redshift/query.py +133 -33
  98. datahub/ingestion/source/redshift/redshift.py +46 -73
  99. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  100. datahub/ingestion/source/redshift/report.py +3 -0
  101. datahub/ingestion/source/s3/config.py +5 -5
  102. datahub/ingestion/source/s3/source.py +20 -41
  103. datahub/ingestion/source/salesforce.py +550 -275
  104. datahub/ingestion/source/schema_inference/object.py +1 -1
  105. datahub/ingestion/source/sigma/sigma.py +1 -1
  106. datahub/ingestion/source/slack/slack.py +31 -10
  107. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  108. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  109. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  110. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  112. datahub/ingestion/source/sql/athena.py +10 -16
  113. datahub/ingestion/source/sql/druid.py +1 -5
  114. datahub/ingestion/source/sql/hive.py +15 -6
  115. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  116. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  117. datahub/ingestion/source/sql/mssql/source.py +11 -5
  118. datahub/ingestion/source/sql/oracle.py +127 -63
  119. datahub/ingestion/source/sql/sql_common.py +6 -12
  120. datahub/ingestion/source/sql/sql_types.py +2 -2
  121. datahub/ingestion/source/sql/teradata.py +7 -5
  122. datahub/ingestion/source/sql/trino.py +2 -2
  123. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  124. datahub/ingestion/source/superset.py +222 -62
  125. datahub/ingestion/source/tableau/tableau.py +22 -6
  126. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  127. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  128. datahub/ingestion/source/unity/source.py +11 -1
  129. datahub/ingestion/source/vertexai.py +697 -0
  130. datahub/ingestion/source_config/pulsar.py +3 -1
  131. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  132. datahub/lite/duckdb_lite.py +3 -10
  133. datahub/lite/lite_local.py +1 -1
  134. datahub/lite/lite_util.py +4 -3
  135. datahub/metadata/_schema_classes.py +714 -417
  136. datahub/metadata/_urns/urn_defs.py +1673 -1649
  137. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  138. datahub/metadata/schema.avsc +16438 -16603
  139. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  140. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  141. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  142. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  143. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  144. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  145. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  146. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  147. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  148. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  149. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  150. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  151. datahub/metadata/schemas/DomainKey.avsc +2 -1
  152. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  153. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  154. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  155. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  156. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  157. datahub/metadata/schemas/InputFields.avsc +3 -1
  158. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  159. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  160. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  162. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  163. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  164. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  165. datahub/metadata/schemas/PostKey.avsc +2 -1
  166. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  168. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  169. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  170. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  171. datahub/pydantic/__init__.py +0 -0
  172. datahub/pydantic/compat.py +58 -0
  173. datahub/sdk/__init__.py +30 -12
  174. datahub/sdk/_all_entities.py +1 -1
  175. datahub/sdk/_attribution.py +4 -0
  176. datahub/sdk/_shared.py +251 -16
  177. datahub/sdk/_utils.py +35 -0
  178. datahub/sdk/container.py +29 -5
  179. datahub/sdk/dataset.py +118 -20
  180. datahub/sdk/{_entity.py → entity.py} +24 -1
  181. datahub/sdk/entity_client.py +1 -1
  182. datahub/sdk/main_client.py +23 -0
  183. datahub/sdk/resolver_client.py +17 -29
  184. datahub/sdk/search_client.py +50 -0
  185. datahub/sdk/search_filters.py +374 -0
  186. datahub/specific/dataset.py +3 -4
  187. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  188. datahub/sql_parsing/schema_resolver.py +1 -1
  189. datahub/sql_parsing/split_statements.py +20 -13
  190. datahub/sql_parsing/sql_parsing_common.py +7 -0
  191. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  192. datahub/sql_parsing/sqlglot_utils.py +1 -4
  193. datahub/testing/check_sql_parser_result.py +5 -6
  194. datahub/testing/compare_metadata_json.py +7 -6
  195. datahub/testing/pytest_hooks.py +56 -0
  196. datahub/upgrade/upgrade.py +2 -2
  197. datahub/utilities/file_backed_collections.py +3 -14
  198. datahub/utilities/ingest_utils.py +106 -0
  199. datahub/utilities/mapping.py +1 -1
  200. datahub/utilities/memory_footprint.py +3 -2
  201. datahub/utilities/sentinels.py +22 -0
  202. datahub/utilities/unified_diff.py +5 -1
  203. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  204. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,15 @@
1
1
  import logging
2
+ import threading
2
3
  from dataclasses import dataclass, field
3
4
  from typing import Any, Dict, Optional
4
5
 
5
6
  from humanfriendly import format_timespan
6
7
  from pydantic import Field, validator
7
8
  from pyiceberg.catalog import Catalog, load_catalog
9
+ from pyiceberg.catalog.rest import RestCatalog
10
+ from requests.adapters import HTTPAdapter
8
11
  from sortedcontainers import SortedList
12
+ from urllib3.util import Retry
9
13
 
10
14
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
11
15
  from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -25,6 +29,23 @@ from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
25
29
 
26
30
  logger = logging.getLogger(__name__)
27
31
 
32
+ DEFAULT_REST_TIMEOUT = 120
33
+ DEFAULT_REST_RETRY_POLICY = {"total": 3, "backoff_factor": 0.1}
34
+
35
+
36
+ class TimeoutHTTPAdapter(HTTPAdapter):
37
+ def __init__(self, *args, **kwargs):
38
+ if "timeout" in kwargs:
39
+ self.timeout = kwargs["timeout"]
40
+ del kwargs["timeout"]
41
+ super().__init__(*args, **kwargs)
42
+
43
+ def send(self, request, **kwargs):
44
+ timeout = kwargs.get("timeout")
45
+ if timeout is None and hasattr(self, "timeout"):
46
+ kwargs["timeout"] = self.timeout
47
+ return super().send(request, **kwargs)
48
+
28
49
 
29
50
  class IcebergProfilingConfig(ConfigModel):
30
51
  enabled: bool = Field(
@@ -145,7 +166,26 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
145
166
  logger.debug(
146
167
  "Initializing the catalog %s with config: %s", catalog_name, catalog_config
147
168
  )
148
- return load_catalog(name=catalog_name, **catalog_config)
169
+ catalog = load_catalog(name=catalog_name, **catalog_config)
170
+ if isinstance(catalog, RestCatalog):
171
+ logger.debug(
172
+ "Recognized REST catalog type being configured, attempting to configure HTTP Adapter for the session"
173
+ )
174
+ retry_policy: Dict[str, Any] = DEFAULT_REST_RETRY_POLICY.copy()
175
+ retry_policy.update(catalog_config.get("connection", {}).get("retry", {}))
176
+ retries = Retry(**retry_policy)
177
+ logger.debug(f"Retry policy to be set: {retry_policy}")
178
+ timeout = catalog_config.get("connection", {}).get(
179
+ "timeout", DEFAULT_REST_TIMEOUT
180
+ )
181
+ logger.debug(f"Timeout to be set: {timeout}")
182
+ catalog._session.mount(
183
+ "http://", TimeoutHTTPAdapter(timeout=timeout, max_retries=retries)
184
+ )
185
+ catalog._session.mount(
186
+ "https://", TimeoutHTTPAdapter(timeout=timeout, max_retries=retries)
187
+ )
188
+ return catalog
149
189
 
150
190
 
151
191
  class TopTableTimings:
@@ -156,18 +196,21 @@ class TopTableTimings:
156
196
  def __init__(self, size: int = 10):
157
197
  self._size = size
158
198
  self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
199
+ self._lock = threading.Lock()
159
200
 
160
201
  def add(self, entity: Dict[str, Any]) -> None:
161
202
  if self._VALUE_FIELD not in entity:
162
203
  return
163
- self.top_entites.add(entity)
164
- if len(self.top_entites) > self._size:
165
- self.top_entites.pop()
204
+ with self._lock:
205
+ self.top_entites.add(entity)
206
+ if len(self.top_entites) > self._size:
207
+ self.top_entites.pop()
166
208
 
167
209
  def __str__(self) -> str:
168
- if len(self.top_entites) == 0:
169
- return "no timings reported"
170
- return str(list(self.top_entites))
210
+ with self._lock:
211
+ if len(self.top_entites) == 0:
212
+ return "no timings reported"
213
+ return str(list(self.top_entites))
171
214
 
172
215
 
173
216
  class TimingClass:
@@ -175,24 +218,31 @@ class TimingClass:
175
218
 
176
219
  def __init__(self):
177
220
  self.times = SortedList()
221
+ self._lock = threading.Lock()
178
222
 
179
223
  def add_timing(self, t: float) -> None:
180
- self.times.add(t)
224
+ with self._lock:
225
+ self.times.add(t)
181
226
 
182
227
  def __str__(self) -> str:
183
- if len(self.times) == 0:
184
- return "no timings reported"
185
- total = sum(self.times)
186
- avg = total / len(self.times)
187
- return str(
188
- {
189
- "average_time": format_timespan(avg, detailed=True, max_units=3),
190
- "min_time": format_timespan(self.times[0], detailed=True, max_units=3),
191
- "max_time": format_timespan(self.times[-1], detailed=True, max_units=3),
192
- # total_time does not provide correct information in case we run in more than 1 thread
193
- "total_time": format_timespan(total, detailed=True, max_units=3),
194
- }
195
- )
228
+ with self._lock:
229
+ if len(self.times) == 0:
230
+ return "no timings reported"
231
+ total = sum(self.times)
232
+ avg = total / len(self.times)
233
+ return str(
234
+ {
235
+ "average_time": format_timespan(avg, detailed=True, max_units=3),
236
+ "min_time": format_timespan(
237
+ self.times[0], detailed=True, max_units=3
238
+ ),
239
+ "max_time": format_timespan(
240
+ self.times[-1], detailed=True, max_units=3
241
+ ),
242
+ # total_time does not provide correct information in case we run in more than 1 thread
243
+ "total_time": format_timespan(total, detailed=True, max_units=3),
244
+ }
245
+ )
196
246
 
197
247
 
198
248
  @dataclass
@@ -5,7 +5,7 @@ import urllib
5
5
  from collections import defaultdict
6
6
  from dataclasses import dataclass, field
7
7
  from time import sleep
8
- from typing import Dict, Iterable, List, Optional, Union
8
+ from typing import Dict, Iterable, List, Optional, Set, Union
9
9
 
10
10
  import nest_asyncio
11
11
  from okta.client import Client as OktaClient
@@ -14,7 +14,6 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
14
14
  from pydantic import validator
15
15
  from pydantic.fields import Field
16
16
 
17
- from datahub.configuration.common import ConfigModel
18
17
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
18
  from datahub.ingestion.api.common import PipelineContext
20
19
  from datahub.ingestion.api.decorators import (
@@ -56,7 +55,7 @@ logger = logging.getLogger(__name__)
56
55
  nest_asyncio.apply()
57
56
 
58
57
 
59
- class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
58
+ class OktaConfig(StatefulIngestionConfigBase):
60
59
  # Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
61
60
  okta_domain: str = Field(
62
61
  description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",
@@ -77,6 +76,10 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
77
76
  default=True,
78
77
  description="Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True.",
79
78
  )
79
+ ingest_groups_users: bool = Field(
80
+ default=True,
81
+ description="Only ingest users belonging to the selected groups. This option is only useful when `ingest_users` is set to False and `ingest_group_membership` to True.",
82
+ )
80
83
 
81
84
  # Optional: Customize the mapping to DataHub Username from an attribute appearing in the Okta User
82
85
  # profile. Reference: https://developer.okta.com/docs/reference/api/users/
@@ -344,6 +347,7 @@ class OktaSource(StatefulIngestionSourceBase):
344
347
  aspect=StatusClass(removed=False),
345
348
  ).as_workunit()
346
349
 
350
+ okta_users: Set[User] = set()
347
351
  # Step 2: Populate GroupMembership Aspects for CorpUsers
348
352
  datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
349
353
  defaultdict(lambda: GroupMembershipClass(groups=[]))
@@ -372,6 +376,9 @@ class OktaSource(StatefulIngestionSourceBase):
372
376
  self.report.report_failure("okta_user_mapping", error_str)
373
377
  continue
374
378
 
379
+ if self.config.ingest_groups_users:
380
+ okta_users.add(okta_user)
381
+
375
382
  # Update the GroupMembership aspect for this group member.
376
383
  datahub_corp_user_urn_to_group_membership[
377
384
  datahub_corp_user_urn
@@ -379,7 +386,10 @@ class OktaSource(StatefulIngestionSourceBase):
379
386
 
380
387
  # Step 3: Produce MetadataWorkUnits for CorpUsers.
381
388
  if self.config.ingest_users:
382
- okta_users = self._get_okta_users(event_loop)
389
+ # we can just throw away collected okta users so far and fetch them all
390
+ okta_users = set(self._get_okta_users(event_loop))
391
+
392
+ if okta_users:
383
393
  filtered_okta_users = filter(self._filter_okta_user, okta_users)
384
394
  datahub_corp_user_snapshots = self._map_okta_users(filtered_okta_users)
385
395
  for user_count, datahub_corp_user_snapshot in enumerate(
@@ -558,9 +568,7 @@ class OktaSource(StatefulIngestionSourceBase):
558
568
  if (
559
569
  self.config.include_deprovisioned_users is False
560
570
  and okta_user.status == UserStatus.DEPROVISIONED
561
- ):
562
- return False
563
- elif (
571
+ ) or (
564
572
  self.config.include_suspended_users is False
565
573
  and okta_user.status == UserStatus.SUSPENDED
566
574
  ):
@@ -658,6 +666,27 @@ class OktaSource(StatefulIngestionSourceBase):
658
666
  self.config.okta_profile_to_username_regex,
659
667
  )
660
668
 
669
+ def _map_okta_user_profile_custom_properties(
670
+ self, profile: UserProfile
671
+ ) -> Dict[str, str]:
672
+ # filter out the common fields that are already mapped to the CorpUserInfo aspect and the private ones
673
+ return {
674
+ k: str(v)
675
+ for k, v in profile.__dict__.items()
676
+ if v
677
+ and k
678
+ not in [
679
+ "displayName",
680
+ "firstName",
681
+ "lastName",
682
+ "email",
683
+ "title",
684
+ "countryCode",
685
+ "department",
686
+ ]
687
+ and not k.startswith("_")
688
+ }
689
+
661
690
  # Converts Okta User Profile into a CorpUserInfo.
662
691
  def _map_okta_user_profile(self, profile: UserProfile) -> CorpUserInfoClass:
663
692
  # TODO: Extract user's manager if provided.
@@ -675,6 +704,7 @@ class OktaSource(StatefulIngestionSourceBase):
675
704
  title=profile.title,
676
705
  countryCode=profile.countryCode,
677
706
  departmentName=profile.department,
707
+ customProperties=self._map_okta_user_profile_custom_properties(profile),
678
708
  )
679
709
 
680
710
  def _make_corp_group_urn(self, name: str) -> str:
@@ -272,7 +272,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
272
272
  return schema_registry_class.create(config, report)
273
273
  except Exception as e:
274
274
  logger.debug(e, exc_info=e)
275
- raise ImportError(config.schema_registry_class)
275
+ raise ImportError(config.schema_registry_class) from e
276
276
 
277
277
  def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
278
278
  super().__init__(config, ctx)
@@ -110,7 +110,7 @@ class ConnectorManifest:
110
110
 
111
111
  name: str
112
112
  type: str
113
- config: Dict
113
+ config: Dict[str, str]
114
114
  tasks: Dict
115
115
  url: Optional[str] = None
116
116
  flow_property_bag: Optional[Dict[str, str]] = None
@@ -141,12 +141,7 @@ def get_dataset_name(
141
141
  database_name: Optional[str],
142
142
  source_table: str,
143
143
  ) -> str:
144
- if database_name:
145
- dataset_name = database_name + "." + source_table
146
- else:
147
- dataset_name = source_table
148
-
149
- return dataset_name
144
+ return database_name + "." + source_table if database_name else source_table
150
145
 
151
146
 
152
147
  def get_platform_instance(
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Iterable, List, Optional, Type
2
+ from typing import Dict, Iterable, List, Optional, Type
3
3
 
4
4
  import jpype
5
5
  import jpype.imports
@@ -121,7 +121,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
121
121
  connector_manifest.config, self.config.provided_configs
122
122
  )
123
123
  connector_manifest.url = connector_url
124
- connector_manifest.topic_names = self._get_connector_topics(connector_name)
124
+ connector_manifest.topic_names = self._get_connector_topics(
125
+ connector_name=connector_name,
126
+ config=connector_manifest.config,
127
+ connector_type=connector_manifest.type,
128
+ )
125
129
  connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or ""
126
130
 
127
131
  class_type: Type[BaseConnector] = BaseConnector
@@ -203,7 +207,9 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
203
207
 
204
208
  return response.json()
205
209
 
206
- def _get_connector_topics(self, connector_name: str) -> List[str]:
210
+ def _get_connector_topics(
211
+ self, connector_name: str, config: Dict[str, str], connector_type: str
212
+ ) -> List[str]:
207
213
  try:
208
214
  response = self.session.get(
209
215
  f"{self.config.connect_uri}/connectors/{connector_name}/topics",
@@ -215,7 +221,21 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
215
221
  )
216
222
  return []
217
223
 
218
- return response.json()[connector_name]["topics"]
224
+ processed_topics = response.json()[connector_name]["topics"]
225
+
226
+ if connector_type == SINK:
227
+ try:
228
+ return SinkTopicFilter().filter_stale_topics(processed_topics, config)
229
+ except Exception as e:
230
+ self.report.warning(
231
+ title="Error parsing sink conector topics configuration",
232
+ message="Some stale lineage tasks might show up for connector",
233
+ context=connector_name,
234
+ exc=e,
235
+ )
236
+ return processed_topics
237
+ else:
238
+ return processed_topics
219
239
 
220
240
  def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
221
241
  connector_name = connector.name
@@ -359,3 +379,76 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
359
379
  return builder.make_dataset_urn_with_platform_instance(
360
380
  platform, name, platform_instance, self.config.env
361
381
  )
382
+
383
+
384
+ class SinkTopicFilter:
385
+ """Helper class to filter Kafka Connect topics based on configuration."""
386
+
387
+ def filter_stale_topics(
388
+ self,
389
+ processed_topics: List[str],
390
+ sink_config: Dict[str, str],
391
+ ) -> List[str]:
392
+ """
393
+ Kafka-connect's /topics API returns the set of topic names the connector has been using
394
+ since its creation or since the last time its set of active topics was reset. This means-
395
+ if a topic was ever used by a connector, it will be returned, even if it is no longer used.
396
+ To remove these stale topics from the list, we double-check the list returned by the API
397
+ against the sink connector's config.
398
+ Sink connectors configure exactly one of `topics` or `topics.regex`
399
+ https://kafka.apache.org/documentation/#sinkconnectorconfigs_topics
400
+
401
+ Args:
402
+ processed_topics: List of topics currently being processed
403
+ sink_config: Configuration dictionary for the sink connector
404
+
405
+ Returns:
406
+ List of filtered topics that match the configuration
407
+
408
+ Raises:
409
+ ValueError: If sink connector configuration is missing both 'topics' and 'topics.regex' fields
410
+
411
+ """
412
+ # Absence of topics config is a defensive NOOP,
413
+ # although this should never happen in real world
414
+ if not self.has_topic_config(sink_config):
415
+ logger.warning(
416
+ f"Found sink without topics config {sink_config.get(CONNECTOR_CLASS)}"
417
+ )
418
+ return processed_topics
419
+
420
+ # Handle explicit topic list
421
+ if sink_config.get("topics"):
422
+ return self._filter_by_topic_list(processed_topics, sink_config["topics"])
423
+ else:
424
+ # Handle regex pattern
425
+ return self._filter_by_topic_regex(
426
+ processed_topics, sink_config["topics.regex"]
427
+ )
428
+
429
+ def has_topic_config(self, sink_config: Dict[str, str]) -> bool:
430
+ """Check if sink config has either topics or topics.regex."""
431
+ return bool(sink_config.get("topics") or sink_config.get("topics.regex"))
432
+
433
+ def _filter_by_topic_list(
434
+ self, processed_topics: List[str], topics_config: str
435
+ ) -> List[str]:
436
+ """Filter topics based on explicit topic list from config."""
437
+ config_topics = [
438
+ topic.strip() for topic in topics_config.split(",") if topic.strip()
439
+ ]
440
+ return [topic for topic in processed_topics if topic in config_topics]
441
+
442
+ def _filter_by_topic_regex(
443
+ self, processed_topics: List[str], regex_pattern: str
444
+ ) -> List[str]:
445
+ """Filter topics based on regex pattern from config."""
446
+ from java.util.regex import Pattern
447
+
448
+ regex_matcher = Pattern.compile(regex_pattern)
449
+
450
+ return [
451
+ topic
452
+ for topic in processed_topics
453
+ if regex_matcher.matcher(topic).matches()
454
+ ]
@@ -175,7 +175,7 @@ class BigQuerySinkConnector(BaseConnector):
175
175
  class BQParser:
176
176
  project: str
177
177
  target_platform: str
178
- sanitizeTopics: str
178
+ sanitizeTopics: bool
179
179
  transforms: list
180
180
  topicsToTables: Optional[str] = None
181
181
  datasets: Optional[str] = None
@@ -187,7 +187,7 @@ class BigQuerySinkConnector(BaseConnector):
187
187
  connector_manifest: ConnectorManifest,
188
188
  ) -> BQParser:
189
189
  project = connector_manifest.config["project"]
190
- sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false")
190
+ sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
191
191
  transform_names = (
192
192
  self.connector_manifest.config.get("transforms", "").split(",")
193
193
  if self.connector_manifest.config.get("transforms")
@@ -107,9 +107,9 @@ class ConfluentJDBCSourceConnector(BaseConnector):
107
107
  assert database_name
108
108
  db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
109
109
 
110
- topic_prefix = self.connector_manifest.config.get("topic.prefix", None)
110
+ topic_prefix = self.connector_manifest.config.get("topic.prefix") or ""
111
111
 
112
- query = self.connector_manifest.config.get("query", None)
112
+ query = self.connector_manifest.config.get("query") or ""
113
113
 
114
114
  transform_names = (
115
115
  self.connector_manifest.config.get("transforms", "").split(",")
@@ -447,13 +447,10 @@ class DebeziumSourceConnector(BaseConnector):
447
447
  ) -> DebeziumParser:
448
448
  connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
449
449
 
450
- if connector_class == "io.debezium.connector.mysql.MySqlConnector":
451
- parser = self.DebeziumParser(
452
- source_platform="mysql",
453
- server_name=self.get_server_name(connector_manifest),
454
- database_name=None,
455
- )
456
- elif connector_class == "MySqlConnector":
450
+ if (
451
+ connector_class == "io.debezium.connector.mysql.MySqlConnector"
452
+ or connector_class == "MySqlConnector"
453
+ ):
457
454
  parser = self.DebeziumParser(
458
455
  source_platform="mysql",
459
456
  server_name=self.get_server_name(connector_manifest),
@@ -923,7 +923,7 @@ class LookerExplore:
923
923
  tags=cast(List, dict.get("tags")) if dict.get("tags") is not None else [],
924
924
  )
925
925
 
926
- @classmethod # noqa: C901
926
+ @classmethod
927
927
  def from_api( # noqa: C901
928
928
  cls,
929
929
  model: str,
@@ -931,7 +931,7 @@ class LookerExplore:
931
931
  client: LookerAPI,
932
932
  reporter: SourceReport,
933
933
  source_config: LookerDashboardSourceConfig,
934
- ) -> Optional["LookerExplore"]: # noqa: C901
934
+ ) -> Optional["LookerExplore"]:
935
935
  try:
936
936
  explore = client.lookml_model_explore(model, explore_name)
937
937
  views: Set[str] = set()
@@ -1183,7 +1183,7 @@ class LookerExplore:
1183
1183
  base_url = remove_port_from_url(base_url)
1184
1184
  return f"{base_url}/embed/explore/{self.model_name}/{self.name}"
1185
1185
 
1186
- def _to_metadata_events( # noqa: C901
1186
+ def _to_metadata_events(
1187
1187
  self,
1188
1188
  config: LookerCommonConfig,
1189
1189
  reporter: SourceReport,
@@ -33,14 +33,14 @@ class LookerViewFileLoader:
33
33
  base_projects_folder: Dict[str, pathlib.Path],
34
34
  reporter: LookMLSourceReport,
35
35
  source_config: LookMLSourceConfig,
36
- manifest_constants: Dict[str, LookerConstant] = {},
36
+ manifest_constants: Optional[Dict[str, LookerConstant]] = None,
37
37
  ) -> None:
38
38
  self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
39
39
  self._root_project_name = root_project_name
40
40
  self._base_projects_folder = base_projects_folder
41
41
  self.reporter = reporter
42
42
  self.source_config = source_config
43
- self.manifest_constants = manifest_constants
43
+ self.manifest_constants = manifest_constants or {}
44
44
 
45
45
  def _load_viewfile(
46
46
  self, project_name: str, path: str, reporter: LookMLSourceReport
@@ -205,8 +205,9 @@ class LookerAPI:
205
205
  def folder_ancestors(
206
206
  self,
207
207
  folder_id: str,
208
- fields: Union[str, List[str]] = ["id", "name", "parent_id"],
208
+ fields: Optional[Union[str, List[str]]] = None,
209
209
  ) -> Sequence[Folder]:
210
+ fields = fields or ["id", "name", "parent_id"]
210
211
  self.client_stats.folder_calls += 1
211
212
  try:
212
213
  return self.client.folder_ancestors(
@@ -383,7 +383,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
383
383
 
384
384
  self.reachable_explores[(model, explore)].append(via)
385
385
 
386
- def _get_looker_dashboard_element( # noqa: C901
386
+ def _get_looker_dashboard_element(
387
387
  self, element: DashboardElement
388
388
  ) -> Optional[LookerDashboardElement]:
389
389
  # Dashboard elements can use raw usage_queries against explores
@@ -464,9 +464,10 @@ def process_lookml_template_language(
464
464
  source_config: LookMLSourceConfig,
465
465
  view_lkml_file_dict: dict,
466
466
  reporter: LookMLSourceReport,
467
- manifest_constants: Dict[str, "LookerConstant"] = {},
467
+ manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
468
468
  resolve_constants: bool = False,
469
469
  ) -> None:
470
+ manifest_constants = manifest_constants or {}
470
471
  if "views" not in view_lkml_file_dict:
471
472
  return
472
473
 
@@ -507,9 +508,10 @@ def load_and_preprocess_file(
507
508
  path: Union[str, pathlib.Path],
508
509
  source_config: LookMLSourceConfig,
509
510
  reporter: LookMLSourceReport,
510
- manifest_constants: Dict[str, "LookerConstant"] = {},
511
+ manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
511
512
  resolve_constants: bool = False,
512
513
  ) -> dict:
514
+ manifest_constants = manifest_constants or {}
513
515
  parsed = load_lkml(path)
514
516
 
515
517
  process_lookml_template_language(
@@ -501,7 +501,7 @@ class LookMLSource(StatefulIngestionSourceBase):
501
501
  raise ValueError(
502
502
  f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
503
503
  f"in your config file"
504
- )
504
+ ) from None
505
505
 
506
506
  def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
507
507
  manifest_file = folder / "manifest.lkml"
@@ -1006,8 +1006,9 @@ class LookMLSource(StatefulIngestionSourceBase):
1006
1006
  def report_skipped_unreachable_views(
1007
1007
  self,
1008
1008
  viewfile_loader: LookerViewFileLoader,
1009
- processed_view_map: Dict[str, Set[str]] = {},
1009
+ processed_view_map: Optional[Dict[str, Set[str]]] = None,
1010
1010
  ) -> None:
1011
+ processed_view_map = processed_view_map or {}
1011
1012
  view_files: Dict[str, List[pathlib.Path]] = {}
1012
1013
  for project, folder_path in self.base_projects_folder.items():
1013
1014
  folder = pathlib.Path(folder_path)