acryl-datahub 1.1.0.4rc3__py3-none-any.whl → 1.1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (149) hide show
  1. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2499 -2501
  2. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +149 -131
  3. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/cli/check_cli.py +65 -11
  7. datahub/cli/cli_utils.py +63 -0
  8. datahub/cli/container_cli.py +5 -0
  9. datahub/cli/delete_cli.py +3 -4
  10. datahub/cli/docker_check.py +107 -12
  11. datahub/cli/docker_cli.py +149 -227
  12. datahub/cli/exists_cli.py +0 -2
  13. datahub/cli/get_cli.py +0 -2
  14. datahub/cli/iceberg_cli.py +5 -0
  15. datahub/cli/ingest_cli.py +3 -15
  16. datahub/cli/migrate.py +2 -0
  17. datahub/cli/put_cli.py +1 -4
  18. datahub/cli/quickstart_versioning.py +50 -7
  19. datahub/cli/specific/assertions_cli.py +0 -4
  20. datahub/cli/specific/datacontract_cli.py +0 -3
  21. datahub/cli/specific/dataproduct_cli.py +0 -11
  22. datahub/cli/specific/dataset_cli.py +1 -8
  23. datahub/cli/specific/forms_cli.py +0 -4
  24. datahub/cli/specific/group_cli.py +0 -2
  25. datahub/cli/specific/structuredproperties_cli.py +1 -4
  26. datahub/cli/specific/user_cli.py +0 -2
  27. datahub/cli/state_cli.py +0 -2
  28. datahub/cli/timeline_cli.py +0 -2
  29. datahub/emitter/rest_emitter.py +24 -8
  30. datahub/entrypoints.py +4 -3
  31. datahub/ingestion/api/decorators.py +15 -3
  32. datahub/ingestion/api/report.py +332 -3
  33. datahub/ingestion/api/sink.py +3 -0
  34. datahub/ingestion/api/source.py +47 -45
  35. datahub/ingestion/autogenerated/__init__.py +0 -0
  36. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  37. datahub/ingestion/autogenerated/lineage.json +401 -0
  38. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  39. datahub/ingestion/extractor/schema_util.py +13 -4
  40. datahub/ingestion/graph/client.py +73 -30
  41. datahub/ingestion/run/pipeline.py +54 -2
  42. datahub/ingestion/sink/datahub_rest.py +12 -0
  43. datahub/ingestion/source/abs/source.py +1 -1
  44. datahub/ingestion/source/aws/glue.py +1 -1
  45. datahub/ingestion/source/azure/azure_common.py +2 -2
  46. datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
  47. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  48. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  49. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  50. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  51. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  52. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  53. datahub/ingestion/source/common/subtypes.py +45 -0
  54. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  55. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  56. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  57. datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
  58. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  59. datahub/ingestion/source/dremio/dremio_api.py +38 -27
  60. datahub/ingestion/source/dremio/dremio_source.py +7 -7
  61. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  62. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  63. datahub/ingestion/source/ge_data_profiler.py +28 -20
  64. datahub/ingestion/source/hex/api.py +26 -1
  65. datahub/ingestion/source/identity/azure_ad.py +1 -1
  66. datahub/ingestion/source/identity/okta.py +1 -14
  67. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  68. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  69. datahub/ingestion/source/mlflow.py +11 -1
  70. datahub/ingestion/source/mock_data/__init__.py +0 -0
  71. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  72. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  73. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  74. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  75. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  76. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  77. datahub/ingestion/source/preset.py +2 -2
  78. datahub/ingestion/source/redshift/usage.py +4 -3
  79. datahub/ingestion/source/s3/report.py +4 -2
  80. datahub/ingestion/source/s3/source.py +367 -115
  81. datahub/ingestion/source/salesforce.py +6 -3
  82. datahub/ingestion/source/sigma/sigma.py +6 -1
  83. datahub/ingestion/source/slack/slack.py +2 -1
  84. datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
  85. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  86. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  87. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
  88. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  89. datahub/ingestion/source/sql/athena.py +119 -12
  90. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  91. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  92. datahub/ingestion/source/sql/mssql/source.py +24 -15
  93. datahub/ingestion/source/sql/oracle.py +1 -1
  94. datahub/ingestion/source/sql/sql_common.py +11 -0
  95. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  96. datahub/ingestion/source/sql/teradata.py +997 -235
  97. datahub/ingestion/source/sql/vertica.py +10 -6
  98. datahub/ingestion/source/sql_queries.py +2 -2
  99. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  100. datahub/ingestion/source/superset.py +57 -2
  101. datahub/ingestion/source/tableau/tableau.py +57 -37
  102. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  103. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  104. datahub/ingestion/source/unity/proxy.py +4 -3
  105. datahub/ingestion/source/unity/source.py +56 -30
  106. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  107. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  108. datahub/metadata/_internal_schema_classes.py +1253 -536
  109. datahub/metadata/_urns/urn_defs.py +1797 -1685
  110. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  111. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  112. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  113. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  114. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  115. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  116. datahub/metadata/schema.avsc +16614 -16538
  117. datahub/metadata/schemas/ContainerProperties.avsc +2 -0
  118. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  119. datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
  120. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  121. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  122. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  123. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  124. datahub/metadata/schemas/DataJobInfo.avsc +2 -0
  125. datahub/metadata/schemas/DataProcessKey.avsc +2 -0
  126. datahub/metadata/schemas/DatasetKey.avsc +4 -1
  127. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  128. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
  129. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  130. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
  131. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
  132. datahub/metadata/schemas/MLModelKey.avsc +2 -0
  133. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  134. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  135. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  136. datahub/sdk/datajob.py +39 -15
  137. datahub/sdk/lineage_client.py +2 -0
  138. datahub/sdk/main_client.py +14 -2
  139. datahub/sdk/search_client.py +4 -3
  140. datahub/specific/dataproduct.py +4 -0
  141. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  142. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  143. datahub/telemetry/telemetry.py +17 -11
  144. datahub/upgrade/upgrade.py +46 -13
  145. datahub/utilities/server_config_util.py +8 -0
  146. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  147. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
  148. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
  149. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
1
  import logging
2
- from typing import Dict, Iterable, List, Optional
2
+ from typing import Dict, Iterable, List, Optional, Union
3
3
 
4
4
  import datahub.emitter.mce_builder as builder
5
- from datahub.api.entities.datajob import DataFlow, DataJob
5
+ from datahub.api.entities.datajob import DataJob as DataJobV1
6
6
  from datahub.api.entities.dataprocess.dataprocess_instance import (
7
7
  DataProcessInstance,
8
8
  InstanceRunResult,
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
42
42
  FineGrainedLineageDownstreamType,
43
43
  FineGrainedLineageUpstreamType,
44
44
  )
45
- from datahub.utilities.urns.data_flow_urn import DataFlowUrn
46
- from datahub.utilities.urns.dataset_urn import DatasetUrn
45
+ from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
46
+ from datahub.sdk.dataflow import DataFlow
47
+ from datahub.sdk.datajob import DataJob
48
+ from datahub.sdk.entity import Entity
47
49
 
48
50
  # Logger instance
49
51
  logger = logging.getLogger(__name__)
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
75
77
  self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
76
78
 
77
79
  def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
78
- input_dataset_urn_list: List[DatasetUrn] = []
79
- output_dataset_urn_list: List[DatasetUrn] = []
80
+ input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
81
+ output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
80
82
  fine_grained_lineage: List[FineGrainedLineage] = []
81
83
 
82
84
  # TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
178
180
  )
179
181
  )
180
182
 
181
- datajob.inlets.extend(input_dataset_urn_list)
182
- datajob.outlets.extend(output_dataset_urn_list)
183
- datajob.fine_grained_lineages.extend(fine_grained_lineage)
183
+ datajob.set_inlets(input_dataset_urn_list)
184
+ datajob.set_outlets(output_dataset_urn_list)
185
+ datajob.set_fine_grained_lineages(fine_grained_lineage)
184
186
 
185
187
  return dict(
186
188
  **{
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
197
199
 
198
200
  def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
199
201
  return DataFlow(
200
- orchestrator=Constant.ORCHESTRATOR,
201
- id=connector.connector_id,
202
+ platform=Constant.ORCHESTRATOR,
203
+ name=connector.connector_id,
202
204
  env=self.config.env,
203
- name=connector.connector_name,
205
+ display_name=connector.connector_name,
204
206
  platform_instance=self.config.platform_instance,
205
207
  )
206
208
 
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
213
215
  )
214
216
  owner_email = self.audit_log.get_user_email(connector.user_id)
215
217
  datajob = DataJob(
216
- id=connector.connector_id,
218
+ name=connector.connector_id,
217
219
  flow_urn=dataflow_urn,
218
220
  platform_instance=self.config.platform_instance,
219
- name=connector.connector_name,
220
- owners={owner_email} if owner_email else set(),
221
+ display_name=connector.connector_name,
222
+ owners=[CorpUserUrn(owner_email)] if owner_email else None,
221
223
  )
222
224
 
223
225
  # Map connector source and destination table with dataset entity
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
232
234
  "sync_frequency": str(connector.sync_frequency),
233
235
  "destination_id": connector.destination_id,
234
236
  }
235
- datajob.properties = {
236
- **connector_properties,
237
- **lineage_properties,
238
- }
237
+
238
+ datajob.set_custom_properties({**connector_properties, **lineage_properties})
239
239
 
240
240
  return datajob
241
241
 
242
242
  def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
243
+ # hack: convert to old instance for DataProcessInstance.from_datajob compatibility
244
+ datajob_v1 = DataJobV1(
245
+ id=datajob.name,
246
+ flow_urn=datajob.flow_urn,
247
+ platform_instance=self.config.platform_instance,
248
+ name=datajob.name,
249
+ inlets=datajob.inlets,
250
+ outlets=datajob.outlets,
251
+ fine_grained_lineages=datajob.fine_grained_lineages,
252
+ )
243
253
  return DataProcessInstance.from_datajob(
244
- datajob=datajob,
254
+ datajob=datajob_v1,
245
255
  id=job.job_id,
246
256
  clone_inlets=True,
247
257
  clone_outlets=True,
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
278
288
 
279
289
  def _get_connector_workunits(
280
290
  self, connector: Connector
281
- ) -> Iterable[MetadataWorkUnit]:
291
+ ) -> Iterable[Union[MetadataWorkUnit, Entity]]:
282
292
  self.report.report_connectors_scanned()
283
293
  # Create dataflow entity with same name as connector name
284
294
  dataflow = self._generate_dataflow_from_connector(connector)
285
- for mcp in dataflow.generate_mcp():
286
- yield mcp.as_workunit()
295
+ yield dataflow
287
296
 
288
297
  # Map Fivetran's connector entity with Datahub's datajob entity
289
298
  datajob = self._generate_datajob_from_connector(connector)
290
- for mcp in datajob.generate_mcp(materialize_iolets=False):
291
- yield mcp.as_workunit()
299
+ yield datajob
292
300
 
293
301
  # Map Fivetran's job/sync history entity with Datahub's data process entity
294
302
  if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
310
318
  ).workunit_processor,
311
319
  ]
312
320
 
313
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
321
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
314
322
  """
315
323
  Datahub Ingestion framework invoke this method
316
324
  """
@@ -16,6 +16,7 @@ from datahub.ingestion.api.decorators import (
16
16
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
17
17
  from datahub.ingestion.api.workunit import MetadataWorkUnit
18
18
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
19
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
19
20
  from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
20
21
  from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
21
22
  from datahub.ingestion.source.data_lake_common.object_store import (
@@ -82,7 +83,14 @@ class GCSSourceReport(DataLakeSourceReport):
82
83
  @platform_name("Google Cloud Storage", id=PLATFORM_GCS)
83
84
  @config_class(GCSSourceConfig)
84
85
  @support_status(SupportStatus.INCUBATING)
85
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
86
+ @capability(
87
+ SourceCapability.CONTAINERS,
88
+ "Enabled by default",
89
+ subtype_modifier=[
90
+ SourceCapabilityModifier.GCS_BUCKET,
91
+ SourceCapabilityModifier.FOLDER,
92
+ ],
93
+ )
86
94
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
87
95
  @capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
88
96
  class GCSSource(StatefulIngestionSourceBase):
@@ -112,6 +120,7 @@ class GCSSource(StatefulIngestionSourceBase):
112
120
  env=self.config.env,
113
121
  max_rows=self.config.max_rows,
114
122
  number_of_files_to_sample=self.config.number_of_files_to_sample,
123
+ platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
115
124
  )
116
125
  return s3_config
117
126
 
@@ -138,7 +147,9 @@ class GCSSource(StatefulIngestionSourceBase):
138
147
 
139
148
  def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
140
149
  config = self.create_equivalent_s3_config()
141
- s3_source = S3Source(config, PipelineContext(ctx.run_id))
150
+ # Create a new context for S3 source without graph to avoid duplicate checkpointer registration
151
+ s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
152
+ s3_source = S3Source(config, s3_ctx)
142
153
  return self.s3_source_overrides(s3_source)
143
154
 
144
155
  def s3_source_overrides(self, source: S3Source) -> S3Source:
@@ -1213,26 +1213,34 @@ class DatahubGEProfiler:
1213
1213
  f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
1214
1214
  )
1215
1215
 
1216
- with PerfTimer() as timer, unittest.mock.patch(
1217
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1218
- get_column_unique_count_dh_patch,
1219
- ), unittest.mock.patch(
1220
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1221
- _get_column_quantiles_bigquery_patch,
1222
- ), unittest.mock.patch(
1223
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1224
- _get_column_quantiles_awsathena_patch,
1225
- ), unittest.mock.patch(
1226
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1227
- _get_column_median_patch,
1228
- ), concurrent.futures.ThreadPoolExecutor(
1229
- max_workers=max_workers
1230
- ) as async_executor, SQLAlchemyQueryCombiner(
1231
- enabled=self.config.query_combiner_enabled,
1232
- catch_exceptions=self.config.catch_exceptions,
1233
- is_single_row_query_method=_is_single_row_query_method,
1234
- serial_execution_fallback_enabled=True,
1235
- ).activate() as query_combiner:
1216
+ with (
1217
+ PerfTimer() as timer,
1218
+ unittest.mock.patch(
1219
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1220
+ get_column_unique_count_dh_patch,
1221
+ ),
1222
+ unittest.mock.patch(
1223
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1224
+ _get_column_quantiles_bigquery_patch,
1225
+ ),
1226
+ unittest.mock.patch(
1227
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1228
+ _get_column_quantiles_awsathena_patch,
1229
+ ),
1230
+ unittest.mock.patch(
1231
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1232
+ _get_column_median_patch,
1233
+ ),
1234
+ concurrent.futures.ThreadPoolExecutor(
1235
+ max_workers=max_workers
1236
+ ) as async_executor,
1237
+ SQLAlchemyQueryCombiner(
1238
+ enabled=self.config.query_combiner_enabled,
1239
+ catch_exceptions=self.config.catch_exceptions,
1240
+ is_single_row_query_method=_is_single_row_query_method,
1241
+ serial_execution_fallback_enabled=True,
1242
+ ).activate() as query_combiner,
1243
+ ):
1236
1244
  # Submit the profiling requests to the thread pool executor.
1237
1245
  async_profiles = collections.deque(
1238
1246
  async_executor.submit(
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
5
5
 
6
6
  import requests
7
7
  from pydantic import BaseModel, Field, ValidationError, validator
8
+ from requests.adapters import HTTPAdapter
8
9
  from typing_extensions import assert_never
10
+ from urllib3.util.retry import Retry
9
11
 
10
12
  from datahub.ingestion.api.source import SourceReport
11
13
  from datahub.ingestion.source.hex.constants import (
@@ -220,6 +222,7 @@ class HexApi:
220
222
  self.base_url = base_url
221
223
  self.report = report
222
224
  self.page_size = page_size
225
+ self.session = self._create_retry_session()
223
226
 
224
227
  def _list_projects_url(self):
225
228
  return f"{self.base_url}/projects"
@@ -227,6 +230,28 @@ class HexApi:
227
230
  def _auth_header(self):
228
231
  return {"Authorization": f"Bearer {self.token}"}
229
232
 
233
+ def _create_retry_session(self) -> requests.Session:
234
+ """Create a requests session with retry logic for rate limiting.
235
+
236
+ Hex API rate limit: 60 requests per minute
237
+ https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
238
+ """
239
+ session = requests.Session()
240
+
241
+ # Configure retry strategy for 429 (Too Many Requests) with exponential backoff
242
+ retry_strategy = Retry(
243
+ total=5, # Maximum number of retries
244
+ status_forcelist=[429], # Only retry on 429 status code
245
+ backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
246
+ raise_on_status=True, # Raise exception after max retries
247
+ )
248
+
249
+ adapter = HTTPAdapter(max_retries=retry_strategy)
250
+ session.mount("http://", adapter)
251
+ session.mount("https://", adapter)
252
+
253
+ return session
254
+
230
255
  def fetch_projects(
231
256
  self,
232
257
  include_components: bool = True,
@@ -259,7 +284,7 @@ class HexApi:
259
284
  logger.debug(f"Fetching projects page with params: {params}")
260
285
  self.report.fetch_projects_page_calls += 1
261
286
  try:
262
- response = requests.get(
287
+ response = self.session.get(
263
288
  url=self._list_projects_url(),
264
289
  headers=self._auth_header(),
265
290
  params=params,
@@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
167
167
  @config_class(AzureADConfig)
168
168
  @support_status(SupportStatus.CERTIFIED)
169
169
  @capability(
170
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
170
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
171
171
  )
172
172
  class AzureADSource(StatefulIngestionSourceBase):
173
173
  """
@@ -41,7 +41,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
41
41
  )
42
42
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
43
43
  from datahub.metadata.schema_classes import (
44
- ChangeTypeClass,
45
44
  CorpGroupInfoClass,
46
45
  CorpUserInfoClass,
47
46
  GroupMembershipClass,
@@ -202,7 +201,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
202
201
  @support_status(SupportStatus.CERTIFIED)
203
202
  @capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
204
203
  @capability(
205
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
204
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
206
205
  )
207
206
  class OktaSource(StatefulIngestionSourceBase):
208
207
  """
@@ -332,18 +331,12 @@ class OktaSource(StatefulIngestionSourceBase):
332
331
  yield MetadataWorkUnit(id=wu_id, mce=mce)
333
332
 
334
333
  yield MetadataChangeProposalWrapper(
335
- entityType="corpGroup",
336
334
  entityUrn=datahub_corp_group_snapshot.urn,
337
- changeType=ChangeTypeClass.UPSERT,
338
- aspectName="origin",
339
335
  aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
340
336
  ).as_workunit()
341
337
 
342
338
  yield MetadataChangeProposalWrapper(
343
- entityType="corpGroup",
344
339
  entityUrn=datahub_corp_group_snapshot.urn,
345
- changeType=ChangeTypeClass.UPSERT,
346
- aspectName="status",
347
340
  aspect=StatusClass(removed=False),
348
341
  ).as_workunit()
349
342
 
@@ -418,18 +411,12 @@ class OktaSource(StatefulIngestionSourceBase):
418
411
  yield MetadataWorkUnit(id=wu_id, mce=mce)
419
412
 
420
413
  yield MetadataChangeProposalWrapper(
421
- entityType="corpuser",
422
414
  entityUrn=datahub_corp_user_snapshot.urn,
423
- changeType=ChangeTypeClass.UPSERT,
424
- aspectName="origin",
425
415
  aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
426
416
  ).as_workunit()
427
417
 
428
418
  yield MetadataChangeProposalWrapper(
429
- entityType="corpuser",
430
419
  entityUrn=datahub_corp_user_snapshot.urn,
431
- changeType=ChangeTypeClass.UPSERT,
432
- aspectName="status",
433
420
  aspect=StatusClass(removed=False),
434
421
  ).as_workunit()
435
422