acryl-datahub 1.1.0.4rc3__py3-none-any.whl → 1.1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2499 -2501
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +149 -131
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/cli/check_cli.py +65 -11
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +3 -4
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +24 -8
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +47 -45
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +73 -30
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +12 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/glue.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
- datahub/ingestion/source/dbt/dbt_common.py +3 -1
- datahub/ingestion/source/dremio/dremio_api.py +38 -27
- datahub/ingestion/source/dremio/dremio_source.py +7 -7
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +28 -20
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -12
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/mssql/source.py +24 -15
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +11 -0
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +57 -2
- datahub/ingestion/source/tableau/tableau.py +57 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +56 -30
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1253 -536
- datahub/metadata/_urns/urn_defs.py +1797 -1685
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +16614 -16538
- datahub/metadata/schemas/ContainerProperties.avsc +2 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataJobInfo.avsc +2 -0
- datahub/metadata/schemas/DataProcessKey.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +4 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
- datahub/metadata/schemas/MLModelKey.avsc +2 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/datajob.py +39 -15
- datahub/sdk/lineage_client.py +2 -0
- datahub/sdk/main_client.py +14 -2
- datahub/sdk/search_client.py +4 -3
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterable, List, Optional
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import datahub.emitter.mce_builder as builder
|
|
5
|
-
from datahub.api.entities.datajob import
|
|
5
|
+
from datahub.api.entities.datajob import DataJob as DataJobV1
|
|
6
6
|
from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
7
7
|
DataProcessInstance,
|
|
8
8
|
InstanceRunResult,
|
|
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
42
42
|
FineGrainedLineageDownstreamType,
|
|
43
43
|
FineGrainedLineageUpstreamType,
|
|
44
44
|
)
|
|
45
|
-
from datahub.
|
|
46
|
-
from datahub.
|
|
45
|
+
from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
|
|
46
|
+
from datahub.sdk.dataflow import DataFlow
|
|
47
|
+
from datahub.sdk.datajob import DataJob
|
|
48
|
+
from datahub.sdk.entity import Entity
|
|
47
49
|
|
|
48
50
|
# Logger instance
|
|
49
51
|
logger = logging.getLogger(__name__)
|
|
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
75
77
|
self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
|
|
76
78
|
|
|
77
79
|
def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
|
|
78
|
-
input_dataset_urn_list: List[DatasetUrn] = []
|
|
79
|
-
output_dataset_urn_list: List[DatasetUrn] = []
|
|
80
|
+
input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
81
|
+
output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
80
82
|
fine_grained_lineage: List[FineGrainedLineage] = []
|
|
81
83
|
|
|
82
84
|
# TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
|
|
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
178
180
|
)
|
|
179
181
|
)
|
|
180
182
|
|
|
181
|
-
datajob.
|
|
182
|
-
datajob.
|
|
183
|
-
datajob.
|
|
183
|
+
datajob.set_inlets(input_dataset_urn_list)
|
|
184
|
+
datajob.set_outlets(output_dataset_urn_list)
|
|
185
|
+
datajob.set_fine_grained_lineages(fine_grained_lineage)
|
|
184
186
|
|
|
185
187
|
return dict(
|
|
186
188
|
**{
|
|
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
197
199
|
|
|
198
200
|
def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
|
|
199
201
|
return DataFlow(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
+
platform=Constant.ORCHESTRATOR,
|
|
203
|
+
name=connector.connector_id,
|
|
202
204
|
env=self.config.env,
|
|
203
|
-
|
|
205
|
+
display_name=connector.connector_name,
|
|
204
206
|
platform_instance=self.config.platform_instance,
|
|
205
207
|
)
|
|
206
208
|
|
|
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
213
215
|
)
|
|
214
216
|
owner_email = self.audit_log.get_user_email(connector.user_id)
|
|
215
217
|
datajob = DataJob(
|
|
216
|
-
|
|
218
|
+
name=connector.connector_id,
|
|
217
219
|
flow_urn=dataflow_urn,
|
|
218
220
|
platform_instance=self.config.platform_instance,
|
|
219
|
-
|
|
220
|
-
owners=
|
|
221
|
+
display_name=connector.connector_name,
|
|
222
|
+
owners=[CorpUserUrn(owner_email)] if owner_email else None,
|
|
221
223
|
)
|
|
222
224
|
|
|
223
225
|
# Map connector source and destination table with dataset entity
|
|
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
232
234
|
"sync_frequency": str(connector.sync_frequency),
|
|
233
235
|
"destination_id": connector.destination_id,
|
|
234
236
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
**lineage_properties,
|
|
238
|
-
}
|
|
237
|
+
|
|
238
|
+
datajob.set_custom_properties({**connector_properties, **lineage_properties})
|
|
239
239
|
|
|
240
240
|
return datajob
|
|
241
241
|
|
|
242
242
|
def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
|
|
243
|
+
# hack: convert to old instance for DataProcessInstance.from_datajob compatibility
|
|
244
|
+
datajob_v1 = DataJobV1(
|
|
245
|
+
id=datajob.name,
|
|
246
|
+
flow_urn=datajob.flow_urn,
|
|
247
|
+
platform_instance=self.config.platform_instance,
|
|
248
|
+
name=datajob.name,
|
|
249
|
+
inlets=datajob.inlets,
|
|
250
|
+
outlets=datajob.outlets,
|
|
251
|
+
fine_grained_lineages=datajob.fine_grained_lineages,
|
|
252
|
+
)
|
|
243
253
|
return DataProcessInstance.from_datajob(
|
|
244
|
-
datajob=
|
|
254
|
+
datajob=datajob_v1,
|
|
245
255
|
id=job.job_id,
|
|
246
256
|
clone_inlets=True,
|
|
247
257
|
clone_outlets=True,
|
|
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
278
288
|
|
|
279
289
|
def _get_connector_workunits(
|
|
280
290
|
self, connector: Connector
|
|
281
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
291
|
+
) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
282
292
|
self.report.report_connectors_scanned()
|
|
283
293
|
# Create dataflow entity with same name as connector name
|
|
284
294
|
dataflow = self._generate_dataflow_from_connector(connector)
|
|
285
|
-
|
|
286
|
-
yield mcp.as_workunit()
|
|
295
|
+
yield dataflow
|
|
287
296
|
|
|
288
297
|
# Map Fivetran's connector entity with Datahub's datajob entity
|
|
289
298
|
datajob = self._generate_datajob_from_connector(connector)
|
|
290
|
-
|
|
291
|
-
yield mcp.as_workunit()
|
|
299
|
+
yield datajob
|
|
292
300
|
|
|
293
301
|
# Map Fivetran's job/sync history entity with Datahub's data process entity
|
|
294
302
|
if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
|
|
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
310
318
|
).workunit_processor,
|
|
311
319
|
]
|
|
312
320
|
|
|
313
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
321
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
314
322
|
"""
|
|
315
323
|
Datahub Ingestion framework invoke this method
|
|
316
324
|
"""
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
16
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
|
|
17
17
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
18
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
19
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
19
20
|
from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
|
|
20
21
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
|
|
21
22
|
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
@@ -82,7 +83,14 @@ class GCSSourceReport(DataLakeSourceReport):
|
|
|
82
83
|
@platform_name("Google Cloud Storage", id=PLATFORM_GCS)
|
|
83
84
|
@config_class(GCSSourceConfig)
|
|
84
85
|
@support_status(SupportStatus.INCUBATING)
|
|
85
|
-
@capability(
|
|
86
|
+
@capability(
|
|
87
|
+
SourceCapability.CONTAINERS,
|
|
88
|
+
"Enabled by default",
|
|
89
|
+
subtype_modifier=[
|
|
90
|
+
SourceCapabilityModifier.GCS_BUCKET,
|
|
91
|
+
SourceCapabilityModifier.FOLDER,
|
|
92
|
+
],
|
|
93
|
+
)
|
|
86
94
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
87
95
|
@capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
|
|
88
96
|
class GCSSource(StatefulIngestionSourceBase):
|
|
@@ -112,6 +120,7 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
112
120
|
env=self.config.env,
|
|
113
121
|
max_rows=self.config.max_rows,
|
|
114
122
|
number_of_files_to_sample=self.config.number_of_files_to_sample,
|
|
123
|
+
platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
|
|
115
124
|
)
|
|
116
125
|
return s3_config
|
|
117
126
|
|
|
@@ -138,7 +147,9 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
138
147
|
|
|
139
148
|
def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
|
|
140
149
|
config = self.create_equivalent_s3_config()
|
|
141
|
-
|
|
150
|
+
# Create a new context for S3 source without graph to avoid duplicate checkpointer registration
|
|
151
|
+
s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
|
|
152
|
+
s3_source = S3Source(config, s3_ctx)
|
|
142
153
|
return self.s3_source_overrides(s3_source)
|
|
143
154
|
|
|
144
155
|
def s3_source_overrides(self, source: S3Source) -> S3Source:
|
|
@@ -1213,26 +1213,34 @@ class DatahubGEProfiler:
|
|
|
1213
1213
|
f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
|
|
1214
1214
|
)
|
|
1215
1215
|
|
|
1216
|
-
with
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1216
|
+
with (
|
|
1217
|
+
PerfTimer() as timer,
|
|
1218
|
+
unittest.mock.patch(
|
|
1219
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
|
|
1220
|
+
get_column_unique_count_dh_patch,
|
|
1221
|
+
),
|
|
1222
|
+
unittest.mock.patch(
|
|
1223
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
|
|
1224
|
+
_get_column_quantiles_bigquery_patch,
|
|
1225
|
+
),
|
|
1226
|
+
unittest.mock.patch(
|
|
1227
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
|
|
1228
|
+
_get_column_quantiles_awsathena_patch,
|
|
1229
|
+
),
|
|
1230
|
+
unittest.mock.patch(
|
|
1231
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
|
|
1232
|
+
_get_column_median_patch,
|
|
1233
|
+
),
|
|
1234
|
+
concurrent.futures.ThreadPoolExecutor(
|
|
1235
|
+
max_workers=max_workers
|
|
1236
|
+
) as async_executor,
|
|
1237
|
+
SQLAlchemyQueryCombiner(
|
|
1238
|
+
enabled=self.config.query_combiner_enabled,
|
|
1239
|
+
catch_exceptions=self.config.catch_exceptions,
|
|
1240
|
+
is_single_row_query_method=_is_single_row_query_method,
|
|
1241
|
+
serial_execution_fallback_enabled=True,
|
|
1242
|
+
).activate() as query_combiner,
|
|
1243
|
+
):
|
|
1236
1244
|
# Submit the profiling requests to the thread pool executor.
|
|
1237
1245
|
async_profiles = collections.deque(
|
|
1238
1246
|
async_executor.submit(
|
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
|
|
|
5
5
|
|
|
6
6
|
import requests
|
|
7
7
|
from pydantic import BaseModel, Field, ValidationError, validator
|
|
8
|
+
from requests.adapters import HTTPAdapter
|
|
8
9
|
from typing_extensions import assert_never
|
|
10
|
+
from urllib3.util.retry import Retry
|
|
9
11
|
|
|
10
12
|
from datahub.ingestion.api.source import SourceReport
|
|
11
13
|
from datahub.ingestion.source.hex.constants import (
|
|
@@ -220,6 +222,7 @@ class HexApi:
|
|
|
220
222
|
self.base_url = base_url
|
|
221
223
|
self.report = report
|
|
222
224
|
self.page_size = page_size
|
|
225
|
+
self.session = self._create_retry_session()
|
|
223
226
|
|
|
224
227
|
def _list_projects_url(self):
|
|
225
228
|
return f"{self.base_url}/projects"
|
|
@@ -227,6 +230,28 @@ class HexApi:
|
|
|
227
230
|
def _auth_header(self):
|
|
228
231
|
return {"Authorization": f"Bearer {self.token}"}
|
|
229
232
|
|
|
233
|
+
def _create_retry_session(self) -> requests.Session:
|
|
234
|
+
"""Create a requests session with retry logic for rate limiting.
|
|
235
|
+
|
|
236
|
+
Hex API rate limit: 60 requests per minute
|
|
237
|
+
https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
|
|
238
|
+
"""
|
|
239
|
+
session = requests.Session()
|
|
240
|
+
|
|
241
|
+
# Configure retry strategy for 429 (Too Many Requests) with exponential backoff
|
|
242
|
+
retry_strategy = Retry(
|
|
243
|
+
total=5, # Maximum number of retries
|
|
244
|
+
status_forcelist=[429], # Only retry on 429 status code
|
|
245
|
+
backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
|
|
246
|
+
raise_on_status=True, # Raise exception after max retries
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
250
|
+
session.mount("http://", adapter)
|
|
251
|
+
session.mount("https://", adapter)
|
|
252
|
+
|
|
253
|
+
return session
|
|
254
|
+
|
|
230
255
|
def fetch_projects(
|
|
231
256
|
self,
|
|
232
257
|
include_components: bool = True,
|
|
@@ -259,7 +284,7 @@ class HexApi:
|
|
|
259
284
|
logger.debug(f"Fetching projects page with params: {params}")
|
|
260
285
|
self.report.fetch_projects_page_calls += 1
|
|
261
286
|
try:
|
|
262
|
-
response =
|
|
287
|
+
response = self.session.get(
|
|
263
288
|
url=self._list_projects_url(),
|
|
264
289
|
headers=self._auth_header(),
|
|
265
290
|
params=params,
|
|
@@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
|
|
|
167
167
|
@config_class(AzureADConfig)
|
|
168
168
|
@support_status(SupportStatus.CERTIFIED)
|
|
169
169
|
@capability(
|
|
170
|
-
SourceCapability.DELETION_DETECTION, "
|
|
170
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
171
171
|
)
|
|
172
172
|
class AzureADSource(StatefulIngestionSourceBase):
|
|
173
173
|
"""
|
|
@@ -41,7 +41,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
|
41
41
|
)
|
|
42
42
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
43
43
|
from datahub.metadata.schema_classes import (
|
|
44
|
-
ChangeTypeClass,
|
|
45
44
|
CorpGroupInfoClass,
|
|
46
45
|
CorpUserInfoClass,
|
|
47
46
|
GroupMembershipClass,
|
|
@@ -202,7 +201,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
|
|
|
202
201
|
@support_status(SupportStatus.CERTIFIED)
|
|
203
202
|
@capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
|
|
204
203
|
@capability(
|
|
205
|
-
SourceCapability.DELETION_DETECTION, "
|
|
204
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
206
205
|
)
|
|
207
206
|
class OktaSource(StatefulIngestionSourceBase):
|
|
208
207
|
"""
|
|
@@ -332,18 +331,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
332
331
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
333
332
|
|
|
334
333
|
yield MetadataChangeProposalWrapper(
|
|
335
|
-
entityType="corpGroup",
|
|
336
334
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
337
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
338
|
-
aspectName="origin",
|
|
339
335
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
340
336
|
).as_workunit()
|
|
341
337
|
|
|
342
338
|
yield MetadataChangeProposalWrapper(
|
|
343
|
-
entityType="corpGroup",
|
|
344
339
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
345
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
346
|
-
aspectName="status",
|
|
347
340
|
aspect=StatusClass(removed=False),
|
|
348
341
|
).as_workunit()
|
|
349
342
|
|
|
@@ -418,18 +411,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
418
411
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
419
412
|
|
|
420
413
|
yield MetadataChangeProposalWrapper(
|
|
421
|
-
entityType="corpuser",
|
|
422
414
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
423
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
424
|
-
aspectName="origin",
|
|
425
415
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
426
416
|
).as_workunit()
|
|
427
417
|
|
|
428
418
|
yield MetadataChangeProposalWrapper(
|
|
429
|
-
entityType="corpuser",
|
|
430
419
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
431
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
432
|
-
aspectName="status",
|
|
433
420
|
aspect=StatusClass(removed=False),
|
|
434
421
|
).as_workunit()
|
|
435
422
|
|