acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/cli/check_cli.py +65 -11
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +3 -4
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +41 -8
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +47 -45
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +73 -30
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +12 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/glue.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
- datahub/ingestion/source/dbt/dbt_common.py +3 -1
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/redshift/redshift.py +17 -0
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -12
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/mssql/source.py +24 -15
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +11 -0
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +57 -2
- datahub/ingestion/source/tableau/tableau.py +57 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +56 -30
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1253 -536
- datahub/metadata/_urns/urn_defs.py +1797 -1685
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +16614 -16538
- datahub/metadata/schemas/ContainerProperties.avsc +2 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataJobInfo.avsc +2 -0
- datahub/metadata/schemas/DataProcessKey.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +4 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
- datahub/metadata/schemas/MLModelKey.avsc +2 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/datajob.py +39 -15
- datahub/sdk/lineage_client.py +2 -0
- datahub/sdk/main_client.py +14 -2
- datahub/sdk/search_client.py +4 -3
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
1
5
|
class DremioSQLQueries:
|
|
2
6
|
QUERY_DATASETS_CE = """
|
|
3
7
|
SELECT* FROM
|
|
@@ -235,28 +239,83 @@ class DremioSQLQueries:
|
|
|
235
239
|
TABLE_NAME ASC
|
|
236
240
|
"""
|
|
237
241
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
242
|
+
@staticmethod
|
|
243
|
+
def _get_default_start_timestamp_millis() -> str:
|
|
244
|
+
"""Get default start timestamp (1 day ago) in milliseconds precision format"""
|
|
245
|
+
one_day_ago = datetime.now() - timedelta(days=1)
|
|
246
|
+
return one_day_ago.strftime("%Y-%m-%d %H:%M:%S.%f")[
|
|
247
|
+
:-3
|
|
248
|
+
] # Truncate to milliseconds
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _get_default_end_timestamp_millis() -> str:
|
|
252
|
+
"""Get default end timestamp (now) in milliseconds precision format"""
|
|
253
|
+
now = datetime.now()
|
|
254
|
+
return now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] # Truncate to milliseconds
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def get_query_all_jobs(
|
|
258
|
+
start_timestamp_millis: Optional[str] = None,
|
|
259
|
+
end_timestamp_millis: Optional[str] = None,
|
|
260
|
+
) -> str:
|
|
261
|
+
"""
|
|
262
|
+
Get query for all jobs with optional time filtering.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 1 day ago)
|
|
266
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
SQL query string with time filtering applied
|
|
270
|
+
"""
|
|
271
|
+
if start_timestamp_millis is None:
|
|
272
|
+
start_timestamp_millis = (
|
|
273
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
274
|
+
)
|
|
275
|
+
if end_timestamp_millis is None:
|
|
276
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
277
|
+
|
|
278
|
+
return f"""
|
|
279
|
+
SELECT
|
|
280
|
+
job_id,
|
|
281
|
+
user_name,
|
|
282
|
+
submitted_ts,
|
|
283
|
+
query,
|
|
284
|
+
queried_datasets
|
|
285
|
+
FROM
|
|
286
|
+
SYS.JOBS_RECENT
|
|
287
|
+
WHERE
|
|
288
|
+
STATUS = 'COMPLETED'
|
|
289
|
+
AND LENGTH(queried_datasets)>0
|
|
290
|
+
AND user_name != '$dremio$'
|
|
291
|
+
AND query_type not like '%INTERNAL%'
|
|
292
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
293
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def get_query_all_jobs_cloud(
|
|
298
|
+
start_timestamp_millis: Optional[str] = None,
|
|
299
|
+
end_timestamp_millis: Optional[str] = None,
|
|
300
|
+
) -> str:
|
|
301
|
+
"""
|
|
302
|
+
Get query for all jobs in Dremio Cloud with optional time filtering.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 7 days ago)
|
|
306
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
SQL query string with time filtering applied
|
|
310
|
+
"""
|
|
311
|
+
if start_timestamp_millis is None:
|
|
312
|
+
start_timestamp_millis = (
|
|
313
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
314
|
+
)
|
|
315
|
+
if end_timestamp_millis is None:
|
|
316
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
256
317
|
|
|
257
|
-
|
|
258
|
-
# queried_datasets correctly documented as [varchar]
|
|
259
|
-
QUERY_ALL_JOBS_CLOUD = """
|
|
318
|
+
return f"""
|
|
260
319
|
SELECT
|
|
261
320
|
job_id,
|
|
262
321
|
user_name,
|
|
@@ -270,6 +329,8 @@ class DremioSQLQueries:
|
|
|
270
329
|
AND ARRAY_SIZE(queried_datasets)>0
|
|
271
330
|
AND user_name != '$dremio$'
|
|
272
331
|
AND query_type not like '%INTERNAL%'
|
|
332
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
333
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
273
334
|
"""
|
|
274
335
|
|
|
275
336
|
QUERY_TYPES = [
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterable, List, Optional
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import datahub.emitter.mce_builder as builder
|
|
5
|
-
from datahub.api.entities.datajob import
|
|
5
|
+
from datahub.api.entities.datajob import DataJob as DataJobV1
|
|
6
6
|
from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
7
7
|
DataProcessInstance,
|
|
8
8
|
InstanceRunResult,
|
|
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
42
42
|
FineGrainedLineageDownstreamType,
|
|
43
43
|
FineGrainedLineageUpstreamType,
|
|
44
44
|
)
|
|
45
|
-
from datahub.
|
|
46
|
-
from datahub.
|
|
45
|
+
from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
|
|
46
|
+
from datahub.sdk.dataflow import DataFlow
|
|
47
|
+
from datahub.sdk.datajob import DataJob
|
|
48
|
+
from datahub.sdk.entity import Entity
|
|
47
49
|
|
|
48
50
|
# Logger instance
|
|
49
51
|
logger = logging.getLogger(__name__)
|
|
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
75
77
|
self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
|
|
76
78
|
|
|
77
79
|
def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
|
|
78
|
-
input_dataset_urn_list: List[DatasetUrn] = []
|
|
79
|
-
output_dataset_urn_list: List[DatasetUrn] = []
|
|
80
|
+
input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
81
|
+
output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
80
82
|
fine_grained_lineage: List[FineGrainedLineage] = []
|
|
81
83
|
|
|
82
84
|
# TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
|
|
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
178
180
|
)
|
|
179
181
|
)
|
|
180
182
|
|
|
181
|
-
datajob.
|
|
182
|
-
datajob.
|
|
183
|
-
datajob.
|
|
183
|
+
datajob.set_inlets(input_dataset_urn_list)
|
|
184
|
+
datajob.set_outlets(output_dataset_urn_list)
|
|
185
|
+
datajob.set_fine_grained_lineages(fine_grained_lineage)
|
|
184
186
|
|
|
185
187
|
return dict(
|
|
186
188
|
**{
|
|
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
197
199
|
|
|
198
200
|
def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
|
|
199
201
|
return DataFlow(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
+
platform=Constant.ORCHESTRATOR,
|
|
203
|
+
name=connector.connector_id,
|
|
202
204
|
env=self.config.env,
|
|
203
|
-
|
|
205
|
+
display_name=connector.connector_name,
|
|
204
206
|
platform_instance=self.config.platform_instance,
|
|
205
207
|
)
|
|
206
208
|
|
|
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
213
215
|
)
|
|
214
216
|
owner_email = self.audit_log.get_user_email(connector.user_id)
|
|
215
217
|
datajob = DataJob(
|
|
216
|
-
|
|
218
|
+
name=connector.connector_id,
|
|
217
219
|
flow_urn=dataflow_urn,
|
|
218
220
|
platform_instance=self.config.platform_instance,
|
|
219
|
-
|
|
220
|
-
owners=
|
|
221
|
+
display_name=connector.connector_name,
|
|
222
|
+
owners=[CorpUserUrn(owner_email)] if owner_email else None,
|
|
221
223
|
)
|
|
222
224
|
|
|
223
225
|
# Map connector source and destination table with dataset entity
|
|
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
232
234
|
"sync_frequency": str(connector.sync_frequency),
|
|
233
235
|
"destination_id": connector.destination_id,
|
|
234
236
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
**lineage_properties,
|
|
238
|
-
}
|
|
237
|
+
|
|
238
|
+
datajob.set_custom_properties({**connector_properties, **lineage_properties})
|
|
239
239
|
|
|
240
240
|
return datajob
|
|
241
241
|
|
|
242
242
|
def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
|
|
243
|
+
# hack: convert to old instance for DataProcessInstance.from_datajob compatibility
|
|
244
|
+
datajob_v1 = DataJobV1(
|
|
245
|
+
id=datajob.name,
|
|
246
|
+
flow_urn=datajob.flow_urn,
|
|
247
|
+
platform_instance=self.config.platform_instance,
|
|
248
|
+
name=datajob.name,
|
|
249
|
+
inlets=datajob.inlets,
|
|
250
|
+
outlets=datajob.outlets,
|
|
251
|
+
fine_grained_lineages=datajob.fine_grained_lineages,
|
|
252
|
+
)
|
|
243
253
|
return DataProcessInstance.from_datajob(
|
|
244
|
-
datajob=
|
|
254
|
+
datajob=datajob_v1,
|
|
245
255
|
id=job.job_id,
|
|
246
256
|
clone_inlets=True,
|
|
247
257
|
clone_outlets=True,
|
|
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
278
288
|
|
|
279
289
|
def _get_connector_workunits(
|
|
280
290
|
self, connector: Connector
|
|
281
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
291
|
+
) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
282
292
|
self.report.report_connectors_scanned()
|
|
283
293
|
# Create dataflow entity with same name as connector name
|
|
284
294
|
dataflow = self._generate_dataflow_from_connector(connector)
|
|
285
|
-
|
|
286
|
-
yield mcp.as_workunit()
|
|
295
|
+
yield dataflow
|
|
287
296
|
|
|
288
297
|
# Map Fivetran's connector entity with Datahub's datajob entity
|
|
289
298
|
datajob = self._generate_datajob_from_connector(connector)
|
|
290
|
-
|
|
291
|
-
yield mcp.as_workunit()
|
|
299
|
+
yield datajob
|
|
292
300
|
|
|
293
301
|
# Map Fivetran's job/sync history entity with Datahub's data process entity
|
|
294
302
|
if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
|
|
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
310
318
|
).workunit_processor,
|
|
311
319
|
]
|
|
312
320
|
|
|
313
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
321
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
314
322
|
"""
|
|
315
323
|
Datahub Ingestion framework invoke this method
|
|
316
324
|
"""
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
16
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
|
|
17
17
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
18
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
19
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
19
20
|
from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
|
|
20
21
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
|
|
21
22
|
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
@@ -82,7 +83,14 @@ class GCSSourceReport(DataLakeSourceReport):
|
|
|
82
83
|
@platform_name("Google Cloud Storage", id=PLATFORM_GCS)
|
|
83
84
|
@config_class(GCSSourceConfig)
|
|
84
85
|
@support_status(SupportStatus.INCUBATING)
|
|
85
|
-
@capability(
|
|
86
|
+
@capability(
|
|
87
|
+
SourceCapability.CONTAINERS,
|
|
88
|
+
"Enabled by default",
|
|
89
|
+
subtype_modifier=[
|
|
90
|
+
SourceCapabilityModifier.GCS_BUCKET,
|
|
91
|
+
SourceCapabilityModifier.FOLDER,
|
|
92
|
+
],
|
|
93
|
+
)
|
|
86
94
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
87
95
|
@capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
|
|
88
96
|
class GCSSource(StatefulIngestionSourceBase):
|
|
@@ -112,6 +120,7 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
112
120
|
env=self.config.env,
|
|
113
121
|
max_rows=self.config.max_rows,
|
|
114
122
|
number_of_files_to_sample=self.config.number_of_files_to_sample,
|
|
123
|
+
platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
|
|
115
124
|
)
|
|
116
125
|
return s3_config
|
|
117
126
|
|
|
@@ -138,7 +147,9 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
138
147
|
|
|
139
148
|
def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
|
|
140
149
|
config = self.create_equivalent_s3_config()
|
|
141
|
-
|
|
150
|
+
# Create a new context for S3 source without graph to avoid duplicate checkpointer registration
|
|
151
|
+
s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
|
|
152
|
+
s3_source = S3Source(config, s3_ctx)
|
|
142
153
|
return self.s3_source_overrides(s3_source)
|
|
143
154
|
|
|
144
155
|
def s3_source_overrides(self, source: S3Source) -> S3Source:
|
|
@@ -120,7 +120,6 @@ SNOWFLAKE = "snowflake"
|
|
|
120
120
|
BIGQUERY = "bigquery"
|
|
121
121
|
REDSHIFT = "redshift"
|
|
122
122
|
DATABRICKS = "databricks"
|
|
123
|
-
TRINO = "trino"
|
|
124
123
|
|
|
125
124
|
# Type names for Databricks, to match Title Case types in sqlalchemy
|
|
126
125
|
ProfilerTypeMapping.INT_TYPE_NAMES.append("Integer")
|
|
@@ -206,6 +205,17 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
|
|
|
206
205
|
)
|
|
207
206
|
)
|
|
208
207
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
208
|
+
elif (
|
|
209
|
+
self.engine.dialect.name.lower() == GXSqlDialect.AWSATHENA
|
|
210
|
+
or self.engine.dialect.name.lower() == GXSqlDialect.TRINO
|
|
211
|
+
):
|
|
212
|
+
return convert_to_json_serializable(
|
|
213
|
+
self.engine.execute(
|
|
214
|
+
sa.select(sa.func.approx_distinct(sa.column(column))).select_from(
|
|
215
|
+
self._table
|
|
216
|
+
)
|
|
217
|
+
).scalar()
|
|
218
|
+
)
|
|
209
219
|
return convert_to_json_serializable(
|
|
210
220
|
self.engine.execute(
|
|
211
221
|
sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(
|
|
@@ -734,11 +744,41 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
734
744
|
def _get_dataset_column_distinct_value_frequencies(
|
|
735
745
|
self, column_profile: DatasetFieldProfileClass, column: str
|
|
736
746
|
) -> None:
|
|
737
|
-
if self.config.include_field_distinct_value_frequencies:
|
|
747
|
+
if not self.config.include_field_distinct_value_frequencies:
|
|
748
|
+
return
|
|
749
|
+
try:
|
|
750
|
+
results = self.dataset.engine.execute(
|
|
751
|
+
sa.select(
|
|
752
|
+
[
|
|
753
|
+
sa.column(column),
|
|
754
|
+
sa.func.count(sa.column(column)),
|
|
755
|
+
]
|
|
756
|
+
)
|
|
757
|
+
.select_from(self.dataset._table)
|
|
758
|
+
.where(sa.column(column).is_not(None))
|
|
759
|
+
.group_by(sa.column(column))
|
|
760
|
+
).fetchall()
|
|
761
|
+
|
|
738
762
|
column_profile.distinctValueFrequencies = [
|
|
739
|
-
ValueFrequencyClass(value=str(value), frequency=count)
|
|
740
|
-
for value, count in
|
|
763
|
+
ValueFrequencyClass(value=str(value), frequency=int(count))
|
|
764
|
+
for value, count in results
|
|
741
765
|
]
|
|
766
|
+
# sort so output is deterministic. don't do it in SQL because not all column
|
|
767
|
+
# types are sortable in SQL (such as JSON data types on Athena/Trino).
|
|
768
|
+
column_profile.distinctValueFrequencies = sorted(
|
|
769
|
+
column_profile.distinctValueFrequencies, key=lambda x: x.value
|
|
770
|
+
)
|
|
771
|
+
except Exception as e:
|
|
772
|
+
logger.debug(
|
|
773
|
+
f"Caught exception while attempting to get distinct value frequencies for column {column}. {e}"
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
self.report.report_warning(
|
|
777
|
+
title="Profiling: Unable to Calculate Distinct Value Frequencies",
|
|
778
|
+
message="Distinct value frequencies for the column will not be accessible",
|
|
779
|
+
context=f"{self.dataset_name}.{column}",
|
|
780
|
+
exc=e,
|
|
781
|
+
)
|
|
742
782
|
|
|
743
783
|
@_run_with_query_combiner
|
|
744
784
|
def _get_dataset_column_histogram(
|
|
@@ -1173,26 +1213,34 @@ class DatahubGEProfiler:
|
|
|
1173
1213
|
f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
|
|
1174
1214
|
)
|
|
1175
1215
|
|
|
1176
|
-
with
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1216
|
+
with (
|
|
1217
|
+
PerfTimer() as timer,
|
|
1218
|
+
unittest.mock.patch(
|
|
1219
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
|
|
1220
|
+
get_column_unique_count_dh_patch,
|
|
1221
|
+
),
|
|
1222
|
+
unittest.mock.patch(
|
|
1223
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
|
|
1224
|
+
_get_column_quantiles_bigquery_patch,
|
|
1225
|
+
),
|
|
1226
|
+
unittest.mock.patch(
|
|
1227
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
|
|
1228
|
+
_get_column_quantiles_awsathena_patch,
|
|
1229
|
+
),
|
|
1230
|
+
unittest.mock.patch(
|
|
1231
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
|
|
1232
|
+
_get_column_median_patch,
|
|
1233
|
+
),
|
|
1234
|
+
concurrent.futures.ThreadPoolExecutor(
|
|
1235
|
+
max_workers=max_workers
|
|
1236
|
+
) as async_executor,
|
|
1237
|
+
SQLAlchemyQueryCombiner(
|
|
1238
|
+
enabled=self.config.query_combiner_enabled,
|
|
1239
|
+
catch_exceptions=self.config.catch_exceptions,
|
|
1240
|
+
is_single_row_query_method=_is_single_row_query_method,
|
|
1241
|
+
serial_execution_fallback_enabled=True,
|
|
1242
|
+
).activate() as query_combiner,
|
|
1243
|
+
):
|
|
1196
1244
|
# Submit the profiling requests to the thread pool executor.
|
|
1197
1245
|
async_profiles = collections.deque(
|
|
1198
1246
|
async_executor.submit(
|
|
@@ -1395,12 +1443,12 @@ class DatahubGEProfiler:
|
|
|
1395
1443
|
)
|
|
1396
1444
|
return None
|
|
1397
1445
|
finally:
|
|
1398
|
-
if batch is not None and self.base_engine.engine.name.
|
|
1399
|
-
|
|
1400
|
-
|
|
1446
|
+
if batch is not None and self.base_engine.engine.name.lower() in [
|
|
1447
|
+
GXSqlDialect.TRINO,
|
|
1448
|
+
GXSqlDialect.AWSATHENA,
|
|
1401
1449
|
]:
|
|
1402
1450
|
if (
|
|
1403
|
-
self.base_engine.engine.name.
|
|
1451
|
+
self.base_engine.engine.name.lower() == GXSqlDialect.TRINO
|
|
1404
1452
|
or temp_view is not None
|
|
1405
1453
|
):
|
|
1406
1454
|
self._drop_temp_table(batch)
|
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
|
|
|
5
5
|
|
|
6
6
|
import requests
|
|
7
7
|
from pydantic import BaseModel, Field, ValidationError, validator
|
|
8
|
+
from requests.adapters import HTTPAdapter
|
|
8
9
|
from typing_extensions import assert_never
|
|
10
|
+
from urllib3.util.retry import Retry
|
|
9
11
|
|
|
10
12
|
from datahub.ingestion.api.source import SourceReport
|
|
11
13
|
from datahub.ingestion.source.hex.constants import (
|
|
@@ -220,6 +222,7 @@ class HexApi:
|
|
|
220
222
|
self.base_url = base_url
|
|
221
223
|
self.report = report
|
|
222
224
|
self.page_size = page_size
|
|
225
|
+
self.session = self._create_retry_session()
|
|
223
226
|
|
|
224
227
|
def _list_projects_url(self):
|
|
225
228
|
return f"{self.base_url}/projects"
|
|
@@ -227,6 +230,28 @@ class HexApi:
|
|
|
227
230
|
def _auth_header(self):
|
|
228
231
|
return {"Authorization": f"Bearer {self.token}"}
|
|
229
232
|
|
|
233
|
+
def _create_retry_session(self) -> requests.Session:
|
|
234
|
+
"""Create a requests session with retry logic for rate limiting.
|
|
235
|
+
|
|
236
|
+
Hex API rate limit: 60 requests per minute
|
|
237
|
+
https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
|
|
238
|
+
"""
|
|
239
|
+
session = requests.Session()
|
|
240
|
+
|
|
241
|
+
# Configure retry strategy for 429 (Too Many Requests) with exponential backoff
|
|
242
|
+
retry_strategy = Retry(
|
|
243
|
+
total=5, # Maximum number of retries
|
|
244
|
+
status_forcelist=[429], # Only retry on 429 status code
|
|
245
|
+
backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
|
|
246
|
+
raise_on_status=True, # Raise exception after max retries
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
250
|
+
session.mount("http://", adapter)
|
|
251
|
+
session.mount("https://", adapter)
|
|
252
|
+
|
|
253
|
+
return session
|
|
254
|
+
|
|
230
255
|
def fetch_projects(
|
|
231
256
|
self,
|
|
232
257
|
include_components: bool = True,
|
|
@@ -259,7 +284,7 @@ class HexApi:
|
|
|
259
284
|
logger.debug(f"Fetching projects page with params: {params}")
|
|
260
285
|
self.report.fetch_projects_page_calls += 1
|
|
261
286
|
try:
|
|
262
|
-
response =
|
|
287
|
+
response = self.session.get(
|
|
263
288
|
url=self._list_projects_url(),
|
|
264
289
|
headers=self._auth_header(),
|
|
265
290
|
params=params,
|
|
@@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
|
|
|
167
167
|
@config_class(AzureADConfig)
|
|
168
168
|
@support_status(SupportStatus.CERTIFIED)
|
|
169
169
|
@capability(
|
|
170
|
-
SourceCapability.DELETION_DETECTION, "
|
|
170
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
171
171
|
)
|
|
172
172
|
class AzureADSource(StatefulIngestionSourceBase):
|
|
173
173
|
"""
|
|
@@ -41,7 +41,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
|
41
41
|
)
|
|
42
42
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
43
43
|
from datahub.metadata.schema_classes import (
|
|
44
|
-
ChangeTypeClass,
|
|
45
44
|
CorpGroupInfoClass,
|
|
46
45
|
CorpUserInfoClass,
|
|
47
46
|
GroupMembershipClass,
|
|
@@ -202,7 +201,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
|
|
|
202
201
|
@support_status(SupportStatus.CERTIFIED)
|
|
203
202
|
@capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
|
|
204
203
|
@capability(
|
|
205
|
-
SourceCapability.DELETION_DETECTION, "
|
|
204
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
206
205
|
)
|
|
207
206
|
class OktaSource(StatefulIngestionSourceBase):
|
|
208
207
|
"""
|
|
@@ -332,18 +331,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
332
331
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
333
332
|
|
|
334
333
|
yield MetadataChangeProposalWrapper(
|
|
335
|
-
entityType="corpGroup",
|
|
336
334
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
337
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
338
|
-
aspectName="origin",
|
|
339
335
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
340
336
|
).as_workunit()
|
|
341
337
|
|
|
342
338
|
yield MetadataChangeProposalWrapper(
|
|
343
|
-
entityType="corpGroup",
|
|
344
339
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
345
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
346
|
-
aspectName="status",
|
|
347
340
|
aspect=StatusClass(removed=False),
|
|
348
341
|
).as_workunit()
|
|
349
342
|
|
|
@@ -418,18 +411,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
418
411
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
419
412
|
|
|
420
413
|
yield MetadataChangeProposalWrapper(
|
|
421
|
-
entityType="corpuser",
|
|
422
414
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
423
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
424
|
-
aspectName="origin",
|
|
425
415
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
426
416
|
).as_workunit()
|
|
427
417
|
|
|
428
418
|
yield MetadataChangeProposalWrapper(
|
|
429
|
-
entityType="corpuser",
|
|
430
419
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
431
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
432
|
-
aspectName="status",
|
|
433
420
|
aspect=StatusClass(removed=False),
|
|
434
421
|
).as_workunit()
|
|
435
422
|
|