acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +12 -16
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +71 -13
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1446 -559
- datahub/metadata/_urns/urn_defs.py +1721 -1553
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +18055 -17802
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -120,7 +120,6 @@ SNOWFLAKE = "snowflake"
|
|
|
120
120
|
BIGQUERY = "bigquery"
|
|
121
121
|
REDSHIFT = "redshift"
|
|
122
122
|
DATABRICKS = "databricks"
|
|
123
|
-
TRINO = "trino"
|
|
124
123
|
|
|
125
124
|
# Type names for Databricks, to match Title Case types in sqlalchemy
|
|
126
125
|
ProfilerTypeMapping.INT_TYPE_NAMES.append("Integer")
|
|
@@ -206,6 +205,17 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
|
|
|
206
205
|
)
|
|
207
206
|
)
|
|
208
207
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
208
|
+
elif (
|
|
209
|
+
self.engine.dialect.name.lower() == GXSqlDialect.AWSATHENA
|
|
210
|
+
or self.engine.dialect.name.lower() == GXSqlDialect.TRINO
|
|
211
|
+
):
|
|
212
|
+
return convert_to_json_serializable(
|
|
213
|
+
self.engine.execute(
|
|
214
|
+
sa.select(sa.func.approx_distinct(sa.column(column))).select_from(
|
|
215
|
+
self._table
|
|
216
|
+
)
|
|
217
|
+
).scalar()
|
|
218
|
+
)
|
|
209
219
|
return convert_to_json_serializable(
|
|
210
220
|
self.engine.execute(
|
|
211
221
|
sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(
|
|
@@ -734,11 +744,41 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
734
744
|
def _get_dataset_column_distinct_value_frequencies(
|
|
735
745
|
self, column_profile: DatasetFieldProfileClass, column: str
|
|
736
746
|
) -> None:
|
|
737
|
-
if self.config.include_field_distinct_value_frequencies:
|
|
747
|
+
if not self.config.include_field_distinct_value_frequencies:
|
|
748
|
+
return
|
|
749
|
+
try:
|
|
750
|
+
results = self.dataset.engine.execute(
|
|
751
|
+
sa.select(
|
|
752
|
+
[
|
|
753
|
+
sa.column(column),
|
|
754
|
+
sa.func.count(sa.column(column)),
|
|
755
|
+
]
|
|
756
|
+
)
|
|
757
|
+
.select_from(self.dataset._table)
|
|
758
|
+
.where(sa.column(column).is_not(None))
|
|
759
|
+
.group_by(sa.column(column))
|
|
760
|
+
).fetchall()
|
|
761
|
+
|
|
738
762
|
column_profile.distinctValueFrequencies = [
|
|
739
|
-
ValueFrequencyClass(value=str(value), frequency=count)
|
|
740
|
-
for value, count in
|
|
763
|
+
ValueFrequencyClass(value=str(value), frequency=int(count))
|
|
764
|
+
for value, count in results
|
|
741
765
|
]
|
|
766
|
+
# sort so output is deterministic. don't do it in SQL because not all column
|
|
767
|
+
# types are sortable in SQL (such as JSON data types on Athena/Trino).
|
|
768
|
+
column_profile.distinctValueFrequencies = sorted(
|
|
769
|
+
column_profile.distinctValueFrequencies, key=lambda x: x.value
|
|
770
|
+
)
|
|
771
|
+
except Exception as e:
|
|
772
|
+
logger.debug(
|
|
773
|
+
f"Caught exception while attempting to get distinct value frequencies for column {column}. {e}"
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
self.report.report_warning(
|
|
777
|
+
title="Profiling: Unable to Calculate Distinct Value Frequencies",
|
|
778
|
+
message="Distinct value frequencies for the column will not be accessible",
|
|
779
|
+
context=f"{self.dataset_name}.{column}",
|
|
780
|
+
exc=e,
|
|
781
|
+
)
|
|
742
782
|
|
|
743
783
|
@_run_with_query_combiner
|
|
744
784
|
def _get_dataset_column_histogram(
|
|
@@ -1173,26 +1213,34 @@ class DatahubGEProfiler:
|
|
|
1173
1213
|
f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
|
|
1174
1214
|
)
|
|
1175
1215
|
|
|
1176
|
-
with
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1216
|
+
with (
|
|
1217
|
+
PerfTimer() as timer,
|
|
1218
|
+
unittest.mock.patch(
|
|
1219
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
|
|
1220
|
+
get_column_unique_count_dh_patch,
|
|
1221
|
+
),
|
|
1222
|
+
unittest.mock.patch(
|
|
1223
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
|
|
1224
|
+
_get_column_quantiles_bigquery_patch,
|
|
1225
|
+
),
|
|
1226
|
+
unittest.mock.patch(
|
|
1227
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
|
|
1228
|
+
_get_column_quantiles_awsathena_patch,
|
|
1229
|
+
),
|
|
1230
|
+
unittest.mock.patch(
|
|
1231
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
|
|
1232
|
+
_get_column_median_patch,
|
|
1233
|
+
),
|
|
1234
|
+
concurrent.futures.ThreadPoolExecutor(
|
|
1235
|
+
max_workers=max_workers
|
|
1236
|
+
) as async_executor,
|
|
1237
|
+
SQLAlchemyQueryCombiner(
|
|
1238
|
+
enabled=self.config.query_combiner_enabled,
|
|
1239
|
+
catch_exceptions=self.config.catch_exceptions,
|
|
1240
|
+
is_single_row_query_method=_is_single_row_query_method,
|
|
1241
|
+
serial_execution_fallback_enabled=True,
|
|
1242
|
+
).activate() as query_combiner,
|
|
1243
|
+
):
|
|
1196
1244
|
# Submit the profiling requests to the thread pool executor.
|
|
1197
1245
|
async_profiles = collections.deque(
|
|
1198
1246
|
async_executor.submit(
|
|
@@ -1395,12 +1443,12 @@ class DatahubGEProfiler:
|
|
|
1395
1443
|
)
|
|
1396
1444
|
return None
|
|
1397
1445
|
finally:
|
|
1398
|
-
if batch is not None and self.base_engine.engine.name.
|
|
1399
|
-
|
|
1400
|
-
|
|
1446
|
+
if batch is not None and self.base_engine.engine.name.lower() in [
|
|
1447
|
+
GXSqlDialect.TRINO,
|
|
1448
|
+
GXSqlDialect.AWSATHENA,
|
|
1401
1449
|
]:
|
|
1402
1450
|
if (
|
|
1403
|
-
self.base_engine.engine.name.
|
|
1451
|
+
self.base_engine.engine.name.lower() == GXSqlDialect.TRINO
|
|
1404
1452
|
or temp_view is not None
|
|
1405
1453
|
):
|
|
1406
1454
|
self._drop_temp_table(batch)
|
|
@@ -125,6 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
125
125
|
description="Profile table only if it has been updated since these many number of days. "
|
|
126
126
|
"If set to `null`, no constraint of last modified time for tables to profile. "
|
|
127
127
|
"Supported only in `snowflake` and `BigQuery`.",
|
|
128
|
+
schema_extra={"supported_sources": ["snowflake", "bigquery"]},
|
|
128
129
|
)
|
|
129
130
|
|
|
130
131
|
profile_table_size_limit: Optional[int] = Field(
|
|
@@ -132,6 +133,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
132
133
|
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
|
|
133
134
|
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
|
|
134
135
|
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
|
|
136
|
+
schema_extra={
|
|
137
|
+
"supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
|
|
138
|
+
},
|
|
135
139
|
)
|
|
136
140
|
|
|
137
141
|
profile_table_row_limit: Optional[int] = Field(
|
|
@@ -139,12 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
139
143
|
description="Profile tables only if their row count is less than specified count. "
|
|
140
144
|
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
|
|
141
145
|
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
|
|
146
|
+
schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
|
|
142
147
|
)
|
|
143
148
|
|
|
144
149
|
profile_table_row_count_estimate_only: bool = Field(
|
|
145
150
|
default=False,
|
|
146
151
|
description="Use an approximate query for row count. This will be much faster but slightly "
|
|
147
152
|
"less accurate. Only supported for Postgres and MySQL. ",
|
|
153
|
+
schema_extra={"supported_sources": ["postgres", "mysql"]},
|
|
148
154
|
)
|
|
149
155
|
|
|
150
156
|
# The query combiner enables us to combine multiple queries into a single query,
|
|
@@ -161,27 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
161
167
|
default=True,
|
|
162
168
|
description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
|
|
163
169
|
"If enabled, latest partition data is used for profiling.",
|
|
170
|
+
schema_extra={"supported_sources": ["athena", "bigquery"]},
|
|
164
171
|
)
|
|
165
172
|
partition_datetime: Optional[datetime.datetime] = Field(
|
|
166
173
|
default=None,
|
|
167
174
|
description="If specified, profile only the partition which matches this datetime. "
|
|
168
175
|
"If not specified, profile the latest partition. Only Bigquery supports this.",
|
|
176
|
+
schema_extra={"supported_sources": ["bigquery"]},
|
|
169
177
|
)
|
|
170
178
|
use_sampling: bool = Field(
|
|
171
179
|
default=True,
|
|
172
180
|
description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
|
|
173
181
|
"If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
|
|
182
|
+
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
|
174
183
|
)
|
|
175
184
|
|
|
176
185
|
sample_size: int = Field(
|
|
177
186
|
default=10000,
|
|
178
187
|
description="Number of rows to be sampled from table for column level profiling."
|
|
179
188
|
"Applicable only if `use_sampling` is set to True.",
|
|
189
|
+
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
|
180
190
|
)
|
|
181
191
|
|
|
182
192
|
profile_external_tables: bool = Field(
|
|
183
193
|
default=False,
|
|
184
194
|
description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
|
|
195
|
+
schema_extra={"supported_sources": ["redshift", "snowflake"]},
|
|
185
196
|
)
|
|
186
197
|
|
|
187
198
|
tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(
|
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
|
|
|
5
5
|
|
|
6
6
|
import requests
|
|
7
7
|
from pydantic import BaseModel, Field, ValidationError, validator
|
|
8
|
+
from requests.adapters import HTTPAdapter
|
|
8
9
|
from typing_extensions import assert_never
|
|
10
|
+
from urllib3.util.retry import Retry
|
|
9
11
|
|
|
10
12
|
from datahub.ingestion.api.source import SourceReport
|
|
11
13
|
from datahub.ingestion.source.hex.constants import (
|
|
@@ -220,6 +222,7 @@ class HexApi:
|
|
|
220
222
|
self.base_url = base_url
|
|
221
223
|
self.report = report
|
|
222
224
|
self.page_size = page_size
|
|
225
|
+
self.session = self._create_retry_session()
|
|
223
226
|
|
|
224
227
|
def _list_projects_url(self):
|
|
225
228
|
return f"{self.base_url}/projects"
|
|
@@ -227,6 +230,28 @@ class HexApi:
|
|
|
227
230
|
def _auth_header(self):
|
|
228
231
|
return {"Authorization": f"Bearer {self.token}"}
|
|
229
232
|
|
|
233
|
+
def _create_retry_session(self) -> requests.Session:
|
|
234
|
+
"""Create a requests session with retry logic for rate limiting.
|
|
235
|
+
|
|
236
|
+
Hex API rate limit: 60 requests per minute
|
|
237
|
+
https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
|
|
238
|
+
"""
|
|
239
|
+
session = requests.Session()
|
|
240
|
+
|
|
241
|
+
# Configure retry strategy for 429 (Too Many Requests) with exponential backoff
|
|
242
|
+
retry_strategy = Retry(
|
|
243
|
+
total=5, # Maximum number of retries
|
|
244
|
+
status_forcelist=[429], # Only retry on 429 status code
|
|
245
|
+
backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
|
|
246
|
+
raise_on_status=True, # Raise exception after max retries
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
250
|
+
session.mount("http://", adapter)
|
|
251
|
+
session.mount("https://", adapter)
|
|
252
|
+
|
|
253
|
+
return session
|
|
254
|
+
|
|
230
255
|
def fetch_projects(
|
|
231
256
|
self,
|
|
232
257
|
include_components: bool = True,
|
|
@@ -259,7 +284,7 @@ class HexApi:
|
|
|
259
284
|
logger.debug(f"Fetching projects page with params: {params}")
|
|
260
285
|
self.report.fetch_projects_page_calls += 1
|
|
261
286
|
try:
|
|
262
|
-
response =
|
|
287
|
+
response = self.session.get(
|
|
263
288
|
url=self._list_projects_url(),
|
|
264
289
|
headers=self._auth_header(),
|
|
265
290
|
params=params,
|
|
@@ -134,7 +134,9 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
|
|
|
134
134
|
SourceCapability.OWNERSHIP,
|
|
135
135
|
"Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
|
|
136
136
|
)
|
|
137
|
-
@capability(
|
|
137
|
+
@capability(
|
|
138
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
139
|
+
)
|
|
138
140
|
class IcebergSource(StatefulIngestionSourceBase):
|
|
139
141
|
"""
|
|
140
142
|
## Integration Details
|
|
@@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
|
|
|
167
167
|
@config_class(AzureADConfig)
|
|
168
168
|
@support_status(SupportStatus.CERTIFIED)
|
|
169
169
|
@capability(
|
|
170
|
-
SourceCapability.DELETION_DETECTION, "
|
|
170
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
171
171
|
)
|
|
172
172
|
class AzureADSource(StatefulIngestionSourceBase):
|
|
173
173
|
"""
|
|
@@ -41,7 +41,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
|
41
41
|
)
|
|
42
42
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
43
43
|
from datahub.metadata.schema_classes import (
|
|
44
|
-
ChangeTypeClass,
|
|
45
44
|
CorpGroupInfoClass,
|
|
46
45
|
CorpUserInfoClass,
|
|
47
46
|
GroupMembershipClass,
|
|
@@ -202,7 +201,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
|
|
|
202
201
|
@support_status(SupportStatus.CERTIFIED)
|
|
203
202
|
@capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
|
|
204
203
|
@capability(
|
|
205
|
-
SourceCapability.DELETION_DETECTION, "
|
|
204
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
206
205
|
)
|
|
207
206
|
class OktaSource(StatefulIngestionSourceBase):
|
|
208
207
|
"""
|
|
@@ -332,18 +331,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
332
331
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
333
332
|
|
|
334
333
|
yield MetadataChangeProposalWrapper(
|
|
335
|
-
entityType="corpGroup",
|
|
336
334
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
337
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
338
|
-
aspectName="origin",
|
|
339
335
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
340
336
|
).as_workunit()
|
|
341
337
|
|
|
342
338
|
yield MetadataChangeProposalWrapper(
|
|
343
|
-
entityType="corpGroup",
|
|
344
339
|
entityUrn=datahub_corp_group_snapshot.urn,
|
|
345
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
346
|
-
aspectName="status",
|
|
347
340
|
aspect=StatusClass(removed=False),
|
|
348
341
|
).as_workunit()
|
|
349
342
|
|
|
@@ -418,18 +411,12 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
418
411
|
yield MetadataWorkUnit(id=wu_id, mce=mce)
|
|
419
412
|
|
|
420
413
|
yield MetadataChangeProposalWrapper(
|
|
421
|
-
entityType="corpuser",
|
|
422
414
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
423
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
424
|
-
aspectName="origin",
|
|
425
415
|
aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
|
|
426
416
|
).as_workunit()
|
|
427
417
|
|
|
428
418
|
yield MetadataChangeProposalWrapper(
|
|
429
|
-
entityType="corpuser",
|
|
430
419
|
entityUrn=datahub_corp_user_snapshot.urn,
|
|
431
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
432
|
-
aspectName="status",
|
|
433
420
|
aspect=StatusClass(removed=False),
|
|
434
421
|
).as_workunit()
|
|
435
422
|
|
|
@@ -189,6 +189,22 @@ class KafkaConnectionTest:
|
|
|
189
189
|
SourceCapability.SCHEMA_METADATA,
|
|
190
190
|
"Schemas associated with each topic are extracted from the schema registry. Avro and Protobuf (certified), JSON (incubating). Schema references are supported.",
|
|
191
191
|
)
|
|
192
|
+
@capability(
|
|
193
|
+
SourceCapability.DATA_PROFILING,
|
|
194
|
+
"Not supported",
|
|
195
|
+
supported=False,
|
|
196
|
+
)
|
|
197
|
+
@capability(
|
|
198
|
+
SourceCapability.LINEAGE_COARSE,
|
|
199
|
+
"Not supported. If you use Kafka Connect, the kafka-connect source can generate lineage.",
|
|
200
|
+
supported=False,
|
|
201
|
+
)
|
|
202
|
+
@capability(
|
|
203
|
+
SourceCapability.LINEAGE_FINE,
|
|
204
|
+
"Not supported",
|
|
205
|
+
supported=False,
|
|
206
|
+
)
|
|
207
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
192
208
|
class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
193
209
|
"""
|
|
194
210
|
This plugin extracts the following:
|