acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
@@ -120,7 +120,6 @@ SNOWFLAKE = "snowflake"
120
120
  BIGQUERY = "bigquery"
121
121
  REDSHIFT = "redshift"
122
122
  DATABRICKS = "databricks"
123
- TRINO = "trino"
124
123
 
125
124
  # Type names for Databricks, to match Title Case types in sqlalchemy
126
125
  ProfilerTypeMapping.INT_TYPE_NAMES.append("Integer")
@@ -206,6 +205,17 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
206
205
  )
207
206
  )
208
207
  return convert_to_json_serializable(element_values.fetchone()[0])
208
+ elif (
209
+ self.engine.dialect.name.lower() == GXSqlDialect.AWSATHENA
210
+ or self.engine.dialect.name.lower() == GXSqlDialect.TRINO
211
+ ):
212
+ return convert_to_json_serializable(
213
+ self.engine.execute(
214
+ sa.select(sa.func.approx_distinct(sa.column(column))).select_from(
215
+ self._table
216
+ )
217
+ ).scalar()
218
+ )
209
219
  return convert_to_json_serializable(
210
220
  self.engine.execute(
211
221
  sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(
@@ -734,11 +744,41 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
734
744
  def _get_dataset_column_distinct_value_frequencies(
735
745
  self, column_profile: DatasetFieldProfileClass, column: str
736
746
  ) -> None:
737
- if self.config.include_field_distinct_value_frequencies:
747
+ if not self.config.include_field_distinct_value_frequencies:
748
+ return
749
+ try:
750
+ results = self.dataset.engine.execute(
751
+ sa.select(
752
+ [
753
+ sa.column(column),
754
+ sa.func.count(sa.column(column)),
755
+ ]
756
+ )
757
+ .select_from(self.dataset._table)
758
+ .where(sa.column(column).is_not(None))
759
+ .group_by(sa.column(column))
760
+ ).fetchall()
761
+
738
762
  column_profile.distinctValueFrequencies = [
739
- ValueFrequencyClass(value=str(value), frequency=count)
740
- for value, count in self.dataset.get_column_value_counts(column).items()
763
+ ValueFrequencyClass(value=str(value), frequency=int(count))
764
+ for value, count in results
741
765
  ]
766
+ # sort so output is deterministic. don't do it in SQL because not all column
767
+ # types are sortable in SQL (such as JSON data types on Athena/Trino).
768
+ column_profile.distinctValueFrequencies = sorted(
769
+ column_profile.distinctValueFrequencies, key=lambda x: x.value
770
+ )
771
+ except Exception as e:
772
+ logger.debug(
773
+ f"Caught exception while attempting to get distinct value frequencies for column {column}. {e}"
774
+ )
775
+
776
+ self.report.report_warning(
777
+ title="Profiling: Unable to Calculate Distinct Value Frequencies",
778
+ message="Distinct value frequencies for the column will not be accessible",
779
+ context=f"{self.dataset_name}.{column}",
780
+ exc=e,
781
+ )
742
782
 
743
783
  @_run_with_query_combiner
744
784
  def _get_dataset_column_histogram(
@@ -1173,26 +1213,34 @@ class DatahubGEProfiler:
1173
1213
  f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
1174
1214
  )
1175
1215
 
1176
- with PerfTimer() as timer, unittest.mock.patch(
1177
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1178
- get_column_unique_count_dh_patch,
1179
- ), unittest.mock.patch(
1180
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1181
- _get_column_quantiles_bigquery_patch,
1182
- ), unittest.mock.patch(
1183
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1184
- _get_column_quantiles_awsathena_patch,
1185
- ), unittest.mock.patch(
1186
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1187
- _get_column_median_patch,
1188
- ), concurrent.futures.ThreadPoolExecutor(
1189
- max_workers=max_workers
1190
- ) as async_executor, SQLAlchemyQueryCombiner(
1191
- enabled=self.config.query_combiner_enabled,
1192
- catch_exceptions=self.config.catch_exceptions,
1193
- is_single_row_query_method=_is_single_row_query_method,
1194
- serial_execution_fallback_enabled=True,
1195
- ).activate() as query_combiner:
1216
+ with (
1217
+ PerfTimer() as timer,
1218
+ unittest.mock.patch(
1219
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1220
+ get_column_unique_count_dh_patch,
1221
+ ),
1222
+ unittest.mock.patch(
1223
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1224
+ _get_column_quantiles_bigquery_patch,
1225
+ ),
1226
+ unittest.mock.patch(
1227
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1228
+ _get_column_quantiles_awsathena_patch,
1229
+ ),
1230
+ unittest.mock.patch(
1231
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1232
+ _get_column_median_patch,
1233
+ ),
1234
+ concurrent.futures.ThreadPoolExecutor(
1235
+ max_workers=max_workers
1236
+ ) as async_executor,
1237
+ SQLAlchemyQueryCombiner(
1238
+ enabled=self.config.query_combiner_enabled,
1239
+ catch_exceptions=self.config.catch_exceptions,
1240
+ is_single_row_query_method=_is_single_row_query_method,
1241
+ serial_execution_fallback_enabled=True,
1242
+ ).activate() as query_combiner,
1243
+ ):
1196
1244
  # Submit the profiling requests to the thread pool executor.
1197
1245
  async_profiles = collections.deque(
1198
1246
  async_executor.submit(
@@ -1395,12 +1443,12 @@ class DatahubGEProfiler:
1395
1443
  )
1396
1444
  return None
1397
1445
  finally:
1398
- if batch is not None and self.base_engine.engine.name.upper() in [
1399
- "TRINO",
1400
- "AWSATHENA",
1446
+ if batch is not None and self.base_engine.engine.name.lower() in [
1447
+ GXSqlDialect.TRINO,
1448
+ GXSqlDialect.AWSATHENA,
1401
1449
  ]:
1402
1450
  if (
1403
- self.base_engine.engine.name.upper() == "TRINO"
1451
+ self.base_engine.engine.name.lower() == GXSqlDialect.TRINO
1404
1452
  or temp_view is not None
1405
1453
  ):
1406
1454
  self._drop_temp_table(batch)
@@ -125,6 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig):
125
125
  description="Profile table only if it has been updated since these many number of days. "
126
126
  "If set to `null`, no constraint of last modified time for tables to profile. "
127
127
  "Supported only in `snowflake` and `BigQuery`.",
128
+ schema_extra={"supported_sources": ["snowflake", "bigquery"]},
128
129
  )
129
130
 
130
131
  profile_table_size_limit: Optional[int] = Field(
@@ -132,6 +133,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
132
133
  description="Profile tables only if their size is less than specified GBs. If set to `null`, "
133
134
  "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
134
135
  "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
136
+ schema_extra={
137
+ "supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
138
+ },
135
139
  )
136
140
 
137
141
  profile_table_row_limit: Optional[int] = Field(
@@ -139,12 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig):
139
143
  description="Profile tables only if their row count is less than specified count. "
140
144
  "If set to `null`, no limit on the row count of tables to profile. Supported only in "
141
145
  "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
146
+ schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
142
147
  )
143
148
 
144
149
  profile_table_row_count_estimate_only: bool = Field(
145
150
  default=False,
146
151
  description="Use an approximate query for row count. This will be much faster but slightly "
147
152
  "less accurate. Only supported for Postgres and MySQL. ",
153
+ schema_extra={"supported_sources": ["postgres", "mysql"]},
148
154
  )
149
155
 
150
156
  # The query combiner enables us to combine multiple queries into a single query,
@@ -161,27 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig):
161
167
  default=True,
162
168
  description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
163
169
  "If enabled, latest partition data is used for profiling.",
170
+ schema_extra={"supported_sources": ["athena", "bigquery"]},
164
171
  )
165
172
  partition_datetime: Optional[datetime.datetime] = Field(
166
173
  default=None,
167
174
  description="If specified, profile only the partition which matches this datetime. "
168
175
  "If not specified, profile the latest partition. Only Bigquery supports this.",
176
+ schema_extra={"supported_sources": ["bigquery"]},
169
177
  )
170
178
  use_sampling: bool = Field(
171
179
  default=True,
172
180
  description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
173
181
  "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
182
+ schema_extra={"supported_sources": ["bigquery", "snowflake"]},
174
183
  )
175
184
 
176
185
  sample_size: int = Field(
177
186
  default=10000,
178
187
  description="Number of rows to be sampled from table for column level profiling."
179
188
  "Applicable only if `use_sampling` is set to True.",
189
+ schema_extra={"supported_sources": ["bigquery", "snowflake"]},
180
190
  )
181
191
 
182
192
  profile_external_tables: bool = Field(
183
193
  default=False,
184
194
  description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
195
+ schema_extra={"supported_sources": ["redshift", "snowflake"]},
185
196
  )
186
197
 
187
198
  tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
5
5
 
6
6
  import requests
7
7
  from pydantic import BaseModel, Field, ValidationError, validator
8
+ from requests.adapters import HTTPAdapter
8
9
  from typing_extensions import assert_never
10
+ from urllib3.util.retry import Retry
9
11
 
10
12
  from datahub.ingestion.api.source import SourceReport
11
13
  from datahub.ingestion.source.hex.constants import (
@@ -220,6 +222,7 @@ class HexApi:
220
222
  self.base_url = base_url
221
223
  self.report = report
222
224
  self.page_size = page_size
225
+ self.session = self._create_retry_session()
223
226
 
224
227
  def _list_projects_url(self):
225
228
  return f"{self.base_url}/projects"
@@ -227,6 +230,28 @@ class HexApi:
227
230
  def _auth_header(self):
228
231
  return {"Authorization": f"Bearer {self.token}"}
229
232
 
233
+ def _create_retry_session(self) -> requests.Session:
234
+ """Create a requests session with retry logic for rate limiting.
235
+
236
+ Hex API rate limit: 60 requests per minute
237
+ https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
238
+ """
239
+ session = requests.Session()
240
+
241
+ # Configure retry strategy for 429 (Too Many Requests) with exponential backoff
242
+ retry_strategy = Retry(
243
+ total=5, # Maximum number of retries
244
+ status_forcelist=[429], # Only retry on 429 status code
245
+ backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
246
+ raise_on_status=True, # Raise exception after max retries
247
+ )
248
+
249
+ adapter = HTTPAdapter(max_retries=retry_strategy)
250
+ session.mount("http://", adapter)
251
+ session.mount("https://", adapter)
252
+
253
+ return session
254
+
230
255
  def fetch_projects(
231
256
  self,
232
257
  include_components: bool = True,
@@ -259,7 +284,7 @@ class HexApi:
259
284
  logger.debug(f"Fetching projects page with params: {params}")
260
285
  self.report.fetch_projects_page_calls += 1
261
286
  try:
262
- response = requests.get(
287
+ response = self.session.get(
263
288
  url=self._list_projects_url(),
264
289
  headers=self._auth_header(),
265
290
  params=params,
@@ -134,7 +134,9 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
134
134
  SourceCapability.OWNERSHIP,
135
135
  "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
136
136
  )
137
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
137
+ @capability(
138
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
139
+ )
138
140
  class IcebergSource(StatefulIngestionSourceBase):
139
141
  """
140
142
  ## Integration Details
@@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
167
167
  @config_class(AzureADConfig)
168
168
  @support_status(SupportStatus.CERTIFIED)
169
169
  @capability(
170
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
170
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
171
171
  )
172
172
  class AzureADSource(StatefulIngestionSourceBase):
173
173
  """
@@ -41,7 +41,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
41
41
  )
42
42
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
43
43
  from datahub.metadata.schema_classes import (
44
- ChangeTypeClass,
45
44
  CorpGroupInfoClass,
46
45
  CorpUserInfoClass,
47
46
  GroupMembershipClass,
@@ -202,7 +201,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
202
201
  @support_status(SupportStatus.CERTIFIED)
203
202
  @capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
204
203
  @capability(
205
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
204
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
206
205
  )
207
206
  class OktaSource(StatefulIngestionSourceBase):
208
207
  """
@@ -332,18 +331,12 @@ class OktaSource(StatefulIngestionSourceBase):
332
331
  yield MetadataWorkUnit(id=wu_id, mce=mce)
333
332
 
334
333
  yield MetadataChangeProposalWrapper(
335
- entityType="corpGroup",
336
334
  entityUrn=datahub_corp_group_snapshot.urn,
337
- changeType=ChangeTypeClass.UPSERT,
338
- aspectName="origin",
339
335
  aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
340
336
  ).as_workunit()
341
337
 
342
338
  yield MetadataChangeProposalWrapper(
343
- entityType="corpGroup",
344
339
  entityUrn=datahub_corp_group_snapshot.urn,
345
- changeType=ChangeTypeClass.UPSERT,
346
- aspectName="status",
347
340
  aspect=StatusClass(removed=False),
348
341
  ).as_workunit()
349
342
 
@@ -418,18 +411,12 @@ class OktaSource(StatefulIngestionSourceBase):
418
411
  yield MetadataWorkUnit(id=wu_id, mce=mce)
419
412
 
420
413
  yield MetadataChangeProposalWrapper(
421
- entityType="corpuser",
422
414
  entityUrn=datahub_corp_user_snapshot.urn,
423
- changeType=ChangeTypeClass.UPSERT,
424
- aspectName="origin",
425
415
  aspect=OriginClass(OriginTypeClass.EXTERNAL, "OKTA"),
426
416
  ).as_workunit()
427
417
 
428
418
  yield MetadataChangeProposalWrapper(
429
- entityType="corpuser",
430
419
  entityUrn=datahub_corp_user_snapshot.urn,
431
- changeType=ChangeTypeClass.UPSERT,
432
- aspectName="status",
433
420
  aspect=StatusClass(removed=False),
434
421
  ).as_workunit()
435
422
 
@@ -189,6 +189,22 @@ class KafkaConnectionTest:
189
189
  SourceCapability.SCHEMA_METADATA,
190
190
  "Schemas associated with each topic are extracted from the schema registry. Avro and Protobuf (certified), JSON (incubating). Schema references are supported.",
191
191
  )
192
+ @capability(
193
+ SourceCapability.DATA_PROFILING,
194
+ "Not supported",
195
+ supported=False,
196
+ )
197
+ @capability(
198
+ SourceCapability.LINEAGE_COARSE,
199
+ "Not supported. If you use Kafka Connect, the kafka-connect source can generate lineage.",
200
+ supported=False,
201
+ )
202
+ @capability(
203
+ SourceCapability.LINEAGE_FINE,
204
+ "Not supported",
205
+ supported=False,
206
+ )
207
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
192
208
  class KafkaSource(StatefulIngestionSourceBase, TestableSource):
193
209
  """
194
210
  This plugin extracts the following: