acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -149,7 +149,7 @@ def construct_schema(
149
149
 
150
150
  extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {}
151
151
 
152
- for field_path in schema.keys():
152
+ for field_path in schema:
153
153
  field_types = schema[field_path]["types"]
154
154
  field_type: Union[str, type] = "mixed"
155
155
 
@@ -124,7 +124,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
124
124
  try:
125
125
  self.sigma_api = SigmaAPI(self.config, self.reporter)
126
126
  except Exception as e:
127
- raise ConfigurationError(f"Unable to connect sigma API. Exception: {e}")
127
+ raise ConfigurationError("Unable to connect sigma API") from e
128
128
 
129
129
  @staticmethod
130
130
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -9,7 +9,6 @@ from tenacity import retry, wait_exponential
9
9
  from tenacity.before_sleep import before_sleep_log
10
10
 
11
11
  import datahub.emitter.mce_builder as builder
12
- from datahub.configuration.common import ConfigModel
13
12
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
14
13
  from datahub.ingestion.api.common import PipelineContext
15
14
  from datahub.ingestion.api.decorators import (
@@ -18,8 +17,19 @@ from datahub.ingestion.api.decorators import (
18
17
  platform_name,
19
18
  support_status,
20
19
  )
21
- from datahub.ingestion.api.source import Source, SourceReport
20
+ from datahub.ingestion.api.source import (
21
+ MetadataWorkUnitProcessor,
22
+ SourceReport,
23
+ )
22
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
25
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
26
+ StaleEntityRemovalHandler,
27
+ StaleEntityRemovalSourceReport,
28
+ )
29
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
30
+ StatefulIngestionConfigBase,
31
+ StatefulIngestionSourceBase,
32
+ )
23
33
  from datahub.metadata.schema_classes import (
24
34
  CorpUserEditableInfoClass,
25
35
  DatasetPropertiesClass,
@@ -44,7 +54,9 @@ class CorpUser:
44
54
  slack_display_name: Optional[str] = None
45
55
 
46
56
 
47
- class SlackSourceConfig(ConfigModel):
57
+ class SlackSourceConfig(
58
+ StatefulIngestionConfigBase,
59
+ ):
48
60
  bot_token: SecretStr = Field(
49
61
  description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
50
62
  )
@@ -58,22 +70,22 @@ class SlackSourceConfig(ConfigModel):
58
70
  default=10,
59
71
  description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
60
72
  )
61
- ingest_public_channels = Field(
73
+ ingest_public_channels: bool = Field(
62
74
  type=bool,
63
75
  default=False,
64
76
  description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
65
77
  )
66
- channels_iteration_limit = Field(
78
+ channels_iteration_limit: int = Field(
67
79
  type=int,
68
80
  default=200,
69
81
  description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
70
82
  )
71
- channel_min_members = Field(
83
+ channel_min_members: int = Field(
72
84
  type=int,
73
85
  default=2,
74
86
  description="Ingest channels with at least this many members.",
75
87
  )
76
- should_ingest_archived_channels = Field(
88
+ should_ingest_archived_channels: bool = Field(
77
89
  type=bool,
78
90
  default=False,
79
91
  description="Whether to ingest archived channels.",
@@ -81,7 +93,7 @@ class SlackSourceConfig(ConfigModel):
81
93
 
82
94
 
83
95
  @dataclass
84
- class SlackSourceReport(SourceReport):
96
+ class SlackSourceReport(StaleEntityRemovalSourceReport):
85
97
  channels_reported: int = 0
86
98
  archived_channels_reported: int = 0
87
99
 
@@ -92,11 +104,12 @@ PLATFORM_NAME = "slack"
92
104
  @platform_name("Slack")
93
105
  @config_class(SlackSourceConfig)
94
106
  @support_status(SupportStatus.TESTING)
95
- class SlackSource(Source):
107
+ class SlackSource(StatefulIngestionSourceBase):
96
108
  def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
109
+ super().__init__(config, ctx)
97
110
  self.ctx = ctx
98
111
  self.config = config
99
- self.report = SlackSourceReport()
112
+ self.report: SlackSourceReport = SlackSourceReport()
100
113
  self.workspace_base_url: Optional[str] = None
101
114
  self.rate_limiter = RateLimiter(
102
115
  max_calls=self.config.api_requests_per_min, period=60
@@ -111,6 +124,14 @@ class SlackSource(Source):
111
124
  def get_slack_client(self) -> WebClient:
112
125
  return WebClient(token=self.config.bot_token.get_secret_value())
113
126
 
127
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
128
+ return [
129
+ *super().get_workunit_processors(),
130
+ StaleEntityRemovalHandler.create(
131
+ self, self.config, self.ctx
132
+ ).workunit_processor,
133
+ ]
134
+
114
135
  def get_workunits_internal(
115
136
  self,
116
137
  ) -> Iterable[MetadataWorkUnit]:
@@ -125,7 +125,7 @@ class SnowflakeConnectionConfig(ConfigModel):
125
125
 
126
126
  @pydantic.validator("authentication_type", always=True)
127
127
  def authenticator_type_is_valid(cls, v, values):
128
- if v not in _VALID_AUTH_TYPES.keys():
128
+ if v not in _VALID_AUTH_TYPES:
129
129
  raise ValueError(
130
130
  f"unsupported authenticator type '{v}' was provided,"
131
131
  f" use one of {list(_VALID_AUTH_TYPES.keys())}"
@@ -312,7 +312,7 @@ class SnowflakeConnectionConfig(ConfigModel):
312
312
  raise ValueError(
313
313
  f"access_token not found in response {response}. "
314
314
  "Please check your OAuth configuration."
315
- )
315
+ ) from None
316
316
  connect_args = self.get_options()["connect_args"]
317
317
  return snowflake.connector.connect(
318
318
  user=self.username,
@@ -403,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
403
403
  res["session_id"],
404
404
  res["query_start_time"],
405
405
  object_modified_by_ddl,
406
+ res["query_type"],
406
407
  )
407
408
  if known_ddl_entry:
408
409
  return known_ddl_entry
@@ -537,40 +538,42 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
537
538
  session_id: str,
538
539
  timestamp: datetime,
539
540
  object_modified_by_ddl: dict,
541
+ query_type: str,
540
542
  ) -> Optional[Union[TableRename, TableSwap]]:
541
543
  timestamp = timestamp.astimezone(timezone.utc)
542
- if object_modified_by_ddl[
543
- "operationType"
544
- ] == "ALTER" and object_modified_by_ddl["properties"].get("swapTargetName"):
545
- urn1 = self.identifiers.gen_dataset_urn(
544
+ if (
545
+ object_modified_by_ddl["operationType"] == "ALTER"
546
+ and query_type == "RENAME_TABLE"
547
+ and object_modified_by_ddl["properties"].get("objectName")
548
+ ):
549
+ original_un = self.identifiers.gen_dataset_urn(
546
550
  self.identifiers.get_dataset_identifier_from_qualified_name(
547
551
  object_modified_by_ddl["objectName"]
548
552
  )
549
553
  )
550
554
 
551
- urn2 = self.identifiers.gen_dataset_urn(
555
+ new_urn = self.identifiers.gen_dataset_urn(
552
556
  self.identifiers.get_dataset_identifier_from_qualified_name(
553
- object_modified_by_ddl["properties"]["swapTargetName"]["value"]
557
+ object_modified_by_ddl["properties"]["objectName"]["value"]
554
558
  )
555
559
  )
556
-
557
- return TableSwap(urn1, urn2, query, session_id, timestamp)
560
+ return TableRename(original_un, new_urn, query, session_id, timestamp)
558
561
  elif object_modified_by_ddl[
559
562
  "operationType"
560
- ] == "RENAME_TABLE" and object_modified_by_ddl["properties"].get("objectName"):
561
- original_un = self.identifiers.gen_dataset_urn(
563
+ ] == "ALTER" and object_modified_by_ddl["properties"].get("swapTargetName"):
564
+ urn1 = self.identifiers.gen_dataset_urn(
562
565
  self.identifiers.get_dataset_identifier_from_qualified_name(
563
566
  object_modified_by_ddl["objectName"]
564
567
  )
565
568
  )
566
569
 
567
- new_urn = self.identifiers.gen_dataset_urn(
570
+ urn2 = self.identifiers.gen_dataset_urn(
568
571
  self.identifiers.get_dataset_identifier_from_qualified_name(
569
- object_modified_by_ddl["properties"]["objectName"]["value"]
572
+ object_modified_by_ddl["properties"]["swapTargetName"]["value"]
570
573
  )
571
574
  )
572
575
 
573
- return TableRename(original_un, new_urn, query, session_id, timestamp)
576
+ return TableSwap(urn1, urn2, query, session_id, timestamp)
574
577
  else:
575
578
  self.report.num_ddl_queries_dropped += 1
576
579
  return None
@@ -731,6 +734,9 @@ fingerprinted_queries as (
731
734
  JOIN filtered_access_history a USING (query_id)
732
735
  )
733
736
  SELECT * FROM query_access_history
737
+ -- Our query aggregator expects the queries to be added in chronological order.
738
+ -- It's easier for us to push down the sorting to Snowflake/SQL instead of doing it in Python.
739
+ ORDER BY QUERY_START_TIME ASC
734
740
  """
735
741
 
736
742
 
@@ -134,10 +134,11 @@ class SnowflakeQuery:
134
134
  clustering_key AS "CLUSTERING_KEY",
135
135
  auto_clustering_on AS "AUTO_CLUSTERING_ON",
136
136
  is_dynamic AS "IS_DYNAMIC",
137
- is_iceberg AS "IS_ICEBERG"
137
+ is_iceberg AS "IS_ICEBERG",
138
+ is_hybrid AS "IS_HYBRID"
138
139
  FROM {db_clause}information_schema.tables t
139
140
  WHERE table_schema != 'INFORMATION_SCHEMA'
140
- and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
141
+ and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
141
142
  order by table_schema, table_name"""
142
143
 
143
144
  @staticmethod
@@ -156,10 +157,11 @@ class SnowflakeQuery:
156
157
  clustering_key AS "CLUSTERING_KEY",
157
158
  auto_clustering_on AS "AUTO_CLUSTERING_ON",
158
159
  is_dynamic AS "IS_DYNAMIC",
159
- is_iceberg AS "IS_ICEBERG"
160
+ is_iceberg AS "IS_ICEBERG",
161
+ is_hybrid AS "IS_HYBRID"
160
162
  FROM {db_clause}information_schema.tables t
161
163
  where table_schema='{schema_name}'
162
- and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
164
+ and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
163
165
  order by table_schema, table_name"""
164
166
 
165
167
  @staticmethod
@@ -96,10 +96,7 @@ class SnowflakeTable(BaseTable):
96
96
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
97
97
  is_dynamic: bool = False
98
98
  is_iceberg: bool = False
99
-
100
- @property
101
- def is_hybrid(self) -> bool:
102
- return self.type is not None and self.type == "HYBRID TABLE"
99
+ is_hybrid: bool = False
103
100
 
104
101
  def get_subtype(self) -> DatasetSubTypes:
105
102
  return DatasetSubTypes.TABLE
@@ -369,6 +366,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
369
366
  clustering_key=table["CLUSTERING_KEY"],
370
367
  is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
371
368
  is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
369
+ is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
372
370
  )
373
371
  )
374
372
  return tables
@@ -395,6 +393,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
395
393
  clustering_key=table["CLUSTERING_KEY"],
396
394
  is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
397
395
  is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
396
+ is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
398
397
  )
399
398
  )
400
399
  return tables
@@ -439,7 +439,7 @@ class SnowflakeV2Source(
439
439
  failure_reason=failure_message,
440
440
  )
441
441
 
442
- if c in _report.keys():
442
+ if c in _report:
443
443
  continue
444
444
 
445
445
  # If some capabilities are missing, then mark them as not capable
@@ -55,7 +55,7 @@ try:
55
55
  except ImportError:
56
56
  _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
57
57
 
58
- def override(f: _F, /) -> _F: # noqa: F811
58
+ def override(f: _F, /) -> _F:
59
59
  return f
60
60
 
61
61
 
@@ -104,7 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
104
104
  return "\n".join([r for r in res])
105
105
 
106
106
  @typing.no_type_check
107
- def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine: # noqa: C901
107
+ def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine:
108
108
  """Derives the data type of the Athena column.
109
109
 
110
110
  This method is overwritten to extend the behavior of PyAthena.
@@ -396,7 +396,7 @@ class AthenaSource(SQLAlchemySource):
396
396
  metadata.table_type if metadata.table_type else ""
397
397
  )
398
398
 
399
- location: Optional[str] = custom_properties.get("location", None)
399
+ location: Optional[str] = custom_properties.get("location")
400
400
  if location is not None:
401
401
  if location.startswith("s3://"):
402
402
  location = make_s3_urn(location, self.config.env)
@@ -538,21 +538,15 @@ class AthenaSource(SQLAlchemySource):
538
538
  column_name=column["name"],
539
539
  column_type=column["type"],
540
540
  inspector=inspector,
541
- description=column.get("comment", None),
541
+ description=column.get("comment"),
542
542
  nullable=column.get("nullable", True),
543
- is_part_of_key=(
544
- True
545
- if (
546
- pk_constraints is not None
547
- and isinstance(pk_constraints, dict)
548
- and column["name"] in pk_constraints.get("constrained_columns", [])
549
- )
550
- else False
543
+ is_part_of_key=bool(
544
+ pk_constraints is not None
545
+ and isinstance(pk_constraints, dict)
546
+ and column["name"] in pk_constraints.get("constrained_columns", [])
551
547
  ),
552
- is_partitioning_key=(
553
- True
554
- if (partition_keys is not None and column["name"] in partition_keys)
555
- else False
548
+ is_partitioning_key=bool(
549
+ partition_keys is not None and column["name"] in partition_keys
556
550
  ),
557
551
  )
558
552
 
@@ -50,11 +50,7 @@ class DruidConfig(BasicSQLAlchemyConfig):
50
50
  """
51
51
 
52
52
  def get_identifier(self, schema: str, table: str) -> str:
53
- return (
54
- f"{self.platform_instance}.{table}"
55
- if self.platform_instance
56
- else f"{table}"
57
- )
53
+ return f"{table}"
58
54
 
59
55
 
60
56
  @platform_name("Druid")
@@ -777,6 +777,7 @@ class HiveSource(TwoTierSQLAlchemySource):
777
777
  column,
778
778
  inspector,
779
779
  pk_constraints,
780
+ partition_keys=partition_keys,
780
781
  )
781
782
 
782
783
  if self._COMPLEX_TYPE.match(fields[0].nativeDataType) and isinstance(
@@ -821,12 +822,8 @@ class HiveSource(TwoTierSQLAlchemySource):
821
822
 
822
823
  try:
823
824
  view_definition = inspector.get_view_definition(view, schema)
824
- if view_definition is None:
825
- view_definition = ""
826
- else:
827
- # Some dialects return a TextClause instead of a raw string,
828
- # so we need to convert them to a string.
829
- view_definition = str(view_definition)
825
+ # Some dialects return a TextClause instead of a raw string, so we need to convert them to a string.
826
+ view_definition = str(view_definition) if view_definition else ""
830
827
  except NotImplementedError:
831
828
  view_definition = ""
832
829
 
@@ -853,3 +850,15 @@ class HiveSource(TwoTierSQLAlchemySource):
853
850
  default_db=default_db,
854
851
  default_schema=default_schema,
855
852
  )
853
+
854
+ def get_partitions(
855
+ self, inspector: Inspector, schema: str, table: str
856
+ ) -> Optional[List[str]]:
857
+ partition_columns: List[dict] = inspector.get_indexes(
858
+ table_name=table, schema=schema
859
+ )
860
+ for partition_column in partition_columns:
861
+ if partition_column.get("column_names"):
862
+ return partition_column.get("column_names")
863
+
864
+ return []
@@ -67,7 +67,7 @@ TableKey = namedtuple("TableKey", ["schema", "table"])
67
67
 
68
68
 
69
69
  class HiveMetastoreConfigMode(StrEnum):
70
- hive: str = "hive" # noqa: F811
70
+ hive: str = "hive"
71
71
  presto: str = "presto"
72
72
  presto_on_hive: str = "presto-on-hive"
73
73
  trino: str = "trino"
@@ -893,8 +893,9 @@ class HiveMetastoreSource(SQLAlchemySource):
893
893
  return get_schema_fields_for_hive_column(
894
894
  column["col_name"],
895
895
  column["col_type"],
896
+ # column is actually an sqlalchemy.engine.row.LegacyRow, not a Dict and we cannot make column.get("col_description", "")
896
897
  description=(
897
- column["col_description"] if "col_description" in column else ""
898
+ column["col_description"] if "col_description" in column else "" # noqa: SIM401
898
899
  ),
899
900
  default_nullable=True,
900
901
  )
@@ -11,12 +11,17 @@ from datahub.emitter.mcp_builder import (
11
11
  DatabaseKey,
12
12
  SchemaKey,
13
13
  )
14
+ from datahub.ingestion.source.common.subtypes import (
15
+ FlowContainerSubTypes,
16
+ JobContainerSubTypes,
17
+ )
14
18
  from datahub.metadata.schema_classes import (
15
19
  ContainerClass,
16
20
  DataFlowInfoClass,
17
21
  DataJobInfoClass,
18
22
  DataJobInputOutputClass,
19
23
  DataPlatformInstanceClass,
24
+ SubTypesClass,
20
25
  )
21
26
 
22
27
 
@@ -211,6 +216,18 @@ class MSSQLDataJob:
211
216
  status=self.status,
212
217
  )
213
218
 
219
+ @property
220
+ def as_subtypes_aspect(self) -> SubTypesClass:
221
+ assert isinstance(self.entity, (JobStep, StoredProcedure))
222
+ type = (
223
+ JobContainerSubTypes.MSSQL_JOBSTEP
224
+ if isinstance(self.entity, JobStep)
225
+ else JobContainerSubTypes.MSSQL_STORED_PROCEDURE
226
+ )
227
+ return SubTypesClass(
228
+ typeNames=[type],
229
+ )
230
+
214
231
  @property
215
232
  def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
216
233
  if self.entity.flow.platform_instance:
@@ -276,6 +293,18 @@ class MSSQLDataFlow:
276
293
  externalUrl=self.external_url,
277
294
  )
278
295
 
296
+ @property
297
+ def as_subtypes_aspect(self) -> SubTypesClass:
298
+ assert isinstance(self.entity, (MSSQLJob, MSSQLProceduresContainer))
299
+ type = (
300
+ FlowContainerSubTypes.MSSQL_JOB
301
+ if isinstance(self.entity, MSSQLJob)
302
+ else FlowContainerSubTypes.MSSQL_PROCEDURE_CONTAINER
303
+ )
304
+ return SubTypesClass(
305
+ typeNames=[type],
306
+ )
307
+
279
308
  @property
280
309
  def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
281
310
  if self.entity.platform_instance:
@@ -401,7 +401,7 @@ class SQLServerSource(SQLAlchemySource):
401
401
  data_job.add_property(name=data_name, value=str(data_value))
402
402
  yield from self.construct_job_workunits(data_job)
403
403
 
404
- def loop_stored_procedures( # noqa: C901
404
+ def loop_stored_procedures(
405
405
  self,
406
406
  inspector: Inspector,
407
407
  schema: str,
@@ -638,6 +638,11 @@ class SQLServerSource(SQLAlchemySource):
638
638
  aspect=data_job.as_datajob_info_aspect,
639
639
  ).as_workunit()
640
640
 
641
+ yield MetadataChangeProposalWrapper(
642
+ entityUrn=data_job.urn,
643
+ aspect=data_job.as_subtypes_aspect,
644
+ ).as_workunit()
645
+
641
646
  data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect
642
647
  if data_platform_instance_aspect:
643
648
  yield MetadataChangeProposalWrapper(
@@ -676,8 +681,6 @@ class SQLServerSource(SQLAlchemySource):
676
681
  ),
677
682
  ).as_workunit()
678
683
 
679
- # TODO: Add SubType when it appear
680
-
681
684
  def construct_flow_workunits(
682
685
  self,
683
686
  data_flow: MSSQLDataFlow,
@@ -687,6 +690,11 @@ class SQLServerSource(SQLAlchemySource):
687
690
  aspect=data_flow.as_dataflow_info_aspect,
688
691
  ).as_workunit()
689
692
 
693
+ yield MetadataChangeProposalWrapper(
694
+ entityUrn=data_flow.urn,
695
+ aspect=data_flow.as_subtypes_aspect,
696
+ ).as_workunit()
697
+
690
698
  data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect
691
699
  if data_platform_instance_aspect:
692
700
  yield MetadataChangeProposalWrapper(
@@ -700,8 +708,6 @@ class SQLServerSource(SQLAlchemySource):
700
708
  aspect=data_flow.as_container_aspect,
701
709
  ).as_workunit()
702
710
 
703
- # TODO: Add SubType when it appear
704
-
705
711
  def get_inspectors(self) -> Iterable[Inspector]:
706
712
  # This method can be overridden in the case that you want to dynamically
707
713
  # run on multiple databases.