acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show
  1. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
  2. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
  3. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datacontract/datacontract.py +35 -3
  7. datahub/api/entities/datajob/dataflow.py +3 -3
  8. datahub/api/entities/datajob/datajob.py +7 -4
  9. datahub/api/entities/dataset/dataset.py +9 -11
  10. datahub/api/entities/forms/forms.py +34 -34
  11. datahub/api/graphql/assertion.py +1 -1
  12. datahub/api/graphql/operation.py +4 -4
  13. datahub/cli/check_cli.py +3 -2
  14. datahub/cli/config_utils.py +2 -2
  15. datahub/cli/delete_cli.py +6 -5
  16. datahub/cli/docker_cli.py +2 -2
  17. datahub/cli/exists_cli.py +2 -1
  18. datahub/cli/get_cli.py +2 -1
  19. datahub/cli/iceberg_cli.py +6 -5
  20. datahub/cli/ingest_cli.py +9 -6
  21. datahub/cli/migrate.py +4 -3
  22. datahub/cli/migration_utils.py +4 -3
  23. datahub/cli/put_cli.py +3 -2
  24. datahub/cli/specific/assertions_cli.py +2 -1
  25. datahub/cli/specific/datacontract_cli.py +3 -2
  26. datahub/cli/specific/dataproduct_cli.py +10 -9
  27. datahub/cli/specific/dataset_cli.py +4 -3
  28. datahub/cli/specific/forms_cli.py +2 -1
  29. datahub/cli/specific/group_cli.py +2 -1
  30. datahub/cli/specific/structuredproperties_cli.py +4 -3
  31. datahub/cli/specific/user_cli.py +2 -1
  32. datahub/cli/state_cli.py +2 -1
  33. datahub/cli/timeline_cli.py +2 -1
  34. datahub/configuration/common.py +5 -0
  35. datahub/configuration/source_common.py +1 -1
  36. datahub/emitter/mcp.py +20 -5
  37. datahub/emitter/request_helper.py +116 -3
  38. datahub/emitter/rest_emitter.py +163 -93
  39. datahub/entrypoints.py +2 -1
  40. datahub/errors.py +4 -0
  41. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
  42. datahub/ingestion/api/source.py +2 -5
  43. datahub/ingestion/api/source_helpers.py +1 -0
  44. datahub/ingestion/glossary/classification_mixin.py +4 -2
  45. datahub/ingestion/graph/client.py +33 -8
  46. datahub/ingestion/graph/config.py +14 -0
  47. datahub/ingestion/graph/filters.py +1 -1
  48. datahub/ingestion/graph/links.py +53 -0
  49. datahub/ingestion/run/pipeline.py +9 -6
  50. datahub/ingestion/run/pipeline_config.py +1 -1
  51. datahub/ingestion/sink/datahub_rest.py +5 -6
  52. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  53. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  54. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  55. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  56. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  57. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  58. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  59. datahub/ingestion/source/common/subtypes.py +3 -0
  60. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  61. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  62. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  63. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  64. datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
  65. datahub/ingestion/source/feast.py +4 -4
  66. datahub/ingestion/source/fivetran/config.py +1 -1
  67. datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
  68. datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
  69. datahub/ingestion/source/ge_data_profiler.py +27 -1
  70. datahub/ingestion/source/hex/api.py +1 -20
  71. datahub/ingestion/source/hex/query_fetcher.py +4 -1
  72. datahub/ingestion/source/iceberg/iceberg.py +20 -4
  73. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  74. datahub/ingestion/source/ldap.py +1 -1
  75. datahub/ingestion/source/looker/looker_common.py +17 -2
  76. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  77. datahub/ingestion/source/looker/looker_source.py +34 -5
  78. datahub/ingestion/source/looker/lookml_source.py +7 -1
  79. datahub/ingestion/source/metadata/lineage.py +2 -1
  80. datahub/ingestion/source/mlflow.py +19 -6
  81. datahub/ingestion/source/mode.py +74 -28
  82. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  83. datahub/ingestion/source/powerbi/config.py +13 -1
  84. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  85. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  87. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  88. datahub/ingestion/source/redshift/usage.py +10 -9
  89. datahub/ingestion/source/sigma/config.py +74 -6
  90. datahub/ingestion/source/sigma/sigma.py +16 -1
  91. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  92. datahub/ingestion/source/slack/slack.py +4 -52
  93. datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
  94. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
  95. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  96. datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
  97. datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
  98. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  99. datahub/ingestion/source/sql/athena.py +2 -1
  100. datahub/ingestion/source/sql/clickhouse.py +5 -1
  101. datahub/ingestion/source/sql/druid.py +7 -2
  102. datahub/ingestion/source/sql/hive.py +7 -2
  103. datahub/ingestion/source/sql/hive_metastore.py +5 -5
  104. datahub/ingestion/source/sql/mssql/source.py +1 -1
  105. datahub/ingestion/source/sql/oracle.py +6 -2
  106. datahub/ingestion/source/sql/sql_config.py +1 -34
  107. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  108. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  109. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  110. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  111. datahub/ingestion/source/tableau/tableau.py +31 -6
  112. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  113. datahub/ingestion/source/unity/config.py +2 -1
  114. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  115. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  116. datahub/ingestion/source/vertexai/vertexai.py +316 -4
  117. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
  118. datahub/integrations/assertion/common.py +3 -2
  119. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
  120. datahub/metadata/_urns/urn_defs.py +1819 -1763
  121. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  122. datahub/metadata/schema.avsc +17296 -16883
  123. datahub/metadata/schema_classes.py +3 -3
  124. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  125. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  126. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  127. datahub/metadata/schemas/FormInfo.avsc +5 -0
  128. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  129. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  130. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  131. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  132. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  133. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  134. datahub/metadata/schemas/__init__.py +3 -3
  135. datahub/sdk/_all_entities.py +4 -0
  136. datahub/sdk/_shared.py +142 -4
  137. datahub/sdk/_utils.py +4 -0
  138. datahub/sdk/dataset.py +2 -2
  139. datahub/sdk/entity_client.py +8 -0
  140. datahub/sdk/lineage_client.py +235 -0
  141. datahub/sdk/main_client.py +6 -3
  142. datahub/sdk/mlmodel.py +301 -0
  143. datahub/sdk/mlmodelgroup.py +233 -0
  144. datahub/secret/datahub_secret_store.py +2 -1
  145. datahub/specific/dataset.py +12 -0
  146. datahub/sql_parsing/fingerprint_utils.py +6 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
  148. datahub/sql_parsing/sqlglot_utils.py +18 -14
  149. datahub/telemetry/telemetry.py +2 -2
  150. datahub/testing/check_imports.py +1 -1
  151. datahub/testing/mcp_diff.py +15 -2
  152. datahub/upgrade/upgrade.py +10 -12
  153. datahub/utilities/logging_manager.py +8 -1
  154. datahub/utilities/server_config_util.py +350 -10
  155. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  156. datahub/utilities/urn_encoder.py +1 -1
  157. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
  158. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
  159. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  import json
2
3
  import logging
3
4
  import re
@@ -12,16 +13,15 @@ from pydantic import BaseModel, Field, validator
12
13
 
13
14
  from datahub.configuration.git import GitReference
14
15
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
16
+ from datahub.ingestion.api.common import PipelineContext
15
17
  from datahub.ingestion.api.decorators import (
16
18
  SupportStatus,
17
- capability,
18
19
  config_class,
19
20
  platform_name,
20
21
  support_status,
21
22
  )
22
23
  from datahub.ingestion.api.source import (
23
24
  CapabilityReport,
24
- SourceCapability,
25
25
  TestableSource,
26
26
  TestConnectionReport,
27
27
  )
@@ -40,19 +40,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
40
40
  logger = logging.getLogger(__name__)
41
41
 
42
42
 
43
+ @dataclasses.dataclass
44
+ class DBTCoreReport(DBTSourceReport):
45
+ catalog_info: Optional[dict] = None
46
+ manifest_info: Optional[dict] = None
47
+
48
+
43
49
  class DBTCoreConfig(DBTCommonConfig):
44
50
  manifest_path: str = Field(
45
- description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json Note "
46
- "this can be a local file or a URI."
51
+ description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
52
+ "This can be a local file or a URI."
47
53
  )
48
- catalog_path: str = Field(
49
- description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json Note this "
50
- "can be a local file or a URI."
54
+ catalog_path: Optional[str] = Field(
55
+ None,
56
+ description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
57
+ "This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
58
+ "This can be a local file or a URI.",
51
59
  )
52
60
  sources_path: Optional[str] = Field(
53
61
  default=None,
54
- description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. If not "
55
- "specified, last-modified fields will not be populated. Note this can be a local file or a URI.",
62
+ description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
63
+ "If not specified, last-modified fields will not be populated. "
64
+ "This can be a local file or a URI.",
56
65
  )
57
66
  run_results_paths: List[str] = Field(
58
67
  default=[],
@@ -161,7 +170,7 @@ def get_columns(
161
170
 
162
171
  def extract_dbt_entities(
163
172
  all_manifest_entities: Dict[str, Dict[str, Any]],
164
- all_catalog_entities: Dict[str, Dict[str, Any]],
173
+ all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
165
174
  sources_results: List[Dict[str, Any]],
166
175
  manifest_adapter: str,
167
176
  use_identifiers: bool,
@@ -186,15 +195,6 @@ def extract_dbt_entities(
186
195
  ):
187
196
  name = manifest_node["alias"]
188
197
 
189
- # initialize comment to "" for consistency with descriptions
190
- # (since dbt null/undefined descriptions as "")
191
- comment = ""
192
-
193
- if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
194
- "comment"
195
- ):
196
- comment = all_catalog_entities[key]["metadata"]["comment"]
197
-
198
198
  materialization = None
199
199
  if "materialized" in manifest_node.get("config", {}):
200
200
  # It's a model
@@ -204,8 +204,9 @@ def extract_dbt_entities(
204
204
  if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
205
205
  upstream_nodes = manifest_node["depends_on"]["nodes"]
206
206
 
207
- # It's a source
208
- catalog_node = all_catalog_entities.get(key)
207
+ catalog_node = (
208
+ all_catalog_entities.get(key) if all_catalog_entities is not None else None
209
+ )
209
210
  missing_from_catalog = catalog_node is None
210
211
  catalog_type = None
211
212
 
@@ -214,16 +215,23 @@ def extract_dbt_entities(
214
215
  # Test and ephemeral nodes will never show up in the catalog.
215
216
  missing_from_catalog = False
216
217
  else:
217
- if not only_include_if_in_catalog:
218
+ if all_catalog_entities is not None and not only_include_if_in_catalog:
219
+ # If the catalog file is missing, we have already generated a general message.
218
220
  report.warning(
219
221
  title="Node missing from catalog",
220
222
  message="Found a node in the manifest file but not in the catalog. "
221
223
  "This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
222
- "Some metadata, such as column types and descriptions, will be impacted.",
224
+ "Some metadata, particularly schema information, will be impacted.",
223
225
  context=key,
224
226
  )
225
227
  else:
226
- catalog_type = all_catalog_entities[key]["metadata"]["type"]
228
+ catalog_type = catalog_node["metadata"]["type"]
229
+
230
+ # initialize comment to "" for consistency with descriptions
231
+ # (since dbt null/undefined descriptions as "")
232
+ comment = ""
233
+ if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
234
+ comment = catalog_node["metadata"]["comment"]
227
235
 
228
236
  query_tag_props = manifest_node.get("query_tag", {})
229
237
 
@@ -231,12 +239,15 @@ def extract_dbt_entities(
231
239
 
232
240
  owner = meta.get("owner")
233
241
  if owner is None:
234
- owner = manifest_node.get("config", {}).get("meta", {}).get("owner")
242
+ owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
243
+
244
+ if not meta:
245
+ # On older versions of dbt, the meta field was nested under config
246
+ # for some node types.
247
+ meta = manifest_node.get("config", {}).get("meta") or {}
235
248
 
236
249
  tags = manifest_node.get("tags", [])
237
250
  tags = [tag_prefix + tag for tag in tags]
238
- if not meta:
239
- meta = manifest_node.get("config", {}).get("meta", {})
240
251
 
241
252
  max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
242
253
  max_loaded_at = None
@@ -453,15 +464,18 @@ def load_run_results(
453
464
  @platform_name("dbt")
454
465
  @config_class(DBTCoreConfig)
455
466
  @support_status(SupportStatus.CERTIFIED)
456
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
457
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
458
467
  class DBTCoreSource(DBTSourceBase, TestableSource):
459
468
  config: DBTCoreConfig
469
+ report: DBTCoreReport
470
+
471
+ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
472
+ super().__init__(config, ctx)
473
+ self.report = DBTCoreReport()
460
474
 
461
475
  @classmethod
462
476
  def create(cls, config_dict, ctx):
463
477
  config = DBTCoreConfig.parse_obj(config_dict)
464
- return cls(config, ctx, "dbt")
478
+ return cls(config, ctx)
465
479
 
466
480
  @staticmethod
467
481
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -471,9 +485,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
471
485
  DBTCoreSource.load_file_as_json(
472
486
  source_config.manifest_path, source_config.aws_connection
473
487
  )
474
- DBTCoreSource.load_file_as_json(
475
- source_config.catalog_path, source_config.aws_connection
476
- )
488
+ if source_config.catalog_path is not None:
489
+ DBTCoreSource.load_file_as_json(
490
+ source_config.catalog_path, source_config.aws_connection
491
+ )
477
492
  test_report.basic_connectivity = CapabilityReport(capable=True)
478
493
  except Exception as e:
479
494
  test_report.basic_connectivity = CapabilityReport(
@@ -511,11 +526,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
511
526
  dbt_manifest_json = self.load_file_as_json(
512
527
  self.config.manifest_path, self.config.aws_connection
513
528
  )
514
-
515
- dbt_catalog_json = self.load_file_as_json(
516
- self.config.catalog_path, self.config.aws_connection
529
+ dbt_manifest_metadata = dbt_manifest_json["metadata"]
530
+ self.report.manifest_info = dict(
531
+ generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
532
+ dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
533
+ project_name=dbt_manifest_metadata.get("project_name", "unknown"),
517
534
  )
518
535
 
536
+ dbt_catalog_json = None
537
+ dbt_catalog_metadata = None
538
+ if self.config.catalog_path is not None:
539
+ dbt_catalog_json = self.load_file_as_json(
540
+ self.config.catalog_path, self.config.aws_connection
541
+ )
542
+ dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
543
+ self.report.catalog_info = dict(
544
+ generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
545
+ dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
546
+ project_name=dbt_catalog_metadata.get("project_name", "unknown"),
547
+ )
548
+ else:
549
+ self.report.warning(
550
+ title="No catalog file configured",
551
+ message="Some metadata, particularly schema information, will be missing.",
552
+ )
553
+
519
554
  if self.config.sources_path is not None:
520
555
  dbt_sources_json = self.load_file_as_json(
521
556
  self.config.sources_path, self.config.aws_connection
@@ -528,18 +563,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
528
563
  manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
529
564
  manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
530
565
 
531
- catalog_schema = dbt_catalog_json.get("metadata", {}).get("dbt_schema_version")
532
- catalog_version = dbt_catalog_json.get("metadata", {}).get("dbt_version")
566
+ catalog_schema = None
567
+ catalog_version = None
568
+ if dbt_catalog_metadata is not None:
569
+ catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
570
+ catalog_version = dbt_catalog_metadata.get("dbt_version")
533
571
 
534
572
  manifest_nodes = dbt_manifest_json["nodes"]
535
573
  manifest_sources = dbt_manifest_json["sources"]
536
574
 
537
575
  all_manifest_entities = {**manifest_nodes, **manifest_sources}
538
576
 
539
- catalog_nodes = dbt_catalog_json["nodes"]
540
- catalog_sources = dbt_catalog_json["sources"]
577
+ all_catalog_entities = None
578
+ if dbt_catalog_json is not None:
579
+ catalog_nodes = dbt_catalog_json["nodes"]
580
+ catalog_sources = dbt_catalog_json["sources"]
541
581
 
542
- all_catalog_entities = {**catalog_nodes, **catalog_sources}
582
+ all_catalog_entities = {**catalog_nodes, **catalog_sources}
543
583
 
544
584
  nodes = extract_dbt_entities(
545
585
  all_manifest_entities=all_manifest_entities,
@@ -590,7 +630,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
590
630
  )
591
631
  except Exception as e:
592
632
  self.report.info(
593
- title="Dbt Catalog Version",
633
+ title="dbt Catalog Version",
594
634
  message="Failed to determine the catalog version",
595
635
  exc=e,
596
636
  )
@@ -474,6 +474,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
474
474
  dataset_properties.customProperties["schema.downsampled"] = "True"
475
475
  dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
476
476
  # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include MAX_SCHEMA_SIZE items
477
+ primary_keys = []
477
478
  for schema_field in sorted(
478
479
  table_fields,
479
480
  key=lambda x: (
@@ -484,22 +485,23 @@ class DynamoDBSource(StatefulIngestionSourceBase):
484
485
  field_path = schema_field["delimited_name"]
485
486
  native_data_type = self.get_native_type(schema_field["type"], table_name)
486
487
  type = self.get_field_type(schema_field["type"], table_name)
487
- description = None
488
488
  nullable = True
489
489
  if field_path in primary_key_dict:
490
- description = (
490
+ # primary key should not be nullable
491
+ type_key = (
491
492
  "Partition Key"
492
493
  if primary_key_dict.get(field_path) == "HASH"
493
494
  else "Sort Key"
494
495
  )
495
- # primary key should not be nullable
496
+ dataset_properties.customProperties[type_key] = field_path
496
497
  nullable = False
498
+ primary_keys.append(field_path)
497
499
 
498
500
  field = SchemaField(
499
501
  fieldPath=field_path,
500
502
  nativeDataType=native_data_type,
501
503
  type=type,
502
- description=description,
504
+ description=None,
503
505
  nullable=nullable,
504
506
  recursive=False,
505
507
  )
@@ -513,6 +515,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
513
515
  hash="",
514
516
  platformSchema=SchemalessClass(),
515
517
  fields=canonical_schema,
518
+ primaryKeys=primary_keys,
516
519
  )
517
520
  return schema_metadata
518
521
 
@@ -135,10 +135,10 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
135
135
  """
136
136
  This plugin extracts:
137
137
 
138
- - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
139
- - Fields as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
140
- - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
141
- - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
138
+ - Entities as [`MLPrimaryKey`](https://docs.datahub.com/docs/graphql/objects#mlprimarykey)
139
+ - Fields as [`MLFeature`](https://docs.datahub.com/docs/graphql/objects#mlfeature)
140
+ - Feature views and on-demand feature views as [`MLFeatureTable`](https://docs.datahub.com/docs/graphql/objects#mlfeaturetable)
141
+ - Batch and stream source details as [`Dataset`](https://docs.datahub.com/docs/graphql/objects#dataset)
142
142
  - Column types associated with each entity and feature
143
143
  """
144
144
 
@@ -16,7 +16,7 @@ from datahub.configuration.source_common import DatasetSourceConfigMixin
16
16
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
17
17
  from datahub.emitter.mce_builder import DEFAULT_ENV
18
18
  from datahub.ingestion.api.report import Report
19
- from datahub.ingestion.source.bigquery_v2.bigquery_config import (
19
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
20
20
  BigQueryConnectionConfig,
21
21
  )
22
22
  from datahub.ingestion.source.snowflake.snowflake_connection import (
@@ -54,7 +54,7 @@ class FivetranLogAPI:
54
54
  snowflake_destination_config.database,
55
55
  )
56
56
  )
57
- fivetran_log_query.set_db(
57
+ fivetran_log_query.set_schema(
58
58
  snowflake_destination_config.log_schema,
59
59
  )
60
60
  fivetran_log_database = snowflake_destination_config.database
@@ -66,8 +66,12 @@ class FivetranLogAPI:
66
66
  engine = create_engine(
67
67
  bigquery_destination_config.get_sql_alchemy_url(),
68
68
  )
69
- fivetran_log_query.set_db(bigquery_destination_config.dataset)
70
- fivetran_log_database = bigquery_destination_config.dataset
69
+ fivetran_log_query.set_schema(bigquery_destination_config.dataset)
70
+
71
+ # The "database" should be the BigQuery project name.
72
+ fivetran_log_database = engine.execute(
73
+ "SELECT @@project_id"
74
+ ).fetchone()[0]
71
75
  else:
72
76
  raise ConfigurationError(
73
77
  f"Destination platform '{destination_platform}' is not yet supported."
@@ -12,14 +12,14 @@ class FivetranLogQuery:
12
12
 
13
13
  def __init__(self) -> None:
14
14
  # Select query db clause
15
- self.db_clause: str = ""
16
-
17
- def set_db(self, db_name: str) -> None:
18
- self.db_clause = f"{db_name}."
15
+ self.schema_clause: str = ""
19
16
 
20
17
  def use_database(self, db_name: str) -> str:
21
18
  return f"use database {db_name}"
22
19
 
20
+ def set_schema(self, schema_name: str) -> None:
21
+ self.schema_clause = f"{schema_name}."
22
+
23
23
  def get_connectors_query(self) -> str:
24
24
  return f"""\
25
25
  SELECT
@@ -30,7 +30,7 @@ SELECT
30
30
  paused,
31
31
  sync_frequency,
32
32
  destination_id
33
- FROM {self.db_clause}connector
33
+ FROM {self.schema_clause}connector
34
34
  WHERE
35
35
  _fivetran_deleted = FALSE
36
36
  QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY _fivetran_synced DESC) = 1
@@ -42,7 +42,7 @@ SELECT id as user_id,
42
42
  given_name,
43
43
  family_name,
44
44
  email
45
- FROM {self.db_clause}user
45
+ FROM {self.schema_clause}user
46
46
  """
47
47
 
48
48
  def get_sync_logs_query(
@@ -62,7 +62,7 @@ WITH ranked_syncs AS (
62
62
  MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
63
63
  MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
64
64
  ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
65
- FROM {self.db_clause}log
65
+ FROM {self.schema_clause}log
66
66
  WHERE message_event in ('sync_start', 'sync_end')
67
67
  AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
68
68
  AND connector_id IN ({formatted_connector_ids})
@@ -99,11 +99,11 @@ FROM (
99
99
  dsm.name as destination_schema_name,
100
100
  tl.created_at as created_at,
101
101
  ROW_NUMBER() OVER (PARTITION BY stm.connector_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
102
- FROM {self.db_clause}table_lineage as tl
103
- JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id
104
- JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
105
- JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
106
- JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
102
+ FROM {self.schema_clause}table_lineage as tl
103
+ JOIN {self.schema_clause}source_table_metadata as stm on tl.source_table_id = stm.id
104
+ JOIN {self.schema_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
105
+ JOIN {self.schema_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
106
+ JOIN {self.schema_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
107
107
  WHERE stm.connector_id IN ({formatted_connector_ids})
108
108
  )
109
109
  -- Ensure that we only get back one entry per source and destination pair.
@@ -131,13 +131,13 @@ FROM (
131
131
  dcm.name as destination_column_name,
132
132
  cl.created_at as created_at,
133
133
  ROW_NUMBER() OVER (PARTITION BY stm.connector_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
134
- FROM {self.db_clause}column_lineage as cl
135
- JOIN {self.db_clause}source_column_metadata as scm
134
+ FROM {self.schema_clause}column_lineage as cl
135
+ JOIN {self.schema_clause}source_column_metadata as scm
136
136
  ON cl.source_column_id = scm.id
137
- JOIN {self.db_clause}destination_column_metadata as dcm
137
+ JOIN {self.schema_clause}destination_column_metadata as dcm
138
138
  ON cl.destination_column_id = dcm.id
139
139
  -- Only joining source_table_metadata to get the connector_id.
140
- JOIN {self.db_clause}source_table_metadata as stm
140
+ JOIN {self.schema_clause}source_table_metadata as stm
141
141
  ON scm.table_id = stm.id
142
142
  WHERE stm.connector_id IN ({formatted_connector_ids})
143
143
  )
@@ -5,6 +5,7 @@ import concurrent.futures
5
5
  import contextlib
6
6
  import dataclasses
7
7
  import functools
8
+ import importlib.metadata
8
9
  import json
9
10
  import logging
10
11
  import re
@@ -51,6 +52,7 @@ from typing_extensions import Concatenate, ParamSpec
51
52
  from datahub.emitter import mce_builder
52
53
  from datahub.emitter.mce_builder import get_sys_time
53
54
  from datahub.ingestion.graph.client import get_default_graph
55
+ from datahub.ingestion.graph.config import ClientMode
54
56
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
55
57
  from datahub.ingestion.source.profiling.common import (
56
58
  Cardinality,
@@ -83,6 +85,30 @@ if TYPE_CHECKING:
83
85
  from pyathena.cursor import Cursor
84
86
 
85
87
  assert MARKUPSAFE_PATCHED
88
+
89
+ # We need to ensure that acryl-great-expectations is installed
90
+ # and great-expectations is not installed.
91
+ try:
92
+ acryl_gx_version = bool(importlib.metadata.distribution("acryl-great-expectations"))
93
+ except importlib.metadata.PackageNotFoundError:
94
+ acryl_gx_version = False
95
+
96
+ try:
97
+ original_gx_version = bool(importlib.metadata.distribution("great-expectations"))
98
+ except importlib.metadata.PackageNotFoundError:
99
+ original_gx_version = False
100
+
101
+ if acryl_gx_version and original_gx_version:
102
+ raise RuntimeError(
103
+ "acryl-great-expectations and great-expectations cannot both be installed because their files will conflict. "
104
+ "You will need to (1) uninstall great-expectations and (2) re-install acryl-great-expectations. "
105
+ "See https://github.com/pypa/pip/issues/4625."
106
+ )
107
+ elif original_gx_version:
108
+ raise RuntimeError(
109
+ "We expect acryl-great-expectations to be installed, but great-expectations is installed instead."
110
+ )
111
+
86
112
  logger: logging.Logger = logging.getLogger(__name__)
87
113
 
88
114
  _original_get_column_median = SqlAlchemyDataset.get_column_median
@@ -1569,7 +1595,7 @@ def _get_columns_to_ignore_sampling(
1569
1595
  name=dataset_name, platform=platform, env=env
1570
1596
  )
1571
1597
 
1572
- datahub_graph = get_default_graph()
1598
+ datahub_graph = get_default_graph(ClientMode.INGESTION)
1573
1599
 
1574
1600
  dataset_tags = datahub_graph.get_tags(dataset_urn)
1575
1601
  if dataset_tags:
@@ -27,6 +27,7 @@ logger = logging.getLogger(__name__)
27
27
 
28
28
  # The following models were Claude-generated from Hex API OpenAPI definition https://static.hex.site/openapi.json
29
29
  # To be exclusively used internally for the deserialization of the API response
30
+ # Model is incomplete and fields may have not been mapped if not used in the ingestion
30
31
 
31
32
 
32
33
  class HexApiAppViewStats(BaseModel):
@@ -83,20 +84,10 @@ class HexApiUser(BaseModel):
83
84
  email: str
84
85
 
85
86
 
86
- class HexApiAccessType(StrEnum):
87
- """Access type enum."""
88
-
89
- NONE = "NONE"
90
- VIEW = "VIEW"
91
- EDIT = "EDIT"
92
- FULL_ACCESS = "FULL_ACCESS"
93
-
94
-
95
87
  class HexApiUserAccess(BaseModel):
96
88
  """User access model."""
97
89
 
98
90
  user: HexApiUser
99
- access: Optional[HexApiAccessType] = None
100
91
 
101
92
 
102
93
  class HexApiCollectionData(BaseModel):
@@ -109,13 +100,6 @@ class HexApiCollectionAccess(BaseModel):
109
100
  """Collection access model."""
110
101
 
111
102
  collection: HexApiCollectionData
112
- access: Optional[HexApiAccessType] = None
113
-
114
-
115
- class HexApiAccessSettings(BaseModel):
116
- """Access settings model."""
117
-
118
- access: Optional[HexApiAccessType] = None
119
103
 
120
104
 
121
105
  class HexApiWeeklySchedule(BaseModel):
@@ -145,9 +129,6 @@ class HexApiSharing(BaseModel):
145
129
  users: Optional[List[HexApiUserAccess]] = []
146
130
  collections: Optional[List[HexApiCollectionAccess]] = []
147
131
  groups: Optional[List[Any]] = []
148
- workspace: Optional[HexApiAccessSettings] = None
149
- public_web: Optional[HexApiAccessSettings] = Field(default=None, alias="publicWeb")
150
- support: Optional[HexApiAccessSettings] = None
151
132
 
152
133
  class Config:
153
134
  extra = "ignore" # Allow extra fields in the JSON
@@ -18,7 +18,8 @@ from datahub.utilities.time import datetime_to_ts_millis
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
  # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
21
- HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
21
+ # Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
22
+ HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
22
23
 
23
24
 
24
25
  @dataclass
@@ -39,6 +40,7 @@ class HexQueryFetcherReport(SourceReport):
39
40
  fetched_query_objects: int = 0
40
41
  filtered_out_queries_missing_metadata: int = 0
41
42
  filtered_out_queries_different_workspace: int = 0
43
+ filtered_out_queries_no_match: int = 0
42
44
  filtered_out_queries_no_subjects: int = 0
43
45
  total_queries: int = 0
44
46
  total_dataset_subjects: int = 0
@@ -210,6 +212,7 @@ class HexQueryFetcher:
210
212
  match = re.search(HEX_METADATA_PATTERN, sql_statement)
211
213
 
212
214
  if not match:
215
+ self.report.filtered_out_queries_no_match += 1
213
216
  return None
214
217
 
215
218
  try:
@@ -16,7 +16,7 @@ from pyiceberg.exceptions import (
16
16
  )
17
17
  from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
18
18
  from pyiceberg.table import Table
19
- from pyiceberg.typedef import Identifier
19
+ from pyiceberg.typedef import Identifier, Properties
20
20
  from pyiceberg.types import (
21
21
  BinaryType,
22
22
  BooleanType,
@@ -387,8 +387,13 @@ class IcebergSource(StatefulIngestionSourceBase):
387
387
  env=self.config.env,
388
388
  )
389
389
  )
390
+ namespace_properties: Properties = catalog.load_namespace_properties(
391
+ namespace
392
+ )
390
393
  namespaces.append((namespace, namespace_urn))
391
- for aspect in self._create_iceberg_namespace_aspects(namespace):
394
+ for aspect in self._create_iceberg_namespace_aspects(
395
+ namespace, namespace_properties
396
+ ):
392
397
  yield stamping_processor.stamp_wu(
393
398
  MetadataChangeProposalWrapper(
394
399
  entityUrn=namespace_urn, aspect=aspect
@@ -608,12 +613,23 @@ class IcebergSource(StatefulIngestionSourceBase):
608
613
  return self.report
609
614
 
610
615
  def _create_iceberg_namespace_aspects(
611
- self, namespace: Identifier
616
+ self, namespace: Identifier, properties: Properties
612
617
  ) -> Iterable[_Aspect]:
613
618
  namespace_repr = ".".join(namespace)
619
+ custom_properties: Dict[str, str] = {}
620
+ for k, v in properties.items():
621
+ try:
622
+ custom_properties[str(k)] = str(v)
623
+ except Exception as e:
624
+ LOGGER.warning(
625
+ f"Exception when trying to parse namespace properties for {namespace_repr}. Exception: {e}"
626
+ )
614
627
  yield Status(removed=False)
615
628
  yield ContainerProperties(
616
- name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
629
+ name=namespace_repr,
630
+ qualifiedName=namespace_repr,
631
+ env=self.config.env,
632
+ customProperties=custom_properties,
617
633
  )
618
634
  yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
619
635
  dpi = self._get_dataplatform_instance_aspect()
@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
40
40
  del kwargs["timeout"]
41
41
  super().__init__(*args, **kwargs)
42
42
 
43
- def send(self, request, **kwargs):
43
+ def send(self, request, *args, **kwargs):
44
44
  timeout = kwargs.get("timeout")
45
45
  if timeout is None and hasattr(self, "timeout"):
46
46
  kwargs["timeout"] = self.timeout
47
- return super().send(request, **kwargs)
47
+ return super().send(request, *args, **kwargs)
48
48
 
49
49
 
50
50
  class IcebergProfilingConfig(ConfigModel):
@@ -515,5 +515,5 @@ def parse_ldap_dn(input_clean: bytes) -> str:
515
515
 
516
516
  def get_attr_or_none(
517
517
  attrs: Dict[str, Any], key: str, default: Optional[str] = None
518
- ) -> str:
518
+ ) -> Optional[str]:
519
519
  return attrs[key][0].decode() if attrs.get(key) else default