acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/datajob/datajob.py +7 -4
- datahub/api/entities/dataset/dataset.py +9 -11
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/common.py +5 -0
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/errors.py +4 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/api/source_helpers.py +1 -0
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +33 -8
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/run/pipeline.py +9 -6
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
- datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
- datahub/ingestion/source/ge_data_profiler.py +27 -1
- datahub/ingestion/source/hex/api.py +1 -20
- datahub/ingestion/source/hex/query_fetcher.py +4 -1
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +17 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +34 -5
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mlflow.py +19 -6
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +13 -1
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sigma/config.py +74 -6
- datahub/ingestion/source/sigma/sigma.py +16 -1
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +4 -52
- datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/hive.py +7 -2
- datahub/ingestion/source/sql/hive_metastore.py +5 -5
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +31 -6
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/ingestion/source/vertexai/vertexai.py +316 -4
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
- datahub/metadata/_urns/urn_defs.py +1819 -1763
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +17296 -16883
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/_all_entities.py +4 -0
- datahub/sdk/_shared.py +142 -4
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/entity_client.py +8 -0
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +6 -3
- datahub/sdk/mlmodel.py +301 -0
- datahub/sdk/mlmodelgroup.py +233 -0
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/specific/dataset.py +12 -0
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +18 -14
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/testing/mcp_diff.py +15 -2
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +350 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import re
|
|
@@ -12,16 +13,15 @@ from pydantic import BaseModel, Field, validator
|
|
|
12
13
|
|
|
13
14
|
from datahub.configuration.git import GitReference
|
|
14
15
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
16
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
15
17
|
from datahub.ingestion.api.decorators import (
|
|
16
18
|
SupportStatus,
|
|
17
|
-
capability,
|
|
18
19
|
config_class,
|
|
19
20
|
platform_name,
|
|
20
21
|
support_status,
|
|
21
22
|
)
|
|
22
23
|
from datahub.ingestion.api.source import (
|
|
23
24
|
CapabilityReport,
|
|
24
|
-
SourceCapability,
|
|
25
25
|
TestableSource,
|
|
26
26
|
TestConnectionReport,
|
|
27
27
|
)
|
|
@@ -40,19 +40,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
|
|
|
40
40
|
logger = logging.getLogger(__name__)
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
@dataclasses.dataclass
|
|
44
|
+
class DBTCoreReport(DBTSourceReport):
|
|
45
|
+
catalog_info: Optional[dict] = None
|
|
46
|
+
manifest_info: Optional[dict] = None
|
|
47
|
+
|
|
48
|
+
|
|
43
49
|
class DBTCoreConfig(DBTCommonConfig):
|
|
44
50
|
manifest_path: str = Field(
|
|
45
|
-
description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json
|
|
46
|
-
"
|
|
51
|
+
description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
|
|
52
|
+
"This can be a local file or a URI."
|
|
47
53
|
)
|
|
48
|
-
catalog_path: str = Field(
|
|
49
|
-
|
|
50
|
-
"
|
|
54
|
+
catalog_path: Optional[str] = Field(
|
|
55
|
+
None,
|
|
56
|
+
description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
|
|
57
|
+
"This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
|
|
58
|
+
"This can be a local file or a URI.",
|
|
51
59
|
)
|
|
52
60
|
sources_path: Optional[str] = Field(
|
|
53
61
|
default=None,
|
|
54
|
-
description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json.
|
|
55
|
-
"specified, last-modified fields will not be populated.
|
|
62
|
+
description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
|
|
63
|
+
"If not specified, last-modified fields will not be populated. "
|
|
64
|
+
"This can be a local file or a URI.",
|
|
56
65
|
)
|
|
57
66
|
run_results_paths: List[str] = Field(
|
|
58
67
|
default=[],
|
|
@@ -161,7 +170,7 @@ def get_columns(
|
|
|
161
170
|
|
|
162
171
|
def extract_dbt_entities(
|
|
163
172
|
all_manifest_entities: Dict[str, Dict[str, Any]],
|
|
164
|
-
all_catalog_entities: Dict[str, Dict[str, Any]],
|
|
173
|
+
all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
|
|
165
174
|
sources_results: List[Dict[str, Any]],
|
|
166
175
|
manifest_adapter: str,
|
|
167
176
|
use_identifiers: bool,
|
|
@@ -186,15 +195,6 @@ def extract_dbt_entities(
|
|
|
186
195
|
):
|
|
187
196
|
name = manifest_node["alias"]
|
|
188
197
|
|
|
189
|
-
# initialize comment to "" for consistency with descriptions
|
|
190
|
-
# (since dbt null/undefined descriptions as "")
|
|
191
|
-
comment = ""
|
|
192
|
-
|
|
193
|
-
if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
|
|
194
|
-
"comment"
|
|
195
|
-
):
|
|
196
|
-
comment = all_catalog_entities[key]["metadata"]["comment"]
|
|
197
|
-
|
|
198
198
|
materialization = None
|
|
199
199
|
if "materialized" in manifest_node.get("config", {}):
|
|
200
200
|
# It's a model
|
|
@@ -204,8 +204,9 @@ def extract_dbt_entities(
|
|
|
204
204
|
if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
|
|
205
205
|
upstream_nodes = manifest_node["depends_on"]["nodes"]
|
|
206
206
|
|
|
207
|
-
|
|
208
|
-
|
|
207
|
+
catalog_node = (
|
|
208
|
+
all_catalog_entities.get(key) if all_catalog_entities is not None else None
|
|
209
|
+
)
|
|
209
210
|
missing_from_catalog = catalog_node is None
|
|
210
211
|
catalog_type = None
|
|
211
212
|
|
|
@@ -214,16 +215,23 @@ def extract_dbt_entities(
|
|
|
214
215
|
# Test and ephemeral nodes will never show up in the catalog.
|
|
215
216
|
missing_from_catalog = False
|
|
216
217
|
else:
|
|
217
|
-
if not only_include_if_in_catalog:
|
|
218
|
+
if all_catalog_entities is not None and not only_include_if_in_catalog:
|
|
219
|
+
# If the catalog file is missing, we have already generated a general message.
|
|
218
220
|
report.warning(
|
|
219
221
|
title="Node missing from catalog",
|
|
220
222
|
message="Found a node in the manifest file but not in the catalog. "
|
|
221
223
|
"This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
|
|
222
|
-
"Some metadata,
|
|
224
|
+
"Some metadata, particularly schema information, will be impacted.",
|
|
223
225
|
context=key,
|
|
224
226
|
)
|
|
225
227
|
else:
|
|
226
|
-
catalog_type =
|
|
228
|
+
catalog_type = catalog_node["metadata"]["type"]
|
|
229
|
+
|
|
230
|
+
# initialize comment to "" for consistency with descriptions
|
|
231
|
+
# (since dbt null/undefined descriptions as "")
|
|
232
|
+
comment = ""
|
|
233
|
+
if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
|
|
234
|
+
comment = catalog_node["metadata"]["comment"]
|
|
227
235
|
|
|
228
236
|
query_tag_props = manifest_node.get("query_tag", {})
|
|
229
237
|
|
|
@@ -231,12 +239,15 @@ def extract_dbt_entities(
|
|
|
231
239
|
|
|
232
240
|
owner = meta.get("owner")
|
|
233
241
|
if owner is None:
|
|
234
|
-
owner = manifest_node.get("config", {}).get("meta"
|
|
242
|
+
owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
|
|
243
|
+
|
|
244
|
+
if not meta:
|
|
245
|
+
# On older versions of dbt, the meta field was nested under config
|
|
246
|
+
# for some node types.
|
|
247
|
+
meta = manifest_node.get("config", {}).get("meta") or {}
|
|
235
248
|
|
|
236
249
|
tags = manifest_node.get("tags", [])
|
|
237
250
|
tags = [tag_prefix + tag for tag in tags]
|
|
238
|
-
if not meta:
|
|
239
|
-
meta = manifest_node.get("config", {}).get("meta", {})
|
|
240
251
|
|
|
241
252
|
max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
|
|
242
253
|
max_loaded_at = None
|
|
@@ -453,15 +464,18 @@ def load_run_results(
|
|
|
453
464
|
@platform_name("dbt")
|
|
454
465
|
@config_class(DBTCoreConfig)
|
|
455
466
|
@support_status(SupportStatus.CERTIFIED)
|
|
456
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
457
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
458
467
|
class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
459
468
|
config: DBTCoreConfig
|
|
469
|
+
report: DBTCoreReport
|
|
470
|
+
|
|
471
|
+
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
|
|
472
|
+
super().__init__(config, ctx)
|
|
473
|
+
self.report = DBTCoreReport()
|
|
460
474
|
|
|
461
475
|
@classmethod
|
|
462
476
|
def create(cls, config_dict, ctx):
|
|
463
477
|
config = DBTCoreConfig.parse_obj(config_dict)
|
|
464
|
-
return cls(config, ctx
|
|
478
|
+
return cls(config, ctx)
|
|
465
479
|
|
|
466
480
|
@staticmethod
|
|
467
481
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -471,9 +485,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
471
485
|
DBTCoreSource.load_file_as_json(
|
|
472
486
|
source_config.manifest_path, source_config.aws_connection
|
|
473
487
|
)
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
488
|
+
if source_config.catalog_path is not None:
|
|
489
|
+
DBTCoreSource.load_file_as_json(
|
|
490
|
+
source_config.catalog_path, source_config.aws_connection
|
|
491
|
+
)
|
|
477
492
|
test_report.basic_connectivity = CapabilityReport(capable=True)
|
|
478
493
|
except Exception as e:
|
|
479
494
|
test_report.basic_connectivity = CapabilityReport(
|
|
@@ -511,11 +526,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
511
526
|
dbt_manifest_json = self.load_file_as_json(
|
|
512
527
|
self.config.manifest_path, self.config.aws_connection
|
|
513
528
|
)
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
529
|
+
dbt_manifest_metadata = dbt_manifest_json["metadata"]
|
|
530
|
+
self.report.manifest_info = dict(
|
|
531
|
+
generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
|
|
532
|
+
dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
|
|
533
|
+
project_name=dbt_manifest_metadata.get("project_name", "unknown"),
|
|
517
534
|
)
|
|
518
535
|
|
|
536
|
+
dbt_catalog_json = None
|
|
537
|
+
dbt_catalog_metadata = None
|
|
538
|
+
if self.config.catalog_path is not None:
|
|
539
|
+
dbt_catalog_json = self.load_file_as_json(
|
|
540
|
+
self.config.catalog_path, self.config.aws_connection
|
|
541
|
+
)
|
|
542
|
+
dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
|
|
543
|
+
self.report.catalog_info = dict(
|
|
544
|
+
generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
|
|
545
|
+
dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
|
|
546
|
+
project_name=dbt_catalog_metadata.get("project_name", "unknown"),
|
|
547
|
+
)
|
|
548
|
+
else:
|
|
549
|
+
self.report.warning(
|
|
550
|
+
title="No catalog file configured",
|
|
551
|
+
message="Some metadata, particularly schema information, will be missing.",
|
|
552
|
+
)
|
|
553
|
+
|
|
519
554
|
if self.config.sources_path is not None:
|
|
520
555
|
dbt_sources_json = self.load_file_as_json(
|
|
521
556
|
self.config.sources_path, self.config.aws_connection
|
|
@@ -528,18 +563,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
528
563
|
manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
|
|
529
564
|
manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
|
|
530
565
|
|
|
531
|
-
catalog_schema =
|
|
532
|
-
catalog_version =
|
|
566
|
+
catalog_schema = None
|
|
567
|
+
catalog_version = None
|
|
568
|
+
if dbt_catalog_metadata is not None:
|
|
569
|
+
catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
|
|
570
|
+
catalog_version = dbt_catalog_metadata.get("dbt_version")
|
|
533
571
|
|
|
534
572
|
manifest_nodes = dbt_manifest_json["nodes"]
|
|
535
573
|
manifest_sources = dbt_manifest_json["sources"]
|
|
536
574
|
|
|
537
575
|
all_manifest_entities = {**manifest_nodes, **manifest_sources}
|
|
538
576
|
|
|
539
|
-
|
|
540
|
-
|
|
577
|
+
all_catalog_entities = None
|
|
578
|
+
if dbt_catalog_json is not None:
|
|
579
|
+
catalog_nodes = dbt_catalog_json["nodes"]
|
|
580
|
+
catalog_sources = dbt_catalog_json["sources"]
|
|
541
581
|
|
|
542
|
-
|
|
582
|
+
all_catalog_entities = {**catalog_nodes, **catalog_sources}
|
|
543
583
|
|
|
544
584
|
nodes = extract_dbt_entities(
|
|
545
585
|
all_manifest_entities=all_manifest_entities,
|
|
@@ -590,7 +630,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
590
630
|
)
|
|
591
631
|
except Exception as e:
|
|
592
632
|
self.report.info(
|
|
593
|
-
title="
|
|
633
|
+
title="dbt Catalog Version",
|
|
594
634
|
message="Failed to determine the catalog version",
|
|
595
635
|
exc=e,
|
|
596
636
|
)
|
|
@@ -474,6 +474,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
474
474
|
dataset_properties.customProperties["schema.downsampled"] = "True"
|
|
475
475
|
dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
|
|
476
476
|
# append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include MAX_SCHEMA_SIZE items
|
|
477
|
+
primary_keys = []
|
|
477
478
|
for schema_field in sorted(
|
|
478
479
|
table_fields,
|
|
479
480
|
key=lambda x: (
|
|
@@ -484,22 +485,23 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
484
485
|
field_path = schema_field["delimited_name"]
|
|
485
486
|
native_data_type = self.get_native_type(schema_field["type"], table_name)
|
|
486
487
|
type = self.get_field_type(schema_field["type"], table_name)
|
|
487
|
-
description = None
|
|
488
488
|
nullable = True
|
|
489
489
|
if field_path in primary_key_dict:
|
|
490
|
-
|
|
490
|
+
# primary key should not be nullable
|
|
491
|
+
type_key = (
|
|
491
492
|
"Partition Key"
|
|
492
493
|
if primary_key_dict.get(field_path) == "HASH"
|
|
493
494
|
else "Sort Key"
|
|
494
495
|
)
|
|
495
|
-
|
|
496
|
+
dataset_properties.customProperties[type_key] = field_path
|
|
496
497
|
nullable = False
|
|
498
|
+
primary_keys.append(field_path)
|
|
497
499
|
|
|
498
500
|
field = SchemaField(
|
|
499
501
|
fieldPath=field_path,
|
|
500
502
|
nativeDataType=native_data_type,
|
|
501
503
|
type=type,
|
|
502
|
-
description=
|
|
504
|
+
description=None,
|
|
503
505
|
nullable=nullable,
|
|
504
506
|
recursive=False,
|
|
505
507
|
)
|
|
@@ -513,6 +515,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
513
515
|
hash="",
|
|
514
516
|
platformSchema=SchemalessClass(),
|
|
515
517
|
fields=canonical_schema,
|
|
518
|
+
primaryKeys=primary_keys,
|
|
516
519
|
)
|
|
517
520
|
return schema_metadata
|
|
518
521
|
|
|
@@ -135,10 +135,10 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
|
|
|
135
135
|
"""
|
|
136
136
|
This plugin extracts:
|
|
137
137
|
|
|
138
|
-
- Entities as [`MLPrimaryKey`](https://
|
|
139
|
-
- Fields as [`MLFeature`](https://
|
|
140
|
-
- Feature views and on-demand feature views as [`MLFeatureTable`](https://
|
|
141
|
-
- Batch and stream source details as [`Dataset`](https://
|
|
138
|
+
- Entities as [`MLPrimaryKey`](https://docs.datahub.com/docs/graphql/objects#mlprimarykey)
|
|
139
|
+
- Fields as [`MLFeature`](https://docs.datahub.com/docs/graphql/objects#mlfeature)
|
|
140
|
+
- Feature views and on-demand feature views as [`MLFeatureTable`](https://docs.datahub.com/docs/graphql/objects#mlfeaturetable)
|
|
141
|
+
- Batch and stream source details as [`Dataset`](https://docs.datahub.com/docs/graphql/objects#dataset)
|
|
142
142
|
- Column types associated with each entity and feature
|
|
143
143
|
"""
|
|
144
144
|
|
|
@@ -16,7 +16,7 @@ from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
|
16
16
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
17
17
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
18
18
|
from datahub.ingestion.api.report import Report
|
|
19
|
-
from datahub.ingestion.source.bigquery_v2.
|
|
19
|
+
from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
|
|
20
20
|
BigQueryConnectionConfig,
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.source.snowflake.snowflake_connection import (
|
|
@@ -54,7 +54,7 @@ class FivetranLogAPI:
|
|
|
54
54
|
snowflake_destination_config.database,
|
|
55
55
|
)
|
|
56
56
|
)
|
|
57
|
-
fivetran_log_query.
|
|
57
|
+
fivetran_log_query.set_schema(
|
|
58
58
|
snowflake_destination_config.log_schema,
|
|
59
59
|
)
|
|
60
60
|
fivetran_log_database = snowflake_destination_config.database
|
|
@@ -66,8 +66,12 @@ class FivetranLogAPI:
|
|
|
66
66
|
engine = create_engine(
|
|
67
67
|
bigquery_destination_config.get_sql_alchemy_url(),
|
|
68
68
|
)
|
|
69
|
-
fivetran_log_query.
|
|
70
|
-
|
|
69
|
+
fivetran_log_query.set_schema(bigquery_destination_config.dataset)
|
|
70
|
+
|
|
71
|
+
# The "database" should be the BigQuery project name.
|
|
72
|
+
fivetran_log_database = engine.execute(
|
|
73
|
+
"SELECT @@project_id"
|
|
74
|
+
).fetchone()[0]
|
|
71
75
|
else:
|
|
72
76
|
raise ConfigurationError(
|
|
73
77
|
f"Destination platform '{destination_platform}' is not yet supported."
|
|
@@ -12,14 +12,14 @@ class FivetranLogQuery:
|
|
|
12
12
|
|
|
13
13
|
def __init__(self) -> None:
|
|
14
14
|
# Select query db clause
|
|
15
|
-
self.
|
|
16
|
-
|
|
17
|
-
def set_db(self, db_name: str) -> None:
|
|
18
|
-
self.db_clause = f"{db_name}."
|
|
15
|
+
self.schema_clause: str = ""
|
|
19
16
|
|
|
20
17
|
def use_database(self, db_name: str) -> str:
|
|
21
18
|
return f"use database {db_name}"
|
|
22
19
|
|
|
20
|
+
def set_schema(self, schema_name: str) -> None:
|
|
21
|
+
self.schema_clause = f"{schema_name}."
|
|
22
|
+
|
|
23
23
|
def get_connectors_query(self) -> str:
|
|
24
24
|
return f"""\
|
|
25
25
|
SELECT
|
|
@@ -30,7 +30,7 @@ SELECT
|
|
|
30
30
|
paused,
|
|
31
31
|
sync_frequency,
|
|
32
32
|
destination_id
|
|
33
|
-
FROM {self.
|
|
33
|
+
FROM {self.schema_clause}connector
|
|
34
34
|
WHERE
|
|
35
35
|
_fivetran_deleted = FALSE
|
|
36
36
|
QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY _fivetran_synced DESC) = 1
|
|
@@ -42,7 +42,7 @@ SELECT id as user_id,
|
|
|
42
42
|
given_name,
|
|
43
43
|
family_name,
|
|
44
44
|
email
|
|
45
|
-
FROM {self.
|
|
45
|
+
FROM {self.schema_clause}user
|
|
46
46
|
"""
|
|
47
47
|
|
|
48
48
|
def get_sync_logs_query(
|
|
@@ -62,7 +62,7 @@ WITH ranked_syncs AS (
|
|
|
62
62
|
MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
|
|
63
63
|
MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
|
|
64
64
|
ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
|
|
65
|
-
FROM {self.
|
|
65
|
+
FROM {self.schema_clause}log
|
|
66
66
|
WHERE message_event in ('sync_start', 'sync_end')
|
|
67
67
|
AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
|
|
68
68
|
AND connector_id IN ({formatted_connector_ids})
|
|
@@ -99,11 +99,11 @@ FROM (
|
|
|
99
99
|
dsm.name as destination_schema_name,
|
|
100
100
|
tl.created_at as created_at,
|
|
101
101
|
ROW_NUMBER() OVER (PARTITION BY stm.connector_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
|
|
102
|
-
FROM {self.
|
|
103
|
-
JOIN {self.
|
|
104
|
-
JOIN {self.
|
|
105
|
-
JOIN {self.
|
|
106
|
-
JOIN {self.
|
|
102
|
+
FROM {self.schema_clause}table_lineage as tl
|
|
103
|
+
JOIN {self.schema_clause}source_table_metadata as stm on tl.source_table_id = stm.id
|
|
104
|
+
JOIN {self.schema_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
|
|
105
|
+
JOIN {self.schema_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
|
|
106
|
+
JOIN {self.schema_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
|
|
107
107
|
WHERE stm.connector_id IN ({formatted_connector_ids})
|
|
108
108
|
)
|
|
109
109
|
-- Ensure that we only get back one entry per source and destination pair.
|
|
@@ -131,13 +131,13 @@ FROM (
|
|
|
131
131
|
dcm.name as destination_column_name,
|
|
132
132
|
cl.created_at as created_at,
|
|
133
133
|
ROW_NUMBER() OVER (PARTITION BY stm.connector_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
|
|
134
|
-
FROM {self.
|
|
135
|
-
JOIN {self.
|
|
134
|
+
FROM {self.schema_clause}column_lineage as cl
|
|
135
|
+
JOIN {self.schema_clause}source_column_metadata as scm
|
|
136
136
|
ON cl.source_column_id = scm.id
|
|
137
|
-
JOIN {self.
|
|
137
|
+
JOIN {self.schema_clause}destination_column_metadata as dcm
|
|
138
138
|
ON cl.destination_column_id = dcm.id
|
|
139
139
|
-- Only joining source_table_metadata to get the connector_id.
|
|
140
|
-
JOIN {self.
|
|
140
|
+
JOIN {self.schema_clause}source_table_metadata as stm
|
|
141
141
|
ON scm.table_id = stm.id
|
|
142
142
|
WHERE stm.connector_id IN ({formatted_connector_ids})
|
|
143
143
|
)
|
|
@@ -5,6 +5,7 @@ import concurrent.futures
|
|
|
5
5
|
import contextlib
|
|
6
6
|
import dataclasses
|
|
7
7
|
import functools
|
|
8
|
+
import importlib.metadata
|
|
8
9
|
import json
|
|
9
10
|
import logging
|
|
10
11
|
import re
|
|
@@ -51,6 +52,7 @@ from typing_extensions import Concatenate, ParamSpec
|
|
|
51
52
|
from datahub.emitter import mce_builder
|
|
52
53
|
from datahub.emitter.mce_builder import get_sys_time
|
|
53
54
|
from datahub.ingestion.graph.client import get_default_graph
|
|
55
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
54
56
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
55
57
|
from datahub.ingestion.source.profiling.common import (
|
|
56
58
|
Cardinality,
|
|
@@ -83,6 +85,30 @@ if TYPE_CHECKING:
|
|
|
83
85
|
from pyathena.cursor import Cursor
|
|
84
86
|
|
|
85
87
|
assert MARKUPSAFE_PATCHED
|
|
88
|
+
|
|
89
|
+
# We need to ensure that acryl-great-expectations is installed
|
|
90
|
+
# and great-expectations is not installed.
|
|
91
|
+
try:
|
|
92
|
+
acryl_gx_version = bool(importlib.metadata.distribution("acryl-great-expectations"))
|
|
93
|
+
except importlib.metadata.PackageNotFoundError:
|
|
94
|
+
acryl_gx_version = False
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
original_gx_version = bool(importlib.metadata.distribution("great-expectations"))
|
|
98
|
+
except importlib.metadata.PackageNotFoundError:
|
|
99
|
+
original_gx_version = False
|
|
100
|
+
|
|
101
|
+
if acryl_gx_version and original_gx_version:
|
|
102
|
+
raise RuntimeError(
|
|
103
|
+
"acryl-great-expectations and great-expectations cannot both be installed because their files will conflict. "
|
|
104
|
+
"You will need to (1) uninstall great-expectations and (2) re-install acryl-great-expectations. "
|
|
105
|
+
"See https://github.com/pypa/pip/issues/4625."
|
|
106
|
+
)
|
|
107
|
+
elif original_gx_version:
|
|
108
|
+
raise RuntimeError(
|
|
109
|
+
"We expect acryl-great-expectations to be installed, but great-expectations is installed instead."
|
|
110
|
+
)
|
|
111
|
+
|
|
86
112
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
87
113
|
|
|
88
114
|
_original_get_column_median = SqlAlchemyDataset.get_column_median
|
|
@@ -1569,7 +1595,7 @@ def _get_columns_to_ignore_sampling(
|
|
|
1569
1595
|
name=dataset_name, platform=platform, env=env
|
|
1570
1596
|
)
|
|
1571
1597
|
|
|
1572
|
-
datahub_graph = get_default_graph()
|
|
1598
|
+
datahub_graph = get_default_graph(ClientMode.INGESTION)
|
|
1573
1599
|
|
|
1574
1600
|
dataset_tags = datahub_graph.get_tags(dataset_urn)
|
|
1575
1601
|
if dataset_tags:
|
|
@@ -27,6 +27,7 @@ logger = logging.getLogger(__name__)
|
|
|
27
27
|
|
|
28
28
|
# The following models were Claude-generated from Hex API OpenAPI definition https://static.hex.site/openapi.json
|
|
29
29
|
# To be exclusively used internally for the deserialization of the API response
|
|
30
|
+
# Model is incomplete and fields may have not been mapped if not used in the ingestion
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class HexApiAppViewStats(BaseModel):
|
|
@@ -83,20 +84,10 @@ class HexApiUser(BaseModel):
|
|
|
83
84
|
email: str
|
|
84
85
|
|
|
85
86
|
|
|
86
|
-
class HexApiAccessType(StrEnum):
|
|
87
|
-
"""Access type enum."""
|
|
88
|
-
|
|
89
|
-
NONE = "NONE"
|
|
90
|
-
VIEW = "VIEW"
|
|
91
|
-
EDIT = "EDIT"
|
|
92
|
-
FULL_ACCESS = "FULL_ACCESS"
|
|
93
|
-
|
|
94
|
-
|
|
95
87
|
class HexApiUserAccess(BaseModel):
|
|
96
88
|
"""User access model."""
|
|
97
89
|
|
|
98
90
|
user: HexApiUser
|
|
99
|
-
access: Optional[HexApiAccessType] = None
|
|
100
91
|
|
|
101
92
|
|
|
102
93
|
class HexApiCollectionData(BaseModel):
|
|
@@ -109,13 +100,6 @@ class HexApiCollectionAccess(BaseModel):
|
|
|
109
100
|
"""Collection access model."""
|
|
110
101
|
|
|
111
102
|
collection: HexApiCollectionData
|
|
112
|
-
access: Optional[HexApiAccessType] = None
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
class HexApiAccessSettings(BaseModel):
|
|
116
|
-
"""Access settings model."""
|
|
117
|
-
|
|
118
|
-
access: Optional[HexApiAccessType] = None
|
|
119
103
|
|
|
120
104
|
|
|
121
105
|
class HexApiWeeklySchedule(BaseModel):
|
|
@@ -145,9 +129,6 @@ class HexApiSharing(BaseModel):
|
|
|
145
129
|
users: Optional[List[HexApiUserAccess]] = []
|
|
146
130
|
collections: Optional[List[HexApiCollectionAccess]] = []
|
|
147
131
|
groups: Optional[List[Any]] = []
|
|
148
|
-
workspace: Optional[HexApiAccessSettings] = None
|
|
149
|
-
public_web: Optional[HexApiAccessSettings] = Field(default=None, alias="publicWeb")
|
|
150
|
-
support: Optional[HexApiAccessSettings] = None
|
|
151
132
|
|
|
152
133
|
class Config:
|
|
153
134
|
extra = "ignore" # Allow extra fields in the JSON
|
|
@@ -18,7 +18,8 @@ from datahub.utilities.time import datetime_to_ts_millis
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
|
21
|
-
|
|
21
|
+
# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
|
|
22
|
+
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
@dataclass
|
|
@@ -39,6 +40,7 @@ class HexQueryFetcherReport(SourceReport):
|
|
|
39
40
|
fetched_query_objects: int = 0
|
|
40
41
|
filtered_out_queries_missing_metadata: int = 0
|
|
41
42
|
filtered_out_queries_different_workspace: int = 0
|
|
43
|
+
filtered_out_queries_no_match: int = 0
|
|
42
44
|
filtered_out_queries_no_subjects: int = 0
|
|
43
45
|
total_queries: int = 0
|
|
44
46
|
total_dataset_subjects: int = 0
|
|
@@ -210,6 +212,7 @@ class HexQueryFetcher:
|
|
|
210
212
|
match = re.search(HEX_METADATA_PATTERN, sql_statement)
|
|
211
213
|
|
|
212
214
|
if not match:
|
|
215
|
+
self.report.filtered_out_queries_no_match += 1
|
|
213
216
|
return None
|
|
214
217
|
|
|
215
218
|
try:
|
|
@@ -16,7 +16,7 @@ from pyiceberg.exceptions import (
|
|
|
16
16
|
)
|
|
17
17
|
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
|
|
18
18
|
from pyiceberg.table import Table
|
|
19
|
-
from pyiceberg.typedef import Identifier
|
|
19
|
+
from pyiceberg.typedef import Identifier, Properties
|
|
20
20
|
from pyiceberg.types import (
|
|
21
21
|
BinaryType,
|
|
22
22
|
BooleanType,
|
|
@@ -387,8 +387,13 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
387
387
|
env=self.config.env,
|
|
388
388
|
)
|
|
389
389
|
)
|
|
390
|
+
namespace_properties: Properties = catalog.load_namespace_properties(
|
|
391
|
+
namespace
|
|
392
|
+
)
|
|
390
393
|
namespaces.append((namespace, namespace_urn))
|
|
391
|
-
for aspect in self._create_iceberg_namespace_aspects(
|
|
394
|
+
for aspect in self._create_iceberg_namespace_aspects(
|
|
395
|
+
namespace, namespace_properties
|
|
396
|
+
):
|
|
392
397
|
yield stamping_processor.stamp_wu(
|
|
393
398
|
MetadataChangeProposalWrapper(
|
|
394
399
|
entityUrn=namespace_urn, aspect=aspect
|
|
@@ -608,12 +613,23 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
608
613
|
return self.report
|
|
609
614
|
|
|
610
615
|
def _create_iceberg_namespace_aspects(
|
|
611
|
-
self, namespace: Identifier
|
|
616
|
+
self, namespace: Identifier, properties: Properties
|
|
612
617
|
) -> Iterable[_Aspect]:
|
|
613
618
|
namespace_repr = ".".join(namespace)
|
|
619
|
+
custom_properties: Dict[str, str] = {}
|
|
620
|
+
for k, v in properties.items():
|
|
621
|
+
try:
|
|
622
|
+
custom_properties[str(k)] = str(v)
|
|
623
|
+
except Exception as e:
|
|
624
|
+
LOGGER.warning(
|
|
625
|
+
f"Exception when trying to parse namespace properties for {namespace_repr}. Exception: {e}"
|
|
626
|
+
)
|
|
614
627
|
yield Status(removed=False)
|
|
615
628
|
yield ContainerProperties(
|
|
616
|
-
name=namespace_repr,
|
|
629
|
+
name=namespace_repr,
|
|
630
|
+
qualifiedName=namespace_repr,
|
|
631
|
+
env=self.config.env,
|
|
632
|
+
customProperties=custom_properties,
|
|
617
633
|
)
|
|
618
634
|
yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
|
|
619
635
|
dpi = self._get_dataplatform_instance_aspect()
|
|
@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
|
|
|
40
40
|
del kwargs["timeout"]
|
|
41
41
|
super().__init__(*args, **kwargs)
|
|
42
42
|
|
|
43
|
-
def send(self, request, **kwargs):
|
|
43
|
+
def send(self, request, *args, **kwargs):
|
|
44
44
|
timeout = kwargs.get("timeout")
|
|
45
45
|
if timeout is None and hasattr(self, "timeout"):
|
|
46
46
|
kwargs["timeout"] = self.timeout
|
|
47
|
-
return super().send(request, **kwargs)
|
|
47
|
+
return super().send(request, *args, **kwargs)
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class IcebergProfilingConfig(ConfigModel):
|
datahub/ingestion/source/ldap.py
CHANGED