acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/METADATA +2524 -2471
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/RECORD +87 -87
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +16 -7
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/run/pipeline.py +3 -2
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/ge_data_profiler.py +2 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
- datahub/metadata/_urns/urn_defs.py +1786 -1786
- datahub/metadata/schema.avsc +17364 -16988
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/main_client.py +2 -2
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +378 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, Sour
|
|
|
18
18
|
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
19
19
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
20
20
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
21
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
21
22
|
from datahub.metadata.schema_classes import (
|
|
22
23
|
DomainsClass,
|
|
23
24
|
GlossaryTermAssociationClass,
|
|
@@ -48,7 +49,7 @@ def apply_association_to_container(
|
|
|
48
49
|
"""
|
|
49
50
|
urns: List[str] = [container_urn]
|
|
50
51
|
if not graph:
|
|
51
|
-
graph = get_default_graph()
|
|
52
|
+
graph = get_default_graph(ClientMode.INGESTION)
|
|
52
53
|
logger.info(f"Using {graph}")
|
|
53
54
|
urns.extend(
|
|
54
55
|
graph.get_urns_by_filter(
|
|
@@ -205,7 +205,7 @@ class FeatureGroupProcessor:
|
|
|
205
205
|
textwrap.dedent(
|
|
206
206
|
f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
|
|
207
207
|
To view full table metadata, run Glue ingestion
|
|
208
|
-
(see https://
|
|
208
|
+
(see https://docs.datahub.com/docs/generated/ingestion/sources/glue)"""
|
|
209
209
|
)
|
|
210
210
|
)
|
|
211
211
|
|
|
@@ -270,29 +270,30 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
270
270
|
):
|
|
271
271
|
return
|
|
272
272
|
|
|
273
|
-
with self.report.new_stage(
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
273
|
+
with self.report.new_stage(
|
|
274
|
+
f"*: {QUERIES_EXTRACTION}"
|
|
275
|
+
), BigQueryQueriesExtractor(
|
|
276
|
+
connection=self.config.get_bigquery_client(),
|
|
277
|
+
schema_api=self.bq_schema_extractor.schema_api,
|
|
278
|
+
config=BigQueryQueriesExtractorConfig(
|
|
279
|
+
window=self.config,
|
|
280
|
+
user_email_pattern=self.config.usage.user_email_pattern,
|
|
281
|
+
include_lineage=self.config.include_table_lineage,
|
|
282
|
+
include_usage_statistics=self.config.include_usage_statistics,
|
|
283
|
+
include_operations=self.config.usage.include_operational_stats,
|
|
284
|
+
include_queries=self.config.include_queries,
|
|
285
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
286
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
287
|
+
region_qualifiers=self.config.region_qualifiers,
|
|
288
|
+
),
|
|
289
|
+
structured_report=self.report,
|
|
290
|
+
filters=self.filters,
|
|
291
|
+
identifiers=self.identifiers,
|
|
292
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
293
|
+
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
294
|
+
) as queries_extractor:
|
|
295
|
+
self.report.queries_extractor = queries_extractor.report
|
|
296
|
+
yield from queries_extractor.get_workunits_internal()
|
|
296
297
|
else:
|
|
297
298
|
if self.config.include_usage_statistics:
|
|
298
299
|
yield from self.usage_extractor.get_usage_workunits(
|
|
@@ -70,30 +70,31 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
with self.report.new_stage(
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
73
|
+
with self.report.new_stage(
|
|
74
|
+
f"{keyspace_name}: {PROFILING}"
|
|
75
|
+
), ThreadPoolExecutor(
|
|
76
|
+
max_workers=self.config.profiling.max_workers
|
|
77
|
+
) as executor:
|
|
78
|
+
future_to_dataset = {
|
|
79
|
+
executor.submit(
|
|
80
|
+
self.generate_profile,
|
|
81
|
+
keyspace_name,
|
|
82
|
+
table_name,
|
|
83
|
+
cassandra_data.columns.get(table_name, []),
|
|
84
|
+
): table_name
|
|
85
|
+
for table_name in tables
|
|
86
|
+
}
|
|
87
|
+
for future in as_completed(future_to_dataset):
|
|
88
|
+
table_name = future_to_dataset[future]
|
|
89
|
+
try:
|
|
90
|
+
yield from future.result()
|
|
91
|
+
except Exception as exc:
|
|
92
|
+
self.report.profiling_skipped_other[table_name] += 1
|
|
93
|
+
self.report.failure(
|
|
94
|
+
message="Failed to profile for table",
|
|
95
|
+
context=f"{keyspace_name}.{table_name}",
|
|
96
|
+
exc=exc,
|
|
97
|
+
)
|
|
97
98
|
|
|
98
99
|
def generate_profile(
|
|
99
100
|
self,
|
|
@@ -195,17 +195,18 @@ class DataHubDatabaseReader:
|
|
|
195
195
|
Yields:
|
|
196
196
|
Row objects containing URNs of soft-deleted entities
|
|
197
197
|
"""
|
|
198
|
-
with self.engine.connect() as conn
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
198
|
+
with self.engine.connect() as conn, contextlib.closing(
|
|
199
|
+
conn.connection.cursor()
|
|
200
|
+
) as cursor:
|
|
201
|
+
logger.debug("Polling soft-deleted urns from database")
|
|
202
|
+
cursor.execute(self.soft_deleted_urns_query)
|
|
203
|
+
columns = [desc[0] for desc in cursor.description]
|
|
204
|
+
while True:
|
|
205
|
+
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
206
|
+
if not rows:
|
|
207
|
+
return
|
|
208
|
+
for row in rows:
|
|
209
|
+
yield dict(zip(columns, row))
|
|
209
210
|
|
|
210
211
|
def _parse_row(
|
|
211
212
|
self, row: Dict[str, Any]
|
|
@@ -10,14 +10,12 @@ from pydantic import Field, root_validator
|
|
|
10
10
|
|
|
11
11
|
from datahub.ingestion.api.decorators import (
|
|
12
12
|
SupportStatus,
|
|
13
|
-
capability,
|
|
14
13
|
config_class,
|
|
15
14
|
platform_name,
|
|
16
15
|
support_status,
|
|
17
16
|
)
|
|
18
17
|
from datahub.ingestion.api.source import (
|
|
19
18
|
CapabilityReport,
|
|
20
|
-
SourceCapability,
|
|
21
19
|
TestableSource,
|
|
22
20
|
TestConnectionReport,
|
|
23
21
|
)
|
|
@@ -262,16 +260,14 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
|
|
|
262
260
|
|
|
263
261
|
@platform_name("dbt")
|
|
264
262
|
@config_class(DBTCloudConfig)
|
|
265
|
-
@support_status(SupportStatus.
|
|
266
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
267
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
263
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
268
264
|
class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
269
265
|
config: DBTCloudConfig
|
|
270
266
|
|
|
271
267
|
@classmethod
|
|
272
268
|
def create(cls, config_dict, ctx):
|
|
273
269
|
config = DBTCloudConfig.parse_obj(config_dict)
|
|
274
|
-
return cls(config, ctx
|
|
270
|
+
return cls(config, ctx)
|
|
275
271
|
|
|
276
272
|
@staticmethod
|
|
277
273
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -125,6 +125,7 @@ _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
|
|
|
125
125
|
@dataclass
|
|
126
126
|
class DBTSourceReport(StaleEntityRemovalSourceReport):
|
|
127
127
|
sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
|
|
128
|
+
sql_parser_skipped_non_sql_model: LossyList[str] = field(default_factory=LossyList)
|
|
128
129
|
sql_parser_parse_failures: int = 0
|
|
129
130
|
sql_parser_detach_ctes_failures: int = 0
|
|
130
131
|
sql_parser_table_errors: int = 0
|
|
@@ -829,11 +830,13 @@ def get_column_type(
|
|
|
829
830
|
"Enabled by default, configure using `include_column_lineage`",
|
|
830
831
|
)
|
|
831
832
|
class DBTSourceBase(StatefulIngestionSourceBase):
|
|
832
|
-
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext
|
|
833
|
+
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
|
|
833
834
|
super().__init__(config, ctx)
|
|
835
|
+
self.platform: str = "dbt"
|
|
836
|
+
|
|
834
837
|
self.config = config
|
|
835
|
-
self.platform: str = platform
|
|
836
838
|
self.report: DBTSourceReport = DBTSourceReport()
|
|
839
|
+
|
|
837
840
|
self.compiled_owner_extraction_pattern: Optional[Any] = None
|
|
838
841
|
if self.config.owner_extraction_pattern:
|
|
839
842
|
self.compiled_owner_extraction_pattern = re.compile(
|
|
@@ -1177,6 +1180,11 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1177
1180
|
logger.debug(
|
|
1178
1181
|
f"Not generating CLL for {node.dbt_name} because we don't need it."
|
|
1179
1182
|
)
|
|
1183
|
+
elif node.language != "sql":
|
|
1184
|
+
logger.debug(
|
|
1185
|
+
f"Not generating CLL for {node.dbt_name} because it is not a SQL model."
|
|
1186
|
+
)
|
|
1187
|
+
self.report.sql_parser_skipped_non_sql_model.append(node.dbt_name)
|
|
1180
1188
|
elif node.compiled_code:
|
|
1181
1189
|
# Add CTE stops based on the upstreams list.
|
|
1182
1190
|
cte_mapping = {
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import re
|
|
@@ -12,16 +13,15 @@ from pydantic import BaseModel, Field, validator
|
|
|
12
13
|
|
|
13
14
|
from datahub.configuration.git import GitReference
|
|
14
15
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
16
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
15
17
|
from datahub.ingestion.api.decorators import (
|
|
16
18
|
SupportStatus,
|
|
17
|
-
capability,
|
|
18
19
|
config_class,
|
|
19
20
|
platform_name,
|
|
20
21
|
support_status,
|
|
21
22
|
)
|
|
22
23
|
from datahub.ingestion.api.source import (
|
|
23
24
|
CapabilityReport,
|
|
24
|
-
SourceCapability,
|
|
25
25
|
TestableSource,
|
|
26
26
|
TestConnectionReport,
|
|
27
27
|
)
|
|
@@ -40,19 +40,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
|
|
|
40
40
|
logger = logging.getLogger(__name__)
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
@dataclasses.dataclass
|
|
44
|
+
class DBTCoreReport(DBTSourceReport):
|
|
45
|
+
catalog_info: Optional[dict] = None
|
|
46
|
+
manifest_info: Optional[dict] = None
|
|
47
|
+
|
|
48
|
+
|
|
43
49
|
class DBTCoreConfig(DBTCommonConfig):
|
|
44
50
|
manifest_path: str = Field(
|
|
45
|
-
description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json
|
|
46
|
-
"
|
|
51
|
+
description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
|
|
52
|
+
"This can be a local file or a URI."
|
|
47
53
|
)
|
|
48
|
-
catalog_path: str = Field(
|
|
49
|
-
|
|
50
|
-
"
|
|
54
|
+
catalog_path: Optional[str] = Field(
|
|
55
|
+
None,
|
|
56
|
+
description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
|
|
57
|
+
"This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
|
|
58
|
+
"This can be a local file or a URI.",
|
|
51
59
|
)
|
|
52
60
|
sources_path: Optional[str] = Field(
|
|
53
61
|
default=None,
|
|
54
|
-
description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json.
|
|
55
|
-
"specified, last-modified fields will not be populated.
|
|
62
|
+
description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
|
|
63
|
+
"If not specified, last-modified fields will not be populated. "
|
|
64
|
+
"This can be a local file or a URI.",
|
|
56
65
|
)
|
|
57
66
|
run_results_paths: List[str] = Field(
|
|
58
67
|
default=[],
|
|
@@ -161,7 +170,7 @@ def get_columns(
|
|
|
161
170
|
|
|
162
171
|
def extract_dbt_entities(
|
|
163
172
|
all_manifest_entities: Dict[str, Dict[str, Any]],
|
|
164
|
-
all_catalog_entities: Dict[str, Dict[str, Any]],
|
|
173
|
+
all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
|
|
165
174
|
sources_results: List[Dict[str, Any]],
|
|
166
175
|
manifest_adapter: str,
|
|
167
176
|
use_identifiers: bool,
|
|
@@ -186,15 +195,6 @@ def extract_dbt_entities(
|
|
|
186
195
|
):
|
|
187
196
|
name = manifest_node["alias"]
|
|
188
197
|
|
|
189
|
-
# initialize comment to "" for consistency with descriptions
|
|
190
|
-
# (since dbt null/undefined descriptions as "")
|
|
191
|
-
comment = ""
|
|
192
|
-
|
|
193
|
-
if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
|
|
194
|
-
"comment"
|
|
195
|
-
):
|
|
196
|
-
comment = all_catalog_entities[key]["metadata"]["comment"]
|
|
197
|
-
|
|
198
198
|
materialization = None
|
|
199
199
|
if "materialized" in manifest_node.get("config", {}):
|
|
200
200
|
# It's a model
|
|
@@ -204,8 +204,9 @@ def extract_dbt_entities(
|
|
|
204
204
|
if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
|
|
205
205
|
upstream_nodes = manifest_node["depends_on"]["nodes"]
|
|
206
206
|
|
|
207
|
-
|
|
208
|
-
|
|
207
|
+
catalog_node = (
|
|
208
|
+
all_catalog_entities.get(key) if all_catalog_entities is not None else None
|
|
209
|
+
)
|
|
209
210
|
missing_from_catalog = catalog_node is None
|
|
210
211
|
catalog_type = None
|
|
211
212
|
|
|
@@ -214,16 +215,23 @@ def extract_dbt_entities(
|
|
|
214
215
|
# Test and ephemeral nodes will never show up in the catalog.
|
|
215
216
|
missing_from_catalog = False
|
|
216
217
|
else:
|
|
217
|
-
if not only_include_if_in_catalog:
|
|
218
|
+
if all_catalog_entities is not None and not only_include_if_in_catalog:
|
|
219
|
+
# If the catalog file is missing, we have already generated a general message.
|
|
218
220
|
report.warning(
|
|
219
221
|
title="Node missing from catalog",
|
|
220
222
|
message="Found a node in the manifest file but not in the catalog. "
|
|
221
223
|
"This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
|
|
222
|
-
"Some metadata,
|
|
224
|
+
"Some metadata, particularly schema information, will be impacted.",
|
|
223
225
|
context=key,
|
|
224
226
|
)
|
|
225
227
|
else:
|
|
226
|
-
catalog_type =
|
|
228
|
+
catalog_type = catalog_node["metadata"]["type"]
|
|
229
|
+
|
|
230
|
+
# initialize comment to "" for consistency with descriptions
|
|
231
|
+
# (since dbt null/undefined descriptions as "")
|
|
232
|
+
comment = ""
|
|
233
|
+
if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
|
|
234
|
+
comment = catalog_node["metadata"]["comment"]
|
|
227
235
|
|
|
228
236
|
query_tag_props = manifest_node.get("query_tag", {})
|
|
229
237
|
|
|
@@ -231,12 +239,15 @@ def extract_dbt_entities(
|
|
|
231
239
|
|
|
232
240
|
owner = meta.get("owner")
|
|
233
241
|
if owner is None:
|
|
234
|
-
owner = manifest_node.get("config", {}).get("meta"
|
|
242
|
+
owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
|
|
243
|
+
|
|
244
|
+
if not meta:
|
|
245
|
+
# On older versions of dbt, the meta field was nested under config
|
|
246
|
+
# for some node types.
|
|
247
|
+
meta = manifest_node.get("config", {}).get("meta") or {}
|
|
235
248
|
|
|
236
249
|
tags = manifest_node.get("tags", [])
|
|
237
250
|
tags = [tag_prefix + tag for tag in tags]
|
|
238
|
-
if not meta:
|
|
239
|
-
meta = manifest_node.get("config", {}).get("meta", {})
|
|
240
251
|
|
|
241
252
|
max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
|
|
242
253
|
max_loaded_at = None
|
|
@@ -453,15 +464,18 @@ def load_run_results(
|
|
|
453
464
|
@platform_name("dbt")
|
|
454
465
|
@config_class(DBTCoreConfig)
|
|
455
466
|
@support_status(SupportStatus.CERTIFIED)
|
|
456
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
457
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
458
467
|
class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
459
468
|
config: DBTCoreConfig
|
|
469
|
+
report: DBTCoreReport
|
|
470
|
+
|
|
471
|
+
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
|
|
472
|
+
super().__init__(config, ctx)
|
|
473
|
+
self.report = DBTCoreReport()
|
|
460
474
|
|
|
461
475
|
@classmethod
|
|
462
476
|
def create(cls, config_dict, ctx):
|
|
463
477
|
config = DBTCoreConfig.parse_obj(config_dict)
|
|
464
|
-
return cls(config, ctx
|
|
478
|
+
return cls(config, ctx)
|
|
465
479
|
|
|
466
480
|
@staticmethod
|
|
467
481
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -471,9 +485,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
471
485
|
DBTCoreSource.load_file_as_json(
|
|
472
486
|
source_config.manifest_path, source_config.aws_connection
|
|
473
487
|
)
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
488
|
+
if source_config.catalog_path is not None:
|
|
489
|
+
DBTCoreSource.load_file_as_json(
|
|
490
|
+
source_config.catalog_path, source_config.aws_connection
|
|
491
|
+
)
|
|
477
492
|
test_report.basic_connectivity = CapabilityReport(capable=True)
|
|
478
493
|
except Exception as e:
|
|
479
494
|
test_report.basic_connectivity = CapabilityReport(
|
|
@@ -511,11 +526,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
511
526
|
dbt_manifest_json = self.load_file_as_json(
|
|
512
527
|
self.config.manifest_path, self.config.aws_connection
|
|
513
528
|
)
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
529
|
+
dbt_manifest_metadata = dbt_manifest_json["metadata"]
|
|
530
|
+
self.report.manifest_info = dict(
|
|
531
|
+
generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
|
|
532
|
+
dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
|
|
533
|
+
project_name=dbt_manifest_metadata.get("project_name", "unknown"),
|
|
517
534
|
)
|
|
518
535
|
|
|
536
|
+
dbt_catalog_json = None
|
|
537
|
+
dbt_catalog_metadata = None
|
|
538
|
+
if self.config.catalog_path is not None:
|
|
539
|
+
dbt_catalog_json = self.load_file_as_json(
|
|
540
|
+
self.config.catalog_path, self.config.aws_connection
|
|
541
|
+
)
|
|
542
|
+
dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
|
|
543
|
+
self.report.catalog_info = dict(
|
|
544
|
+
generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
|
|
545
|
+
dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
|
|
546
|
+
project_name=dbt_catalog_metadata.get("project_name", "unknown"),
|
|
547
|
+
)
|
|
548
|
+
else:
|
|
549
|
+
self.report.warning(
|
|
550
|
+
title="No catalog file configured",
|
|
551
|
+
message="Some metadata, particularly schema information, will be missing.",
|
|
552
|
+
)
|
|
553
|
+
|
|
519
554
|
if self.config.sources_path is not None:
|
|
520
555
|
dbt_sources_json = self.load_file_as_json(
|
|
521
556
|
self.config.sources_path, self.config.aws_connection
|
|
@@ -528,18 +563,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
528
563
|
manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
|
|
529
564
|
manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
|
|
530
565
|
|
|
531
|
-
catalog_schema =
|
|
532
|
-
catalog_version =
|
|
566
|
+
catalog_schema = None
|
|
567
|
+
catalog_version = None
|
|
568
|
+
if dbt_catalog_metadata is not None:
|
|
569
|
+
catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
|
|
570
|
+
catalog_version = dbt_catalog_metadata.get("dbt_version")
|
|
533
571
|
|
|
534
572
|
manifest_nodes = dbt_manifest_json["nodes"]
|
|
535
573
|
manifest_sources = dbt_manifest_json["sources"]
|
|
536
574
|
|
|
537
575
|
all_manifest_entities = {**manifest_nodes, **manifest_sources}
|
|
538
576
|
|
|
539
|
-
|
|
540
|
-
|
|
577
|
+
all_catalog_entities = None
|
|
578
|
+
if dbt_catalog_json is not None:
|
|
579
|
+
catalog_nodes = dbt_catalog_json["nodes"]
|
|
580
|
+
catalog_sources = dbt_catalog_json["sources"]
|
|
541
581
|
|
|
542
|
-
|
|
582
|
+
all_catalog_entities = {**catalog_nodes, **catalog_sources}
|
|
543
583
|
|
|
544
584
|
nodes = extract_dbt_entities(
|
|
545
585
|
all_manifest_entities=all_manifest_entities,
|
|
@@ -590,7 +630,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
590
630
|
)
|
|
591
631
|
except Exception as e:
|
|
592
632
|
self.report.info(
|
|
593
|
-
title="
|
|
633
|
+
title="dbt Catalog Version",
|
|
594
634
|
message="Failed to determine the catalog version",
|
|
595
635
|
exc=e,
|
|
596
636
|
)
|
|
@@ -135,10 +135,10 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
|
|
|
135
135
|
"""
|
|
136
136
|
This plugin extracts:
|
|
137
137
|
|
|
138
|
-
- Entities as [`MLPrimaryKey`](https://
|
|
139
|
-
- Fields as [`MLFeature`](https://
|
|
140
|
-
- Feature views and on-demand feature views as [`MLFeatureTable`](https://
|
|
141
|
-
- Batch and stream source details as [`Dataset`](https://
|
|
138
|
+
- Entities as [`MLPrimaryKey`](https://docs.datahub.com/docs/graphql/objects#mlprimarykey)
|
|
139
|
+
- Fields as [`MLFeature`](https://docs.datahub.com/docs/graphql/objects#mlfeature)
|
|
140
|
+
- Feature views and on-demand feature views as [`MLFeatureTable`](https://docs.datahub.com/docs/graphql/objects#mlfeaturetable)
|
|
141
|
+
- Batch and stream source details as [`Dataset`](https://docs.datahub.com/docs/graphql/objects#dataset)
|
|
142
142
|
- Column types associated with each entity and feature
|
|
143
143
|
"""
|
|
144
144
|
|
|
@@ -51,6 +51,7 @@ from typing_extensions import Concatenate, ParamSpec
|
|
|
51
51
|
from datahub.emitter import mce_builder
|
|
52
52
|
from datahub.emitter.mce_builder import get_sys_time
|
|
53
53
|
from datahub.ingestion.graph.client import get_default_graph
|
|
54
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
54
55
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
55
56
|
from datahub.ingestion.source.profiling.common import (
|
|
56
57
|
Cardinality,
|
|
@@ -1569,7 +1570,7 @@ def _get_columns_to_ignore_sampling(
|
|
|
1569
1570
|
name=dataset_name, platform=platform, env=env
|
|
1570
1571
|
)
|
|
1571
1572
|
|
|
1572
|
-
datahub_graph = get_default_graph()
|
|
1573
|
+
datahub_graph = get_default_graph(ClientMode.INGESTION)
|
|
1573
1574
|
|
|
1574
1575
|
dataset_tags = datahub_graph.get_tags(dataset_urn)
|
|
1575
1576
|
if dataset_tags:
|
|
@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
|
|
|
40
40
|
del kwargs["timeout"]
|
|
41
41
|
super().__init__(*args, **kwargs)
|
|
42
42
|
|
|
43
|
-
def send(self, request, **kwargs):
|
|
43
|
+
def send(self, request, *args, **kwargs):
|
|
44
44
|
timeout = kwargs.get("timeout")
|
|
45
45
|
if timeout is None and hasattr(self, "timeout"):
|
|
46
46
|
kwargs["timeout"] = self.timeout
|
|
47
|
-
return super().send(request, **kwargs)
|
|
47
|
+
return super().send(request, *args, **kwargs)
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class IcebergProfilingConfig(ConfigModel):
|
datahub/ingestion/source/ldap.py
CHANGED
|
@@ -113,7 +113,7 @@ class LookerAPI:
|
|
|
113
113
|
)
|
|
114
114
|
except SDKError as e:
|
|
115
115
|
raise ConfigurationError(
|
|
116
|
-
|
|
116
|
+
"Failed to connect/authenticate with looker - check your configuration"
|
|
117
117
|
) from e
|
|
118
118
|
|
|
119
119
|
self.client_stats = LookerAPIStats()
|
|
@@ -497,7 +497,13 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
497
497
|
f"Failed to find a project name for model {model_name}"
|
|
498
498
|
)
|
|
499
499
|
return model.project_name
|
|
500
|
-
except SDKError:
|
|
500
|
+
except SDKError as e:
|
|
501
|
+
self.reporter.failure(
|
|
502
|
+
title="Failed to find a project name for model",
|
|
503
|
+
message="Consider configuring a static project name in your config file",
|
|
504
|
+
context=str(dict(model_name=model_name)),
|
|
505
|
+
exc=e,
|
|
506
|
+
)
|
|
501
507
|
raise ValueError(
|
|
502
508
|
f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
|
|
503
509
|
f"in your config file"
|
|
@@ -36,6 +36,7 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
36
36
|
)
|
|
37
37
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
38
|
from datahub.ingestion.graph.client import get_default_graph
|
|
39
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
39
40
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
40
41
|
FineGrainedLineageDownstreamType,
|
|
41
42
|
FineGrainedLineageUpstreamType,
|
|
@@ -210,7 +211,7 @@ def _get_lineage_mcp(
|
|
|
210
211
|
|
|
211
212
|
# extract the old lineage and save it for the new mcp
|
|
212
213
|
if preserve_upstream:
|
|
213
|
-
client = get_default_graph()
|
|
214
|
+
client = get_default_graph(ClientMode.INGESTION)
|
|
214
215
|
|
|
215
216
|
old_upstream_lineage = get_aspects_for_entity(
|
|
216
217
|
client._session,
|