acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/datajob/datajob.py +7 -4
- datahub/api/entities/dataset/dataset.py +9 -11
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/common.py +5 -0
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/errors.py +4 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/api/source_helpers.py +1 -0
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +33 -8
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/run/pipeline.py +9 -6
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
- datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
- datahub/ingestion/source/ge_data_profiler.py +27 -1
- datahub/ingestion/source/hex/api.py +1 -20
- datahub/ingestion/source/hex/query_fetcher.py +4 -1
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +17 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +34 -5
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mlflow.py +19 -6
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +13 -1
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sigma/config.py +74 -6
- datahub/ingestion/source/sigma/sigma.py +16 -1
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +4 -52
- datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/hive.py +7 -2
- datahub/ingestion/source/sql/hive_metastore.py +5 -5
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +31 -6
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/ingestion/source/vertexai/vertexai.py +316 -4
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
- datahub/metadata/_urns/urn_defs.py +1819 -1763
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +17296 -16883
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/_all_entities.py +4 -0
- datahub/sdk/_shared.py +142 -4
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/entity_client.py +8 -0
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +6 -3
- datahub/sdk/mlmodel.py +301 -0
- datahub/sdk/mlmodelgroup.py +233 -0
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/specific/dataset.py +12 -0
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +18 -14
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/testing/mcp_diff.py +15 -2
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +350 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import datahub.metadata.urns as urns
|
|
4
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
5
|
+
|
|
6
|
+
_url_prefixes = {
|
|
7
|
+
# Atypical mappings.
|
|
8
|
+
urns.DataJobUrn.ENTITY_TYPE: "tasks",
|
|
9
|
+
urns.DataFlowUrn.ENTITY_TYPE: "pipelines",
|
|
10
|
+
urns.CorpUserUrn.ENTITY_TYPE: "user",
|
|
11
|
+
urns.CorpGroupUrn.ENTITY_TYPE: "group",
|
|
12
|
+
# Normal mappings - matches the entity type.
|
|
13
|
+
urns.ChartUrn.ENTITY_TYPE: "chart",
|
|
14
|
+
urns.ContainerUrn.ENTITY_TYPE: "container",
|
|
15
|
+
urns.DataProductUrn.ENTITY_TYPE: "dataProduct",
|
|
16
|
+
urns.DatasetUrn.ENTITY_TYPE: "dataset",
|
|
17
|
+
urns.DashboardUrn.ENTITY_TYPE: "dashboard",
|
|
18
|
+
urns.DomainUrn.ENTITY_TYPE: "domain",
|
|
19
|
+
urns.GlossaryNodeUrn.ENTITY_TYPE: "glossaryNode",
|
|
20
|
+
urns.GlossaryTermUrn.ENTITY_TYPE: "glossaryTerm",
|
|
21
|
+
urns.TagUrn.ENTITY_TYPE: "tag",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def make_url_for_urn(
|
|
26
|
+
frontend_base_url: str,
|
|
27
|
+
entity_urn: str,
|
|
28
|
+
*,
|
|
29
|
+
tab: Optional[str] = None,
|
|
30
|
+
) -> str:
|
|
31
|
+
"""Build the public-facing URL for an entity urn.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
frontend_url: The public-facing base url of the frontend.
|
|
35
|
+
entity_urn: The urn of the entity to get the url for.
|
|
36
|
+
tab: The tab to deep link into. If not provided, the default tab for the entity will be shown.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The public-facing url for the entity.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
>>> make_url_for_urn("https://demo.datahub.com", "urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992", tab="Contents")
|
|
43
|
+
'https://demo.datahub.com/container/urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992/Contents'
|
|
44
|
+
>>> make_url_for_urn("https://demo.datahub.com", "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)")
|
|
45
|
+
'https://demo.datahub.com/dataset/urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)/'
|
|
46
|
+
"""
|
|
47
|
+
entity_type = guess_entity_type(entity_urn)
|
|
48
|
+
|
|
49
|
+
url_prefix = _url_prefixes.get(entity_type, entity_type)
|
|
50
|
+
url = f"{frontend_base_url}/{url_prefix}/{entity_urn}/"
|
|
51
|
+
if tab:
|
|
52
|
+
url += f"{tab}"
|
|
53
|
+
return url
|
|
@@ -31,6 +31,7 @@ from datahub.ingestion.api.source import Extractor, Source
|
|
|
31
31
|
from datahub.ingestion.api.transform import Transformer
|
|
32
32
|
from datahub.ingestion.extractor.extractor_registry import extractor_registry
|
|
33
33
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
34
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
34
35
|
from datahub.ingestion.reporting.reporting_provider_registry import (
|
|
35
36
|
reporting_provider_registry,
|
|
36
37
|
)
|
|
@@ -136,9 +137,8 @@ class CliReport(Report):
|
|
|
136
137
|
|
|
137
138
|
|
|
138
139
|
def _make_default_rest_sink(ctx: PipelineContext) -> DatahubRestSink:
|
|
139
|
-
graph = get_default_graph()
|
|
140
|
+
graph = get_default_graph(ClientMode.INGESTION)
|
|
140
141
|
sink_config = graph._make_rest_sink_config()
|
|
141
|
-
|
|
142
142
|
return DatahubRestSink(ctx, sink_config)
|
|
143
143
|
|
|
144
144
|
|
|
@@ -175,6 +175,7 @@ class Pipeline:
|
|
|
175
175
|
self.graph: Optional[DataHubGraph] = None
|
|
176
176
|
with _add_init_error_context("connect to DataHub"):
|
|
177
177
|
if self.config.datahub_api:
|
|
178
|
+
self.config.datahub_api.client_mode = ClientMode.INGESTION
|
|
178
179
|
self.graph = exit_stack.enter_context(
|
|
179
180
|
DataHubGraph(self.config.datahub_api)
|
|
180
181
|
)
|
|
@@ -555,18 +556,20 @@ class Pipeline:
|
|
|
555
556
|
def raise_from_status(self, raise_warnings: bool = False) -> None:
|
|
556
557
|
if self.source.get_report().failures:
|
|
557
558
|
raise PipelineExecutionError(
|
|
558
|
-
"Source reported errors", self.source.get_report()
|
|
559
|
+
"Source reported errors", self.source.get_report().failures
|
|
559
560
|
)
|
|
560
561
|
if self.sink.get_report().failures:
|
|
561
|
-
raise PipelineExecutionError(
|
|
562
|
+
raise PipelineExecutionError(
|
|
563
|
+
"Sink reported errors", self.sink.get_report().failures
|
|
564
|
+
)
|
|
562
565
|
if raise_warnings:
|
|
563
566
|
if self.source.get_report().warnings:
|
|
564
567
|
raise PipelineExecutionError(
|
|
565
|
-
"Source reported warnings", self.source.get_report()
|
|
568
|
+
"Source reported warnings", self.source.get_report().warnings
|
|
566
569
|
)
|
|
567
570
|
if self.sink.get_report().warnings:
|
|
568
571
|
raise PipelineExecutionError(
|
|
569
|
-
"Sink reported warnings", self.sink.get_report()
|
|
572
|
+
"Sink reported warnings", self.sink.get_report().warnings
|
|
570
573
|
)
|
|
571
574
|
|
|
572
575
|
def log_ingestion_stats(self) -> None:
|
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional
|
|
|
7
7
|
from pydantic import Field, validator
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import ConfigModel, DynamicTypedConfig
|
|
10
|
-
from datahub.ingestion.graph.
|
|
10
|
+
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
11
11
|
from datahub.ingestion.sink.file import FileSinkConfig
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
@@ -34,7 +34,7 @@ from datahub.ingestion.api.sink import (
|
|
|
34
34
|
WriteCallback,
|
|
35
35
|
)
|
|
36
36
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
|
-
from datahub.ingestion.graph.
|
|
37
|
+
from datahub.ingestion.graph.config import ClientMode, DatahubClientConfig
|
|
38
38
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
39
39
|
MetadataChangeEvent,
|
|
40
40
|
MetadataChangeProposal,
|
|
@@ -140,11 +140,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
140
140
|
f"💥 Failed to connect to DataHub with {repr(self.emitter)}"
|
|
141
141
|
) from exc
|
|
142
142
|
|
|
143
|
-
self.report.gms_version =
|
|
144
|
-
gms_config.get("versions", {})
|
|
145
|
-
.get("acryldata/datahub", {})
|
|
146
|
-
.get("version", None)
|
|
147
|
-
)
|
|
143
|
+
self.report.gms_version = gms_config.service_version
|
|
148
144
|
self.report.mode = self.config.mode
|
|
149
145
|
self.report.max_threads = self.config.max_threads
|
|
150
146
|
logger.debug("Setting env variables to override config")
|
|
@@ -180,6 +176,8 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
180
176
|
disable_ssl_verification=config.disable_ssl_verification,
|
|
181
177
|
openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
|
|
182
178
|
default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
|
|
179
|
+
client_mode=config.client_mode,
|
|
180
|
+
datahub_component=config.datahub_component,
|
|
183
181
|
)
|
|
184
182
|
|
|
185
183
|
@property
|
|
@@ -190,6 +188,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
190
188
|
# https://github.com/psf/requests/issues/1871#issuecomment-32751346
|
|
191
189
|
thread_local = self._emitter_thread_local
|
|
192
190
|
if not hasattr(thread_local, "emitter"):
|
|
191
|
+
self.config.client_mode = ClientMode.INGESTION
|
|
193
192
|
thread_local.emitter = DatahubRestSink._make_emitter(self.config)
|
|
194
193
|
return thread_local.emitter
|
|
195
194
|
|
|
@@ -18,6 +18,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, Sour
|
|
|
18
18
|
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
19
19
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
20
20
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
21
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
21
22
|
from datahub.metadata.schema_classes import (
|
|
22
23
|
DomainsClass,
|
|
23
24
|
GlossaryTermAssociationClass,
|
|
@@ -48,7 +49,7 @@ def apply_association_to_container(
|
|
|
48
49
|
"""
|
|
49
50
|
urns: List[str] = [container_urn]
|
|
50
51
|
if not graph:
|
|
51
|
-
graph = get_default_graph()
|
|
52
|
+
graph = get_default_graph(ClientMode.INGESTION)
|
|
52
53
|
logger.info(f"Using {graph}")
|
|
53
54
|
urns.extend(
|
|
54
55
|
graph.get_urns_by_filter(
|
|
@@ -205,7 +205,7 @@ class FeatureGroupProcessor:
|
|
|
205
205
|
textwrap.dedent(
|
|
206
206
|
f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
|
|
207
207
|
To view full table metadata, run Glue ingestion
|
|
208
|
-
(see https://
|
|
208
|
+
(see https://docs.datahub.com/docs/generated/ingestion/sources/glue)"""
|
|
209
209
|
)
|
|
210
210
|
)
|
|
211
211
|
|
|
@@ -270,29 +270,30 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
270
270
|
):
|
|
271
271
|
return
|
|
272
272
|
|
|
273
|
-
with self.report.new_stage(
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
273
|
+
with self.report.new_stage(
|
|
274
|
+
f"*: {QUERIES_EXTRACTION}"
|
|
275
|
+
), BigQueryQueriesExtractor(
|
|
276
|
+
connection=self.config.get_bigquery_client(),
|
|
277
|
+
schema_api=self.bq_schema_extractor.schema_api,
|
|
278
|
+
config=BigQueryQueriesExtractorConfig(
|
|
279
|
+
window=self.config,
|
|
280
|
+
user_email_pattern=self.config.usage.user_email_pattern,
|
|
281
|
+
include_lineage=self.config.include_table_lineage,
|
|
282
|
+
include_usage_statistics=self.config.include_usage_statistics,
|
|
283
|
+
include_operations=self.config.usage.include_operational_stats,
|
|
284
|
+
include_queries=self.config.include_queries,
|
|
285
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
286
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
287
|
+
region_qualifiers=self.config.region_qualifiers,
|
|
288
|
+
),
|
|
289
|
+
structured_report=self.report,
|
|
290
|
+
filters=self.filters,
|
|
291
|
+
identifiers=self.identifiers,
|
|
292
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
293
|
+
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
294
|
+
) as queries_extractor:
|
|
295
|
+
self.report.queries_extractor = queries_extractor.report
|
|
296
|
+
yield from queries_extractor.get_workunits_internal()
|
|
296
297
|
else:
|
|
297
298
|
if self.config.include_usage_statistics:
|
|
298
299
|
yield from self.usage_extractor.get_usage_workunits(
|
|
@@ -2,10 +2,8 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
4
|
from datetime import timedelta
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Dict, List, Optional, Union
|
|
6
6
|
|
|
7
|
-
from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
|
|
8
|
-
from google.cloud.logging_v2.client import Client as GCPLoggingClient
|
|
9
7
|
from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
|
|
10
8
|
|
|
11
9
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
@@ -18,7 +16,9 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
|
18
16
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
19
17
|
ClassificationSourceConfigMixin,
|
|
20
18
|
)
|
|
21
|
-
from datahub.ingestion.source.
|
|
19
|
+
from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
|
|
20
|
+
BigQueryConnectionConfig,
|
|
21
|
+
)
|
|
22
22
|
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
|
|
23
23
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
|
|
24
24
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
@@ -105,64 +105,6 @@ class BigQueryUsageConfig(BaseUsageConfig):
|
|
|
105
105
|
)
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
class BigQueryConnectionConfig(ConfigModel):
|
|
109
|
-
credential: Optional[GCPCredential] = Field(
|
|
110
|
-
default=None, description="BigQuery credential informations"
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
_credentials_path: Optional[str] = PrivateAttr(None)
|
|
114
|
-
|
|
115
|
-
extra_client_options: Dict[str, Any] = Field(
|
|
116
|
-
default={},
|
|
117
|
-
description="Additional options to pass to google.cloud.logging_v2.client.Client.",
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
project_on_behalf: Optional[str] = Field(
|
|
121
|
-
default=None,
|
|
122
|
-
description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
def __init__(self, **data: Any):
|
|
126
|
-
super().__init__(**data)
|
|
127
|
-
|
|
128
|
-
if self.credential:
|
|
129
|
-
self._credentials_path = self.credential.create_credential_temp_file()
|
|
130
|
-
logger.debug(
|
|
131
|
-
f"Creating temporary credential file at {self._credentials_path}"
|
|
132
|
-
)
|
|
133
|
-
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
|
|
134
|
-
|
|
135
|
-
def get_bigquery_client(self) -> bigquery.Client:
|
|
136
|
-
client_options = self.extra_client_options
|
|
137
|
-
return bigquery.Client(self.project_on_behalf, **client_options)
|
|
138
|
-
|
|
139
|
-
def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
|
|
140
|
-
return resourcemanager_v3.ProjectsClient()
|
|
141
|
-
|
|
142
|
-
def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
|
|
143
|
-
return datacatalog_v1.PolicyTagManagerClient()
|
|
144
|
-
|
|
145
|
-
def make_gcp_logging_client(
|
|
146
|
-
self, project_id: Optional[str] = None
|
|
147
|
-
) -> GCPLoggingClient:
|
|
148
|
-
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
|
149
|
-
# why we disable gRPC here.
|
|
150
|
-
client_options = self.extra_client_options.copy()
|
|
151
|
-
client_options["_use_grpc"] = False
|
|
152
|
-
if project_id is not None:
|
|
153
|
-
return GCPLoggingClient(**client_options, project=project_id)
|
|
154
|
-
else:
|
|
155
|
-
return GCPLoggingClient(**client_options)
|
|
156
|
-
|
|
157
|
-
def get_sql_alchemy_url(self) -> str:
|
|
158
|
-
if self.project_on_behalf:
|
|
159
|
-
return f"bigquery://{self.project_on_behalf}"
|
|
160
|
-
# When project_id is not set, we will attempt to detect the project ID
|
|
161
|
-
# based on the credentials or environment variables.
|
|
162
|
-
# See https://github.com/mxmzdlv/pybigquery#authentication.
|
|
163
|
-
return "bigquery://"
|
|
164
|
-
|
|
165
|
-
|
|
166
108
|
class GcsLineageProviderConfig(ConfigModel):
|
|
167
109
|
"""
|
|
168
110
|
Any source that produces gcs lineage from/to Datasets should inherit this class.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
|
|
6
|
+
from google.cloud.logging_v2.client import Client as GCPLoggingClient
|
|
7
|
+
from pydantic import Field, PrivateAttr
|
|
8
|
+
|
|
9
|
+
from datahub.configuration.common import ConfigModel
|
|
10
|
+
from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BigQueryConnectionConfig(ConfigModel):
|
|
16
|
+
credential: Optional[GCPCredential] = Field(
|
|
17
|
+
default=None, description="BigQuery credential informations"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
_credentials_path: Optional[str] = PrivateAttr(None)
|
|
21
|
+
|
|
22
|
+
extra_client_options: Dict[str, Any] = Field(
|
|
23
|
+
default={},
|
|
24
|
+
description="Additional options to pass to google.cloud.logging_v2.client.Client.",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
project_on_behalf: Optional[str] = Field(
|
|
28
|
+
default=None,
|
|
29
|
+
description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def __init__(self, **data: Any):
|
|
33
|
+
super().__init__(**data)
|
|
34
|
+
|
|
35
|
+
if self.credential:
|
|
36
|
+
self._credentials_path = self.credential.create_credential_temp_file()
|
|
37
|
+
logger.debug(
|
|
38
|
+
f"Creating temporary credential file at {self._credentials_path}"
|
|
39
|
+
)
|
|
40
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
|
|
41
|
+
|
|
42
|
+
def get_bigquery_client(self) -> bigquery.Client:
|
|
43
|
+
client_options = self.extra_client_options
|
|
44
|
+
return bigquery.Client(self.project_on_behalf, **client_options)
|
|
45
|
+
|
|
46
|
+
def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
|
|
47
|
+
return resourcemanager_v3.ProjectsClient()
|
|
48
|
+
|
|
49
|
+
def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
|
|
50
|
+
return datacatalog_v1.PolicyTagManagerClient()
|
|
51
|
+
|
|
52
|
+
def make_gcp_logging_client(
|
|
53
|
+
self, project_id: Optional[str] = None
|
|
54
|
+
) -> GCPLoggingClient:
|
|
55
|
+
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
|
56
|
+
# why we disable gRPC here.
|
|
57
|
+
client_options = self.extra_client_options.copy()
|
|
58
|
+
client_options["_use_grpc"] = False
|
|
59
|
+
if project_id is not None:
|
|
60
|
+
return GCPLoggingClient(**client_options, project=project_id)
|
|
61
|
+
else:
|
|
62
|
+
return GCPLoggingClient(**client_options)
|
|
63
|
+
|
|
64
|
+
def get_sql_alchemy_url(self) -> str:
|
|
65
|
+
if self.project_on_behalf:
|
|
66
|
+
return f"bigquery://{self.project_on_behalf}"
|
|
67
|
+
# When project_id is not set, we will attempt to detect the project ID
|
|
68
|
+
# based on the credentials or environment variables.
|
|
69
|
+
# See https://github.com/mxmzdlv/pybigquery#authentication.
|
|
70
|
+
return "bigquery://"
|
|
@@ -10,10 +10,12 @@ from datahub.ingestion.api.common import PipelineContext
|
|
|
10
10
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
11
11
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
12
12
|
from datahub.ingestion.source.bigquery_v2.bigquery_config import (
|
|
13
|
-
BigQueryConnectionConfig,
|
|
14
13
|
BigQueryFilterConfig,
|
|
15
14
|
BigQueryIdentifierConfig,
|
|
16
15
|
)
|
|
16
|
+
from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
|
|
17
|
+
BigQueryConnectionConfig,
|
|
18
|
+
)
|
|
17
19
|
from datahub.ingestion.source.bigquery_v2.bigquery_report import (
|
|
18
20
|
BigQueryQueriesExtractorReport,
|
|
19
21
|
BigQuerySchemaApiPerfReport,
|
|
@@ -70,30 +70,31 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
with self.report.new_stage(
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
73
|
+
with self.report.new_stage(
|
|
74
|
+
f"{keyspace_name}: {PROFILING}"
|
|
75
|
+
), ThreadPoolExecutor(
|
|
76
|
+
max_workers=self.config.profiling.max_workers
|
|
77
|
+
) as executor:
|
|
78
|
+
future_to_dataset = {
|
|
79
|
+
executor.submit(
|
|
80
|
+
self.generate_profile,
|
|
81
|
+
keyspace_name,
|
|
82
|
+
table_name,
|
|
83
|
+
cassandra_data.columns.get(table_name, []),
|
|
84
|
+
): table_name
|
|
85
|
+
for table_name in tables
|
|
86
|
+
}
|
|
87
|
+
for future in as_completed(future_to_dataset):
|
|
88
|
+
table_name = future_to_dataset[future]
|
|
89
|
+
try:
|
|
90
|
+
yield from future.result()
|
|
91
|
+
except Exception as exc:
|
|
92
|
+
self.report.profiling_skipped_other[table_name] += 1
|
|
93
|
+
self.report.failure(
|
|
94
|
+
message="Failed to profile for table",
|
|
95
|
+
context=f"{keyspace_name}.{table_name}",
|
|
96
|
+
exc=exc,
|
|
97
|
+
)
|
|
97
98
|
|
|
98
99
|
def generate_profile(
|
|
99
100
|
self,
|
|
@@ -195,17 +195,18 @@ class DataHubDatabaseReader:
|
|
|
195
195
|
Yields:
|
|
196
196
|
Row objects containing URNs of soft-deleted entities
|
|
197
197
|
"""
|
|
198
|
-
with self.engine.connect() as conn
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
198
|
+
with self.engine.connect() as conn, contextlib.closing(
|
|
199
|
+
conn.connection.cursor()
|
|
200
|
+
) as cursor:
|
|
201
|
+
logger.debug("Polling soft-deleted urns from database")
|
|
202
|
+
cursor.execute(self.soft_deleted_urns_query)
|
|
203
|
+
columns = [desc[0] for desc in cursor.description]
|
|
204
|
+
while True:
|
|
205
|
+
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
206
|
+
if not rows:
|
|
207
|
+
return
|
|
208
|
+
for row in rows:
|
|
209
|
+
yield dict(zip(columns, row))
|
|
209
210
|
|
|
210
211
|
def _parse_row(
|
|
211
212
|
self, row: Dict[str, Any]
|
|
@@ -10,14 +10,12 @@ from pydantic import Field, root_validator
|
|
|
10
10
|
|
|
11
11
|
from datahub.ingestion.api.decorators import (
|
|
12
12
|
SupportStatus,
|
|
13
|
-
capability,
|
|
14
13
|
config_class,
|
|
15
14
|
platform_name,
|
|
16
15
|
support_status,
|
|
17
16
|
)
|
|
18
17
|
from datahub.ingestion.api.source import (
|
|
19
18
|
CapabilityReport,
|
|
20
|
-
SourceCapability,
|
|
21
19
|
TestableSource,
|
|
22
20
|
TestConnectionReport,
|
|
23
21
|
)
|
|
@@ -262,16 +260,14 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
|
|
|
262
260
|
|
|
263
261
|
@platform_name("dbt")
|
|
264
262
|
@config_class(DBTCloudConfig)
|
|
265
|
-
@support_status(SupportStatus.
|
|
266
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
267
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
263
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
268
264
|
class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
269
265
|
config: DBTCloudConfig
|
|
270
266
|
|
|
271
267
|
@classmethod
|
|
272
268
|
def create(cls, config_dict, ctx):
|
|
273
269
|
config = DBTCloudConfig.parse_obj(config_dict)
|
|
274
|
-
return cls(config, ctx
|
|
270
|
+
return cls(config, ctx)
|
|
275
271
|
|
|
276
272
|
@staticmethod
|
|
277
273
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -125,6 +125,7 @@ _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
|
|
|
125
125
|
@dataclass
|
|
126
126
|
class DBTSourceReport(StaleEntityRemovalSourceReport):
|
|
127
127
|
sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
|
|
128
|
+
sql_parser_skipped_non_sql_model: LossyList[str] = field(default_factory=LossyList)
|
|
128
129
|
sql_parser_parse_failures: int = 0
|
|
129
130
|
sql_parser_detach_ctes_failures: int = 0
|
|
130
131
|
sql_parser_table_errors: int = 0
|
|
@@ -829,11 +830,13 @@ def get_column_type(
|
|
|
829
830
|
"Enabled by default, configure using `include_column_lineage`",
|
|
830
831
|
)
|
|
831
832
|
class DBTSourceBase(StatefulIngestionSourceBase):
|
|
832
|
-
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext
|
|
833
|
+
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
|
|
833
834
|
super().__init__(config, ctx)
|
|
835
|
+
self.platform: str = "dbt"
|
|
836
|
+
|
|
834
837
|
self.config = config
|
|
835
|
-
self.platform: str = platform
|
|
836
838
|
self.report: DBTSourceReport = DBTSourceReport()
|
|
839
|
+
|
|
837
840
|
self.compiled_owner_extraction_pattern: Optional[Any] = None
|
|
838
841
|
if self.config.owner_extraction_pattern:
|
|
839
842
|
self.compiled_owner_extraction_pattern = re.compile(
|
|
@@ -1177,6 +1180,11 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1177
1180
|
logger.debug(
|
|
1178
1181
|
f"Not generating CLL for {node.dbt_name} because we don't need it."
|
|
1179
1182
|
)
|
|
1183
|
+
elif node.language != "sql":
|
|
1184
|
+
logger.debug(
|
|
1185
|
+
f"Not generating CLL for {node.dbt_name} because it is not a SQL model."
|
|
1186
|
+
)
|
|
1187
|
+
self.report.sql_parser_skipped_non_sql_model.append(node.dbt_name)
|
|
1180
1188
|
elif node.compiled_code:
|
|
1181
1189
|
# Add CTE stops based on the upstreams list.
|
|
1182
1190
|
cte_mapping = {
|