acryl-datahub 1.0.0.3rc12__py3-none-any.whl → 1.0.0.4rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/METADATA +2529 -2527
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/RECORD +37 -34
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/emitter/request_helper.py +10 -5
- datahub/emitter/rest_emitter.py +183 -106
- datahub/ingestion/extractor/schema_util.py +17 -1
- datahub/ingestion/graph/client.py +17 -4
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/sink/datahub_rest.py +11 -10
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/ge_data_profiler.py +25 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +1 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +5 -17
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/stored_procedures/lineage.py +1 -0
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +4 -2
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/metadata/_internal_schema_classes.py +13 -0
- datahub/metadata/schema.avsc +17 -0
- datahub/metadata/schemas/Operation.avsc +17 -0
- datahub/sdk/main_client.py +15 -0
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -2
- datahub/utilities/server_config_util.py +14 -75
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/top_level.txt +0 -0
|
@@ -21,10 +21,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
|
|
|
21
21
|
from datahub.emitter.rest_emitter import (
|
|
22
22
|
BATCH_INGEST_MAX_PAYLOAD_LENGTH,
|
|
23
23
|
DEFAULT_REST_EMITTER_ENDPOINT,
|
|
24
|
-
DEFAULT_REST_TRACE_MODE,
|
|
25
24
|
DataHubRestEmitter,
|
|
25
|
+
EmitMode,
|
|
26
26
|
RestSinkEndpoint,
|
|
27
|
-
RestTraceMode,
|
|
28
27
|
)
|
|
29
28
|
from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
|
|
30
29
|
from datahub.ingestion.api.sink import (
|
|
@@ -71,7 +70,6 @@ _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
|
|
|
71
70
|
class DatahubRestSinkConfig(DatahubClientConfig):
|
|
72
71
|
mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
|
|
73
72
|
endpoint: RestSinkEndpoint = DEFAULT_REST_EMITTER_ENDPOINT
|
|
74
|
-
default_trace_mode: RestTraceMode = DEFAULT_REST_TRACE_MODE
|
|
75
73
|
|
|
76
74
|
# These only apply in async modes.
|
|
77
75
|
max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
|
|
@@ -134,7 +132,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
134
132
|
self._emitter_thread_local = threading.local()
|
|
135
133
|
|
|
136
134
|
try:
|
|
137
|
-
gms_config = self.emitter.
|
|
135
|
+
gms_config = self.emitter.server_config
|
|
138
136
|
except Exception as exc:
|
|
139
137
|
raise ConfigurationError(
|
|
140
138
|
f"💥 Failed to connect to DataHub with {repr(self.emitter)}"
|
|
@@ -175,7 +173,6 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
175
173
|
client_certificate_path=config.client_certificate_path,
|
|
176
174
|
disable_ssl_verification=config.disable_ssl_verification,
|
|
177
175
|
openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
|
|
178
|
-
default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
|
|
179
176
|
client_mode=config.client_mode,
|
|
180
177
|
datahub_component=config.datahub_component,
|
|
181
178
|
)
|
|
@@ -252,9 +249,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
252
249
|
MetadataChangeProposal,
|
|
253
250
|
MetadataChangeProposalWrapper,
|
|
254
251
|
],
|
|
252
|
+
emit_mode: EmitMode,
|
|
255
253
|
) -> None:
|
|
256
254
|
# TODO: Add timing metrics
|
|
257
|
-
self.emitter.emit(record)
|
|
255
|
+
self.emitter.emit(record, emit_mode=emit_mode)
|
|
258
256
|
|
|
259
257
|
def _emit_batch_wrapper(
|
|
260
258
|
self,
|
|
@@ -269,8 +267,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
269
267
|
],
|
|
270
268
|
) -> None:
|
|
271
269
|
events: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]] = []
|
|
270
|
+
|
|
272
271
|
for record in records:
|
|
273
272
|
event = record[0]
|
|
273
|
+
|
|
274
274
|
if isinstance(event, MetadataChangeEvent):
|
|
275
275
|
# Unpack MCEs into MCPs.
|
|
276
276
|
mcps = mcps_from_mce(event)
|
|
@@ -278,7 +278,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
278
278
|
else:
|
|
279
279
|
events.append(event)
|
|
280
280
|
|
|
281
|
-
chunks = self.emitter.emit_mcps(events)
|
|
281
|
+
chunks = self.emitter.emit_mcps(events, emit_mode=EmitMode.ASYNC)
|
|
282
282
|
self.report.async_batches_prepared += 1
|
|
283
283
|
if chunks > 1:
|
|
284
284
|
self.report.async_batches_split += chunks
|
|
@@ -309,6 +309,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
309
309
|
partition_key,
|
|
310
310
|
self._emit_wrapper,
|
|
311
311
|
record,
|
|
312
|
+
EmitMode.ASYNC,
|
|
312
313
|
done_callback=functools.partial(
|
|
313
314
|
self._write_done_callback, record_envelope, write_callback
|
|
314
315
|
),
|
|
@@ -320,6 +321,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
320
321
|
self.executor.submit(
|
|
321
322
|
partition_key,
|
|
322
323
|
record,
|
|
324
|
+
EmitMode.ASYNC,
|
|
323
325
|
done_callback=functools.partial(
|
|
324
326
|
self._write_done_callback, record_envelope, write_callback
|
|
325
327
|
),
|
|
@@ -328,7 +330,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
328
330
|
else:
|
|
329
331
|
# execute synchronously
|
|
330
332
|
try:
|
|
331
|
-
self._emit_wrapper(record)
|
|
333
|
+
self._emit_wrapper(record, emit_mode=EmitMode.SYNC_PRIMARY)
|
|
332
334
|
write_callback.on_success(record_envelope, success_metadata={})
|
|
333
335
|
except Exception as e:
|
|
334
336
|
write_callback.on_failure(record_envelope, e, failure_metadata={})
|
|
@@ -340,8 +342,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
340
342
|
],
|
|
341
343
|
) -> None:
|
|
342
344
|
return self.write_record_async(
|
|
343
|
-
RecordEnvelope(item, metadata={}),
|
|
344
|
-
NoopWriteCallback(),
|
|
345
|
+
RecordEnvelope(item, metadata={}), NoopWriteCallback()
|
|
345
346
|
)
|
|
346
347
|
|
|
347
348
|
def close(self):
|
|
@@ -2,10 +2,8 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
4
|
from datetime import timedelta
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Dict, List, Optional, Union
|
|
6
6
|
|
|
7
|
-
from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
|
|
8
|
-
from google.cloud.logging_v2.client import Client as GCPLoggingClient
|
|
9
7
|
from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
|
|
10
8
|
|
|
11
9
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
@@ -18,7 +16,9 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
|
18
16
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
19
17
|
ClassificationSourceConfigMixin,
|
|
20
18
|
)
|
|
21
|
-
from datahub.ingestion.source.
|
|
19
|
+
from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
|
|
20
|
+
BigQueryConnectionConfig,
|
|
21
|
+
)
|
|
22
22
|
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
|
|
23
23
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
|
|
24
24
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
@@ -105,64 +105,6 @@ class BigQueryUsageConfig(BaseUsageConfig):
|
|
|
105
105
|
)
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
class BigQueryConnectionConfig(ConfigModel):
|
|
109
|
-
credential: Optional[GCPCredential] = Field(
|
|
110
|
-
default=None, description="BigQuery credential informations"
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
_credentials_path: Optional[str] = PrivateAttr(None)
|
|
114
|
-
|
|
115
|
-
extra_client_options: Dict[str, Any] = Field(
|
|
116
|
-
default={},
|
|
117
|
-
description="Additional options to pass to google.cloud.logging_v2.client.Client.",
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
project_on_behalf: Optional[str] = Field(
|
|
121
|
-
default=None,
|
|
122
|
-
description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
def __init__(self, **data: Any):
|
|
126
|
-
super().__init__(**data)
|
|
127
|
-
|
|
128
|
-
if self.credential:
|
|
129
|
-
self._credentials_path = self.credential.create_credential_temp_file()
|
|
130
|
-
logger.debug(
|
|
131
|
-
f"Creating temporary credential file at {self._credentials_path}"
|
|
132
|
-
)
|
|
133
|
-
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
|
|
134
|
-
|
|
135
|
-
def get_bigquery_client(self) -> bigquery.Client:
|
|
136
|
-
client_options = self.extra_client_options
|
|
137
|
-
return bigquery.Client(self.project_on_behalf, **client_options)
|
|
138
|
-
|
|
139
|
-
def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
|
|
140
|
-
return resourcemanager_v3.ProjectsClient()
|
|
141
|
-
|
|
142
|
-
def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
|
|
143
|
-
return datacatalog_v1.PolicyTagManagerClient()
|
|
144
|
-
|
|
145
|
-
def make_gcp_logging_client(
|
|
146
|
-
self, project_id: Optional[str] = None
|
|
147
|
-
) -> GCPLoggingClient:
|
|
148
|
-
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
|
149
|
-
# why we disable gRPC here.
|
|
150
|
-
client_options = self.extra_client_options.copy()
|
|
151
|
-
client_options["_use_grpc"] = False
|
|
152
|
-
if project_id is not None:
|
|
153
|
-
return GCPLoggingClient(**client_options, project=project_id)
|
|
154
|
-
else:
|
|
155
|
-
return GCPLoggingClient(**client_options)
|
|
156
|
-
|
|
157
|
-
def get_sql_alchemy_url(self) -> str:
|
|
158
|
-
if self.project_on_behalf:
|
|
159
|
-
return f"bigquery://{self.project_on_behalf}"
|
|
160
|
-
# When project_id is not set, we will attempt to detect the project ID
|
|
161
|
-
# based on the credentials or environment variables.
|
|
162
|
-
# See https://github.com/mxmzdlv/pybigquery#authentication.
|
|
163
|
-
return "bigquery://"
|
|
164
|
-
|
|
165
|
-
|
|
166
108
|
class GcsLineageProviderConfig(ConfigModel):
|
|
167
109
|
"""
|
|
168
110
|
Any source that produces gcs lineage from/to Datasets should inherit this class.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
|
|
6
|
+
from google.cloud.logging_v2.client import Client as GCPLoggingClient
|
|
7
|
+
from pydantic import Field, PrivateAttr
|
|
8
|
+
|
|
9
|
+
from datahub.configuration.common import ConfigModel
|
|
10
|
+
from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BigQueryConnectionConfig(ConfigModel):
|
|
16
|
+
credential: Optional[GCPCredential] = Field(
|
|
17
|
+
default=None, description="BigQuery credential informations"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
_credentials_path: Optional[str] = PrivateAttr(None)
|
|
21
|
+
|
|
22
|
+
extra_client_options: Dict[str, Any] = Field(
|
|
23
|
+
default={},
|
|
24
|
+
description="Additional options to pass to google.cloud.logging_v2.client.Client.",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
project_on_behalf: Optional[str] = Field(
|
|
28
|
+
default=None,
|
|
29
|
+
description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def __init__(self, **data: Any):
|
|
33
|
+
super().__init__(**data)
|
|
34
|
+
|
|
35
|
+
if self.credential:
|
|
36
|
+
self._credentials_path = self.credential.create_credential_temp_file()
|
|
37
|
+
logger.debug(
|
|
38
|
+
f"Creating temporary credential file at {self._credentials_path}"
|
|
39
|
+
)
|
|
40
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
|
|
41
|
+
|
|
42
|
+
def get_bigquery_client(self) -> bigquery.Client:
|
|
43
|
+
client_options = self.extra_client_options
|
|
44
|
+
return bigquery.Client(self.project_on_behalf, **client_options)
|
|
45
|
+
|
|
46
|
+
def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
|
|
47
|
+
return resourcemanager_v3.ProjectsClient()
|
|
48
|
+
|
|
49
|
+
def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
|
|
50
|
+
return datacatalog_v1.PolicyTagManagerClient()
|
|
51
|
+
|
|
52
|
+
def make_gcp_logging_client(
|
|
53
|
+
self, project_id: Optional[str] = None
|
|
54
|
+
) -> GCPLoggingClient:
|
|
55
|
+
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
|
56
|
+
# why we disable gRPC here.
|
|
57
|
+
client_options = self.extra_client_options.copy()
|
|
58
|
+
client_options["_use_grpc"] = False
|
|
59
|
+
if project_id is not None:
|
|
60
|
+
return GCPLoggingClient(**client_options, project=project_id)
|
|
61
|
+
else:
|
|
62
|
+
return GCPLoggingClient(**client_options)
|
|
63
|
+
|
|
64
|
+
def get_sql_alchemy_url(self) -> str:
|
|
65
|
+
if self.project_on_behalf:
|
|
66
|
+
return f"bigquery://{self.project_on_behalf}"
|
|
67
|
+
# When project_id is not set, we will attempt to detect the project ID
|
|
68
|
+
# based on the credentials or environment variables.
|
|
69
|
+
# See https://github.com/mxmzdlv/pybigquery#authentication.
|
|
70
|
+
return "bigquery://"
|
|
@@ -10,10 +10,12 @@ from datahub.ingestion.api.common import PipelineContext
|
|
|
10
10
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
11
11
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
12
12
|
from datahub.ingestion.source.bigquery_v2.bigquery_config import (
|
|
13
|
-
BigQueryConnectionConfig,
|
|
14
13
|
BigQueryFilterConfig,
|
|
15
14
|
BigQueryIdentifierConfig,
|
|
16
15
|
)
|
|
16
|
+
from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
|
|
17
|
+
BigQueryConnectionConfig,
|
|
18
|
+
)
|
|
17
19
|
from datahub.ingestion.source.bigquery_v2.bigquery_report import (
|
|
18
20
|
BigQueryQueriesExtractorReport,
|
|
19
21
|
BigQuerySchemaApiPerfReport,
|
|
@@ -16,7 +16,7 @@ from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
|
16
16
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
17
17
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
18
18
|
from datahub.ingestion.api.report import Report
|
|
19
|
-
from datahub.ingestion.source.bigquery_v2.
|
|
19
|
+
from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
|
|
20
20
|
BigQueryConnectionConfig,
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.source.snowflake.snowflake_connection import (
|
|
@@ -5,6 +5,7 @@ import concurrent.futures
|
|
|
5
5
|
import contextlib
|
|
6
6
|
import dataclasses
|
|
7
7
|
import functools
|
|
8
|
+
import importlib.metadata
|
|
8
9
|
import json
|
|
9
10
|
import logging
|
|
10
11
|
import re
|
|
@@ -84,6 +85,30 @@ if TYPE_CHECKING:
|
|
|
84
85
|
from pyathena.cursor import Cursor
|
|
85
86
|
|
|
86
87
|
assert MARKUPSAFE_PATCHED
|
|
88
|
+
|
|
89
|
+
# We need to ensure that acryl-great-expectations is installed
|
|
90
|
+
# and great-expectations is not installed.
|
|
91
|
+
try:
|
|
92
|
+
acryl_gx_version = bool(importlib.metadata.distribution("acryl-great-expectations"))
|
|
93
|
+
except importlib.metadata.PackageNotFoundError:
|
|
94
|
+
acryl_gx_version = False
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
original_gx_version = bool(importlib.metadata.distribution("great-expectations"))
|
|
98
|
+
except importlib.metadata.PackageNotFoundError:
|
|
99
|
+
original_gx_version = False
|
|
100
|
+
|
|
101
|
+
if acryl_gx_version and original_gx_version:
|
|
102
|
+
raise RuntimeError(
|
|
103
|
+
"acryl-great-expectations and great-expectations cannot both be installed because their files will conflict. "
|
|
104
|
+
"You will need to (1) uninstall great-expectations and (2) re-install acryl-great-expectations. "
|
|
105
|
+
"See https://github.com/pypa/pip/issues/4625."
|
|
106
|
+
)
|
|
107
|
+
elif original_gx_version:
|
|
108
|
+
raise RuntimeError(
|
|
109
|
+
"We expect acryl-great-expectations to be installed, but great-expectations is installed instead."
|
|
110
|
+
)
|
|
111
|
+
|
|
87
112
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
88
113
|
|
|
89
114
|
_original_get_column_median = SqlAlchemyDataset.get_column_median
|
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
from typing import Dict, List, Optional, Set
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
from pydantic import Field,
|
|
7
|
+
from pydantic import Field, root_validator, validator
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
10
|
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
@@ -385,17 +385,6 @@ class SnowflakeV2Config(
|
|
|
385
385
|
|
|
386
386
|
return values
|
|
387
387
|
|
|
388
|
-
def get_sql_alchemy_url(
|
|
389
|
-
self,
|
|
390
|
-
database: Optional[str] = None,
|
|
391
|
-
username: Optional[str] = None,
|
|
392
|
-
password: Optional[SecretStr] = None,
|
|
393
|
-
role: Optional[str] = None,
|
|
394
|
-
) -> str:
|
|
395
|
-
return SnowflakeConnectionConfig.get_sql_alchemy_url(
|
|
396
|
-
self, database=database, username=username, password=password, role=role
|
|
397
|
-
)
|
|
398
|
-
|
|
399
388
|
@validator("shares")
|
|
400
389
|
def validate_shares(
|
|
401
390
|
cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
|
|
@@ -28,7 +28,7 @@ from datahub.ingestion.source.snowflake.oauth_config import (
|
|
|
28
28
|
OAuthIdentityProvider,
|
|
29
29
|
)
|
|
30
30
|
from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator
|
|
31
|
-
from datahub.ingestion.source.sql.
|
|
31
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
32
32
|
from datahub.utilities.config_clean import (
|
|
33
33
|
remove_protocol,
|
|
34
34
|
remove_suffix,
|
|
@@ -193,23 +193,11 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
193
193
|
"but should be set when using use_certificate false for oauth_config"
|
|
194
194
|
)
|
|
195
195
|
|
|
196
|
-
def get_sql_alchemy_url(
|
|
197
|
-
self,
|
|
198
|
-
database: Optional[str] = None,
|
|
199
|
-
username: Optional[str] = None,
|
|
200
|
-
password: Optional[pydantic.SecretStr] = None,
|
|
201
|
-
role: Optional[str] = None,
|
|
202
|
-
) -> str:
|
|
203
|
-
if username is None:
|
|
204
|
-
username = self.username
|
|
205
|
-
if password is None:
|
|
206
|
-
password = self.password
|
|
207
|
-
if role is None:
|
|
208
|
-
role = self.role
|
|
196
|
+
def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
|
|
209
197
|
return make_sqlalchemy_uri(
|
|
210
198
|
self.scheme,
|
|
211
|
-
username,
|
|
212
|
-
password.get_secret_value() if password else None,
|
|
199
|
+
self.username,
|
|
200
|
+
self.password.get_secret_value() if self.password else None,
|
|
213
201
|
self.account_id,
|
|
214
202
|
f'"{database}"' if database is not None else database,
|
|
215
203
|
uri_opts={
|
|
@@ -218,7 +206,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
218
206
|
for (key, value) in {
|
|
219
207
|
"authenticator": _VALID_AUTH_TYPES.get(self.authentication_type),
|
|
220
208
|
"warehouse": self.warehouse,
|
|
221
|
-
"role": role,
|
|
209
|
+
"role": self.role,
|
|
222
210
|
"application": _APPLICATION_NAME,
|
|
223
211
|
}.items()
|
|
224
212
|
if value
|
|
@@ -135,12 +135,7 @@ class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin):
|
|
|
135
135
|
) -> "DatahubGEProfiler":
|
|
136
136
|
assert db_name
|
|
137
137
|
|
|
138
|
-
url = self.config.get_sql_alchemy_url(
|
|
139
|
-
database=db_name,
|
|
140
|
-
username=self.config.username,
|
|
141
|
-
password=self.config.password,
|
|
142
|
-
role=self.config.role,
|
|
143
|
-
)
|
|
138
|
+
url = self.config.get_sql_alchemy_url(database=db_name)
|
|
144
139
|
|
|
145
140
|
logger.debug(f"sql_alchemy_url={url}")
|
|
146
141
|
|
|
@@ -35,13 +35,14 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
35
35
|
SQLAlchemySource,
|
|
36
36
|
register_custom_type,
|
|
37
37
|
)
|
|
38
|
-
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
38
|
+
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
39
39
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
40
40
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
41
41
|
add_table_to_schema_container,
|
|
42
42
|
gen_database_container,
|
|
43
43
|
gen_database_key,
|
|
44
44
|
)
|
|
45
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
45
46
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
46
47
|
from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
|
|
47
48
|
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
|
@@ -36,7 +36,6 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
36
36
|
from datahub.ingestion.source.sql.sql_config import (
|
|
37
37
|
BasicSQLAlchemyConfig,
|
|
38
38
|
SQLCommonConfig,
|
|
39
|
-
make_sqlalchemy_uri,
|
|
40
39
|
)
|
|
41
40
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
42
41
|
add_table_to_schema_container,
|
|
@@ -46,6 +45,7 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
46
45
|
gen_schema_key,
|
|
47
46
|
get_domain_wu,
|
|
48
47
|
)
|
|
48
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
49
49
|
from datahub.ingestion.source.state.stateful_ingestion_base import JobId
|
|
50
50
|
from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
|
|
51
51
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
@@ -44,9 +44,9 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
44
44
|
)
|
|
45
45
|
from datahub.ingestion.source.sql.sql_config import (
|
|
46
46
|
BasicSQLAlchemyConfig,
|
|
47
|
-
make_sqlalchemy_uri,
|
|
48
47
|
)
|
|
49
48
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
49
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
50
50
|
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
51
51
|
generate_procedure_lineage,
|
|
52
52
|
)
|
|
@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
|
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
6
|
from pydantic import Field
|
|
7
|
-
from sqlalchemy.engine import URL
|
|
8
7
|
|
|
9
8
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
9
|
from datahub.configuration.source_common import (
|
|
@@ -20,6 +19,7 @@ from datahub.ingestion.glossary.classification_mixin import (
|
|
|
20
19
|
ClassificationSourceConfigMixin,
|
|
21
20
|
)
|
|
22
21
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
22
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
23
23
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
24
24
|
StatefulStaleMetadataRemovalConfig,
|
|
25
25
|
)
|
|
@@ -184,36 +184,3 @@ class SQLAlchemyConnectionConfig(ConfigModel):
|
|
|
184
184
|
|
|
185
185
|
class BasicSQLAlchemyConfig(SQLAlchemyConnectionConfig, SQLCommonConfig):
|
|
186
186
|
pass
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def make_sqlalchemy_uri(
|
|
190
|
-
scheme: str,
|
|
191
|
-
username: Optional[str],
|
|
192
|
-
password: Optional[str],
|
|
193
|
-
at: Optional[str],
|
|
194
|
-
db: Optional[str],
|
|
195
|
-
uri_opts: Optional[Dict[str, Any]] = None,
|
|
196
|
-
) -> str:
|
|
197
|
-
host: Optional[str] = None
|
|
198
|
-
port: Optional[int] = None
|
|
199
|
-
if at:
|
|
200
|
-
try:
|
|
201
|
-
host, port_str = at.rsplit(":", 1)
|
|
202
|
-
port = int(port_str)
|
|
203
|
-
except ValueError:
|
|
204
|
-
host = at
|
|
205
|
-
port = None
|
|
206
|
-
if uri_opts:
|
|
207
|
-
uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
|
|
208
|
-
|
|
209
|
-
return str(
|
|
210
|
-
URL.create(
|
|
211
|
-
drivername=scheme,
|
|
212
|
-
username=username,
|
|
213
|
-
password=password,
|
|
214
|
-
host=host,
|
|
215
|
-
port=port,
|
|
216
|
-
database=db,
|
|
217
|
-
query=uri_opts or {},
|
|
218
|
-
)
|
|
219
|
-
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.engine import URL
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def make_sqlalchemy_uri(
|
|
7
|
+
scheme: str,
|
|
8
|
+
username: Optional[str],
|
|
9
|
+
password: Optional[str],
|
|
10
|
+
at: Optional[str],
|
|
11
|
+
db: Optional[str],
|
|
12
|
+
uri_opts: Optional[Dict[str, Any]] = None,
|
|
13
|
+
) -> str:
|
|
14
|
+
host: Optional[str] = None
|
|
15
|
+
port: Optional[int] = None
|
|
16
|
+
if at:
|
|
17
|
+
try:
|
|
18
|
+
host, port_str = at.rsplit(":", 1)
|
|
19
|
+
port = int(port_str)
|
|
20
|
+
except ValueError:
|
|
21
|
+
host = at
|
|
22
|
+
port = None
|
|
23
|
+
if uri_opts:
|
|
24
|
+
uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
|
|
25
|
+
|
|
26
|
+
return str(
|
|
27
|
+
URL.create(
|
|
28
|
+
drivername=scheme,
|
|
29
|
+
username=username,
|
|
30
|
+
password=password,
|
|
31
|
+
host=host,
|
|
32
|
+
port=port,
|
|
33
|
+
database=db,
|
|
34
|
+
query=uri_opts or {},
|
|
35
|
+
)
|
|
36
|
+
)
|
|
@@ -24,6 +24,7 @@ def parse_procedure_code(
|
|
|
24
24
|
) -> Optional[DataJobInputOutputClass]:
|
|
25
25
|
aggregator = SqlParsingAggregator(
|
|
26
26
|
platform=schema_resolver.platform,
|
|
27
|
+
platform_instance=schema_resolver.platform_instance,
|
|
27
28
|
env=schema_resolver.env,
|
|
28
29
|
schema_resolver=schema_resolver,
|
|
29
30
|
generate_lineage=True,
|
|
@@ -14,12 +14,12 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
14
14
|
from datahub.ingestion.source.sql.sql_common import SQLAlchemySource, logger
|
|
15
15
|
from datahub.ingestion.source.sql.sql_config import (
|
|
16
16
|
BasicSQLAlchemyConfig,
|
|
17
|
-
make_sqlalchemy_uri,
|
|
18
17
|
)
|
|
19
18
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
20
19
|
add_table_to_schema_container,
|
|
21
20
|
gen_database_key,
|
|
22
21
|
)
|
|
22
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
|
|
@@ -1394,7 +1394,9 @@ class TableauSiteSource:
|
|
|
1394
1394
|
`fetch_size:` The number of records to retrieve from Tableau
|
|
1395
1395
|
Server in a single API call, starting from the current cursor position on Tableau Server.
|
|
1396
1396
|
"""
|
|
1397
|
-
retries_remaining =
|
|
1397
|
+
retries_remaining = (
|
|
1398
|
+
self.config.max_retries if retries_remaining is None else retries_remaining
|
|
1399
|
+
)
|
|
1398
1400
|
|
|
1399
1401
|
logger.debug(
|
|
1400
1402
|
f"Query {connection_type} to get {fetch_size} objects with cursor {current_cursor}"
|
|
@@ -1565,7 +1567,7 @@ class TableauSiteSource:
|
|
|
1565
1567
|
fetch_size=fetch_size,
|
|
1566
1568
|
current_cursor=current_cursor,
|
|
1567
1569
|
retry_on_auth_error=True,
|
|
1568
|
-
retries_remaining=retries_remaining,
|
|
1570
|
+
retries_remaining=retries_remaining - 1,
|
|
1569
1571
|
)
|
|
1570
1572
|
raise RuntimeError(f"Query {connection_type} error: {errors}")
|
|
1571
1573
|
|
|
@@ -17,7 +17,8 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
|
17
17
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
18
18
|
from datahub.ingestion.source.ge_data_profiler import DATABRICKS
|
|
19
19
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
20
|
-
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
20
|
+
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
21
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
21
22
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
22
23
|
StatefulStaleMetadataRemovalConfig,
|
|
23
24
|
)
|
|
@@ -5457,6 +5457,7 @@ class OperationClass(_Aspect):
|
|
|
5457
5457
|
affectedDatasets: Union[None, List[str]]=None,
|
|
5458
5458
|
sourceType: Union[None, Union[str, "OperationSourceTypeClass"]]=None,
|
|
5459
5459
|
customProperties: Union[None, Dict[str, str]]=None,
|
|
5460
|
+
queries: Union[None, List[str]]=None,
|
|
5460
5461
|
):
|
|
5461
5462
|
super().__init__()
|
|
5462
5463
|
|
|
@@ -5476,6 +5477,7 @@ class OperationClass(_Aspect):
|
|
|
5476
5477
|
self.sourceType = sourceType
|
|
5477
5478
|
self.customProperties = customProperties
|
|
5478
5479
|
self.lastUpdatedTimestamp = lastUpdatedTimestamp
|
|
5480
|
+
self.queries = queries
|
|
5479
5481
|
|
|
5480
5482
|
def _restore_defaults(self) -> None:
|
|
5481
5483
|
self.timestampMillis = int()
|
|
@@ -5490,6 +5492,7 @@ class OperationClass(_Aspect):
|
|
|
5490
5492
|
self.sourceType = self.RECORD_SCHEMA.fields_dict["sourceType"].default
|
|
5491
5493
|
self.customProperties = self.RECORD_SCHEMA.fields_dict["customProperties"].default
|
|
5492
5494
|
self.lastUpdatedTimestamp = int()
|
|
5495
|
+
self.queries = self.RECORD_SCHEMA.fields_dict["queries"].default
|
|
5493
5496
|
|
|
5494
5497
|
|
|
5495
5498
|
@property
|
|
@@ -5612,6 +5615,16 @@ class OperationClass(_Aspect):
|
|
|
5612
5615
|
self._inner_dict['lastUpdatedTimestamp'] = value
|
|
5613
5616
|
|
|
5614
5617
|
|
|
5618
|
+
@property
|
|
5619
|
+
def queries(self) -> Union[None, List[str]]:
|
|
5620
|
+
"""Which queries were used in this operation."""
|
|
5621
|
+
return self._inner_dict.get('queries') # type: ignore
|
|
5622
|
+
|
|
5623
|
+
@queries.setter
|
|
5624
|
+
def queries(self, value: Union[None, List[str]]) -> None:
|
|
5625
|
+
self._inner_dict['queries'] = value
|
|
5626
|
+
|
|
5627
|
+
|
|
5615
5628
|
class OperationSourceTypeClass(object):
|
|
5616
5629
|
"""The source of an operation"""
|
|
5617
5630
|
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -16424,6 +16424,23 @@
|
|
|
16424
16424
|
"type": "long",
|
|
16425
16425
|
"name": "lastUpdatedTimestamp",
|
|
16426
16426
|
"doc": "The time at which the operation occurred. Would be better named 'operationTime'"
|
|
16427
|
+
},
|
|
16428
|
+
{
|
|
16429
|
+
"TimeseriesFieldCollection": {
|
|
16430
|
+
"key": "query"
|
|
16431
|
+
},
|
|
16432
|
+
"Urn": "Urn",
|
|
16433
|
+
"urn_is_array": true,
|
|
16434
|
+
"type": [
|
|
16435
|
+
"null",
|
|
16436
|
+
{
|
|
16437
|
+
"type": "array",
|
|
16438
|
+
"items": "string"
|
|
16439
|
+
}
|
|
16440
|
+
],
|
|
16441
|
+
"name": "queries",
|
|
16442
|
+
"default": null,
|
|
16443
|
+
"doc": "Which queries were used in this operation."
|
|
16427
16444
|
}
|
|
16428
16445
|
],
|
|
16429
16446
|
"doc": "Operational info for an entity."
|