acryl-datahub 1.0.0.3rc12__py3-none-any.whl → 1.0.0.4rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (37) hide show
  1. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/METADATA +2529 -2527
  2. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/RECORD +37 -34
  3. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/emitter/request_helper.py +10 -5
  6. datahub/emitter/rest_emitter.py +183 -106
  7. datahub/ingestion/extractor/schema_util.py +17 -1
  8. datahub/ingestion/graph/client.py +17 -4
  9. datahub/ingestion/graph/links.py +53 -0
  10. datahub/ingestion/sink/datahub_rest.py +11 -10
  11. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  12. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  13. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  14. datahub/ingestion/source/fivetran/config.py +1 -1
  15. datahub/ingestion/source/ge_data_profiler.py +25 -0
  16. datahub/ingestion/source/snowflake/snowflake_config.py +1 -12
  17. datahub/ingestion/source/snowflake/snowflake_connection.py +5 -17
  18. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  19. datahub/ingestion/source/sql/athena.py +2 -1
  20. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  21. datahub/ingestion/source/sql/mssql/source.py +1 -1
  22. datahub/ingestion/source/sql/sql_config.py +1 -34
  23. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  24. datahub/ingestion/source/sql/stored_procedures/lineage.py +1 -0
  25. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  26. datahub/ingestion/source/tableau/tableau.py +4 -2
  27. datahub/ingestion/source/unity/config.py +2 -1
  28. datahub/metadata/_internal_schema_classes.py +13 -0
  29. datahub/metadata/schema.avsc +17 -0
  30. datahub/metadata/schemas/Operation.avsc +17 -0
  31. datahub/sdk/main_client.py +15 -0
  32. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  33. datahub/sql_parsing/sql_parsing_aggregator.py +3 -2
  34. datahub/utilities/server_config_util.py +14 -75
  35. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/entry_points.txt +0 -0
  36. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/licenses/LICENSE +0 -0
  37. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/top_level.txt +0 -0
@@ -21,10 +21,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
21
21
  from datahub.emitter.rest_emitter import (
22
22
  BATCH_INGEST_MAX_PAYLOAD_LENGTH,
23
23
  DEFAULT_REST_EMITTER_ENDPOINT,
24
- DEFAULT_REST_TRACE_MODE,
25
24
  DataHubRestEmitter,
25
+ EmitMode,
26
26
  RestSinkEndpoint,
27
- RestTraceMode,
28
27
  )
29
28
  from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
30
29
  from datahub.ingestion.api.sink import (
@@ -71,7 +70,6 @@ _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
71
70
  class DatahubRestSinkConfig(DatahubClientConfig):
72
71
  mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
73
72
  endpoint: RestSinkEndpoint = DEFAULT_REST_EMITTER_ENDPOINT
74
- default_trace_mode: RestTraceMode = DEFAULT_REST_TRACE_MODE
75
73
 
76
74
  # These only apply in async modes.
77
75
  max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
@@ -134,7 +132,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
134
132
  self._emitter_thread_local = threading.local()
135
133
 
136
134
  try:
137
- gms_config = self.emitter.get_server_config()
135
+ gms_config = self.emitter.server_config
138
136
  except Exception as exc:
139
137
  raise ConfigurationError(
140
138
  f"💥 Failed to connect to DataHub with {repr(self.emitter)}"
@@ -175,7 +173,6 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
175
173
  client_certificate_path=config.client_certificate_path,
176
174
  disable_ssl_verification=config.disable_ssl_verification,
177
175
  openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
178
- default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
179
176
  client_mode=config.client_mode,
180
177
  datahub_component=config.datahub_component,
181
178
  )
@@ -252,9 +249,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
252
249
  MetadataChangeProposal,
253
250
  MetadataChangeProposalWrapper,
254
251
  ],
252
+ emit_mode: EmitMode,
255
253
  ) -> None:
256
254
  # TODO: Add timing metrics
257
- self.emitter.emit(record)
255
+ self.emitter.emit(record, emit_mode=emit_mode)
258
256
 
259
257
  def _emit_batch_wrapper(
260
258
  self,
@@ -269,8 +267,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
269
267
  ],
270
268
  ) -> None:
271
269
  events: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]] = []
270
+
272
271
  for record in records:
273
272
  event = record[0]
273
+
274
274
  if isinstance(event, MetadataChangeEvent):
275
275
  # Unpack MCEs into MCPs.
276
276
  mcps = mcps_from_mce(event)
@@ -278,7 +278,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
278
278
  else:
279
279
  events.append(event)
280
280
 
281
- chunks = self.emitter.emit_mcps(events)
281
+ chunks = self.emitter.emit_mcps(events, emit_mode=EmitMode.ASYNC)
282
282
  self.report.async_batches_prepared += 1
283
283
  if chunks > 1:
284
284
  self.report.async_batches_split += chunks
@@ -309,6 +309,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
309
309
  partition_key,
310
310
  self._emit_wrapper,
311
311
  record,
312
+ EmitMode.ASYNC,
312
313
  done_callback=functools.partial(
313
314
  self._write_done_callback, record_envelope, write_callback
314
315
  ),
@@ -320,6 +321,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
320
321
  self.executor.submit(
321
322
  partition_key,
322
323
  record,
324
+ EmitMode.ASYNC,
323
325
  done_callback=functools.partial(
324
326
  self._write_done_callback, record_envelope, write_callback
325
327
  ),
@@ -328,7 +330,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
328
330
  else:
329
331
  # execute synchronously
330
332
  try:
331
- self._emit_wrapper(record)
333
+ self._emit_wrapper(record, emit_mode=EmitMode.SYNC_PRIMARY)
332
334
  write_callback.on_success(record_envelope, success_metadata={})
333
335
  except Exception as e:
334
336
  write_callback.on_failure(record_envelope, e, failure_metadata={})
@@ -340,8 +342,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
340
342
  ],
341
343
  ) -> None:
342
344
  return self.write_record_async(
343
- RecordEnvelope(item, metadata={}),
344
- NoopWriteCallback(),
345
+ RecordEnvelope(item, metadata={}), NoopWriteCallback()
345
346
  )
346
347
 
347
348
  def close(self):
@@ -2,10 +2,8 @@ import logging
2
2
  import os
3
3
  import re
4
4
  from datetime import timedelta
5
- from typing import Any, Dict, List, Optional, Union
5
+ from typing import Dict, List, Optional, Union
6
6
 
7
- from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
8
- from google.cloud.logging_v2.client import Client as GCPLoggingClient
9
7
  from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
10
8
 
11
9
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
@@ -18,7 +16,9 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
18
16
  from datahub.ingestion.glossary.classification_mixin import (
19
17
  ClassificationSourceConfigMixin,
20
18
  )
21
- from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
19
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
20
+ BigQueryConnectionConfig,
21
+ )
22
22
  from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
23
23
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
24
24
  from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -105,64 +105,6 @@ class BigQueryUsageConfig(BaseUsageConfig):
105
105
  )
106
106
 
107
107
 
108
- class BigQueryConnectionConfig(ConfigModel):
109
- credential: Optional[GCPCredential] = Field(
110
- default=None, description="BigQuery credential informations"
111
- )
112
-
113
- _credentials_path: Optional[str] = PrivateAttr(None)
114
-
115
- extra_client_options: Dict[str, Any] = Field(
116
- default={},
117
- description="Additional options to pass to google.cloud.logging_v2.client.Client.",
118
- )
119
-
120
- project_on_behalf: Optional[str] = Field(
121
- default=None,
122
- description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
123
- )
124
-
125
- def __init__(self, **data: Any):
126
- super().__init__(**data)
127
-
128
- if self.credential:
129
- self._credentials_path = self.credential.create_credential_temp_file()
130
- logger.debug(
131
- f"Creating temporary credential file at {self._credentials_path}"
132
- )
133
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
134
-
135
- def get_bigquery_client(self) -> bigquery.Client:
136
- client_options = self.extra_client_options
137
- return bigquery.Client(self.project_on_behalf, **client_options)
138
-
139
- def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
140
- return resourcemanager_v3.ProjectsClient()
141
-
142
- def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
143
- return datacatalog_v1.PolicyTagManagerClient()
144
-
145
- def make_gcp_logging_client(
146
- self, project_id: Optional[str] = None
147
- ) -> GCPLoggingClient:
148
- # See https://github.com/googleapis/google-cloud-python/issues/2674 for
149
- # why we disable gRPC here.
150
- client_options = self.extra_client_options.copy()
151
- client_options["_use_grpc"] = False
152
- if project_id is not None:
153
- return GCPLoggingClient(**client_options, project=project_id)
154
- else:
155
- return GCPLoggingClient(**client_options)
156
-
157
- def get_sql_alchemy_url(self) -> str:
158
- if self.project_on_behalf:
159
- return f"bigquery://{self.project_on_behalf}"
160
- # When project_id is not set, we will attempt to detect the project ID
161
- # based on the credentials or environment variables.
162
- # See https://github.com/mxmzdlv/pybigquery#authentication.
163
- return "bigquery://"
164
-
165
-
166
108
  class GcsLineageProviderConfig(ConfigModel):
167
109
  """
168
110
  Any source that produces gcs lineage from/to Datasets should inherit this class.
@@ -0,0 +1,70 @@
1
+ import logging
2
+ import os
3
+ from typing import Any, Dict, Optional
4
+
5
+ from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
6
+ from google.cloud.logging_v2.client import Client as GCPLoggingClient
7
+ from pydantic import Field, PrivateAttr
8
+
9
+ from datahub.configuration.common import ConfigModel
10
+ from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class BigQueryConnectionConfig(ConfigModel):
16
+ credential: Optional[GCPCredential] = Field(
17
+ default=None, description="BigQuery credential informations"
18
+ )
19
+
20
+ _credentials_path: Optional[str] = PrivateAttr(None)
21
+
22
+ extra_client_options: Dict[str, Any] = Field(
23
+ default={},
24
+ description="Additional options to pass to google.cloud.logging_v2.client.Client.",
25
+ )
26
+
27
+ project_on_behalf: Optional[str] = Field(
28
+ default=None,
29
+ description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
30
+ )
31
+
32
+ def __init__(self, **data: Any):
33
+ super().__init__(**data)
34
+
35
+ if self.credential:
36
+ self._credentials_path = self.credential.create_credential_temp_file()
37
+ logger.debug(
38
+ f"Creating temporary credential file at {self._credentials_path}"
39
+ )
40
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
41
+
42
+ def get_bigquery_client(self) -> bigquery.Client:
43
+ client_options = self.extra_client_options
44
+ return bigquery.Client(self.project_on_behalf, **client_options)
45
+
46
+ def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
47
+ return resourcemanager_v3.ProjectsClient()
48
+
49
+ def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
50
+ return datacatalog_v1.PolicyTagManagerClient()
51
+
52
+ def make_gcp_logging_client(
53
+ self, project_id: Optional[str] = None
54
+ ) -> GCPLoggingClient:
55
+ # See https://github.com/googleapis/google-cloud-python/issues/2674 for
56
+ # why we disable gRPC here.
57
+ client_options = self.extra_client_options.copy()
58
+ client_options["_use_grpc"] = False
59
+ if project_id is not None:
60
+ return GCPLoggingClient(**client_options, project=project_id)
61
+ else:
62
+ return GCPLoggingClient(**client_options)
63
+
64
+ def get_sql_alchemy_url(self) -> str:
65
+ if self.project_on_behalf:
66
+ return f"bigquery://{self.project_on_behalf}"
67
+ # When project_id is not set, we will attempt to detect the project ID
68
+ # based on the credentials or environment variables.
69
+ # See https://github.com/mxmzdlv/pybigquery#authentication.
70
+ return "bigquery://"
@@ -10,10 +10,12 @@ from datahub.ingestion.api.common import PipelineContext
10
10
  from datahub.ingestion.api.source import Source, SourceReport
11
11
  from datahub.ingestion.api.workunit import MetadataWorkUnit
12
12
  from datahub.ingestion.source.bigquery_v2.bigquery_config import (
13
- BigQueryConnectionConfig,
14
13
  BigQueryFilterConfig,
15
14
  BigQueryIdentifierConfig,
16
15
  )
16
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
17
+ BigQueryConnectionConfig,
18
+ )
17
19
  from datahub.ingestion.source.bigquery_v2.bigquery_report import (
18
20
  BigQueryQueriesExtractorReport,
19
21
  BigQuerySchemaApiPerfReport,
@@ -16,7 +16,7 @@ from datahub.configuration.source_common import DatasetSourceConfigMixin
16
16
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
17
17
  from datahub.emitter.mce_builder import DEFAULT_ENV
18
18
  from datahub.ingestion.api.report import Report
19
- from datahub.ingestion.source.bigquery_v2.bigquery_config import (
19
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
20
20
  BigQueryConnectionConfig,
21
21
  )
22
22
  from datahub.ingestion.source.snowflake.snowflake_connection import (
@@ -5,6 +5,7 @@ import concurrent.futures
5
5
  import contextlib
6
6
  import dataclasses
7
7
  import functools
8
+ import importlib.metadata
8
9
  import json
9
10
  import logging
10
11
  import re
@@ -84,6 +85,30 @@ if TYPE_CHECKING:
84
85
  from pyathena.cursor import Cursor
85
86
 
86
87
  assert MARKUPSAFE_PATCHED
88
+
89
+ # We need to ensure that acryl-great-expectations is installed
90
+ # and great-expectations is not installed.
91
+ try:
92
+ acryl_gx_version = bool(importlib.metadata.distribution("acryl-great-expectations"))
93
+ except importlib.metadata.PackageNotFoundError:
94
+ acryl_gx_version = False
95
+
96
+ try:
97
+ original_gx_version = bool(importlib.metadata.distribution("great-expectations"))
98
+ except importlib.metadata.PackageNotFoundError:
99
+ original_gx_version = False
100
+
101
+ if acryl_gx_version and original_gx_version:
102
+ raise RuntimeError(
103
+ "acryl-great-expectations and great-expectations cannot both be installed because their files will conflict. "
104
+ "You will need to (1) uninstall great-expectations and (2) re-install acryl-great-expectations. "
105
+ "See https://github.com/pypa/pip/issues/4625."
106
+ )
107
+ elif original_gx_version:
108
+ raise RuntimeError(
109
+ "We expect acryl-great-expectations to be installed, but great-expectations is installed instead."
110
+ )
111
+
87
112
  logger: logging.Logger = logging.getLogger(__name__)
88
113
 
89
114
  _original_get_column_median = SqlAlchemyDataset.get_column_median
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  from typing import Dict, List, Optional, Set
5
5
 
6
6
  import pydantic
7
- from pydantic import Field, SecretStr, root_validator, validator
7
+ from pydantic import Field, root_validator, validator
8
8
 
9
9
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
10
  from datahub.configuration.pattern_utils import UUID_REGEX
@@ -385,17 +385,6 @@ class SnowflakeV2Config(
385
385
 
386
386
  return values
387
387
 
388
- def get_sql_alchemy_url(
389
- self,
390
- database: Optional[str] = None,
391
- username: Optional[str] = None,
392
- password: Optional[SecretStr] = None,
393
- role: Optional[str] = None,
394
- ) -> str:
395
- return SnowflakeConnectionConfig.get_sql_alchemy_url(
396
- self, database=database, username=username, password=password, role=role
397
- )
398
-
399
388
  @validator("shares")
400
389
  def validate_shares(
401
390
  cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
@@ -28,7 +28,7 @@ from datahub.ingestion.source.snowflake.oauth_config import (
28
28
  OAuthIdentityProvider,
29
29
  )
30
30
  from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator
31
- from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri
31
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
32
32
  from datahub.utilities.config_clean import (
33
33
  remove_protocol,
34
34
  remove_suffix,
@@ -193,23 +193,11 @@ class SnowflakeConnectionConfig(ConfigModel):
193
193
  "but should be set when using use_certificate false for oauth_config"
194
194
  )
195
195
 
196
- def get_sql_alchemy_url(
197
- self,
198
- database: Optional[str] = None,
199
- username: Optional[str] = None,
200
- password: Optional[pydantic.SecretStr] = None,
201
- role: Optional[str] = None,
202
- ) -> str:
203
- if username is None:
204
- username = self.username
205
- if password is None:
206
- password = self.password
207
- if role is None:
208
- role = self.role
196
+ def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
209
197
  return make_sqlalchemy_uri(
210
198
  self.scheme,
211
- username,
212
- password.get_secret_value() if password else None,
199
+ self.username,
200
+ self.password.get_secret_value() if self.password else None,
213
201
  self.account_id,
214
202
  f'"{database}"' if database is not None else database,
215
203
  uri_opts={
@@ -218,7 +206,7 @@ class SnowflakeConnectionConfig(ConfigModel):
218
206
  for (key, value) in {
219
207
  "authenticator": _VALID_AUTH_TYPES.get(self.authentication_type),
220
208
  "warehouse": self.warehouse,
221
- "role": role,
209
+ "role": self.role,
222
210
  "application": _APPLICATION_NAME,
223
211
  }.items()
224
212
  if value
@@ -135,12 +135,7 @@ class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin):
135
135
  ) -> "DatahubGEProfiler":
136
136
  assert db_name
137
137
 
138
- url = self.config.get_sql_alchemy_url(
139
- database=db_name,
140
- username=self.config.username,
141
- password=self.config.password,
142
- role=self.config.role,
143
- )
138
+ url = self.config.get_sql_alchemy_url(database=db_name)
144
139
 
145
140
  logger.debug(f"sql_alchemy_url={url}")
146
141
 
@@ -35,13 +35,14 @@ from datahub.ingestion.source.sql.sql_common import (
35
35
  SQLAlchemySource,
36
36
  register_custom_type,
37
37
  )
38
- from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
38
+ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
39
39
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
40
40
  from datahub.ingestion.source.sql.sql_utils import (
41
41
  add_table_to_schema_container,
42
42
  gen_database_container,
43
43
  gen_database_key,
44
44
  )
45
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
45
46
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
46
47
  from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
47
48
  from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
@@ -36,7 +36,6 @@ from datahub.ingestion.source.sql.sql_common import (
36
36
  from datahub.ingestion.source.sql.sql_config import (
37
37
  BasicSQLAlchemyConfig,
38
38
  SQLCommonConfig,
39
- make_sqlalchemy_uri,
40
39
  )
41
40
  from datahub.ingestion.source.sql.sql_utils import (
42
41
  add_table_to_schema_container,
@@ -46,6 +45,7 @@ from datahub.ingestion.source.sql.sql_utils import (
46
45
  gen_schema_key,
47
46
  get_domain_wu,
48
47
  )
48
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
49
49
  from datahub.ingestion.source.state.stateful_ingestion_base import JobId
50
50
  from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
51
51
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
@@ -44,9 +44,9 @@ from datahub.ingestion.source.sql.sql_common import (
44
44
  )
45
45
  from datahub.ingestion.source.sql.sql_config import (
46
46
  BasicSQLAlchemyConfig,
47
- make_sqlalchemy_uri,
48
47
  )
49
48
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
49
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
50
50
  from datahub.ingestion.source.sql.stored_procedures.base import (
51
51
  generate_procedure_lineage,
52
52
  )
@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
4
4
 
5
5
  import pydantic
6
6
  from pydantic import Field
7
- from sqlalchemy.engine import URL
8
7
 
9
8
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
9
  from datahub.configuration.source_common import (
@@ -20,6 +19,7 @@ from datahub.ingestion.glossary.classification_mixin import (
20
19
  ClassificationSourceConfigMixin,
21
20
  )
22
21
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
22
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
23
23
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
24
24
  StatefulStaleMetadataRemovalConfig,
25
25
  )
@@ -184,36 +184,3 @@ class SQLAlchemyConnectionConfig(ConfigModel):
184
184
 
185
185
  class BasicSQLAlchemyConfig(SQLAlchemyConnectionConfig, SQLCommonConfig):
186
186
  pass
187
-
188
-
189
- def make_sqlalchemy_uri(
190
- scheme: str,
191
- username: Optional[str],
192
- password: Optional[str],
193
- at: Optional[str],
194
- db: Optional[str],
195
- uri_opts: Optional[Dict[str, Any]] = None,
196
- ) -> str:
197
- host: Optional[str] = None
198
- port: Optional[int] = None
199
- if at:
200
- try:
201
- host, port_str = at.rsplit(":", 1)
202
- port = int(port_str)
203
- except ValueError:
204
- host = at
205
- port = None
206
- if uri_opts:
207
- uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
208
-
209
- return str(
210
- URL.create(
211
- drivername=scheme,
212
- username=username,
213
- password=password,
214
- host=host,
215
- port=port,
216
- database=db,
217
- query=uri_opts or {},
218
- )
219
- )
@@ -0,0 +1,36 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from sqlalchemy.engine import URL
4
+
5
+
6
+ def make_sqlalchemy_uri(
7
+ scheme: str,
8
+ username: Optional[str],
9
+ password: Optional[str],
10
+ at: Optional[str],
11
+ db: Optional[str],
12
+ uri_opts: Optional[Dict[str, Any]] = None,
13
+ ) -> str:
14
+ host: Optional[str] = None
15
+ port: Optional[int] = None
16
+ if at:
17
+ try:
18
+ host, port_str = at.rsplit(":", 1)
19
+ port = int(port_str)
20
+ except ValueError:
21
+ host = at
22
+ port = None
23
+ if uri_opts:
24
+ uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
25
+
26
+ return str(
27
+ URL.create(
28
+ drivername=scheme,
29
+ username=username,
30
+ password=password,
31
+ host=host,
32
+ port=port,
33
+ database=db,
34
+ query=uri_opts or {},
35
+ )
36
+ )
@@ -24,6 +24,7 @@ def parse_procedure_code(
24
24
  ) -> Optional[DataJobInputOutputClass]:
25
25
  aggregator = SqlParsingAggregator(
26
26
  platform=schema_resolver.platform,
27
+ platform_instance=schema_resolver.platform_instance,
27
28
  env=schema_resolver.env,
28
29
  schema_resolver=schema_resolver,
29
30
  generate_lineage=True,
@@ -14,12 +14,12 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
14
14
  from datahub.ingestion.source.sql.sql_common import SQLAlchemySource, logger
15
15
  from datahub.ingestion.source.sql.sql_config import (
16
16
  BasicSQLAlchemyConfig,
17
- make_sqlalchemy_uri,
18
17
  )
19
18
  from datahub.ingestion.source.sql.sql_utils import (
20
19
  add_table_to_schema_container,
21
20
  gen_database_key,
22
21
  )
22
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
23
23
 
24
24
 
25
25
  class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
@@ -1394,7 +1394,9 @@ class TableauSiteSource:
1394
1394
  `fetch_size:` The number of records to retrieve from Tableau
1395
1395
  Server in a single API call, starting from the current cursor position on Tableau Server.
1396
1396
  """
1397
- retries_remaining = retries_remaining or self.config.max_retries
1397
+ retries_remaining = (
1398
+ self.config.max_retries if retries_remaining is None else retries_remaining
1399
+ )
1398
1400
 
1399
1401
  logger.debug(
1400
1402
  f"Query {connection_type} to get {fetch_size} objects with cursor {current_cursor}"
@@ -1565,7 +1567,7 @@ class TableauSiteSource:
1565
1567
  fetch_size=fetch_size,
1566
1568
  current_cursor=current_cursor,
1567
1569
  retry_on_auth_error=True,
1568
- retries_remaining=retries_remaining,
1570
+ retries_remaining=retries_remaining - 1,
1569
1571
  )
1570
1572
  raise RuntimeError(f"Query {connection_type} error: {errors}")
1571
1573
 
@@ -17,7 +17,8 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
17
17
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
18
18
  from datahub.ingestion.source.ge_data_profiler import DATABRICKS
19
19
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
20
- from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
20
+ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
21
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
21
22
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
22
23
  StatefulStaleMetadataRemovalConfig,
23
24
  )
@@ -5457,6 +5457,7 @@ class OperationClass(_Aspect):
5457
5457
  affectedDatasets: Union[None, List[str]]=None,
5458
5458
  sourceType: Union[None, Union[str, "OperationSourceTypeClass"]]=None,
5459
5459
  customProperties: Union[None, Dict[str, str]]=None,
5460
+ queries: Union[None, List[str]]=None,
5460
5461
  ):
5461
5462
  super().__init__()
5462
5463
 
@@ -5476,6 +5477,7 @@ class OperationClass(_Aspect):
5476
5477
  self.sourceType = sourceType
5477
5478
  self.customProperties = customProperties
5478
5479
  self.lastUpdatedTimestamp = lastUpdatedTimestamp
5480
+ self.queries = queries
5479
5481
 
5480
5482
  def _restore_defaults(self) -> None:
5481
5483
  self.timestampMillis = int()
@@ -5490,6 +5492,7 @@ class OperationClass(_Aspect):
5490
5492
  self.sourceType = self.RECORD_SCHEMA.fields_dict["sourceType"].default
5491
5493
  self.customProperties = self.RECORD_SCHEMA.fields_dict["customProperties"].default
5492
5494
  self.lastUpdatedTimestamp = int()
5495
+ self.queries = self.RECORD_SCHEMA.fields_dict["queries"].default
5493
5496
 
5494
5497
 
5495
5498
  @property
@@ -5612,6 +5615,16 @@ class OperationClass(_Aspect):
5612
5615
  self._inner_dict['lastUpdatedTimestamp'] = value
5613
5616
 
5614
5617
 
5618
+ @property
5619
+ def queries(self) -> Union[None, List[str]]:
5620
+ """Which queries were used in this operation."""
5621
+ return self._inner_dict.get('queries') # type: ignore
5622
+
5623
+ @queries.setter
5624
+ def queries(self, value: Union[None, List[str]]) -> None:
5625
+ self._inner_dict['queries'] = value
5626
+
5627
+
5615
5628
  class OperationSourceTypeClass(object):
5616
5629
  """The source of an operation"""
5617
5630
 
@@ -16424,6 +16424,23 @@
16424
16424
  "type": "long",
16425
16425
  "name": "lastUpdatedTimestamp",
16426
16426
  "doc": "The time at which the operation occurred. Would be better named 'operationTime'"
16427
+ },
16428
+ {
16429
+ "TimeseriesFieldCollection": {
16430
+ "key": "query"
16431
+ },
16432
+ "Urn": "Urn",
16433
+ "urn_is_array": true,
16434
+ "type": [
16435
+ "null",
16436
+ {
16437
+ "type": "array",
16438
+ "items": "string"
16439
+ }
16440
+ ],
16441
+ "name": "queries",
16442
+ "default": null,
16443
+ "doc": "Which queries were used in this operation."
16427
16444
  }
16428
16445
  ],
16429
16446
  "doc": "Operational info for an entity."