acryl-datahub 1.0.0.3rc11__py3-none-any.whl → 1.0.0.4rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (37) hide show
  1. {acryl_datahub-1.0.0.3rc11.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/METADATA +2545 -2548
  2. {acryl_datahub-1.0.0.3rc11.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/RECORD +37 -34
  3. {acryl_datahub-1.0.0.3rc11.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/emitter/request_helper.py +10 -5
  6. datahub/emitter/rest_emitter.py +183 -106
  7. datahub/ingestion/extractor/schema_util.py +17 -1
  8. datahub/ingestion/graph/client.py +17 -4
  9. datahub/ingestion/graph/links.py +53 -0
  10. datahub/ingestion/sink/datahub_rest.py +11 -10
  11. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  12. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  13. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  14. datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
  15. datahub/ingestion/source/fivetran/config.py +1 -1
  16. datahub/ingestion/source/ge_data_profiler.py +25 -0
  17. datahub/ingestion/source/snowflake/snowflake_config.py +1 -12
  18. datahub/ingestion/source/snowflake/snowflake_connection.py +5 -17
  19. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  20. datahub/ingestion/source/sql/athena.py +2 -1
  21. datahub/ingestion/source/sql/hive_metastore.py +5 -5
  22. datahub/ingestion/source/sql/mssql/source.py +1 -1
  23. datahub/ingestion/source/sql/sql_config.py +1 -34
  24. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  25. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  26. datahub/ingestion/source/unity/config.py +2 -1
  27. datahub/metadata/_internal_schema_classes.py +503 -490
  28. datahub/metadata/_urns/urn_defs.py +1528 -1528
  29. datahub/metadata/schema.avsc +15431 -15414
  30. datahub/metadata/schemas/Operation.avsc +17 -0
  31. datahub/sdk/main_client.py +15 -0
  32. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  33. datahub/sql_parsing/sql_parsing_aggregator.py +3 -2
  34. datahub/utilities/server_config_util.py +37 -126
  35. {acryl_datahub-1.0.0.3rc11.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/entry_points.txt +0 -0
  36. {acryl_datahub-1.0.0.3rc11.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/licenses/LICENSE +0 -0
  37. {acryl_datahub-1.0.0.3rc11.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/top_level.txt +0 -0
@@ -21,10 +21,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
21
21
  from datahub.emitter.rest_emitter import (
22
22
  BATCH_INGEST_MAX_PAYLOAD_LENGTH,
23
23
  DEFAULT_REST_EMITTER_ENDPOINT,
24
- DEFAULT_REST_TRACE_MODE,
25
24
  DataHubRestEmitter,
25
+ EmitMode,
26
26
  RestSinkEndpoint,
27
- RestTraceMode,
28
27
  )
29
28
  from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
30
29
  from datahub.ingestion.api.sink import (
@@ -71,7 +70,6 @@ _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
71
70
  class DatahubRestSinkConfig(DatahubClientConfig):
72
71
  mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
73
72
  endpoint: RestSinkEndpoint = DEFAULT_REST_EMITTER_ENDPOINT
74
- default_trace_mode: RestTraceMode = DEFAULT_REST_TRACE_MODE
75
73
 
76
74
  # These only apply in async modes.
77
75
  max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
@@ -134,7 +132,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
134
132
  self._emitter_thread_local = threading.local()
135
133
 
136
134
  try:
137
- gms_config = self.emitter.get_server_config()
135
+ gms_config = self.emitter.server_config
138
136
  except Exception as exc:
139
137
  raise ConfigurationError(
140
138
  f"💥 Failed to connect to DataHub with {repr(self.emitter)}"
@@ -175,7 +173,6 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
175
173
  client_certificate_path=config.client_certificate_path,
176
174
  disable_ssl_verification=config.disable_ssl_verification,
177
175
  openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
178
- default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
179
176
  client_mode=config.client_mode,
180
177
  datahub_component=config.datahub_component,
181
178
  )
@@ -252,9 +249,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
252
249
  MetadataChangeProposal,
253
250
  MetadataChangeProposalWrapper,
254
251
  ],
252
+ emit_mode: EmitMode,
255
253
  ) -> None:
256
254
  # TODO: Add timing metrics
257
- self.emitter.emit(record)
255
+ self.emitter.emit(record, emit_mode=emit_mode)
258
256
 
259
257
  def _emit_batch_wrapper(
260
258
  self,
@@ -269,8 +267,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
269
267
  ],
270
268
  ) -> None:
271
269
  events: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]] = []
270
+
272
271
  for record in records:
273
272
  event = record[0]
273
+
274
274
  if isinstance(event, MetadataChangeEvent):
275
275
  # Unpack MCEs into MCPs.
276
276
  mcps = mcps_from_mce(event)
@@ -278,7 +278,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
278
278
  else:
279
279
  events.append(event)
280
280
 
281
- chunks = self.emitter.emit_mcps(events)
281
+ chunks = self.emitter.emit_mcps(events, emit_mode=EmitMode.ASYNC)
282
282
  self.report.async_batches_prepared += 1
283
283
  if chunks > 1:
284
284
  self.report.async_batches_split += chunks
@@ -309,6 +309,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
309
309
  partition_key,
310
310
  self._emit_wrapper,
311
311
  record,
312
+ EmitMode.ASYNC,
312
313
  done_callback=functools.partial(
313
314
  self._write_done_callback, record_envelope, write_callback
314
315
  ),
@@ -320,6 +321,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
320
321
  self.executor.submit(
321
322
  partition_key,
322
323
  record,
324
+ EmitMode.ASYNC,
323
325
  done_callback=functools.partial(
324
326
  self._write_done_callback, record_envelope, write_callback
325
327
  ),
@@ -328,7 +330,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
328
330
  else:
329
331
  # execute synchronously
330
332
  try:
331
- self._emit_wrapper(record)
333
+ self._emit_wrapper(record, emit_mode=EmitMode.SYNC_PRIMARY)
332
334
  write_callback.on_success(record_envelope, success_metadata={})
333
335
  except Exception as e:
334
336
  write_callback.on_failure(record_envelope, e, failure_metadata={})
@@ -340,8 +342,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
340
342
  ],
341
343
  ) -> None:
342
344
  return self.write_record_async(
343
- RecordEnvelope(item, metadata={}),
344
- NoopWriteCallback(),
345
+ RecordEnvelope(item, metadata={}), NoopWriteCallback()
345
346
  )
346
347
 
347
348
  def close(self):
@@ -2,10 +2,8 @@ import logging
2
2
  import os
3
3
  import re
4
4
  from datetime import timedelta
5
- from typing import Any, Dict, List, Optional, Union
5
+ from typing import Dict, List, Optional, Union
6
6
 
7
- from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
8
- from google.cloud.logging_v2.client import Client as GCPLoggingClient
9
7
  from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
10
8
 
11
9
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
@@ -18,7 +16,9 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
18
16
  from datahub.ingestion.glossary.classification_mixin import (
19
17
  ClassificationSourceConfigMixin,
20
18
  )
21
- from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
19
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
20
+ BigQueryConnectionConfig,
21
+ )
22
22
  from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
23
23
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
24
24
  from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -105,64 +105,6 @@ class BigQueryUsageConfig(BaseUsageConfig):
105
105
  )
106
106
 
107
107
 
108
- class BigQueryConnectionConfig(ConfigModel):
109
- credential: Optional[GCPCredential] = Field(
110
- default=None, description="BigQuery credential informations"
111
- )
112
-
113
- _credentials_path: Optional[str] = PrivateAttr(None)
114
-
115
- extra_client_options: Dict[str, Any] = Field(
116
- default={},
117
- description="Additional options to pass to google.cloud.logging_v2.client.Client.",
118
- )
119
-
120
- project_on_behalf: Optional[str] = Field(
121
- default=None,
122
- description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
123
- )
124
-
125
- def __init__(self, **data: Any):
126
- super().__init__(**data)
127
-
128
- if self.credential:
129
- self._credentials_path = self.credential.create_credential_temp_file()
130
- logger.debug(
131
- f"Creating temporary credential file at {self._credentials_path}"
132
- )
133
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
134
-
135
- def get_bigquery_client(self) -> bigquery.Client:
136
- client_options = self.extra_client_options
137
- return bigquery.Client(self.project_on_behalf, **client_options)
138
-
139
- def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
140
- return resourcemanager_v3.ProjectsClient()
141
-
142
- def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
143
- return datacatalog_v1.PolicyTagManagerClient()
144
-
145
- def make_gcp_logging_client(
146
- self, project_id: Optional[str] = None
147
- ) -> GCPLoggingClient:
148
- # See https://github.com/googleapis/google-cloud-python/issues/2674 for
149
- # why we disable gRPC here.
150
- client_options = self.extra_client_options.copy()
151
- client_options["_use_grpc"] = False
152
- if project_id is not None:
153
- return GCPLoggingClient(**client_options, project=project_id)
154
- else:
155
- return GCPLoggingClient(**client_options)
156
-
157
- def get_sql_alchemy_url(self) -> str:
158
- if self.project_on_behalf:
159
- return f"bigquery://{self.project_on_behalf}"
160
- # When project_id is not set, we will attempt to detect the project ID
161
- # based on the credentials or environment variables.
162
- # See https://github.com/mxmzdlv/pybigquery#authentication.
163
- return "bigquery://"
164
-
165
-
166
108
  class GcsLineageProviderConfig(ConfigModel):
167
109
  """
168
110
  Any source that produces gcs lineage from/to Datasets should inherit this class.
@@ -0,0 +1,70 @@
1
+ import logging
2
+ import os
3
+ from typing import Any, Dict, Optional
4
+
5
+ from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
6
+ from google.cloud.logging_v2.client import Client as GCPLoggingClient
7
+ from pydantic import Field, PrivateAttr
8
+
9
+ from datahub.configuration.common import ConfigModel
10
+ from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class BigQueryConnectionConfig(ConfigModel):
16
+ credential: Optional[GCPCredential] = Field(
17
+ default=None, description="BigQuery credential informations"
18
+ )
19
+
20
+ _credentials_path: Optional[str] = PrivateAttr(None)
21
+
22
+ extra_client_options: Dict[str, Any] = Field(
23
+ default={},
24
+ description="Additional options to pass to google.cloud.logging_v2.client.Client.",
25
+ )
26
+
27
+ project_on_behalf: Optional[str] = Field(
28
+ default=None,
29
+ description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
30
+ )
31
+
32
+ def __init__(self, **data: Any):
33
+ super().__init__(**data)
34
+
35
+ if self.credential:
36
+ self._credentials_path = self.credential.create_credential_temp_file()
37
+ logger.debug(
38
+ f"Creating temporary credential file at {self._credentials_path}"
39
+ )
40
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
41
+
42
+ def get_bigquery_client(self) -> bigquery.Client:
43
+ client_options = self.extra_client_options
44
+ return bigquery.Client(self.project_on_behalf, **client_options)
45
+
46
+ def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
47
+ return resourcemanager_v3.ProjectsClient()
48
+
49
+ def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
50
+ return datacatalog_v1.PolicyTagManagerClient()
51
+
52
+ def make_gcp_logging_client(
53
+ self, project_id: Optional[str] = None
54
+ ) -> GCPLoggingClient:
55
+ # See https://github.com/googleapis/google-cloud-python/issues/2674 for
56
+ # why we disable gRPC here.
57
+ client_options = self.extra_client_options.copy()
58
+ client_options["_use_grpc"] = False
59
+ if project_id is not None:
60
+ return GCPLoggingClient(**client_options, project=project_id)
61
+ else:
62
+ return GCPLoggingClient(**client_options)
63
+
64
+ def get_sql_alchemy_url(self) -> str:
65
+ if self.project_on_behalf:
66
+ return f"bigquery://{self.project_on_behalf}"
67
+ # When project_id is not set, we will attempt to detect the project ID
68
+ # based on the credentials or environment variables.
69
+ # See https://github.com/mxmzdlv/pybigquery#authentication.
70
+ return "bigquery://"
@@ -10,10 +10,12 @@ from datahub.ingestion.api.common import PipelineContext
10
10
  from datahub.ingestion.api.source import Source, SourceReport
11
11
  from datahub.ingestion.api.workunit import MetadataWorkUnit
12
12
  from datahub.ingestion.source.bigquery_v2.bigquery_config import (
13
- BigQueryConnectionConfig,
14
13
  BigQueryFilterConfig,
15
14
  BigQueryIdentifierConfig,
16
15
  )
16
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
17
+ BigQueryConnectionConfig,
18
+ )
17
19
  from datahub.ingestion.source.bigquery_v2.bigquery_report import (
18
20
  BigQueryQueriesExtractorReport,
19
21
  BigQuerySchemaApiPerfReport,
@@ -474,6 +474,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
474
474
  dataset_properties.customProperties["schema.downsampled"] = "True"
475
475
  dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
476
476
  # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include MAX_SCHEMA_SIZE items
477
+ primary_keys = []
477
478
  for schema_field in sorted(
478
479
  table_fields,
479
480
  key=lambda x: (
@@ -484,22 +485,23 @@ class DynamoDBSource(StatefulIngestionSourceBase):
484
485
  field_path = schema_field["delimited_name"]
485
486
  native_data_type = self.get_native_type(schema_field["type"], table_name)
486
487
  type = self.get_field_type(schema_field["type"], table_name)
487
- description = None
488
488
  nullable = True
489
489
  if field_path in primary_key_dict:
490
- description = (
490
+ # primary key should not be nullable
491
+ type_key = (
491
492
  "Partition Key"
492
493
  if primary_key_dict.get(field_path) == "HASH"
493
494
  else "Sort Key"
494
495
  )
495
- # primary key should not be nullable
496
+ dataset_properties.customProperties[type_key] = field_path
496
497
  nullable = False
498
+ primary_keys.append(field_path)
497
499
 
498
500
  field = SchemaField(
499
501
  fieldPath=field_path,
500
502
  nativeDataType=native_data_type,
501
503
  type=type,
502
- description=description,
504
+ description=None,
503
505
  nullable=nullable,
504
506
  recursive=False,
505
507
  )
@@ -513,6 +515,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
513
515
  hash="",
514
516
  platformSchema=SchemalessClass(),
515
517
  fields=canonical_schema,
518
+ primaryKeys=primary_keys,
516
519
  )
517
520
  return schema_metadata
518
521
 
@@ -16,7 +16,7 @@ from datahub.configuration.source_common import DatasetSourceConfigMixin
16
16
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
17
17
  from datahub.emitter.mce_builder import DEFAULT_ENV
18
18
  from datahub.ingestion.api.report import Report
19
- from datahub.ingestion.source.bigquery_v2.bigquery_config import (
19
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
20
20
  BigQueryConnectionConfig,
21
21
  )
22
22
  from datahub.ingestion.source.snowflake.snowflake_connection import (
@@ -5,6 +5,7 @@ import concurrent.futures
5
5
  import contextlib
6
6
  import dataclasses
7
7
  import functools
8
+ import importlib.metadata
8
9
  import json
9
10
  import logging
10
11
  import re
@@ -84,6 +85,30 @@ if TYPE_CHECKING:
84
85
  from pyathena.cursor import Cursor
85
86
 
86
87
  assert MARKUPSAFE_PATCHED
88
+
89
+ # We need to ensure that acryl-great-expectations is installed
90
+ # and great-expectations is not installed.
91
+ try:
92
+ acryl_gx_version = bool(importlib.metadata.distribution("acryl-great-expectations"))
93
+ except importlib.metadata.PackageNotFoundError:
94
+ acryl_gx_version = False
95
+
96
+ try:
97
+ original_gx_version = bool(importlib.metadata.distribution("great-expectations"))
98
+ except importlib.metadata.PackageNotFoundError:
99
+ original_gx_version = False
100
+
101
+ if acryl_gx_version and original_gx_version:
102
+ raise RuntimeError(
103
+ "acryl-great-expectations and great-expectations cannot both be installed because their files will conflict. "
104
+ "You will need to (1) uninstall great-expectations and (2) re-install acryl-great-expectations. "
105
+ "See https://github.com/pypa/pip/issues/4625."
106
+ )
107
+ elif original_gx_version:
108
+ raise RuntimeError(
109
+ "We expect acryl-great-expectations to be installed, but great-expectations is installed instead."
110
+ )
111
+
87
112
  logger: logging.Logger = logging.getLogger(__name__)
88
113
 
89
114
  _original_get_column_median = SqlAlchemyDataset.get_column_median
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  from typing import Dict, List, Optional, Set
5
5
 
6
6
  import pydantic
7
- from pydantic import Field, SecretStr, root_validator, validator
7
+ from pydantic import Field, root_validator, validator
8
8
 
9
9
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
10
  from datahub.configuration.pattern_utils import UUID_REGEX
@@ -385,17 +385,6 @@ class SnowflakeV2Config(
385
385
 
386
386
  return values
387
387
 
388
- def get_sql_alchemy_url(
389
- self,
390
- database: Optional[str] = None,
391
- username: Optional[str] = None,
392
- password: Optional[SecretStr] = None,
393
- role: Optional[str] = None,
394
- ) -> str:
395
- return SnowflakeConnectionConfig.get_sql_alchemy_url(
396
- self, database=database, username=username, password=password, role=role
397
- )
398
-
399
388
  @validator("shares")
400
389
  def validate_shares(
401
390
  cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
@@ -28,7 +28,7 @@ from datahub.ingestion.source.snowflake.oauth_config import (
28
28
  OAuthIdentityProvider,
29
29
  )
30
30
  from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator
31
- from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri
31
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
32
32
  from datahub.utilities.config_clean import (
33
33
  remove_protocol,
34
34
  remove_suffix,
@@ -193,23 +193,11 @@ class SnowflakeConnectionConfig(ConfigModel):
193
193
  "but should be set when using use_certificate false for oauth_config"
194
194
  )
195
195
 
196
- def get_sql_alchemy_url(
197
- self,
198
- database: Optional[str] = None,
199
- username: Optional[str] = None,
200
- password: Optional[pydantic.SecretStr] = None,
201
- role: Optional[str] = None,
202
- ) -> str:
203
- if username is None:
204
- username = self.username
205
- if password is None:
206
- password = self.password
207
- if role is None:
208
- role = self.role
196
+ def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
209
197
  return make_sqlalchemy_uri(
210
198
  self.scheme,
211
- username,
212
- password.get_secret_value() if password else None,
199
+ self.username,
200
+ self.password.get_secret_value() if self.password else None,
213
201
  self.account_id,
214
202
  f'"{database}"' if database is not None else database,
215
203
  uri_opts={
@@ -218,7 +206,7 @@ class SnowflakeConnectionConfig(ConfigModel):
218
206
  for (key, value) in {
219
207
  "authenticator": _VALID_AUTH_TYPES.get(self.authentication_type),
220
208
  "warehouse": self.warehouse,
221
- "role": role,
209
+ "role": self.role,
222
210
  "application": _APPLICATION_NAME,
223
211
  }.items()
224
212
  if value
@@ -135,12 +135,7 @@ class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin):
135
135
  ) -> "DatahubGEProfiler":
136
136
  assert db_name
137
137
 
138
- url = self.config.get_sql_alchemy_url(
139
- database=db_name,
140
- username=self.config.username,
141
- password=self.config.password,
142
- role=self.config.role,
143
- )
138
+ url = self.config.get_sql_alchemy_url(database=db_name)
144
139
 
145
140
  logger.debug(f"sql_alchemy_url={url}")
146
141
 
@@ -35,13 +35,14 @@ from datahub.ingestion.source.sql.sql_common import (
35
35
  SQLAlchemySource,
36
36
  register_custom_type,
37
37
  )
38
- from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
38
+ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
39
39
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
40
40
  from datahub.ingestion.source.sql.sql_utils import (
41
41
  add_table_to_schema_container,
42
42
  gen_database_container,
43
43
  gen_database_key,
44
44
  )
45
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
45
46
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
46
47
  from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
47
48
  from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
@@ -36,7 +36,6 @@ from datahub.ingestion.source.sql.sql_common import (
36
36
  from datahub.ingestion.source.sql.sql_config import (
37
37
  BasicSQLAlchemyConfig,
38
38
  SQLCommonConfig,
39
- make_sqlalchemy_uri,
40
39
  )
41
40
  from datahub.ingestion.source.sql.sql_utils import (
42
41
  add_table_to_schema_container,
@@ -46,6 +45,7 @@ from datahub.ingestion.source.sql.sql_utils import (
46
45
  gen_schema_key,
47
46
  get_domain_wu,
48
47
  )
48
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
49
49
  from datahub.ingestion.source.state.stateful_ingestion_base import JobId
50
50
  from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
51
51
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
@@ -67,10 +67,10 @@ TableKey = namedtuple("TableKey", ["schema", "table"])
67
67
 
68
68
 
69
69
  class HiveMetastoreConfigMode(StrEnum):
70
- hive: str = "hive"
71
- presto: str = "presto"
72
- presto_on_hive: str = "presto-on-hive"
73
- trino: str = "trino"
70
+ hive = "hive"
71
+ presto = "presto"
72
+ presto_on_hive = "presto-on-hive"
73
+ trino = "trino"
74
74
 
75
75
 
76
76
  @dataclass
@@ -44,9 +44,9 @@ from datahub.ingestion.source.sql.sql_common import (
44
44
  )
45
45
  from datahub.ingestion.source.sql.sql_config import (
46
46
  BasicSQLAlchemyConfig,
47
- make_sqlalchemy_uri,
48
47
  )
49
48
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
49
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
50
50
  from datahub.ingestion.source.sql.stored_procedures.base import (
51
51
  generate_procedure_lineage,
52
52
  )
@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
4
4
 
5
5
  import pydantic
6
6
  from pydantic import Field
7
- from sqlalchemy.engine import URL
8
7
 
9
8
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
9
  from datahub.configuration.source_common import (
@@ -20,6 +19,7 @@ from datahub.ingestion.glossary.classification_mixin import (
20
19
  ClassificationSourceConfigMixin,
21
20
  )
22
21
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
22
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
23
23
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
24
24
  StatefulStaleMetadataRemovalConfig,
25
25
  )
@@ -184,36 +184,3 @@ class SQLAlchemyConnectionConfig(ConfigModel):
184
184
 
185
185
  class BasicSQLAlchemyConfig(SQLAlchemyConnectionConfig, SQLCommonConfig):
186
186
  pass
187
-
188
-
189
- def make_sqlalchemy_uri(
190
- scheme: str,
191
- username: Optional[str],
192
- password: Optional[str],
193
- at: Optional[str],
194
- db: Optional[str],
195
- uri_opts: Optional[Dict[str, Any]] = None,
196
- ) -> str:
197
- host: Optional[str] = None
198
- port: Optional[int] = None
199
- if at:
200
- try:
201
- host, port_str = at.rsplit(":", 1)
202
- port = int(port_str)
203
- except ValueError:
204
- host = at
205
- port = None
206
- if uri_opts:
207
- uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
208
-
209
- return str(
210
- URL.create(
211
- drivername=scheme,
212
- username=username,
213
- password=password,
214
- host=host,
215
- port=port,
216
- database=db,
217
- query=uri_opts or {},
218
- )
219
- )
@@ -0,0 +1,36 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from sqlalchemy.engine import URL
4
+
5
+
6
+ def make_sqlalchemy_uri(
7
+ scheme: str,
8
+ username: Optional[str],
9
+ password: Optional[str],
10
+ at: Optional[str],
11
+ db: Optional[str],
12
+ uri_opts: Optional[Dict[str, Any]] = None,
13
+ ) -> str:
14
+ host: Optional[str] = None
15
+ port: Optional[int] = None
16
+ if at:
17
+ try:
18
+ host, port_str = at.rsplit(":", 1)
19
+ port = int(port_str)
20
+ except ValueError:
21
+ host = at
22
+ port = None
23
+ if uri_opts:
24
+ uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
25
+
26
+ return str(
27
+ URL.create(
28
+ drivername=scheme,
29
+ username=username,
30
+ password=password,
31
+ host=host,
32
+ port=port,
33
+ database=db,
34
+ query=uri_opts or {},
35
+ )
36
+ )
@@ -14,12 +14,12 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
14
14
  from datahub.ingestion.source.sql.sql_common import SQLAlchemySource, logger
15
15
  from datahub.ingestion.source.sql.sql_config import (
16
16
  BasicSQLAlchemyConfig,
17
- make_sqlalchemy_uri,
18
17
  )
19
18
  from datahub.ingestion.source.sql.sql_utils import (
20
19
  add_table_to_schema_container,
21
20
  gen_database_key,
22
21
  )
22
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
23
23
 
24
24
 
25
25
  class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
@@ -17,7 +17,8 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
17
17
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
18
18
  from datahub.ingestion.source.ge_data_profiler import DATABRICKS
19
19
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
20
- from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
20
+ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
21
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
21
22
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
22
23
  StatefulStaleMetadataRemovalConfig,
23
24
  )