acryl-datahub 1.2.0.10rc3__py3-none-any.whl → 1.2.0.10rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/METADATA +2668 -2752
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/RECORD +82 -82
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +6 -3
- datahub/api/entities/dataset/dataset.py +9 -18
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/docker_check.py +2 -2
- datahub/configuration/common.py +29 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/pydantic_migration_helpers.py +0 -9
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +5 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/source/azure/azure_common.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
- datahub/ingestion/source/datahub/config.py +8 -9
- datahub/ingestion/source/delta_lake/config.py +1 -1
- datahub/ingestion/source/dremio/dremio_config.py +3 -4
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/ge_profiling_config.py +26 -22
- datahub/ingestion/source/grafana/grafana_config.py +2 -2
- datahub/ingestion/source/grafana/models.py +12 -14
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/looker/looker_common.py +1 -1
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/lookml_config.py +1 -1
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +1 -1
- datahub/ingestion/source/mode.py +13 -5
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +14 -21
- datahub/ingestion/source/preset.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/redshift/config.py +6 -3
- datahub/ingestion/source/salesforce.py +13 -9
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
- datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +12 -7
- datahub/ingestion/source/sql/cockroachdb.py +5 -3
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +7 -9
- datahub/ingestion/source/sql/mssql/source.py +2 -2
- datahub/ingestion/source/sql/mysql.py +2 -2
- datahub/ingestion/source/sql/oracle.py +3 -3
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -4
- datahub/ingestion/source/sql/trino.py +2 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +1 -1
- datahub/ingestion/source/sql_queries.py +6 -6
- datahub/ingestion/source/state/checkpoint.py +5 -1
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
- datahub/ingestion/source/superset.py +1 -2
- datahub/ingestion/source/tableau/tableau.py +17 -3
- datahub/ingestion/source/unity/config.py +7 -3
- datahub/ingestion/source/usage/usage_common.py +3 -3
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/sdk/search_filters.py +1 -7
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/top_level.txt +0 -0
datahub/ingestion/source/mode.py
CHANGED
|
@@ -7,7 +7,16 @@ from dataclasses import dataclass
|
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
8
|
from functools import lru_cache
|
|
9
9
|
from json import JSONDecodeError
|
|
10
|
-
from typing import
|
|
10
|
+
from typing import (
|
|
11
|
+
Dict,
|
|
12
|
+
Iterable,
|
|
13
|
+
Iterator,
|
|
14
|
+
List,
|
|
15
|
+
Optional,
|
|
16
|
+
Set,
|
|
17
|
+
Tuple,
|
|
18
|
+
Union,
|
|
19
|
+
)
|
|
11
20
|
|
|
12
21
|
import dateutil.parser as dp
|
|
13
22
|
import psutil
|
|
@@ -24,7 +33,7 @@ from requests.models import HTTPBasicAuth, HTTPError
|
|
|
24
33
|
from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
|
|
25
34
|
|
|
26
35
|
import datahub.emitter.mce_builder as builder
|
|
27
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
36
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
28
37
|
from datahub.configuration.source_common import (
|
|
29
38
|
DatasetLineageProviderConfigBase,
|
|
30
39
|
)
|
|
@@ -200,10 +209,9 @@ class ModeConfig(
|
|
|
200
209
|
default=True, description="Tag measures and dimensions in the schema"
|
|
201
210
|
)
|
|
202
211
|
|
|
203
|
-
items_per_page: int = Field(
|
|
204
|
-
|
|
212
|
+
items_per_page: HiddenFromDocs[int] = Field(
|
|
213
|
+
DEFAULT_API_ITEMS_PER_PAGE,
|
|
205
214
|
description="Number of items per page for paginated API requests.",
|
|
206
|
-
hidden_from_docs=True,
|
|
207
215
|
)
|
|
208
216
|
|
|
209
217
|
@validator("connect_uri")
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -166,7 +166,7 @@ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
166
166
|
)
|
|
167
167
|
|
|
168
168
|
@root_validator(skip_on_failure=True)
|
|
169
|
-
def validate_auth_params(
|
|
169
|
+
def validate_auth_params(cls, values):
|
|
170
170
|
if values.get("auth") is NifiAuthType.CLIENT_CERT and not values.get(
|
|
171
171
|
"client_cert_file"
|
|
172
172
|
):
|
|
@@ -4,11 +4,10 @@ from enum import Enum
|
|
|
4
4
|
from typing import Dict, List, Literal, Optional, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
from pydantic import validator
|
|
8
|
-
from pydantic.class_validators import root_validator
|
|
7
|
+
from pydantic import root_validator, validator
|
|
9
8
|
|
|
10
9
|
import datahub.emitter.mce_builder as builder
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
12
11
|
from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
|
|
13
12
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
14
13
|
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
@@ -291,22 +290,18 @@ class PowerBiProfilingConfig(ConfigModel):
|
|
|
291
290
|
class PowerBiDashboardSourceConfig(
|
|
292
291
|
StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
|
|
293
292
|
):
|
|
294
|
-
platform_name: str = pydantic.Field(
|
|
295
|
-
default=Constant.PLATFORM_NAME, hidden_from_docs=True
|
|
296
|
-
)
|
|
293
|
+
platform_name: HiddenFromDocs[str] = pydantic.Field(default=Constant.PLATFORM_NAME)
|
|
297
294
|
|
|
298
|
-
platform_urn: str = pydantic.Field(
|
|
295
|
+
platform_urn: HiddenFromDocs[str] = pydantic.Field(
|
|
299
296
|
default=builder.make_data_platform_urn(platform=Constant.PLATFORM_NAME),
|
|
300
|
-
hidden_from_docs=True,
|
|
301
297
|
)
|
|
302
298
|
|
|
303
299
|
# Organization Identifier
|
|
304
300
|
tenant_id: str = pydantic.Field(description="PowerBI tenant identifier")
|
|
305
301
|
# PowerBi workspace identifier
|
|
306
|
-
workspace_id: Optional[str] = pydantic.Field(
|
|
302
|
+
workspace_id: HiddenFromDocs[Optional[str]] = pydantic.Field(
|
|
307
303
|
default=None,
|
|
308
304
|
description="[deprecated] Use workspace_id_pattern instead",
|
|
309
|
-
hidden_from_docs=True,
|
|
310
305
|
)
|
|
311
306
|
# PowerBi workspace identifier
|
|
312
307
|
workspace_id_pattern: AllowDenyPattern = pydantic.Field(
|
|
@@ -326,15 +321,14 @@ class PowerBiDashboardSourceConfig(
|
|
|
326
321
|
# Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
|
|
327
322
|
# DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
|
|
328
323
|
# mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
|
|
329
|
-
dataset_type_mapping:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
)
|
|
324
|
+
dataset_type_mapping: HiddenFromDocs[
|
|
325
|
+
Union[Dict[str, str], Dict[str, PlatformDetail]]
|
|
326
|
+
] = pydantic.Field(
|
|
327
|
+
default_factory=default_for_dataset_type_mapping,
|
|
328
|
+
description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
|
|
329
|
+
"DataHub supported datasources."
|
|
330
|
+
"You can configured platform instance for dataset lineage. "
|
|
331
|
+
"See Quickstart Recipe for mapping",
|
|
338
332
|
)
|
|
339
333
|
# PowerBI datasource's server to platform instance mapping
|
|
340
334
|
server_to_platform_instance: Dict[
|
|
@@ -541,10 +535,9 @@ class PowerBiDashboardSourceConfig(
|
|
|
541
535
|
"Increase this value if you encounter the 'M-Query Parsing Timeout' message in the connector report.",
|
|
542
536
|
)
|
|
543
537
|
|
|
544
|
-
metadata_api_timeout: int = pydantic.Field(
|
|
538
|
+
metadata_api_timeout: HiddenFromDocs[int] = pydantic.Field(
|
|
545
539
|
default=30,
|
|
546
540
|
description="timeout in seconds for Metadata Rest Api.",
|
|
547
|
-
hidden_from_docs=True,
|
|
548
541
|
)
|
|
549
542
|
|
|
550
543
|
@root_validator(skip_on_failure=True)
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from typing import Dict, Optional
|
|
3
3
|
|
|
4
4
|
import requests
|
|
5
|
-
from pydantic
|
|
5
|
+
from pydantic import root_validator, validator
|
|
6
6
|
from pydantic.fields import Field
|
|
7
7
|
|
|
8
8
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
from datetime import datetime
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import Dict, List, Optional, Type, Union
|
|
4
5
|
|
|
5
|
-
from pydantic import BaseModel, Field, root_validator
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field, root_validator
|
|
6
7
|
|
|
7
8
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
8
9
|
from datahub.ingestion.source.qlik_sense.config import QLIK_DATETIME_FORMAT, Constant
|
|
@@ -78,7 +79,11 @@ PERSONAL_SPACE_DICT = {
|
|
|
78
79
|
}
|
|
79
80
|
|
|
80
81
|
|
|
81
|
-
class
|
|
82
|
+
class _QlikBaseModel(BaseModel):
|
|
83
|
+
model_config = ConfigDict(coerce_numbers_to_str=True)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class Space(_QlikBaseModel):
|
|
82
87
|
id: str
|
|
83
88
|
name: str
|
|
84
89
|
description: str
|
|
@@ -89,6 +94,9 @@ class Space(BaseModel):
|
|
|
89
94
|
|
|
90
95
|
@root_validator(pre=True)
|
|
91
96
|
def update_values(cls, values: Dict) -> Dict:
|
|
97
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
98
|
+
values = deepcopy(values)
|
|
99
|
+
|
|
92
100
|
values[Constant.CREATEDAT] = datetime.strptime(
|
|
93
101
|
values[Constant.CREATEDAT], QLIK_DATETIME_FORMAT
|
|
94
102
|
)
|
|
@@ -98,7 +106,7 @@ class Space(BaseModel):
|
|
|
98
106
|
return values
|
|
99
107
|
|
|
100
108
|
|
|
101
|
-
class Item(
|
|
109
|
+
class Item(_QlikBaseModel):
|
|
102
110
|
id: str
|
|
103
111
|
description: str = ""
|
|
104
112
|
ownerId: str
|
|
@@ -107,7 +115,7 @@ class Item(BaseModel):
|
|
|
107
115
|
updatedAt: datetime
|
|
108
116
|
|
|
109
117
|
|
|
110
|
-
class SchemaField(
|
|
118
|
+
class SchemaField(_QlikBaseModel):
|
|
111
119
|
name: str
|
|
112
120
|
dataType: Optional[str] = None
|
|
113
121
|
primaryKey: Optional[bool] = None
|
|
@@ -115,6 +123,8 @@ class SchemaField(BaseModel):
|
|
|
115
123
|
|
|
116
124
|
@root_validator(pre=True)
|
|
117
125
|
def update_values(cls, values: Dict) -> Dict:
|
|
126
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
127
|
+
values = deepcopy(values)
|
|
118
128
|
values[Constant.DATATYPE] = values.get(Constant.DATATYPE, {}).get(Constant.TYPE)
|
|
119
129
|
return values
|
|
120
130
|
|
|
@@ -130,6 +140,8 @@ class QlikDataset(Item):
|
|
|
130
140
|
|
|
131
141
|
@root_validator(pre=True)
|
|
132
142
|
def update_values(cls, values: Dict) -> Dict:
|
|
143
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
144
|
+
values = deepcopy(values)
|
|
133
145
|
# Update str time to datetime
|
|
134
146
|
values[Constant.CREATEDAT] = datetime.strptime(
|
|
135
147
|
values[Constant.CREATEDTIME], QLIK_DATETIME_FORMAT
|
|
@@ -148,13 +160,13 @@ class QlikDataset(Item):
|
|
|
148
160
|
return values
|
|
149
161
|
|
|
150
162
|
|
|
151
|
-
class AxisProperty(
|
|
163
|
+
class AxisProperty(_QlikBaseModel):
|
|
152
164
|
Title: str = Field(alias="qFallbackTitle")
|
|
153
165
|
Min: str = Field(alias="qMin")
|
|
154
166
|
Max: str = Field(alias="qMax")
|
|
155
167
|
|
|
156
168
|
|
|
157
|
-
class Chart(
|
|
169
|
+
class Chart(_QlikBaseModel):
|
|
158
170
|
qId: str
|
|
159
171
|
visualization: str
|
|
160
172
|
title: str
|
|
@@ -164,13 +176,15 @@ class Chart(BaseModel):
|
|
|
164
176
|
|
|
165
177
|
@root_validator(pre=True)
|
|
166
178
|
def update_values(cls, values: Dict) -> Dict:
|
|
179
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
180
|
+
values = deepcopy(values)
|
|
167
181
|
values[Constant.QID] = values[Constant.QINFO][Constant.QID]
|
|
168
182
|
values["qDimension"] = values[Constant.HYPERCUBE]["qDimensionInfo"]
|
|
169
183
|
values["qMeasure"] = values[Constant.HYPERCUBE]["qMeasureInfo"]
|
|
170
184
|
return values
|
|
171
185
|
|
|
172
186
|
|
|
173
|
-
class Sheet(
|
|
187
|
+
class Sheet(_QlikBaseModel):
|
|
174
188
|
id: str
|
|
175
189
|
title: str
|
|
176
190
|
description: str
|
|
@@ -181,6 +195,8 @@ class Sheet(BaseModel):
|
|
|
181
195
|
|
|
182
196
|
@root_validator(pre=True)
|
|
183
197
|
def update_values(cls, values: Dict) -> Dict:
|
|
198
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
199
|
+
values = deepcopy(values)
|
|
184
200
|
values[Constant.CREATEDAT] = datetime.strptime(
|
|
185
201
|
values[Constant.CREATEDDATE], QLIK_DATETIME_FORMAT
|
|
186
202
|
)
|
|
@@ -190,7 +206,7 @@ class Sheet(BaseModel):
|
|
|
190
206
|
return values
|
|
191
207
|
|
|
192
208
|
|
|
193
|
-
class QlikTable(
|
|
209
|
+
class QlikTable(_QlikBaseModel):
|
|
194
210
|
tableName: str
|
|
195
211
|
type: BoxType = Field(alias="boxType")
|
|
196
212
|
tableAlias: str
|
|
@@ -206,6 +222,8 @@ class QlikTable(BaseModel):
|
|
|
206
222
|
|
|
207
223
|
@root_validator(pre=True)
|
|
208
224
|
def update_values(cls, values: Dict) -> Dict:
|
|
225
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
226
|
+
values = deepcopy(values)
|
|
209
227
|
values[Constant.DATACONNECTORID] = values[Constant.CONNECTIONINFO][Constant.ID]
|
|
210
228
|
values[Constant.DATACONNECTORPLATFORM] = values[Constant.CONNECTIONINFO][
|
|
211
229
|
Constant.SOURCECONNECTORID
|
|
@@ -223,6 +241,8 @@ class App(Item):
|
|
|
223
241
|
|
|
224
242
|
@root_validator(pre=True)
|
|
225
243
|
def update_values(cls, values: Dict) -> Dict:
|
|
244
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
245
|
+
values = deepcopy(values)
|
|
226
246
|
values[Constant.CREATEDAT] = datetime.strptime(
|
|
227
247
|
values[Constant.CREATEDDATE], QLIK_DATETIME_FORMAT
|
|
228
248
|
)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from copy import deepcopy
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import Any, Dict, List, Optional
|
|
4
5
|
|
|
@@ -6,7 +7,7 @@ from pydantic import root_validator
|
|
|
6
7
|
from pydantic.fields import Field
|
|
7
8
|
|
|
8
9
|
from datahub.configuration import ConfigModel
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
10
11
|
from datahub.configuration.source_common import DatasetLineageProviderConfigBase
|
|
11
12
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
12
13
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
@@ -95,10 +96,9 @@ class RedshiftConfig(
|
|
|
95
96
|
# Because of this behavior, it uses dramatically fewer round trips for
|
|
96
97
|
# large Redshift warehouses. As an example, see this query for the columns:
|
|
97
98
|
# https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/blob/60b4db04c1d26071c291aeea52f1dcb5dd8b0eb0/sqlalchemy_redshift/dialect.py#L745.
|
|
98
|
-
scheme: str = Field(
|
|
99
|
+
scheme: HiddenFromDocs[str] = Field(
|
|
99
100
|
default="redshift+redshift_connector",
|
|
100
101
|
description="",
|
|
101
|
-
hidden_from_docs=True,
|
|
102
102
|
)
|
|
103
103
|
|
|
104
104
|
_database_alias_removed = pydantic_removed_field("database_alias")
|
|
@@ -216,6 +216,9 @@ class RedshiftConfig(
|
|
|
216
216
|
|
|
217
217
|
@root_validator(skip_on_failure=True)
|
|
218
218
|
def connection_config_compatibility_set(cls, values: Dict) -> Dict:
|
|
219
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
220
|
+
values = deepcopy(values)
|
|
221
|
+
|
|
219
222
|
if (
|
|
220
223
|
("options" in values and "connect_args" in values["options"])
|
|
221
224
|
and "extra_client_options" in values
|
|
@@ -110,30 +110,33 @@ class SalesforceConfig(
|
|
|
110
110
|
auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
|
|
111
111
|
|
|
112
112
|
# Username, Password Auth
|
|
113
|
-
username: Optional[str] = Field(description="Salesforce username")
|
|
114
|
-
password: Optional[str] = Field(description="Password for Salesforce user")
|
|
113
|
+
username: Optional[str] = Field(None, description="Salesforce username")
|
|
114
|
+
password: Optional[str] = Field(None, description="Password for Salesforce user")
|
|
115
115
|
consumer_key: Optional[str] = Field(
|
|
116
|
-
description="Consumer key for Salesforce JSON web token access"
|
|
116
|
+
None, description="Consumer key for Salesforce JSON web token access"
|
|
117
117
|
)
|
|
118
118
|
private_key: Optional[str] = Field(
|
|
119
|
-
description="Private key as a string for Salesforce JSON web token access"
|
|
119
|
+
None, description="Private key as a string for Salesforce JSON web token access"
|
|
120
120
|
)
|
|
121
121
|
security_token: Optional[str] = Field(
|
|
122
|
-
description="Security token for Salesforce username"
|
|
122
|
+
None, description="Security token for Salesforce username"
|
|
123
123
|
)
|
|
124
124
|
# client_id, client_secret not required
|
|
125
125
|
|
|
126
126
|
# Direct - Instance URL, Access Token Auth
|
|
127
127
|
instance_url: Optional[str] = Field(
|
|
128
|
-
|
|
128
|
+
None,
|
|
129
|
+
description="Salesforce instance url. e.g. https://MyDomainName.my.salesforce.com",
|
|
129
130
|
)
|
|
130
131
|
# Flag to indicate whether the instance is production or sandbox
|
|
131
132
|
is_sandbox: bool = Field(
|
|
132
133
|
default=False, description="Connect to Sandbox instance of your Salesforce"
|
|
133
134
|
)
|
|
134
|
-
access_token: Optional[str] = Field(
|
|
135
|
+
access_token: Optional[str] = Field(
|
|
136
|
+
None, description="Access token for instance url"
|
|
137
|
+
)
|
|
135
138
|
|
|
136
|
-
ingest_tags:
|
|
139
|
+
ingest_tags: bool = Field(
|
|
137
140
|
default=False,
|
|
138
141
|
description="Ingest Tags from source. This will override Tags entered from UI",
|
|
139
142
|
)
|
|
@@ -147,7 +150,8 @@ class SalesforceConfig(
|
|
|
147
150
|
description='Regex patterns for tables/schemas to describe domain_key domain key (domain_key can be any string like "sales".) There can be multiple domain keys specified.',
|
|
148
151
|
)
|
|
149
152
|
api_version: Optional[str] = Field(
|
|
150
|
-
|
|
153
|
+
None,
|
|
154
|
+
description="If specified, overrides default version used by the Salesforce package. Example value: '59.0'",
|
|
151
155
|
)
|
|
152
156
|
|
|
153
157
|
profiling: SalesforceProfilingConfig = SalesforceProfilingConfig()
|
|
@@ -4,7 +4,6 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import tempfile
|
|
6
6
|
import unittest
|
|
7
|
-
import urllib.request
|
|
8
7
|
from dataclasses import dataclass
|
|
9
8
|
from os.path import basename, dirname
|
|
10
9
|
from pathlib import Path
|
|
@@ -12,6 +11,7 @@ from typing import Any, Iterable, List, Optional, Union
|
|
|
12
11
|
from urllib.parse import urlparse
|
|
13
12
|
|
|
14
13
|
import jsonref
|
|
14
|
+
import requests
|
|
15
15
|
from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator
|
|
16
16
|
from pydantic.fields import Field
|
|
17
17
|
|
|
@@ -91,19 +91,18 @@ class JsonSchemaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMix
|
|
|
91
91
|
)
|
|
92
92
|
|
|
93
93
|
@validator("path")
|
|
94
|
-
def download_http_url_to_temp_file(v):
|
|
94
|
+
def download_http_url_to_temp_file(cls, v):
|
|
95
95
|
if isinstance(v, AnyHttpUrl):
|
|
96
96
|
try:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
return tmp_file.name
|
|
97
|
+
response = requests.get(str(v))
|
|
98
|
+
response.raise_for_status()
|
|
99
|
+
schema_dict = response.json()
|
|
100
|
+
if not JsonSchemaTranslator._get_id_from_any_schema(schema_dict):
|
|
101
|
+
schema_dict["$id"] = str(v)
|
|
102
|
+
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
|
|
103
|
+
tmp_file.write(json.dumps(schema_dict))
|
|
104
|
+
tmp_file.flush()
|
|
105
|
+
return tmp_file.name
|
|
107
106
|
except Exception as e:
|
|
108
107
|
logger.error(
|
|
109
108
|
f"Failed to localize url {v} due to {e}. Run with --debug to get full stacktrace"
|
|
@@ -353,7 +352,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
|
|
|
353
352
|
if self.config.platform_instance:
|
|
354
353
|
browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
|
|
355
354
|
|
|
356
|
-
if
|
|
355
|
+
if isinstance(self.config.path, Path) and self.config.path.is_dir():
|
|
357
356
|
for root, _, files in os.walk(self.config.path, topdown=False):
|
|
358
357
|
for file_name in [f for f in files if f.endswith(".json")]:
|
|
359
358
|
try:
|
|
@@ -373,10 +372,11 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
|
|
|
373
372
|
|
|
374
373
|
else:
|
|
375
374
|
try:
|
|
375
|
+
assert isinstance(self.config.path, Path)
|
|
376
376
|
yield from self._load_one_file(
|
|
377
377
|
ref_loader,
|
|
378
378
|
browse_prefix=browse_prefix,
|
|
379
|
-
root_dir=
|
|
379
|
+
root_dir=self.config.path.parent,
|
|
380
380
|
file_name=str(self.config.path),
|
|
381
381
|
)
|
|
382
382
|
except Exception as e:
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
from datetime import datetime
|
|
2
3
|
from typing import Dict, List, Optional
|
|
3
4
|
|
|
@@ -23,6 +24,8 @@ class Workspace(BaseModel):
|
|
|
23
24
|
|
|
24
25
|
@root_validator(pre=True)
|
|
25
26
|
def update_values(cls, values: Dict) -> Dict:
|
|
27
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
28
|
+
values = deepcopy(values)
|
|
26
29
|
# Update name if presonal workspace
|
|
27
30
|
if values["name"] == "User Folder":
|
|
28
31
|
values["name"] = "My documents"
|
|
@@ -7,7 +7,7 @@ from typing import Dict, List, Optional, Set
|
|
|
7
7
|
import pydantic
|
|
8
8
|
from pydantic import Field, root_validator, validator
|
|
9
9
|
|
|
10
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
11
11
|
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
12
12
|
from datahub.configuration.source_common import (
|
|
13
13
|
EnvConfigMixin,
|
|
@@ -67,13 +67,10 @@ class TagOption(StrEnum):
|
|
|
67
67
|
|
|
68
68
|
@dataclass(frozen=True)
|
|
69
69
|
class DatabaseId:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
platform_instance: Optional[str] =
|
|
74
|
-
default=None,
|
|
75
|
-
description="Platform instance of consumer snowflake account.",
|
|
76
|
-
)
|
|
70
|
+
# Database created from share in consumer account
|
|
71
|
+
database: str
|
|
72
|
+
# Platform instance of consumer snowflake account
|
|
73
|
+
platform_instance: Optional[str] = None
|
|
77
74
|
|
|
78
75
|
|
|
79
76
|
class SnowflakeShareConfig(ConfigModel):
|
|
@@ -282,10 +279,11 @@ class SnowflakeV2Config(
|
|
|
282
279
|
description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
|
|
283
280
|
)
|
|
284
281
|
|
|
285
|
-
structured_properties_template_cache_invalidation_interval: int =
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
282
|
+
structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
|
|
283
|
+
Field(
|
|
284
|
+
default=60,
|
|
285
|
+
description="Interval in seconds to invalidate the structured properties template cache.",
|
|
286
|
+
)
|
|
289
287
|
)
|
|
290
288
|
|
|
291
289
|
include_external_url: bool = Field(
|
|
@@ -334,7 +332,7 @@ class SnowflakeV2Config(
|
|
|
334
332
|
"to ignore the temporary staging tables created by known ETL tools.",
|
|
335
333
|
)
|
|
336
334
|
|
|
337
|
-
rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field(
|
|
335
|
+
rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field( # type: ignore[pydantic-field]
|
|
338
336
|
"upstreams_deny_pattern", "temporary_tables_pattern"
|
|
339
337
|
)
|
|
340
338
|
|
|
@@ -352,8 +350,7 @@ class SnowflakeV2Config(
|
|
|
352
350
|
)
|
|
353
351
|
|
|
354
352
|
# Allows empty containers to be ingested before datasets are added, avoiding permission errors
|
|
355
|
-
warn_no_datasets: bool = Field(
|
|
356
|
-
hidden_from_docs=True,
|
|
353
|
+
warn_no_datasets: HiddenFromDocs[bool] = Field(
|
|
357
354
|
default=False,
|
|
358
355
|
description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
|
|
359
356
|
)
|
|
@@ -15,7 +15,12 @@ from snowflake.connector.network import (
|
|
|
15
15
|
OAUTH_AUTHENTICATOR,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
from datahub.configuration.common import
|
|
18
|
+
from datahub.configuration.common import (
|
|
19
|
+
ConfigModel,
|
|
20
|
+
ConfigurationError,
|
|
21
|
+
HiddenFromDocs,
|
|
22
|
+
MetaError,
|
|
23
|
+
)
|
|
19
24
|
from datahub.configuration.connection_resolver import auto_connection_resolver
|
|
20
25
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
21
26
|
from datahub.ingestion.api.closeable import Closeable
|
|
@@ -63,7 +68,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
63
68
|
description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.",
|
|
64
69
|
)
|
|
65
70
|
|
|
66
|
-
scheme: str = "snowflake"
|
|
71
|
+
scheme: HiddenFromDocs[str] = "snowflake"
|
|
67
72
|
username: Optional[str] = pydantic.Field(
|
|
68
73
|
default=None, description="Snowflake username."
|
|
69
74
|
)
|
|
@@ -118,7 +123,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
118
123
|
assert self.account_id
|
|
119
124
|
return self.account_id
|
|
120
125
|
|
|
121
|
-
rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id")
|
|
126
|
+
rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id") # type: ignore[pydantic-field]
|
|
122
127
|
|
|
123
128
|
@pydantic.validator("account_id")
|
|
124
129
|
def validate_account_id(cls, account_id: str, values: Dict) -> str:
|
|
@@ -2,7 +2,17 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import (
|
|
6
|
+
TYPE_CHECKING,
|
|
7
|
+
Any,
|
|
8
|
+
Collection,
|
|
9
|
+
Iterable,
|
|
10
|
+
List,
|
|
11
|
+
Optional,
|
|
12
|
+
Set,
|
|
13
|
+
Tuple,
|
|
14
|
+
Type,
|
|
15
|
+
)
|
|
6
16
|
|
|
7
17
|
from pydantic import BaseModel, Field, validator
|
|
8
18
|
|
|
@@ -44,6 +54,9 @@ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
|
44
54
|
from datahub.utilities.perf_timer import PerfTimer
|
|
45
55
|
from datahub.utilities.time import ts_millis_to_datetime
|
|
46
56
|
|
|
57
|
+
if TYPE_CHECKING:
|
|
58
|
+
from pydantic.deprecated.class_validators import V1Validator
|
|
59
|
+
|
|
47
60
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
48
61
|
|
|
49
62
|
EXTERNAL_LINEAGE = "external_lineage"
|
|
@@ -51,7 +64,7 @@ TABLE_LINEAGE = "table_lineage"
|
|
|
51
64
|
VIEW_LINEAGE = "view_lineage"
|
|
52
65
|
|
|
53
66
|
|
|
54
|
-
def pydantic_parse_json(field: str) ->
|
|
67
|
+
def pydantic_parse_json(field: str) -> "V1Validator":
|
|
55
68
|
def _parse_from_json(cls: Type, v: Any) -> dict:
|
|
56
69
|
if isinstance(v, str):
|
|
57
70
|
return json.loads(v)
|
|
@@ -13,7 +13,7 @@ from typing import Any, Dict, Iterable, List, Optional, Union
|
|
|
13
13
|
import pydantic
|
|
14
14
|
from typing_extensions import Self
|
|
15
15
|
|
|
16
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
16
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
17
17
|
from datahub.configuration.time_window_config import (
|
|
18
18
|
BaseTimeWindowConfig,
|
|
19
19
|
BucketDuration,
|
|
@@ -112,12 +112,11 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
112
112
|
"to ignore the temporary staging tables created by known ETL tools.",
|
|
113
113
|
)
|
|
114
114
|
|
|
115
|
-
local_temp_path: Optional[pathlib.Path] = pydantic.Field(
|
|
116
|
-
default=None,
|
|
117
|
-
description="Local path to store the audit log.",
|
|
115
|
+
local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = pydantic.Field(
|
|
118
116
|
# TODO: For now, this is simply an advanced config to make local testing easier.
|
|
119
117
|
# Eventually, we will want to store date-specific files in the directory and use it as a cache.
|
|
120
|
-
|
|
118
|
+
default=None,
|
|
119
|
+
description="Local path to store the audit log.",
|
|
121
120
|
)
|
|
122
121
|
|
|
123
122
|
include_lineage: bool = True
|
|
@@ -16,6 +16,7 @@ from sqlalchemy.engine.reflection import Inspector
|
|
|
16
16
|
from sqlalchemy.types import TypeEngine
|
|
17
17
|
from sqlalchemy_bigquery import STRUCT
|
|
18
18
|
|
|
19
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
19
20
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
20
21
|
from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey
|
|
21
22
|
from datahub.ingestion.api.decorators import (
|
|
@@ -251,7 +252,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
|
|
|
251
252
|
|
|
252
253
|
|
|
253
254
|
class AthenaConfig(SQLCommonConfig):
|
|
254
|
-
scheme: str = "awsathena+rest"
|
|
255
|
+
scheme: HiddenFromDocs[str] = "awsathena+rest"
|
|
255
256
|
username: Optional[str] = pydantic.Field(
|
|
256
257
|
default=None,
|
|
257
258
|
description="Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html",
|
|
@@ -18,6 +18,7 @@ from sqlalchemy.sql import sqltypes
|
|
|
18
18
|
from sqlalchemy.types import BOOLEAN, DATE, DATETIME, INTEGER
|
|
19
19
|
|
|
20
20
|
import datahub.emitter.mce_builder as builder
|
|
21
|
+
from datahub.configuration.common import HiddenFromDocs, LaxStr
|
|
21
22
|
from datahub.configuration.source_common import DatasetLineageProviderConfigBase
|
|
22
23
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
23
24
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
@@ -128,16 +129,20 @@ class ClickHouseConfig(
|
|
|
128
129
|
):
|
|
129
130
|
# defaults
|
|
130
131
|
host_port: str = Field(default="localhost:8123", description="ClickHouse host URL.")
|
|
131
|
-
scheme: str = Field(default="clickhouse"
|
|
132
|
+
scheme: HiddenFromDocs[str] = Field(default="clickhouse")
|
|
132
133
|
password: pydantic.SecretStr = Field(
|
|
133
134
|
default=pydantic.SecretStr(""), description="password"
|
|
134
135
|
)
|
|
135
|
-
secure: Optional[bool] = Field(
|
|
136
|
-
|
|
136
|
+
secure: Optional[bool] = Field(
|
|
137
|
+
default=None, description="[deprecated] Use uri_opts instead."
|
|
138
|
+
)
|
|
139
|
+
protocol: Optional[str] = Field(
|
|
140
|
+
default=None, description="[deprecated] Use uri_opts instead."
|
|
141
|
+
)
|
|
137
142
|
_deprecate_secure = pydantic_field_deprecated("secure")
|
|
138
143
|
_deprecate_protocol = pydantic_field_deprecated("protocol")
|
|
139
144
|
|
|
140
|
-
uri_opts: Dict[str,
|
|
145
|
+
uri_opts: Dict[str, LaxStr] = Field(
|
|
141
146
|
default={},
|
|
142
147
|
description="The part of the URI and it's used to provide additional configuration options or parameters for the database connection.",
|
|
143
148
|
)
|
|
@@ -185,9 +190,9 @@ class ClickHouseConfig(
|
|
|
185
190
|
"Initializing uri_opts from deprecated secure or protocol options"
|
|
186
191
|
)
|
|
187
192
|
values["uri_opts"] = {}
|
|
188
|
-
if secure:
|
|
189
|
-
values["uri_opts"]["secure"] = secure
|
|
190
|
-
if protocol:
|
|
193
|
+
if secure is not None:
|
|
194
|
+
values["uri_opts"]["secure"] = str(secure)
|
|
195
|
+
if protocol is not None:
|
|
191
196
|
values["uri_opts"]["protocol"] = protocol
|
|
192
197
|
logger.debug(f"uri_opts: {uri_opts}")
|
|
193
198
|
elif (secure or protocol) and uri_opts:
|