acryl-datahub 1.2.0.10rc3__py3-none-any.whl → 1.2.0.10rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show
  1. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/METADATA +2668 -2752
  2. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/RECORD +82 -82
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/assertion/assertion.py +1 -1
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/dataproduct/dataproduct.py +6 -3
  7. datahub/api/entities/dataset/dataset.py +9 -18
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/api/graphql/operation.py +10 -6
  10. datahub/cli/docker_check.py +2 -2
  11. datahub/configuration/common.py +29 -1
  12. datahub/configuration/connection_resolver.py +5 -2
  13. datahub/configuration/import_resolver.py +7 -4
  14. datahub/configuration/pydantic_migration_helpers.py +0 -9
  15. datahub/configuration/source_common.py +3 -2
  16. datahub/configuration/validate_field_deprecation.py +5 -2
  17. datahub/configuration/validate_field_removal.py +5 -2
  18. datahub/configuration/validate_field_rename.py +6 -5
  19. datahub/configuration/validate_multiline_string.py +5 -2
  20. datahub/ingestion/run/pipeline_config.py +2 -2
  21. datahub/ingestion/source/azure/azure_common.py +1 -1
  22. datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
  23. datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
  24. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  25. datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
  26. datahub/ingestion/source/datahub/config.py +8 -9
  27. datahub/ingestion/source/delta_lake/config.py +1 -1
  28. datahub/ingestion/source/dremio/dremio_config.py +3 -4
  29. datahub/ingestion/source/feast.py +8 -10
  30. datahub/ingestion/source/fivetran/config.py +1 -1
  31. datahub/ingestion/source/ge_profiling_config.py +26 -22
  32. datahub/ingestion/source/grafana/grafana_config.py +2 -2
  33. datahub/ingestion/source/grafana/models.py +12 -14
  34. datahub/ingestion/source/hex/hex.py +6 -1
  35. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  36. datahub/ingestion/source/kafka_connect/common.py +2 -2
  37. datahub/ingestion/source/looker/looker_common.py +1 -1
  38. datahub/ingestion/source/looker/looker_config.py +15 -4
  39. datahub/ingestion/source/looker/lookml_config.py +1 -1
  40. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  41. datahub/ingestion/source/metadata/lineage.py +1 -1
  42. datahub/ingestion/source/mode.py +13 -5
  43. datahub/ingestion/source/nifi.py +1 -1
  44. datahub/ingestion/source/powerbi/config.py +14 -21
  45. datahub/ingestion/source/preset.py +1 -1
  46. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  47. datahub/ingestion/source/redshift/config.py +6 -3
  48. datahub/ingestion/source/salesforce.py +13 -9
  49. datahub/ingestion/source/schema/json_schema.py +14 -14
  50. datahub/ingestion/source/sigma/data_classes.py +3 -0
  51. datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
  52. datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
  53. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
  54. datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
  55. datahub/ingestion/source/sql/athena.py +2 -1
  56. datahub/ingestion/source/sql/clickhouse.py +12 -7
  57. datahub/ingestion/source/sql/cockroachdb.py +5 -3
  58. datahub/ingestion/source/sql/druid.py +2 -2
  59. datahub/ingestion/source/sql/hive.py +4 -3
  60. datahub/ingestion/source/sql/hive_metastore.py +7 -9
  61. datahub/ingestion/source/sql/mssql/source.py +2 -2
  62. datahub/ingestion/source/sql/mysql.py +2 -2
  63. datahub/ingestion/source/sql/oracle.py +3 -3
  64. datahub/ingestion/source/sql/presto.py +2 -1
  65. datahub/ingestion/source/sql/teradata.py +4 -4
  66. datahub/ingestion/source/sql/trino.py +2 -1
  67. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  68. datahub/ingestion/source/sql/vertica.py +1 -1
  69. datahub/ingestion/source/sql_queries.py +6 -6
  70. datahub/ingestion/source/state/checkpoint.py +5 -1
  71. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  72. datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
  73. datahub/ingestion/source/superset.py +1 -2
  74. datahub/ingestion/source/tableau/tableau.py +17 -3
  75. datahub/ingestion/source/unity/config.py +7 -3
  76. datahub/ingestion/source/usage/usage_common.py +3 -3
  77. datahub/ingestion/source_config/pulsar.py +3 -1
  78. datahub/sdk/search_filters.py +1 -7
  79. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/WHEEL +0 -0
  80. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/entry_points.txt +0 -0
  81. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/licenses/LICENSE +0 -0
  82. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,16 @@ from dataclasses import dataclass
7
7
  from datetime import datetime, timezone
8
8
  from functools import lru_cache
9
9
  from json import JSONDecodeError
10
- from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
10
+ from typing import (
11
+ Dict,
12
+ Iterable,
13
+ Iterator,
14
+ List,
15
+ Optional,
16
+ Set,
17
+ Tuple,
18
+ Union,
19
+ )
11
20
 
12
21
  import dateutil.parser as dp
13
22
  import psutil
@@ -24,7 +33,7 @@ from requests.models import HTTPBasicAuth, HTTPError
24
33
  from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
25
34
 
26
35
  import datahub.emitter.mce_builder as builder
27
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
36
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
28
37
  from datahub.configuration.source_common import (
29
38
  DatasetLineageProviderConfigBase,
30
39
  )
@@ -200,10 +209,9 @@ class ModeConfig(
200
209
  default=True, description="Tag measures and dimensions in the schema"
201
210
  )
202
211
 
203
- items_per_page: int = Field(
204
- default=DEFAULT_API_ITEMS_PER_PAGE,
212
+ items_per_page: HiddenFromDocs[int] = Field(
213
+ DEFAULT_API_ITEMS_PER_PAGE,
205
214
  description="Number of items per page for paginated API requests.",
206
- hidden_from_docs=True,
207
215
  )
208
216
 
209
217
  @validator("connect_uri")
@@ -166,7 +166,7 @@ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
166
166
  )
167
167
 
168
168
  @root_validator(skip_on_failure=True)
169
- def validate_auth_params(cla, values):
169
+ def validate_auth_params(cls, values):
170
170
  if values.get("auth") is NifiAuthType.CLIENT_CERT and not values.get(
171
171
  "client_cert_file"
172
172
  ):
@@ -4,11 +4,10 @@ from enum import Enum
4
4
  from typing import Dict, List, Literal, Optional, Union
5
5
 
6
6
  import pydantic
7
- from pydantic import validator
8
- from pydantic.class_validators import root_validator
7
+ from pydantic import root_validator, validator
9
8
 
10
9
  import datahub.emitter.mce_builder as builder
11
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
12
11
  from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
13
12
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
14
13
  from datahub.ingestion.api.incremental_lineage_helper import (
@@ -291,22 +290,18 @@ class PowerBiProfilingConfig(ConfigModel):
291
290
  class PowerBiDashboardSourceConfig(
292
291
  StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
293
292
  ):
294
- platform_name: str = pydantic.Field(
295
- default=Constant.PLATFORM_NAME, hidden_from_docs=True
296
- )
293
+ platform_name: HiddenFromDocs[str] = pydantic.Field(default=Constant.PLATFORM_NAME)
297
294
 
298
- platform_urn: str = pydantic.Field(
295
+ platform_urn: HiddenFromDocs[str] = pydantic.Field(
299
296
  default=builder.make_data_platform_urn(platform=Constant.PLATFORM_NAME),
300
- hidden_from_docs=True,
301
297
  )
302
298
 
303
299
  # Organization Identifier
304
300
  tenant_id: str = pydantic.Field(description="PowerBI tenant identifier")
305
301
  # PowerBi workspace identifier
306
- workspace_id: Optional[str] = pydantic.Field(
302
+ workspace_id: HiddenFromDocs[Optional[str]] = pydantic.Field(
307
303
  default=None,
308
304
  description="[deprecated] Use workspace_id_pattern instead",
309
- hidden_from_docs=True,
310
305
  )
311
306
  # PowerBi workspace identifier
312
307
  workspace_id_pattern: AllowDenyPattern = pydantic.Field(
@@ -326,15 +321,14 @@ class PowerBiDashboardSourceConfig(
326
321
  # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
327
322
  # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
328
323
  # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
329
- dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = (
330
- pydantic.Field(
331
- default_factory=default_for_dataset_type_mapping,
332
- description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
333
- "DataHub supported datasources."
334
- "You can configured platform instance for dataset lineage. "
335
- "See Quickstart Recipe for mapping",
336
- hidden_from_docs=True,
337
- )
324
+ dataset_type_mapping: HiddenFromDocs[
325
+ Union[Dict[str, str], Dict[str, PlatformDetail]]
326
+ ] = pydantic.Field(
327
+ default_factory=default_for_dataset_type_mapping,
328
+ description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
329
+ "DataHub supported datasources."
330
+ "You can configured platform instance for dataset lineage. "
331
+ "See Quickstart Recipe for mapping",
338
332
  )
339
333
  # PowerBI datasource's server to platform instance mapping
340
334
  server_to_platform_instance: Dict[
@@ -541,10 +535,9 @@ class PowerBiDashboardSourceConfig(
541
535
  "Increase this value if you encounter the 'M-Query Parsing Timeout' message in the connector report.",
542
536
  )
543
537
 
544
- metadata_api_timeout: int = pydantic.Field(
538
+ metadata_api_timeout: HiddenFromDocs[int] = pydantic.Field(
545
539
  default=30,
546
540
  description="timeout in seconds for Metadata Rest Api.",
547
- hidden_from_docs=True,
548
541
  )
549
542
 
550
543
  @root_validator(skip_on_failure=True)
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import Dict, Optional
3
3
 
4
4
  import requests
5
- from pydantic.class_validators import root_validator, validator
5
+ from pydantic import root_validator, validator
6
6
  from pydantic.fields import Field
7
7
 
8
8
  from datahub.emitter.mce_builder import DEFAULT_ENV
@@ -1,8 +1,9 @@
1
+ from copy import deepcopy
1
2
  from datetime import datetime
2
3
  from enum import Enum
3
4
  from typing import Dict, List, Optional, Type, Union
4
5
 
5
- from pydantic import BaseModel, Field, root_validator
6
+ from pydantic import BaseModel, ConfigDict, Field, root_validator
6
7
 
7
8
  from datahub.emitter.mcp_builder import ContainerKey
8
9
  from datahub.ingestion.source.qlik_sense.config import QLIK_DATETIME_FORMAT, Constant
@@ -78,7 +79,11 @@ PERSONAL_SPACE_DICT = {
78
79
  }
79
80
 
80
81
 
81
- class Space(BaseModel):
82
+ class _QlikBaseModel(BaseModel):
83
+ model_config = ConfigDict(coerce_numbers_to_str=True)
84
+
85
+
86
+ class Space(_QlikBaseModel):
82
87
  id: str
83
88
  name: str
84
89
  description: str
@@ -89,6 +94,9 @@ class Space(BaseModel):
89
94
 
90
95
  @root_validator(pre=True)
91
96
  def update_values(cls, values: Dict) -> Dict:
97
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
98
+ values = deepcopy(values)
99
+
92
100
  values[Constant.CREATEDAT] = datetime.strptime(
93
101
  values[Constant.CREATEDAT], QLIK_DATETIME_FORMAT
94
102
  )
@@ -98,7 +106,7 @@ class Space(BaseModel):
98
106
  return values
99
107
 
100
108
 
101
- class Item(BaseModel):
109
+ class Item(_QlikBaseModel):
102
110
  id: str
103
111
  description: str = ""
104
112
  ownerId: str
@@ -107,7 +115,7 @@ class Item(BaseModel):
107
115
  updatedAt: datetime
108
116
 
109
117
 
110
- class SchemaField(BaseModel):
118
+ class SchemaField(_QlikBaseModel):
111
119
  name: str
112
120
  dataType: Optional[str] = None
113
121
  primaryKey: Optional[bool] = None
@@ -115,6 +123,8 @@ class SchemaField(BaseModel):
115
123
 
116
124
  @root_validator(pre=True)
117
125
  def update_values(cls, values: Dict) -> Dict:
126
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
127
+ values = deepcopy(values)
118
128
  values[Constant.DATATYPE] = values.get(Constant.DATATYPE, {}).get(Constant.TYPE)
119
129
  return values
120
130
 
@@ -130,6 +140,8 @@ class QlikDataset(Item):
130
140
 
131
141
  @root_validator(pre=True)
132
142
  def update_values(cls, values: Dict) -> Dict:
143
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
144
+ values = deepcopy(values)
133
145
  # Update str time to datetime
134
146
  values[Constant.CREATEDAT] = datetime.strptime(
135
147
  values[Constant.CREATEDTIME], QLIK_DATETIME_FORMAT
@@ -148,13 +160,13 @@ class QlikDataset(Item):
148
160
  return values
149
161
 
150
162
 
151
- class AxisProperty(BaseModel):
163
+ class AxisProperty(_QlikBaseModel):
152
164
  Title: str = Field(alias="qFallbackTitle")
153
165
  Min: str = Field(alias="qMin")
154
166
  Max: str = Field(alias="qMax")
155
167
 
156
168
 
157
- class Chart(BaseModel):
169
+ class Chart(_QlikBaseModel):
158
170
  qId: str
159
171
  visualization: str
160
172
  title: str
@@ -164,13 +176,15 @@ class Chart(BaseModel):
164
176
 
165
177
  @root_validator(pre=True)
166
178
  def update_values(cls, values: Dict) -> Dict:
179
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
180
+ values = deepcopy(values)
167
181
  values[Constant.QID] = values[Constant.QINFO][Constant.QID]
168
182
  values["qDimension"] = values[Constant.HYPERCUBE]["qDimensionInfo"]
169
183
  values["qMeasure"] = values[Constant.HYPERCUBE]["qMeasureInfo"]
170
184
  return values
171
185
 
172
186
 
173
- class Sheet(BaseModel):
187
+ class Sheet(_QlikBaseModel):
174
188
  id: str
175
189
  title: str
176
190
  description: str
@@ -181,6 +195,8 @@ class Sheet(BaseModel):
181
195
 
182
196
  @root_validator(pre=True)
183
197
  def update_values(cls, values: Dict) -> Dict:
198
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
199
+ values = deepcopy(values)
184
200
  values[Constant.CREATEDAT] = datetime.strptime(
185
201
  values[Constant.CREATEDDATE], QLIK_DATETIME_FORMAT
186
202
  )
@@ -190,7 +206,7 @@ class Sheet(BaseModel):
190
206
  return values
191
207
 
192
208
 
193
- class QlikTable(BaseModel):
209
+ class QlikTable(_QlikBaseModel):
194
210
  tableName: str
195
211
  type: BoxType = Field(alias="boxType")
196
212
  tableAlias: str
@@ -206,6 +222,8 @@ class QlikTable(BaseModel):
206
222
 
207
223
  @root_validator(pre=True)
208
224
  def update_values(cls, values: Dict) -> Dict:
225
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
226
+ values = deepcopy(values)
209
227
  values[Constant.DATACONNECTORID] = values[Constant.CONNECTIONINFO][Constant.ID]
210
228
  values[Constant.DATACONNECTORPLATFORM] = values[Constant.CONNECTIONINFO][
211
229
  Constant.SOURCECONNECTORID
@@ -223,6 +241,8 @@ class App(Item):
223
241
 
224
242
  @root_validator(pre=True)
225
243
  def update_values(cls, values: Dict) -> Dict:
244
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
245
+ values = deepcopy(values)
226
246
  values[Constant.CREATEDAT] = datetime.strptime(
227
247
  values[Constant.CREATEDDATE], QLIK_DATETIME_FORMAT
228
248
  )
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from copy import deepcopy
2
3
  from enum import Enum
3
4
  from typing import Any, Dict, List, Optional
4
5
 
@@ -6,7 +7,7 @@ from pydantic import root_validator
6
7
  from pydantic.fields import Field
7
8
 
8
9
  from datahub.configuration import ConfigModel
9
- from datahub.configuration.common import AllowDenyPattern
10
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
10
11
  from datahub.configuration.source_common import DatasetLineageProviderConfigBase
11
12
  from datahub.configuration.validate_field_removal import pydantic_removed_field
12
13
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
@@ -95,10 +96,9 @@ class RedshiftConfig(
95
96
  # Because of this behavior, it uses dramatically fewer round trips for
96
97
  # large Redshift warehouses. As an example, see this query for the columns:
97
98
  # https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/blob/60b4db04c1d26071c291aeea52f1dcb5dd8b0eb0/sqlalchemy_redshift/dialect.py#L745.
98
- scheme: str = Field(
99
+ scheme: HiddenFromDocs[str] = Field(
99
100
  default="redshift+redshift_connector",
100
101
  description="",
101
- hidden_from_docs=True,
102
102
  )
103
103
 
104
104
  _database_alias_removed = pydantic_removed_field("database_alias")
@@ -216,6 +216,9 @@ class RedshiftConfig(
216
216
 
217
217
  @root_validator(skip_on_failure=True)
218
218
  def connection_config_compatibility_set(cls, values: Dict) -> Dict:
219
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
220
+ values = deepcopy(values)
221
+
219
222
  if (
220
223
  ("options" in values and "connect_args" in values["options"])
221
224
  and "extra_client_options" in values
@@ -110,30 +110,33 @@ class SalesforceConfig(
110
110
  auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
111
111
 
112
112
  # Username, Password Auth
113
- username: Optional[str] = Field(description="Salesforce username")
114
- password: Optional[str] = Field(description="Password for Salesforce user")
113
+ username: Optional[str] = Field(None, description="Salesforce username")
114
+ password: Optional[str] = Field(None, description="Password for Salesforce user")
115
115
  consumer_key: Optional[str] = Field(
116
- description="Consumer key for Salesforce JSON web token access"
116
+ None, description="Consumer key for Salesforce JSON web token access"
117
117
  )
118
118
  private_key: Optional[str] = Field(
119
- description="Private key as a string for Salesforce JSON web token access"
119
+ None, description="Private key as a string for Salesforce JSON web token access"
120
120
  )
121
121
  security_token: Optional[str] = Field(
122
- description="Security token for Salesforce username"
122
+ None, description="Security token for Salesforce username"
123
123
  )
124
124
  # client_id, client_secret not required
125
125
 
126
126
  # Direct - Instance URL, Access Token Auth
127
127
  instance_url: Optional[str] = Field(
128
- description="Salesforce instance url. e.g. https://MyDomainName.my.salesforce.com"
128
+ None,
129
+ description="Salesforce instance url. e.g. https://MyDomainName.my.salesforce.com",
129
130
  )
130
131
  # Flag to indicate whether the instance is production or sandbox
131
132
  is_sandbox: bool = Field(
132
133
  default=False, description="Connect to Sandbox instance of your Salesforce"
133
134
  )
134
- access_token: Optional[str] = Field(description="Access token for instance url")
135
+ access_token: Optional[str] = Field(
136
+ None, description="Access token for instance url"
137
+ )
135
138
 
136
- ingest_tags: Optional[bool] = Field(
139
+ ingest_tags: bool = Field(
137
140
  default=False,
138
141
  description="Ingest Tags from source. This will override Tags entered from UI",
139
142
  )
@@ -147,7 +150,8 @@ class SalesforceConfig(
147
150
  description='Regex patterns for tables/schemas to describe domain_key domain key (domain_key can be any string like "sales".) There can be multiple domain keys specified.',
148
151
  )
149
152
  api_version: Optional[str] = Field(
150
- description="If specified, overrides default version used by the Salesforce package. Example value: '59.0'"
153
+ None,
154
+ description="If specified, overrides default version used by the Salesforce package. Example value: '59.0'",
151
155
  )
152
156
 
153
157
  profiling: SalesforceProfilingConfig = SalesforceProfilingConfig()
@@ -4,7 +4,6 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  import unittest
7
- import urllib.request
8
7
  from dataclasses import dataclass
9
8
  from os.path import basename, dirname
10
9
  from pathlib import Path
@@ -12,6 +11,7 @@ from typing import Any, Iterable, List, Optional, Union
12
11
  from urllib.parse import urlparse
13
12
 
14
13
  import jsonref
14
+ import requests
15
15
  from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator
16
16
  from pydantic.fields import Field
17
17
 
@@ -91,19 +91,18 @@ class JsonSchemaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMix
91
91
  )
92
92
 
93
93
  @validator("path")
94
- def download_http_url_to_temp_file(v):
94
+ def download_http_url_to_temp_file(cls, v):
95
95
  if isinstance(v, AnyHttpUrl):
96
96
  try:
97
- with urllib.request.urlopen(v) as response:
98
- schema_dict = json.load(response)
99
- if not JsonSchemaTranslator._get_id_from_any_schema(schema_dict):
100
- schema_dict["$id"] = str(v)
101
- with tempfile.NamedTemporaryFile(
102
- mode="w", delete=False
103
- ) as tmp_file:
104
- tmp_file.write(json.dumps(schema_dict))
105
- tmp_file.flush()
106
- return tmp_file.name
97
+ response = requests.get(str(v))
98
+ response.raise_for_status()
99
+ schema_dict = response.json()
100
+ if not JsonSchemaTranslator._get_id_from_any_schema(schema_dict):
101
+ schema_dict["$id"] = str(v)
102
+ with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
103
+ tmp_file.write(json.dumps(schema_dict))
104
+ tmp_file.flush()
105
+ return tmp_file.name
107
106
  except Exception as e:
108
107
  logger.error(
109
108
  f"Failed to localize url {v} due to {e}. Run with --debug to get full stacktrace"
@@ -353,7 +352,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
353
352
  if self.config.platform_instance:
354
353
  browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
355
354
 
356
- if os.path.isdir(self.config.path):
355
+ if isinstance(self.config.path, Path) and self.config.path.is_dir():
357
356
  for root, _, files in os.walk(self.config.path, topdown=False):
358
357
  for file_name in [f for f in files if f.endswith(".json")]:
359
358
  try:
@@ -373,10 +372,11 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
373
372
 
374
373
  else:
375
374
  try:
375
+ assert isinstance(self.config.path, Path)
376
376
  yield from self._load_one_file(
377
377
  ref_loader,
378
378
  browse_prefix=browse_prefix,
379
- root_dir=Path(os.path.dirname(Path(self.config.path))),
379
+ root_dir=self.config.path.parent,
380
380
  file_name=str(self.config.path),
381
381
  )
382
382
  except Exception as e:
@@ -1,3 +1,4 @@
1
+ from copy import deepcopy
1
2
  from datetime import datetime
2
3
  from typing import Dict, List, Optional
3
4
 
@@ -23,6 +24,8 @@ class Workspace(BaseModel):
23
24
 
24
25
  @root_validator(pre=True)
25
26
  def update_values(cls, values: Dict) -> Dict:
27
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
28
+ values = deepcopy(values)
26
29
  # Update name if presonal workspace
27
30
  if values["name"] == "User Folder":
28
31
  values["name"] = "My documents"
@@ -7,7 +7,7 @@ from typing import Dict, List, Optional, Set
7
7
  import pydantic
8
8
  from pydantic import Field, root_validator, validator
9
9
 
10
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
11
11
  from datahub.configuration.pattern_utils import UUID_REGEX
12
12
  from datahub.configuration.source_common import (
13
13
  EnvConfigMixin,
@@ -67,13 +67,10 @@ class TagOption(StrEnum):
67
67
 
68
68
  @dataclass(frozen=True)
69
69
  class DatabaseId:
70
- database: str = Field(
71
- description="Database created from share in consumer account."
72
- )
73
- platform_instance: Optional[str] = Field(
74
- default=None,
75
- description="Platform instance of consumer snowflake account.",
76
- )
70
+ # Database created from share in consumer account
71
+ database: str
72
+ # Platform instance of consumer snowflake account
73
+ platform_instance: Optional[str] = None
77
74
 
78
75
 
79
76
  class SnowflakeShareConfig(ConfigModel):
@@ -282,10 +279,11 @@ class SnowflakeV2Config(
282
279
  description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
283
280
  )
284
281
 
285
- structured_properties_template_cache_invalidation_interval: int = Field(
286
- hidden_from_docs=True,
287
- default=60,
288
- description="Interval in seconds to invalidate the structured properties template cache.",
282
+ structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
283
+ Field(
284
+ default=60,
285
+ description="Interval in seconds to invalidate the structured properties template cache.",
286
+ )
289
287
  )
290
288
 
291
289
  include_external_url: bool = Field(
@@ -334,7 +332,7 @@ class SnowflakeV2Config(
334
332
  "to ignore the temporary staging tables created by known ETL tools.",
335
333
  )
336
334
 
337
- rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field(
335
+ rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field( # type: ignore[pydantic-field]
338
336
  "upstreams_deny_pattern", "temporary_tables_pattern"
339
337
  )
340
338
 
@@ -352,8 +350,7 @@ class SnowflakeV2Config(
352
350
  )
353
351
 
354
352
  # Allows empty containers to be ingested before datasets are added, avoiding permission errors
355
- warn_no_datasets: bool = Field(
356
- hidden_from_docs=True,
353
+ warn_no_datasets: HiddenFromDocs[bool] = Field(
357
354
  default=False,
358
355
  description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
359
356
  )
@@ -15,7 +15,12 @@ from snowflake.connector.network import (
15
15
  OAUTH_AUTHENTICATOR,
16
16
  )
17
17
 
18
- from datahub.configuration.common import ConfigModel, ConfigurationError, MetaError
18
+ from datahub.configuration.common import (
19
+ ConfigModel,
20
+ ConfigurationError,
21
+ HiddenFromDocs,
22
+ MetaError,
23
+ )
19
24
  from datahub.configuration.connection_resolver import auto_connection_resolver
20
25
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
21
26
  from datahub.ingestion.api.closeable import Closeable
@@ -63,7 +68,7 @@ class SnowflakeConnectionConfig(ConfigModel):
63
68
  description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.",
64
69
  )
65
70
 
66
- scheme: str = "snowflake"
71
+ scheme: HiddenFromDocs[str] = "snowflake"
67
72
  username: Optional[str] = pydantic.Field(
68
73
  default=None, description="Snowflake username."
69
74
  )
@@ -118,7 +123,7 @@ class SnowflakeConnectionConfig(ConfigModel):
118
123
  assert self.account_id
119
124
  return self.account_id
120
125
 
121
- rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id")
126
+ rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id") # type: ignore[pydantic-field]
122
127
 
123
128
  @pydantic.validator("account_id")
124
129
  def validate_account_id(cls, account_id: str, values: Dict) -> str:
@@ -2,7 +2,17 @@ import json
2
2
  import logging
3
3
  from dataclasses import dataclass
4
4
  from datetime import datetime
5
- from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type
5
+ from typing import (
6
+ TYPE_CHECKING,
7
+ Any,
8
+ Collection,
9
+ Iterable,
10
+ List,
11
+ Optional,
12
+ Set,
13
+ Tuple,
14
+ Type,
15
+ )
6
16
 
7
17
  from pydantic import BaseModel, Field, validator
8
18
 
@@ -44,6 +54,9 @@ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
44
54
  from datahub.utilities.perf_timer import PerfTimer
45
55
  from datahub.utilities.time import ts_millis_to_datetime
46
56
 
57
+ if TYPE_CHECKING:
58
+ from pydantic.deprecated.class_validators import V1Validator
59
+
47
60
  logger: logging.Logger = logging.getLogger(__name__)
48
61
 
49
62
  EXTERNAL_LINEAGE = "external_lineage"
@@ -51,7 +64,7 @@ TABLE_LINEAGE = "table_lineage"
51
64
  VIEW_LINEAGE = "view_lineage"
52
65
 
53
66
 
54
- def pydantic_parse_json(field: str) -> classmethod:
67
+ def pydantic_parse_json(field: str) -> "V1Validator":
55
68
  def _parse_from_json(cls: Type, v: Any) -> dict:
56
69
  if isinstance(v, str):
57
70
  return json.loads(v)
@@ -13,7 +13,7 @@ from typing import Any, Dict, Iterable, List, Optional, Union
13
13
  import pydantic
14
14
  from typing_extensions import Self
15
15
 
16
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
16
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
17
17
  from datahub.configuration.time_window_config import (
18
18
  BaseTimeWindowConfig,
19
19
  BucketDuration,
@@ -112,12 +112,11 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
112
112
  "to ignore the temporary staging tables created by known ETL tools.",
113
113
  )
114
114
 
115
- local_temp_path: Optional[pathlib.Path] = pydantic.Field(
116
- default=None,
117
- description="Local path to store the audit log.",
115
+ local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = pydantic.Field(
118
116
  # TODO: For now, this is simply an advanced config to make local testing easier.
119
117
  # Eventually, we will want to store date-specific files in the directory and use it as a cache.
120
- hidden_from_docs=True,
118
+ default=None,
119
+ description="Local path to store the audit log.",
121
120
  )
122
121
 
123
122
  include_lineage: bool = True
@@ -16,6 +16,7 @@ from sqlalchemy.engine.reflection import Inspector
16
16
  from sqlalchemy.types import TypeEngine
17
17
  from sqlalchemy_bigquery import STRUCT
18
18
 
19
+ from datahub.configuration.common import HiddenFromDocs
19
20
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
20
21
  from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey
21
22
  from datahub.ingestion.api.decorators import (
@@ -251,7 +252,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
251
252
 
252
253
 
253
254
  class AthenaConfig(SQLCommonConfig):
254
- scheme: str = "awsathena+rest"
255
+ scheme: HiddenFromDocs[str] = "awsathena+rest"
255
256
  username: Optional[str] = pydantic.Field(
256
257
  default=None,
257
258
  description="Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html",
@@ -18,6 +18,7 @@ from sqlalchemy.sql import sqltypes
18
18
  from sqlalchemy.types import BOOLEAN, DATE, DATETIME, INTEGER
19
19
 
20
20
  import datahub.emitter.mce_builder as builder
21
+ from datahub.configuration.common import HiddenFromDocs, LaxStr
21
22
  from datahub.configuration.source_common import DatasetLineageProviderConfigBase
22
23
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
23
24
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
@@ -128,16 +129,20 @@ class ClickHouseConfig(
128
129
  ):
129
130
  # defaults
130
131
  host_port: str = Field(default="localhost:8123", description="ClickHouse host URL.")
131
- scheme: str = Field(default="clickhouse", description="", hidden_from_docs=True)
132
+ scheme: HiddenFromDocs[str] = Field(default="clickhouse")
132
133
  password: pydantic.SecretStr = Field(
133
134
  default=pydantic.SecretStr(""), description="password"
134
135
  )
135
- secure: Optional[bool] = Field(default=None, description="")
136
- protocol: Optional[str] = Field(default=None, description="")
136
+ secure: Optional[bool] = Field(
137
+ default=None, description="[deprecated] Use uri_opts instead."
138
+ )
139
+ protocol: Optional[str] = Field(
140
+ default=None, description="[deprecated] Use uri_opts instead."
141
+ )
137
142
  _deprecate_secure = pydantic_field_deprecated("secure")
138
143
  _deprecate_protocol = pydantic_field_deprecated("protocol")
139
144
 
140
- uri_opts: Dict[str, str] = Field(
145
+ uri_opts: Dict[str, LaxStr] = Field(
141
146
  default={},
142
147
  description="The part of the URI and it's used to provide additional configuration options or parameters for the database connection.",
143
148
  )
@@ -185,9 +190,9 @@ class ClickHouseConfig(
185
190
  "Initializing uri_opts from deprecated secure or protocol options"
186
191
  )
187
192
  values["uri_opts"] = {}
188
- if secure:
189
- values["uri_opts"]["secure"] = secure
190
- if protocol:
193
+ if secure is not None:
194
+ values["uri_opts"]["secure"] = str(secure)
195
+ if protocol is not None:
191
196
  values["uri_opts"]["protocol"] = protocol
192
197
  logger.debug(f"uri_opts: {uri_opts}")
193
198
  elif (secure or protocol) and uri_opts: