acryl-datahub 1.2.0.10rc3__py3-none-any.whl → 1.2.0.10rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (94) hide show
  1. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/METADATA +2513 -2571
  2. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/RECORD +94 -87
  3. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/dataproduct/dataproduct.py +6 -3
  8. datahub/api/entities/dataset/dataset.py +9 -18
  9. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  10. datahub/api/graphql/operation.py +10 -6
  11. datahub/cli/docker_check.py +2 -2
  12. datahub/configuration/common.py +29 -1
  13. datahub/configuration/connection_resolver.py +5 -2
  14. datahub/configuration/import_resolver.py +7 -4
  15. datahub/configuration/pydantic_migration_helpers.py +0 -9
  16. datahub/configuration/source_common.py +3 -2
  17. datahub/configuration/validate_field_deprecation.py +5 -2
  18. datahub/configuration/validate_field_removal.py +5 -2
  19. datahub/configuration/validate_field_rename.py +6 -5
  20. datahub/configuration/validate_multiline_string.py +5 -2
  21. datahub/ingestion/autogenerated/capability_summary.json +33 -1
  22. datahub/ingestion/run/pipeline_config.py +2 -2
  23. datahub/ingestion/source/azure/azure_common.py +1 -1
  24. datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
  25. datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
  26. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  27. datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
  28. datahub/ingestion/source/datahub/config.py +8 -9
  29. datahub/ingestion/source/delta_lake/config.py +1 -1
  30. datahub/ingestion/source/dremio/dremio_config.py +3 -4
  31. datahub/ingestion/source/feast.py +8 -10
  32. datahub/ingestion/source/fivetran/config.py +1 -1
  33. datahub/ingestion/source/ge_profiling_config.py +26 -22
  34. datahub/ingestion/source/grafana/grafana_config.py +2 -2
  35. datahub/ingestion/source/grafana/models.py +12 -14
  36. datahub/ingestion/source/hex/hex.py +6 -1
  37. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  38. datahub/ingestion/source/kafka_connect/common.py +2 -2
  39. datahub/ingestion/source/looker/looker_common.py +1 -1
  40. datahub/ingestion/source/looker/looker_config.py +15 -4
  41. datahub/ingestion/source/looker/looker_source.py +52 -3
  42. datahub/ingestion/source/looker/lookml_config.py +1 -1
  43. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  44. datahub/ingestion/source/metadata/lineage.py +1 -1
  45. datahub/ingestion/source/mode.py +13 -5
  46. datahub/ingestion/source/nifi.py +1 -1
  47. datahub/ingestion/source/powerbi/config.py +14 -21
  48. datahub/ingestion/source/preset.py +1 -1
  49. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  50. datahub/ingestion/source/redshift/config.py +6 -3
  51. datahub/ingestion/source/salesforce.py +13 -9
  52. datahub/ingestion/source/schema/json_schema.py +14 -14
  53. datahub/ingestion/source/sigma/data_classes.py +3 -0
  54. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  55. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  56. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  57. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  58. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  59. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  60. datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
  61. datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
  62. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
  63. datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
  64. datahub/ingestion/source/sql/athena.py +2 -1
  65. datahub/ingestion/source/sql/clickhouse.py +12 -7
  66. datahub/ingestion/source/sql/cockroachdb.py +5 -3
  67. datahub/ingestion/source/sql/druid.py +2 -2
  68. datahub/ingestion/source/sql/hive.py +4 -3
  69. datahub/ingestion/source/sql/hive_metastore.py +7 -9
  70. datahub/ingestion/source/sql/mssql/source.py +2 -2
  71. datahub/ingestion/source/sql/mysql.py +2 -2
  72. datahub/ingestion/source/sql/oracle.py +3 -3
  73. datahub/ingestion/source/sql/presto.py +2 -1
  74. datahub/ingestion/source/sql/teradata.py +4 -4
  75. datahub/ingestion/source/sql/trino.py +2 -1
  76. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  77. datahub/ingestion/source/sql/vertica.py +1 -1
  78. datahub/ingestion/source/sql_queries.py +6 -6
  79. datahub/ingestion/source/state/checkpoint.py +5 -1
  80. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  81. datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
  82. datahub/ingestion/source/superset.py +1 -2
  83. datahub/ingestion/source/tableau/tableau.py +20 -6
  84. datahub/ingestion/source/unity/config.py +7 -3
  85. datahub/ingestion/source/usage/usage_common.py +3 -3
  86. datahub/ingestion/source_config/pulsar.py +3 -1
  87. datahub/ingestion/transformer/set_browse_path.py +112 -0
  88. datahub/sdk/_shared.py +126 -0
  89. datahub/sdk/chart.py +87 -30
  90. datahub/sdk/dashboard.py +79 -32
  91. datahub/sdk/search_filters.py +1 -7
  92. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/WHEEL +0 -0
  93. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/licenses/LICENSE +0 -0
  94. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,7 @@ from typing_extensions import TypeAlias
27
27
 
28
28
  import datahub.metadata.schema_classes as models
29
29
  from datahub.api.entities.structuredproperties.structuredproperties import AllowedTypes
30
- from datahub.configuration.common import ConfigModel
30
+ from datahub.configuration.common import ConfigModel, LaxStr
31
31
  from datahub.emitter.mce_builder import (
32
32
  make_data_platform_urn,
33
33
  make_dataset_urn,
@@ -143,7 +143,6 @@ class SchemaFieldSpecification(StrictModel):
143
143
  jsonPath: Union[None, str] = None
144
144
  nullable: bool = False
145
145
  description: Union[None, str] = None
146
- doc: Union[None, str] = None # doc is an alias for description
147
146
  label: Optional[str] = None
148
147
  created: Optional[dict] = None
149
148
  lastModified: Optional[dict] = None
@@ -221,14 +220,14 @@ class SchemaFieldSpecification(StrictModel):
221
220
  return v
222
221
 
223
222
  @root_validator(pre=True)
224
- def sync_description_and_doc(cls, values: Dict) -> Dict:
225
- """Synchronize doc and description fields if one is provided but not the other."""
223
+ def sync_doc_into_description(cls, values: Dict) -> Dict:
224
+ """Synchronize doc into description field if doc is provided."""
226
225
  description = values.get("description")
227
- doc = values.get("doc")
226
+ doc = values.pop("doc", None)
228
227
 
229
- if description is not None and doc is None:
230
- values["doc"] = description
231
- elif doc is not None and description is None:
228
+ if doc is not None:
229
+ if description is not None:
230
+ raise ValueError("doc and description cannot both be provided")
232
231
  values["description"] = doc
233
232
 
234
233
  return values
@@ -296,10 +295,6 @@ class SchemaFieldSpecification(StrictModel):
296
295
  """Custom dict method for Pydantic v1 to handle YAML serialization properly."""
297
296
  exclude = kwargs.pop("exclude", None) or set()
298
297
 
299
- # If description and doc are identical, exclude doc from the output
300
- if self.description == self.doc and self.description is not None:
301
- exclude.add("doc")
302
-
303
298
  # if nativeDataType and type are identical, exclude nativeDataType from the output
304
299
  if self.nativeDataType == self.type and self.nativeDataType is not None:
305
300
  exclude.add("nativeDataType")
@@ -327,10 +322,6 @@ class SchemaFieldSpecification(StrictModel):
327
322
  """Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
328
323
  exclude = kwargs.pop("exclude", None) or set()
329
324
 
330
- # If description and doc are identical, exclude doc from the output
331
- if self.description == self.doc and self.description is not None:
332
- exclude.add("doc")
333
-
334
325
  # if nativeDataType and type are identical, exclude nativeDataType from the output
335
326
  if self.nativeDataType == self.type and self.nativeDataType is not None:
336
327
  exclude.add("nativeDataType")
@@ -387,7 +378,7 @@ class Dataset(StrictModel):
387
378
  name: Optional[str] = Field(None, validate_default=True)
388
379
  schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema")
389
380
  downstreams: Optional[List[str]] = None
390
- properties: Optional[Dict[str, str]] = None
381
+ properties: Optional[Dict[str, LaxStr]] = None
391
382
  subtype: Optional[str] = None
392
383
  subtypes: Optional[List[str]] = None
393
384
  tags: Optional[List[str]] = None
@@ -605,7 +596,7 @@ class Dataset(StrictModel):
605
596
  ],
606
597
  platformSchema=OtherSchemaClass(
607
598
  rawSchema=yaml.dump(
608
- self.schema_metadata.dict(
599
+ self.schema_metadata.model_dump(
609
600
  exclude_none=True, exclude_unset=True
610
601
  )
611
602
  )
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from enum import Enum
3
3
  from pathlib import Path
4
- from typing import Iterable, List, Optional, Union
4
+ from typing import Iterable, List, Optional, Type, Union
5
5
 
6
6
  import yaml
7
7
  from pydantic import Field, StrictStr, validator
@@ -48,7 +48,7 @@ VALID_ENTITY_TYPE_URNS = [
48
48
  _VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
49
49
 
50
50
 
51
- def _validate_entity_type_urn(v: str) -> str:
51
+ def _validate_entity_type_urn(cls: Type, v: str) -> str:
52
52
  urn = Urn.make_entity_type_urn(v)
53
53
  if urn not in VALID_ENTITY_TYPE_URNS:
54
54
  raise ValueError(
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from typing import Any, Dict, List, Optional
3
3
 
4
- from gql import gql
4
+ from gql import GraphQLRequest
5
5
 
6
6
  from datahub.api.graphql.base import BaseApi
7
7
 
@@ -79,10 +79,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
79
79
  if custom_properties is not None:
80
80
  variable_values["customProperties"] = custom_properties
81
81
 
82
- result = self.client.execute(
83
- gql(Operation.REPORT_OPERATION_MUTATION), variable_values=variable_values
82
+ request = GraphQLRequest(
83
+ Operation.REPORT_OPERATION_MUTATION, variable_values=variable_values
84
84
  )
85
85
 
86
+ result = self.client.execute(request)
87
+
86
88
  return result["reportOperation"]
87
89
 
88
90
  def query_operations(
@@ -109,12 +111,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
109
111
  :param partition: The partition to check the operation.
110
112
  """
111
113
 
112
- result = self.client.execute(
113
- gql(Operation.QUERY_OPERATIONS),
114
+ request = GraphQLRequest(
115
+ Operation.QUERY_OPERATIONS,
114
116
  variable_values={
115
117
  "urn": urn,
116
118
  "startTimeMillis": start_time_millis,
117
- "end_time_millis": end_time_millis,
119
+ "endTimeMillis": end_time_millis,
118
120
  "limit": limit,
119
121
  "filter": self.gen_filter(
120
122
  {
@@ -125,6 +127,8 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
125
127
  ),
126
128
  },
127
129
  )
130
+
131
+ result = self.client.execute(request)
128
132
  if "dataset" in result and "operations" in result["dataset"]:
129
133
  operations = []
130
134
  if source_type is not None:
@@ -13,8 +13,8 @@ import yaml
13
13
  from datahub.configuration.common import ExceptionWithProps
14
14
 
15
15
  # Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
16
- MIN_MEMORY_NEEDED = 3.8 # GB
17
- MIN_DISK_SPACE_NEEDED = 12 # GB
16
+ MIN_MEMORY_NEEDED = 4 # GB
17
+ MIN_DISK_SPACE_NEEDED = 13 # GB
18
18
 
19
19
  DOCKER_COMPOSE_PROJECT_NAME = os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
20
20
  DATAHUB_COMPOSE_PROJECT_FILTER = {
@@ -1,20 +1,25 @@
1
+ import dataclasses
1
2
  import re
2
3
  import unittest.mock
3
4
  from abc import ABC, abstractmethod
4
5
  from enum import auto
5
6
  from typing import (
6
7
  IO,
8
+ TYPE_CHECKING,
9
+ Annotated,
7
10
  Any,
8
11
  ClassVar,
9
12
  Dict,
10
13
  List,
11
14
  Optional,
12
15
  Type,
16
+ TypeVar,
13
17
  Union,
14
18
  runtime_checkable,
15
19
  )
16
20
 
17
21
  import pydantic
22
+ import pydantic_core
18
23
  from cached_property import cached_property
19
24
  from pydantic import BaseModel, Extra, ValidationError
20
25
  from pydantic.fields import Field
@@ -83,6 +88,29 @@ def redact_raw_config(obj: Any) -> Any:
83
88
  return obj
84
89
 
85
90
 
91
+ if TYPE_CHECKING:
92
+ AnyType = TypeVar("AnyType")
93
+ HiddenFromDocs = Annotated[AnyType, ...]
94
+ else:
95
+ HiddenFromDocs = pydantic.json_schema.SkipJsonSchema
96
+
97
+ LaxStr = Annotated[str, pydantic.BeforeValidator(lambda v: str(v))]
98
+
99
+
100
+ @dataclasses.dataclass(frozen=True)
101
+ class SupportedSources:
102
+ sources: List[str]
103
+
104
+ def __get_pydantic_json_schema__(
105
+ self,
106
+ core_schema: pydantic_core.core_schema.CoreSchema,
107
+ handler: pydantic.GetJsonSchemaHandler,
108
+ ) -> pydantic.json_schema.JsonSchemaValue:
109
+ json_schema = handler(core_schema)
110
+ json_schema.setdefault("schema_extra", {})["supported_sources"] = self.sources
111
+ return json_schema
112
+
113
+
86
114
  class ConfigModel(BaseModel):
87
115
  class Config:
88
116
  @staticmethod
@@ -334,4 +362,4 @@ class KeyValuePattern(ConfigModel):
334
362
 
335
363
 
336
364
  class VersionedConfig(ConfigModel):
337
- version: str = "1"
365
+ version: LaxStr = "1"
@@ -1,13 +1,16 @@
1
- from typing import Type
1
+ from typing import TYPE_CHECKING, Type
2
2
 
3
3
  import pydantic
4
4
 
5
5
  from datahub.ingestion.api.global_context import get_graph_context
6
6
 
7
+ if TYPE_CHECKING:
8
+ from pydantic.deprecated.class_validators import V1RootValidator
9
+
7
10
 
8
11
  def auto_connection_resolver(
9
12
  connection_field: str = "connection",
10
- ) -> classmethod:
13
+ ) -> "V1RootValidator":
11
14
  def _resolve_connection(cls: Type, values: dict) -> dict:
12
15
  if connection_field in values:
13
16
  connection_urn = values.pop(connection_field)
@@ -1,15 +1,18 @@
1
- from typing import TypeVar, Union
1
+ from typing import TYPE_CHECKING, Type, TypeVar, Union
2
2
 
3
3
  import pydantic
4
4
 
5
5
  from datahub.ingestion.api.registry import import_path
6
6
 
7
- T = TypeVar("T")
7
+ if TYPE_CHECKING:
8
+ from pydantic.deprecated.class_validators import V1Validator
8
9
 
10
+ _T = TypeVar("_T")
9
11
 
10
- def _pydantic_resolver(v: Union[T, str]) -> T:
12
+
13
+ def _pydantic_resolver(cls: Type, v: Union[str, _T]) -> _T:
11
14
  return import_path(v) if isinstance(v, str) else v
12
15
 
13
16
 
14
- def pydantic_resolve_key(field: str) -> classmethod:
17
+ def pydantic_resolve_key(field: str) -> "V1Validator":
15
18
  return pydantic.validator(field, pre=True, allow_reuse=True)(_pydantic_resolver)
@@ -9,14 +9,6 @@ PYDANTIC_VERSION_2 = _pydantic_version >= Version("2.0")
9
9
  # https://docs.pydantic.dev/latest/changelog/#v250-2023-11-13
10
10
  PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR = _pydantic_version >= Version("2.5.0")
11
11
 
12
- # This can be used to silence deprecation warnings while we migrate.
13
- if PYDANTIC_VERSION_2:
14
- from pydantic import PydanticDeprecatedSince20 # type: ignore
15
- else:
16
-
17
- class PydanticDeprecatedSince20(Warning): # type: ignore
18
- pass
19
-
20
12
 
21
13
  if PYDANTIC_VERSION_2:
22
14
  from pydantic import BaseModel as GenericModel
@@ -52,7 +44,6 @@ class v1_ConfigModel(v1_BaseModel):
52
44
  __all__ = [
53
45
  "PYDANTIC_VERSION_2",
54
46
  "PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR",
55
- "PydanticDeprecatedSince20",
56
47
  "GenericModel",
57
48
  "v1_ConfigModel",
58
49
  "v1_Field",
@@ -1,6 +1,6 @@
1
1
  from typing import Dict, Optional
2
2
 
3
- from pydantic import validator
3
+ import pydantic
4
4
  from pydantic.fields import Field
5
5
 
6
6
  from datahub.configuration.common import ConfigModel
@@ -30,7 +30,8 @@ class EnvConfigMixin(ConfigModel):
30
30
  description="The environment that all assets produced by this connector belong to",
31
31
  )
32
32
 
33
- @validator("env")
33
+ @pydantic.field_validator("env", mode="after")
34
+ @classmethod
34
35
  def env_must_be_one_of(cls, v: str) -> str:
35
36
  if v.upper() not in ALL_ENV_TYPES:
36
37
  raise ValueError(f"env must be one of {ALL_ENV_TYPES}, found {v}")
@@ -1,11 +1,14 @@
1
1
  import warnings
2
- from typing import Any, Optional, Type
2
+ from typing import TYPE_CHECKING, Any, Optional, Type
3
3
 
4
4
  import pydantic
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
  from datahub.utilities.global_warning_util import add_global_warning
8
8
 
9
+ if TYPE_CHECKING:
10
+ from pydantic.deprecated.class_validators import V1RootValidator
11
+
9
12
  _unset = object()
10
13
 
11
14
 
@@ -13,7 +16,7 @@ def pydantic_field_deprecated(
13
16
  field: str,
14
17
  warn_if_value_is_not: Any = _unset,
15
18
  message: Optional[str] = None,
16
- ) -> classmethod:
19
+ ) -> "V1RootValidator":
17
20
  if message:
18
21
  output = message
19
22
  else:
@@ -1,15 +1,18 @@
1
1
  import warnings
2
- from typing import Type
2
+ from typing import TYPE_CHECKING, Type
3
3
 
4
4
  import pydantic
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
 
8
+ if TYPE_CHECKING:
9
+ from pydantic.deprecated.class_validators import V1RootValidator
10
+
8
11
 
9
12
  def pydantic_removed_field(
10
13
  field: str,
11
14
  print_warning: bool = True,
12
- ) -> classmethod:
15
+ ) -> "V1RootValidator":
13
16
  def _validate_field_removal(cls: Type, values: dict) -> dict:
14
17
  if field in values:
15
18
  if print_warning:
@@ -1,11 +1,14 @@
1
1
  import warnings
2
- from typing import Callable, Type, TypeVar
2
+ from typing import TYPE_CHECKING, Callable, Type, TypeVar
3
3
 
4
4
  import pydantic
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
  from datahub.utilities.global_warning_util import add_global_warning
8
8
 
9
+ if TYPE_CHECKING:
10
+ from pydantic.deprecated.class_validators import V1RootValidator
11
+
9
12
  _T = TypeVar("_T")
10
13
 
11
14
 
@@ -18,7 +21,7 @@ def pydantic_renamed_field(
18
21
  new_name: str,
19
22
  transform: Callable = _default_rename_transform,
20
23
  print_warning: bool = True,
21
- ) -> classmethod:
24
+ ) -> "V1RootValidator":
22
25
  def _validate_field_rename(cls: Type, values: dict) -> dict:
23
26
  if old_name in values:
24
27
  if new_name in values:
@@ -49,6 +52,4 @@ def pydantic_renamed_field(
49
52
  # validator with pre=True gets all the values that were passed in.
50
53
  # Given that a renamed field doesn't show up in the fields list, we can't use
51
54
  # the field-level validator, even with a different field name.
52
- return pydantic.root_validator(pre=True, skip_on_failure=True, allow_reuse=True)(
53
- _validate_field_rename
54
- )
55
+ return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)
@@ -1,9 +1,12 @@
1
- from typing import Optional, Type, Union
1
+ from typing import TYPE_CHECKING, Optional, Type, Union
2
2
 
3
3
  import pydantic
4
4
 
5
+ if TYPE_CHECKING:
6
+ from pydantic.deprecated.class_validators import V1Validator
5
7
 
6
- def pydantic_multiline_string(field: str) -> classmethod:
8
+
9
+ def pydantic_multiline_string(field: str) -> "V1Validator":
7
10
  """If the field is present and contains an escaped newline, replace it with a real newline.
8
11
 
9
12
  This makes the assumption that the field value is never supposed to have a
@@ -2968,6 +2968,38 @@
2968
2968
  "platform_name": "Slack",
2969
2969
  "support_status": "TESTING"
2970
2970
  },
2971
+ "snaplogic": {
2972
+ "capabilities": [
2973
+ {
2974
+ "capability": "LINEAGE_FINE",
2975
+ "description": "Enabled by default",
2976
+ "subtype_modifier": null,
2977
+ "supported": true
2978
+ },
2979
+ {
2980
+ "capability": "DELETION_DETECTION",
2981
+ "description": "Not supported yet",
2982
+ "subtype_modifier": null,
2983
+ "supported": false
2984
+ },
2985
+ {
2986
+ "capability": "PLATFORM_INSTANCE",
2987
+ "description": "Snaplogic does not support platform instances",
2988
+ "subtype_modifier": null,
2989
+ "supported": false
2990
+ },
2991
+ {
2992
+ "capability": "LINEAGE_COARSE",
2993
+ "description": "Enabled by default",
2994
+ "subtype_modifier": null,
2995
+ "supported": true
2996
+ }
2997
+ ],
2998
+ "classname": "datahub.ingestion.source.snaplogic.snaplogic.SnaplogicSource",
2999
+ "platform_id": "snaplogic",
3000
+ "platform_name": "Snaplogic",
3001
+ "support_status": "TESTING"
3002
+ },
2971
3003
  "snowflake": {
2972
3004
  "capabilities": [
2973
3005
  {
@@ -3617,4 +3649,4 @@
3617
3649
  "support_status": "CERTIFIED"
3618
3650
  }
3619
3651
  }
3620
- }
3652
+ }
@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional
6
6
 
7
7
  from pydantic import Field, validator
8
8
 
9
- from datahub.configuration.common import ConfigModel, DynamicTypedConfig
9
+ from datahub.configuration.common import ConfigModel, DynamicTypedConfig, HiddenFromDocs
10
10
  from datahub.ingestion.graph.config import DatahubClientConfig
11
11
  from datahub.ingestion.sink.file import FileSinkConfig
12
12
 
@@ -85,7 +85,7 @@ class PipelineConfig(ConfigModel):
85
85
  source: SourceConfig
86
86
  sink: Optional[DynamicTypedConfig] = None
87
87
  transformers: Optional[List[DynamicTypedConfig]] = None
88
- flags: FlagsConfig = Field(default=FlagsConfig(), hidden_from_docs=True)
88
+ flags: HiddenFromDocs[FlagsConfig] = FlagsConfig()
89
89
  reporting: List[ReporterConfig] = []
90
90
  run_id: str = DEFAULT_RUN_ID
91
91
  datahub_api: Optional[DatahubClientConfig] = None
@@ -81,7 +81,7 @@ class AzureConnectionConfig(ConfigModel):
81
81
  )
82
82
  return self.sas_token if self.sas_token is not None else self.account_key
83
83
 
84
- @root_validator()
84
+ @root_validator(skip_on_failure=True)
85
85
  def _check_credential_values(cls, values: Dict) -> Dict:
86
86
  if (
87
87
  values.get("account_key")
@@ -1,12 +1,13 @@
1
1
  import logging
2
2
  import os
3
3
  import re
4
+ from copy import deepcopy
4
5
  from datetime import timedelta
5
6
  from typing import Dict, List, Optional, Union
6
7
 
7
8
  from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
8
9
 
9
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
10
11
  from datahub.configuration.source_common import (
11
12
  EnvConfigMixin,
12
13
  LowerCaseDatasetUrnConfigMixin,
@@ -73,8 +74,10 @@ class BigQueryBaseConfig(ConfigModel):
73
74
  ) from e
74
75
  return v
75
76
 
76
- @root_validator(pre=True, skip_on_failure=True)
77
+ @root_validator(pre=True)
77
78
  def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
79
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
80
+ values = deepcopy(values)
78
81
  project_id = values.pop("project_id", None)
79
82
  project_ids = values.get("project_ids")
80
83
 
@@ -182,13 +185,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
182
185
  )
183
186
 
184
187
  # NOTE: `schema_pattern` is added here only to hide it from docs.
185
- schema_pattern: AllowDenyPattern = Field(
188
+ schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
186
189
  default=AllowDenyPattern.allow_all(),
187
- hidden_from_docs=True,
188
190
  )
189
191
 
190
192
  @root_validator(pre=False, skip_on_failure=True)
191
193
  def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
194
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
195
+ values = deepcopy(values)
192
196
  dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
193
197
  schema_pattern = values.get("schema_pattern")
194
198
  if (
@@ -320,8 +324,7 @@ class BigQueryV2Config(
320
324
  description="Include full payload into events. It is only for debugging and internal use.",
321
325
  )
322
326
 
323
- number_of_datasets_process_in_batch: int = Field(
324
- hidden_from_docs=True,
327
+ number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
325
328
  default=10000,
326
329
  description="Number of table queried in batch when getting metadata. This is a low level config property "
327
330
  "which should be touched with care.",
@@ -436,17 +439,15 @@ class BigQueryV2Config(
436
439
 
437
440
  upstream_lineage_in_report: bool = Field(
438
441
  default=False,
439
- description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
442
+ description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
440
443
  )
441
444
 
442
- run_optimized_column_query: bool = Field(
443
- hidden_from_docs=True,
445
+ run_optimized_column_query: HiddenFromDocs[bool] = Field(
444
446
  default=False,
445
447
  description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
446
448
  )
447
449
 
448
- file_backed_cache_size: int = Field(
449
- hidden_from_docs=True,
450
+ file_backed_cache_size: HiddenFromDocs[int] = Field(
450
451
  default=2000,
451
452
  description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
452
453
  )
@@ -456,10 +457,9 @@ class BigQueryV2Config(
456
457
  description="Option to exclude empty projects from being ingested.",
457
458
  )
458
459
 
459
- schema_resolution_batch_size: int = Field(
460
+ schema_resolution_batch_size: HiddenFromDocs[int] = Field(
460
461
  default=100,
461
462
  description="The number of tables to process in a batch when resolving schema from DataHub.",
462
- hidden_from_schema=True,
463
463
  )
464
464
 
465
465
  max_threads_dataset_parallelism: int = Field(
@@ -480,6 +480,8 @@ class BigQueryV2Config(
480
480
 
481
481
  @root_validator(pre=True)
482
482
  def set_include_schema_metadata(cls, values: Dict) -> Dict:
483
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
484
+ values = deepcopy(values)
483
485
  # Historically this is used to disable schema ingestion
484
486
  if (
485
487
  "include_tables" in values
@@ -498,6 +500,8 @@ class BigQueryV2Config(
498
500
 
499
501
  @root_validator(skip_on_failure=True)
500
502
  def profile_default_settings(cls, values: Dict) -> Dict:
503
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
504
+ values = deepcopy(values)
501
505
  # Extra default SQLAlchemy option for better connection pooling and threading.
502
506
  # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
503
507
  values["options"].setdefault("max_overflow", -1)
@@ -515,9 +519,19 @@ class BigQueryV2Config(
515
519
 
516
520
  return v
517
521
 
522
+ @validator("upstream_lineage_in_report")
523
+ def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
524
+ if v and values.get("use_queries_v2", True):
525
+ logging.warning(
526
+ "`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
527
+ "This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
528
+ )
529
+
530
+ return v
531
+
518
532
  def get_table_pattern(self, pattern: List[str]) -> str:
519
533
  return "|".join(pattern) if pattern else ""
520
534
 
521
- platform_instance_not_supported_for_bigquery = pydantic_removed_field(
535
+ _platform_instance_not_supported_for_bigquery = pydantic_removed_field(
522
536
  "platform_instance"
523
537
  )
@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
8
8
  from google.cloud.bigquery import Client
9
9
  from pydantic import Field, PositiveInt
10
10
 
11
- from datahub.configuration.common import AllowDenyPattern
11
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
12
12
  from datahub.configuration.time_window_config import (
13
13
  BaseTimeWindowConfig,
14
14
  get_time_bucket,
@@ -86,12 +86,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
86
86
  # TODO: Support stateful ingestion for the time windows.
87
87
  window: BaseTimeWindowConfig = BaseTimeWindowConfig()
88
88
 
89
- local_temp_path: Optional[pathlib.Path] = Field(
90
- default=None,
91
- description="Local path to store the audit log.",
89
+ local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
92
90
  # TODO: For now, this is simply an advanced config to make local testing easier.
93
91
  # Eventually, we will want to store date-specific files in the directory and use it as a cache.
94
- hidden_from_docs=True,
92
+ default=None,
93
+ description="Local path to store the audit log.",
95
94
  )
96
95
 
97
96
  user_email_pattern: AllowDenyPattern = Field(
@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
9
9
 
10
10
 
11
11
  class GCPCredential(ConfigModel):
12
- project_id: Optional[str] = Field(description="Project id to set the credentials")
12
+ project_id: Optional[str] = Field(
13
+ None, description="Project id to set the credentials"
14
+ )
13
15
  private_key_id: str = Field(description="Private key id")
14
16
  private_key: str = Field(
15
17
  description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"