acryl-datahub 1.2.0.10rc2__py3-none-any.whl → 1.2.0.10rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (93) hide show
  1. {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/METADATA +2525 -2609
  2. {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/RECORD +93 -93
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/assertion/assertion.py +1 -1
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/dataproduct/dataproduct.py +6 -3
  7. datahub/api/entities/dataset/dataset.py +9 -18
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/api/graphql/operation.py +10 -6
  10. datahub/cli/docker_check.py +2 -2
  11. datahub/configuration/common.py +29 -1
  12. datahub/configuration/connection_resolver.py +5 -2
  13. datahub/configuration/import_resolver.py +7 -4
  14. datahub/configuration/pydantic_migration_helpers.py +0 -9
  15. datahub/configuration/source_common.py +3 -2
  16. datahub/configuration/validate_field_deprecation.py +5 -2
  17. datahub/configuration/validate_field_removal.py +5 -2
  18. datahub/configuration/validate_field_rename.py +6 -5
  19. datahub/configuration/validate_multiline_string.py +5 -2
  20. datahub/ingestion/run/pipeline_config.py +2 -2
  21. datahub/ingestion/source/azure/azure_common.py +1 -1
  22. datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
  23. datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
  24. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  25. datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
  26. datahub/ingestion/source/datahub/config.py +8 -9
  27. datahub/ingestion/source/delta_lake/config.py +1 -1
  28. datahub/ingestion/source/dremio/dremio_config.py +3 -4
  29. datahub/ingestion/source/feast.py +8 -10
  30. datahub/ingestion/source/fivetran/config.py +1 -1
  31. datahub/ingestion/source/ge_profiling_config.py +26 -22
  32. datahub/ingestion/source/grafana/grafana_config.py +2 -2
  33. datahub/ingestion/source/grafana/models.py +12 -14
  34. datahub/ingestion/source/hex/hex.py +6 -1
  35. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  36. datahub/ingestion/source/kafka_connect/common.py +2 -2
  37. datahub/ingestion/source/looker/looker_common.py +55 -75
  38. datahub/ingestion/source/looker/looker_config.py +15 -4
  39. datahub/ingestion/source/looker/looker_source.py +445 -548
  40. datahub/ingestion/source/looker/lookml_config.py +1 -1
  41. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  42. datahub/ingestion/source/metadata/lineage.py +1 -1
  43. datahub/ingestion/source/mode.py +13 -5
  44. datahub/ingestion/source/nifi.py +1 -1
  45. datahub/ingestion/source/powerbi/config.py +14 -21
  46. datahub/ingestion/source/preset.py +1 -1
  47. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  48. datahub/ingestion/source/redshift/config.py +6 -3
  49. datahub/ingestion/source/salesforce.py +13 -9
  50. datahub/ingestion/source/schema/json_schema.py +14 -14
  51. datahub/ingestion/source/sigma/data_classes.py +3 -0
  52. datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
  53. datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
  54. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
  55. datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
  56. datahub/ingestion/source/sql/athena.py +2 -1
  57. datahub/ingestion/source/sql/clickhouse.py +12 -7
  58. datahub/ingestion/source/sql/cockroachdb.py +5 -3
  59. datahub/ingestion/source/sql/druid.py +2 -2
  60. datahub/ingestion/source/sql/hive.py +4 -3
  61. datahub/ingestion/source/sql/hive_metastore.py +7 -9
  62. datahub/ingestion/source/sql/mssql/source.py +2 -2
  63. datahub/ingestion/source/sql/mysql.py +2 -2
  64. datahub/ingestion/source/sql/oracle.py +3 -3
  65. datahub/ingestion/source/sql/presto.py +2 -1
  66. datahub/ingestion/source/sql/teradata.py +4 -4
  67. datahub/ingestion/source/sql/trino.py +2 -1
  68. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  69. datahub/ingestion/source/sql/vertica.py +1 -1
  70. datahub/ingestion/source/sql_queries.py +6 -6
  71. datahub/ingestion/source/state/checkpoint.py +5 -1
  72. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  73. datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
  74. datahub/ingestion/source/superset.py +29 -4
  75. datahub/ingestion/source/tableau/tableau.py +65 -11
  76. datahub/ingestion/source/tableau/tableau_common.py +5 -0
  77. datahub/ingestion/source/tableau/tableau_constant.py +1 -0
  78. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  79. datahub/ingestion/source/unity/config.py +7 -3
  80. datahub/ingestion/source/usage/usage_common.py +3 -3
  81. datahub/ingestion/source_config/pulsar.py +3 -1
  82. datahub/metadata/_internal_schema_classes.py +45 -1
  83. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  84. datahub/metadata/schema.avsc +24 -1
  85. datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
  86. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  87. datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
  88. datahub/sdk/dashboard.py +0 -2
  89. datahub/sdk/search_filters.py +1 -7
  90. {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/WHEEL +0 -0
  91. {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/entry_points.txt +0 -0
  92. {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/licenses/LICENSE +0 -0
  93. {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@ from sqlalchemy.engine.reflection import Inspector
16
16
  from sqlalchemy.types import TypeEngine
17
17
  from sqlalchemy_bigquery import STRUCT
18
18
 
19
+ from datahub.configuration.common import HiddenFromDocs
19
20
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
20
21
  from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey
21
22
  from datahub.ingestion.api.decorators import (
@@ -251,7 +252,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
251
252
 
252
253
 
253
254
  class AthenaConfig(SQLCommonConfig):
254
- scheme: str = "awsathena+rest"
255
+ scheme: HiddenFromDocs[str] = "awsathena+rest"
255
256
  username: Optional[str] = pydantic.Field(
256
257
  default=None,
257
258
  description="Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html",
@@ -18,6 +18,7 @@ from sqlalchemy.sql import sqltypes
18
18
  from sqlalchemy.types import BOOLEAN, DATE, DATETIME, INTEGER
19
19
 
20
20
  import datahub.emitter.mce_builder as builder
21
+ from datahub.configuration.common import HiddenFromDocs, LaxStr
21
22
  from datahub.configuration.source_common import DatasetLineageProviderConfigBase
22
23
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
23
24
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
@@ -128,16 +129,20 @@ class ClickHouseConfig(
128
129
  ):
129
130
  # defaults
130
131
  host_port: str = Field(default="localhost:8123", description="ClickHouse host URL.")
131
- scheme: str = Field(default="clickhouse", description="", hidden_from_docs=True)
132
+ scheme: HiddenFromDocs[str] = Field(default="clickhouse")
132
133
  password: pydantic.SecretStr = Field(
133
134
  default=pydantic.SecretStr(""), description="password"
134
135
  )
135
- secure: Optional[bool] = Field(default=None, description="")
136
- protocol: Optional[str] = Field(default=None, description="")
136
+ secure: Optional[bool] = Field(
137
+ default=None, description="[deprecated] Use uri_opts instead."
138
+ )
139
+ protocol: Optional[str] = Field(
140
+ default=None, description="[deprecated] Use uri_opts instead."
141
+ )
137
142
  _deprecate_secure = pydantic_field_deprecated("secure")
138
143
  _deprecate_protocol = pydantic_field_deprecated("protocol")
139
144
 
140
- uri_opts: Dict[str, str] = Field(
145
+ uri_opts: Dict[str, LaxStr] = Field(
141
146
  default={},
142
147
  description="The part of the URI and it's used to provide additional configuration options or parameters for the database connection.",
143
148
  )
@@ -185,9 +190,9 @@ class ClickHouseConfig(
185
190
  "Initializing uri_opts from deprecated secure or protocol options"
186
191
  )
187
192
  values["uri_opts"] = {}
188
- if secure:
189
- values["uri_opts"]["secure"] = secure
190
- if protocol:
193
+ if secure is not None:
194
+ values["uri_opts"]["secure"] = str(secure)
195
+ if protocol is not None:
191
196
  values["uri_opts"]["protocol"] = protocol
192
197
  logger.debug(f"uri_opts: {uri_opts}")
193
198
  elif (secure or protocol) and uri_opts:
@@ -1,6 +1,6 @@
1
1
  from pydantic.fields import Field
2
2
 
3
- from datahub.configuration.common import AllowDenyPattern
3
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
4
4
  from datahub.ingestion.api.common import PipelineContext
5
5
  from datahub.ingestion.api.decorators import (
6
6
  SourceCapability,
@@ -14,8 +14,10 @@ from datahub.ingestion.source.sql.postgres import PostgresConfig, PostgresSource
14
14
 
15
15
 
16
16
  class CockroachDBConfig(PostgresConfig):
17
- scheme = Field(default="cockroachdb+psycopg2", description="database scheme")
18
- schema_pattern = Field(
17
+ scheme: HiddenFromDocs[str] = Field(
18
+ default="cockroachdb+psycopg2", description="database scheme"
19
+ )
20
+ schema_pattern: AllowDenyPattern = Field(
19
21
  default=AllowDenyPattern(deny=["information_schema", "crdb_internal"])
20
22
  )
21
23
 
@@ -6,7 +6,7 @@ from pydantic.fields import Field
6
6
  from pydruid.db.sqlalchemy import DruidDialect
7
7
  from sqlalchemy.exc import ResourceClosedError
8
8
 
9
- from datahub.configuration.common import AllowDenyPattern
9
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
10
10
  from datahub.ingestion.api.decorators import (
11
11
  SourceCapability,
12
12
  SupportStatus,
@@ -34,7 +34,7 @@ DruidDialect.get_table_names = get_table_names
34
34
 
35
35
  class DruidConfig(BasicSQLAlchemyConfig):
36
36
  # defaults
37
- scheme: str = "druid"
37
+ scheme: HiddenFromDocs[str] = "druid"
38
38
  schema_pattern: AllowDenyPattern = Field(
39
39
  default=AllowDenyPattern(deny=["^(lookup|sysgit|view).*"]),
40
40
  description="regex patterns for schemas to filter in ingestion.",
@@ -6,7 +6,7 @@ from enum import Enum
6
6
  from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
7
7
  from urllib.parse import urlparse
8
8
 
9
- from pydantic.class_validators import validator
9
+ from pydantic import validator
10
10
  from pydantic.fields import Field
11
11
 
12
12
  # This import verifies that the dependencies are available.
@@ -14,6 +14,7 @@ from pyhive import hive # noqa: F401
14
14
  from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp
15
15
  from sqlalchemy.engine.reflection import Inspector
16
16
 
17
+ from datahub.configuration.common import HiddenFromDocs
17
18
  from datahub.emitter.mce_builder import (
18
19
  make_data_platform_urn,
19
20
  make_dataplatform_instance_urn,
@@ -651,10 +652,10 @@ HiveDialect.get_view_definition = get_view_definition_patched
651
652
 
652
653
  class HiveConfig(TwoTierSQLAlchemyConfig):
653
654
  # defaults
654
- scheme: str = Field(default="hive", hidden_from_docs=True)
655
+ scheme: HiddenFromDocs[str] = Field(default="hive")
655
656
 
656
657
  # Overriding as table location lineage is richer implementation here than with include_table_location_lineage
657
- include_table_location_lineage: bool = Field(default=False, hidden_from_docs=True)
658
+ include_table_location_lineage: HiddenFromDocs[bool] = Field(default=False)
658
659
 
659
660
  emit_storage_lineage: bool = Field(
660
661
  default=False,
@@ -1,17 +1,15 @@
1
1
  import base64
2
+ import dataclasses
2
3
  import json
3
4
  import logging
4
5
  from collections import namedtuple
5
6
  from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
6
7
 
7
- from pydantic.dataclasses import dataclass
8
- from pydantic.fields import Field
9
-
10
- # This import verifies that the dependencies are available.
8
+ from pydantic import Field
11
9
  from sqlalchemy import create_engine, text
12
10
  from sqlalchemy.engine.reflection import Inspector
13
11
 
14
- from datahub.configuration.common import AllowDenyPattern
12
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
15
13
  from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
16
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
15
  from datahub.ingestion.api.common import PipelineContext
@@ -73,7 +71,7 @@ class HiveMetastoreConfigMode(StrEnum):
73
71
  trino = "trino"
74
72
 
75
73
 
76
- @dataclass
74
+ @dataclasses.dataclass
77
75
  class ViewDataset:
78
76
  dataset_name: str
79
77
  schema_name: str
@@ -99,7 +97,7 @@ class HiveMetastore(BasicSQLAlchemyConfig):
99
97
  default="localhost:3306",
100
98
  description="Host URL and port to connect to. Example: localhost:3306",
101
99
  )
102
- scheme: str = Field(default="mysql+pymysql", description="", hidden_from_docs=True)
100
+ scheme: HiddenFromDocs[str] = Field(default="mysql+pymysql")
103
101
 
104
102
  database_pattern: AllowDenyPattern = Field(
105
103
  default=AllowDenyPattern.allow_all(),
@@ -123,8 +121,8 @@ class HiveMetastore(BasicSQLAlchemyConfig):
123
121
  description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']",
124
122
  )
125
123
 
126
- include_view_lineage: bool = Field(
127
- default=False, description="", hidden_from_docs=True
124
+ include_view_lineage: HiddenFromDocs[bool] = Field(
125
+ default=False,
128
126
  )
129
127
 
130
128
  include_catalog_name_in_ids: bool = Field(
@@ -13,7 +13,7 @@ from sqlalchemy.exc import ProgrammingError, ResourceClosedError
13
13
  from sqlalchemy.sql import quoted_name
14
14
 
15
15
  import datahub.metadata.schema_classes as models
16
- from datahub.configuration.common import AllowDenyPattern
16
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
17
17
  from datahub.configuration.pattern_utils import UUID_REGEX
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
19
  from datahub.ingestion.api.common import PipelineContext
@@ -75,7 +75,7 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
75
75
  class SQLServerConfig(BasicSQLAlchemyConfig):
76
76
  # defaults
77
77
  host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
78
- scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True)
78
+ scheme: HiddenFromDocs[str] = Field(default="mssql+pytds")
79
79
 
80
80
  # TODO: rename to include_procedures ?
81
81
  include_stored_procedures: bool = Field(
@@ -9,7 +9,7 @@ from sqlalchemy.dialects.mysql import BIT, base
9
9
  from sqlalchemy.dialects.mysql.enumerated import SET
10
10
  from sqlalchemy.engine.reflection import Inspector
11
11
 
12
- from datahub.configuration.common import AllowDenyPattern
12
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
13
13
  from datahub.ingestion.api.decorators import (
14
14
  SourceCapability,
15
15
  SupportStatus,
@@ -57,7 +57,7 @@ base.ischema_names["decimal128"] = DECIMAL128
57
57
  class MySQLConnectionConfig(SQLAlchemyConnectionConfig):
58
58
  # defaults
59
59
  host_port: str = Field(default="localhost:3306", description="MySQL host URL.")
60
- scheme: str = "mysql+pymysql"
60
+ scheme: HiddenFromDocs[str] = "mysql+pymysql"
61
61
 
62
62
 
63
63
  class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
@@ -110,10 +110,10 @@ class OracleConfig(BasicSQLAlchemyConfig):
110
110
  return v
111
111
 
112
112
  @pydantic.validator("data_dictionary_mode")
113
- def check_data_dictionary_mode(cls, values):
114
- if values not in ("ALL", "DBA"):
113
+ def check_data_dictionary_mode(cls, value):
114
+ if value not in ("ALL", "DBA"):
115
115
  raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
116
- return values
116
+ return value
117
117
 
118
118
  @pydantic.validator("thick_mode_lib_dir", always=True)
119
119
  def check_thick_mode_lib_dir(cls, v, values):
@@ -8,6 +8,7 @@ from sqlalchemy import exc, sql
8
8
  from sqlalchemy.engine import reflection
9
9
  from sqlalchemy.engine.base import Engine
10
10
 
11
+ from datahub.configuration.common import HiddenFromDocs
11
12
  from datahub.ingestion.api.common import PipelineContext
12
13
  from datahub.ingestion.api.decorators import (
13
14
  SourceCapability,
@@ -87,7 +88,7 @@ PrestoDialect._get_full_table = _get_full_table
87
88
 
88
89
  class PrestoConfig(TrinoConfig):
89
90
  # defaults
90
- scheme: str = Field(default="presto", description="", hidden_from_docs=True)
91
+ scheme: HiddenFromDocs[str] = Field(default="presto")
91
92
 
92
93
 
93
94
  @platform_name("Presto", doc_order=1)
@@ -468,23 +468,23 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
468
468
  ),
469
469
  )
470
470
 
471
- database_pattern = Field(
471
+ database_pattern: AllowDenyPattern = Field(
472
472
  default=AllowDenyPattern(deny=EXCLUDED_DATABASES),
473
473
  description="Regex patterns for databases to filter in ingestion.",
474
474
  )
475
- include_table_lineage = Field(
475
+ include_table_lineage: bool = Field(
476
476
  default=False,
477
477
  description="Whether to include table lineage in the ingestion. "
478
478
  "This requires to have the table lineage feature enabled.",
479
479
  )
480
480
 
481
- include_view_lineage = Field(
481
+ include_view_lineage: bool = Field(
482
482
  default=True,
483
483
  description="Whether to include view lineage in the ingestion. "
484
484
  "This requires to have the view lineage feature enabled.",
485
485
  )
486
486
 
487
- include_queries = Field(
487
+ include_queries: bool = Field(
488
488
  default=True,
489
489
  description="Whether to generate query entities for SQL queries. "
490
490
  "Query entities provide metadata about individual SQL queries including "
@@ -18,6 +18,7 @@ from sqlalchemy.types import TypeEngine
18
18
  from trino.sqlalchemy import datatype
19
19
  from trino.sqlalchemy.dialect import TrinoDialect
20
20
 
21
+ from datahub.configuration.common import HiddenFromDocs
21
22
  from datahub.configuration.source_common import (
22
23
  EnvConfigMixin,
23
24
  PlatformInstanceConfigMixin,
@@ -222,7 +223,7 @@ class ConnectorDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
222
223
 
223
224
  class TrinoConfig(BasicSQLAlchemyConfig):
224
225
  # defaults
225
- scheme: str = Field(default="trino", description="", hidden_from_docs=True)
226
+ scheme: HiddenFromDocs[str] = Field(default="trino")
226
227
  database: str = Field(description="database (catalog)")
227
228
 
228
229
  catalog_to_connector_details: Dict[str, ConnectorDetail] = Field(
@@ -7,7 +7,7 @@ from sqlalchemy import create_engine, inspect
7
7
  from sqlalchemy.engine import URL
8
8
  from sqlalchemy.engine.reflection import Inspector
9
9
 
10
- from datahub.configuration.common import AllowDenyPattern
10
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
11
11
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
12
12
  from datahub.emitter.mcp_builder import ContainerKey
13
13
  from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -27,11 +27,10 @@ class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
27
27
  default=AllowDenyPattern.allow_all(),
28
28
  description="Regex patterns for databases to filter in ingestion.",
29
29
  )
30
- schema_pattern: AllowDenyPattern = Field(
30
+ schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
31
31
  # The superclass contains a `schema_pattern` field, so we need this here
32
32
  # to override the documentation.
33
33
  default=AllowDenyPattern.allow_all(),
34
- hidden_from_docs=True,
35
34
  description="Deprecated in favour of database_pattern.",
36
35
  )
37
36
 
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tupl
5
5
 
6
6
  import pydantic
7
7
  import pytest
8
- from pydantic.class_validators import validator
8
+ from pydantic import validator
9
9
  from vertica_sqlalchemy_dialect.base import VerticaInspector
10
10
 
11
11
  from datahub.configuration.common import AllowDenyPattern
@@ -8,6 +8,7 @@ from typing import ClassVar, Iterable, List, Optional, Union
8
8
 
9
9
  from pydantic import BaseModel, Field, validator
10
10
 
11
+ from datahub.configuration.common import HiddenFromDocs
11
12
  from datahub.configuration.datetimes import parse_user_datetime
12
13
  from datahub.configuration.source_common import (
13
14
  EnvConfigMixin,
@@ -66,22 +67,21 @@ class SqlQueriesSourceConfig(
66
67
  default=BaseUsageConfig(),
67
68
  )
68
69
 
69
- use_schema_resolver: bool = Field(
70
+ use_schema_resolver: HiddenFromDocs[bool] = Field(
71
+ True,
70
72
  description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.",
71
- default=True,
72
- hidden_from_docs=True,
73
73
  )
74
74
  default_db: Optional[str] = Field(
75
+ None,
75
76
  description="The default database to use for unqualified table names",
76
- default=None,
77
77
  )
78
78
  default_schema: Optional[str] = Field(
79
+ None,
79
80
  description="The default schema to use for unqualified table names",
80
- default=None,
81
81
  )
82
82
  override_dialect: Optional[str] = Field(
83
+ None,
83
84
  description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
84
- default=None,
85
85
  )
86
86
 
87
87
 
@@ -68,7 +68,11 @@ class CheckpointStateBase(ConfigModel):
68
68
 
69
69
  @staticmethod
70
70
  def _to_bytes_utf8(model: ConfigModel) -> bytes:
71
- return model.json(exclude={"version", "serde"}).encode("utf-8")
71
+ pydantic_json = model.model_dump_json(exclude={"version", "serde"})
72
+ # We decode and re-encode so that Python's default whitespace is included.
73
+ # This is purely to keep tests consistent as we migrate to pydantic v2,
74
+ # and can be removed once we're fully migrated.
75
+ return json.dumps(json.loads(pydantic_json)).encode("utf-8")
72
76
 
73
77
  @staticmethod
74
78
  def _to_bytes_base85_json(
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, Iterable, List, Tuple, Type
1
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Tuple, Type
2
2
 
3
3
  import pydantic
4
4
 
@@ -8,13 +8,16 @@ from datahub.utilities.checkpoint_state_util import CheckpointStateUtil
8
8
  from datahub.utilities.dedup_list import deduplicate_list
9
9
  from datahub.utilities.urns.urn import guess_entity_type
10
10
 
11
+ if TYPE_CHECKING:
12
+ from pydantic.deprecated.class_validators import V1RootValidator
13
+
11
14
  STATEFUL_INGESTION_IGNORED_ENTITY_TYPES = {
12
15
  "dataProcessInstance",
13
16
  "query",
14
17
  }
15
18
 
16
19
 
17
- def pydantic_state_migrator(mapping: Dict[str, str]) -> classmethod:
20
+ def pydantic_state_migrator(mapping: Dict[str, str]) -> "V1RootValidator":
18
21
  # mapping would be something like:
19
22
  # {
20
23
  # 'encoded_view_urns': 'dataset',
@@ -10,6 +10,7 @@ from datahub.configuration.common import (
10
10
  ConfigModel,
11
11
  ConfigurationError,
12
12
  DynamicTypedConfig,
13
+ HiddenFromDocs,
13
14
  )
14
15
  from datahub.configuration.pydantic_migration_helpers import GenericModel
15
16
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
@@ -55,25 +56,21 @@ class StatefulIngestionConfig(ConfigModel):
55
56
  description="Whether or not to enable stateful ingest. "
56
57
  "Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",
57
58
  )
58
- max_checkpoint_state_size: pydantic.PositiveInt = Field(
59
+ max_checkpoint_state_size: HiddenFromDocs[pydantic.PositiveInt] = Field(
59
60
  default=2**24, # 16 MB
60
61
  description="The maximum size of the checkpoint state in bytes. Default is 16MB",
61
- hidden_from_docs=True,
62
62
  )
63
- state_provider: Optional[DynamicTypedStateProviderConfig] = Field(
63
+ state_provider: HiddenFromDocs[Optional[DynamicTypedStateProviderConfig]] = Field(
64
64
  default=None,
65
65
  description="The ingestion state provider configuration.",
66
- hidden_from_docs=True,
67
66
  )
68
- ignore_old_state: bool = Field(
67
+ ignore_old_state: HiddenFromDocs[bool] = Field(
69
68
  default=False,
70
69
  description="If set to True, ignores the previous checkpoint state.",
71
- hidden_from_docs=True,
72
70
  )
73
- ignore_new_state: bool = Field(
71
+ ignore_new_state: HiddenFromDocs[bool] = Field(
74
72
  default=False,
75
73
  description="If set to True, ignores the current checkpoint state.",
76
- hidden_from_docs=True,
77
74
  )
78
75
 
79
76
  @pydantic.root_validator(skip_on_failure=True)
@@ -9,9 +9,10 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
9
9
  import dateutil.parser as dp
10
10
  import requests
11
11
  import sqlglot
12
- from pydantic import BaseModel
13
- from pydantic.class_validators import root_validator, validator
12
+ from pydantic import BaseModel, root_validator, validator
14
13
  from pydantic.fields import Field
14
+ from requests.adapters import HTTPAdapter
15
+ from urllib3.util.retry import Retry
15
16
 
16
17
  import datahub.emitter.mce_builder as builder
17
18
  from datahub.configuration.common import AllowDenyPattern
@@ -109,6 +110,12 @@ logger = logging.getLogger(__name__)
109
110
 
110
111
  PAGE_SIZE = 25
111
112
 
113
+ # Retry configuration constants
114
+ RETRY_MAX_TIMES = 3
115
+ RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
116
+ RETRY_BACKOFF_FACTOR = 1
117
+ RETRY_ALLOWED_METHODS = ["GET"]
118
+
112
119
 
113
120
  chart_type_from_viz_type = {
114
121
  "line": ChartTypeClass.LINE,
@@ -328,6 +335,19 @@ class SupersetSource(StatefulIngestionSourceBase):
328
335
  logger.debug("Got access token from superset")
329
336
 
330
337
  requests_session = requests.Session()
338
+
339
+ # Configure retry strategy for transient failures
340
+ retry_strategy = Retry(
341
+ total=RETRY_MAX_TIMES,
342
+ status_forcelist=RETRY_STATUS_CODES,
343
+ backoff_factor=RETRY_BACKOFF_FACTOR,
344
+ allowed_methods=RETRY_ALLOWED_METHODS,
345
+ raise_on_status=False,
346
+ )
347
+ adapter = HTTPAdapter(max_retries=retry_strategy)
348
+ requests_session.mount("http://", adapter)
349
+ requests_session.mount("https://", adapter)
350
+
331
351
  requests_session.headers.update(
332
352
  {
333
353
  "Authorization": f"Bearer {self.access_token}",
@@ -360,8 +380,13 @@ class SupersetSource(StatefulIngestionSourceBase):
360
380
  )
361
381
 
362
382
  if response.status_code != 200:
363
- logger.warning(f"Failed to get {entity_type} data: {response.text}")
364
- continue
383
+ self.report.warning(
384
+ title="Failed to fetch data from Superset API",
385
+ message="Incomplete metadata extraction due to Superset API failure",
386
+ context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
387
+ )
388
+ # we stop pagination for this entity type and we continue the overall ingestion
389
+ break
365
390
 
366
391
  payload = response.json()
367
392
  # Update total_items with the actual count from the response