acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from dataclasses import dataclass
3
- from datetime import datetime, timedelta
3
+ from datetime import datetime, timedelta, timezone
4
4
  from typing import Any, Dict, List, Optional
5
5
 
6
6
  from pydantic import Field
@@ -10,6 +10,7 @@ from datahub.api.circuit_breaker.circuit_breaker import (
10
10
  CircuitBreakerConfig,
11
11
  )
12
12
  from datahub.api.graphql import Assertion, Operation
13
+ from datahub.emitter.mce_builder import parse_ts_millis
13
14
 
14
15
  logger: logging.Logger = logging.getLogger(__name__)
15
16
 
@@ -49,7 +50,7 @@ class AssertionCircuitBreaker(AbstractCircuitBreaker):
49
50
  if not operations:
50
51
  return None
51
52
  else:
52
- return datetime.fromtimestamp(operations[0]["lastUpdatedTimestamp"] / 1000)
53
+ return parse_ts_millis(operations[0]["lastUpdatedTimestamp"])
53
54
 
54
55
  def _check_if_assertion_failed(
55
56
  self, assertions: List[Dict[str, Any]], last_updated: Optional[datetime] = None
@@ -93,7 +94,7 @@ class AssertionCircuitBreaker(AbstractCircuitBreaker):
93
94
  logger.info(f"Found successful assertion: {assertion_urn}")
94
95
  result = False
95
96
  if last_updated is not None:
96
- last_run = datetime.fromtimestamp(last_assertion.time / 1000)
97
+ last_run = parse_ts_millis(last_assertion.time)
97
98
  if last_updated > last_run:
98
99
  logger.error(
99
100
  f"Missing assertion run for {assertion_urn}. The dataset was updated on {last_updated} but the last assertion run was at {last_run}"
@@ -117,7 +118,7 @@ class AssertionCircuitBreaker(AbstractCircuitBreaker):
117
118
  )
118
119
 
119
120
  if not last_updated:
120
- last_updated = datetime.now() - self.config.time_delta
121
+ last_updated = datetime.now(tz=timezone.utc) - self.config.time_delta
121
122
  logger.info(
122
123
  f"Dataset {urn} doesn't have last updated or check_last_assertion_time is false, using calculated min assertion date {last_updated}"
123
124
  )
@@ -14,7 +14,7 @@ from datahub.metadata.schema_classes import (
14
14
  PropertyValueClass,
15
15
  StructuredPropertyDefinitionClass,
16
16
  )
17
- from datahub.metadata.urns import StructuredPropertyUrn, Urn
17
+ from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn
18
18
  from datahub.utilities.urns._urn_base import URN_TYPES
19
19
 
20
20
  logging.basicConfig(level=logging.INFO)
@@ -86,19 +86,31 @@ class StructuredProperties(ConfigModel):
86
86
 
87
87
  @validator("type")
88
88
  def validate_type(cls, v: str) -> str:
89
- # Convert to lowercase if needed
90
- if not v.islower():
89
+ # This logic is somewhat hacky, since we need to deal with
90
+ # 1. fully qualified urns
91
+ # 2. raw data types, that need to get the datahub namespace prefix
92
+ # While keeping the user-facing interface and error messages clean.
93
+
94
+ if not v.startswith("urn:li:") and not v.islower():
95
+ # Convert to lowercase if needed
96
+ v = v.lower()
91
97
  logger.warning(
92
- f"Structured property type should be lowercase. Updated to {v.lower()}"
98
+ f"Structured property type should be lowercase. Updated to {v}"
93
99
  )
94
- v = v.lower()
100
+
101
+ urn = Urn.make_data_type_urn(v)
95
102
 
96
103
  # Check if type is allowed
97
- if not AllowedTypes.check_allowed_type(v):
104
+ data_type_urn = DataTypeUrn.from_string(urn)
105
+ unqualified_data_type = data_type_urn.id
106
+ if unqualified_data_type.startswith("datahub."):
107
+ unqualified_data_type = unqualified_data_type[len("datahub.") :]
108
+ if not AllowedTypes.check_allowed_type(unqualified_data_type):
98
109
  raise ValueError(
99
- f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}"
110
+ f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}"
100
111
  )
101
- return v
112
+
113
+ return urn
102
114
 
103
115
  @property
104
116
  def fqn(self) -> str:
@@ -10,7 +10,6 @@ from typing import (
10
10
  List,
11
11
  Optional,
12
12
  Type,
13
- TypeVar,
14
13
  Union,
15
14
  runtime_checkable,
16
15
  )
@@ -19,14 +18,12 @@ import pydantic
19
18
  from cached_property import cached_property
20
19
  from pydantic import BaseModel, Extra, ValidationError
21
20
  from pydantic.fields import Field
22
- from typing_extensions import Protocol
21
+ from typing_extensions import Protocol, Self
23
22
 
24
23
  from datahub.configuration._config_enum import ConfigEnum as ConfigEnum # noqa: I250
25
24
  from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
26
25
  from datahub.utilities.dedup_list import deduplicate_list
27
26
 
28
- _ConfigSelf = TypeVar("_ConfigSelf", bound="ConfigModel")
29
-
30
27
  REDACT_KEYS = {
31
28
  "password",
32
29
  "token",
@@ -109,7 +106,7 @@ class ConfigModel(BaseModel):
109
106
  schema_extra = _schema_extra
110
107
 
111
108
  @classmethod
112
- def parse_obj_allow_extras(cls: Type[_ConfigSelf], obj: Any) -> _ConfigSelf:
109
+ def parse_obj_allow_extras(cls, obj: Any) -> Self:
113
110
  if PYDANTIC_VERSION_2:
114
111
  try:
115
112
  with unittest.mock.patch.dict(
@@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin):
63
63
  default=None,
64
64
  description="A holder for platform -> platform_instance mappings to generate correct dataset urns",
65
65
  )
66
+
67
+
68
+ class PlatformDetail(ConfigModel):
69
+ platform_instance: Optional[str] = Field(
70
+ default=None,
71
+ description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
72
+ "with platform instance name used in ingestion "
73
+ "recipe of other datahub sources.",
74
+ )
75
+ env: str = Field(
76
+ default=DEFAULT_ENV,
77
+ description="The environment that all assets produced by DataHub platform ingestion source belong to",
78
+ )
@@ -6,7 +6,7 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import time
9
- from datetime import datetime
9
+ from datetime import datetime, timezone
10
10
  from enum import Enum
11
11
  from typing import (
12
12
  TYPE_CHECKING,
@@ -24,6 +24,7 @@ from typing import (
24
24
 
25
25
  import typing_inspect
26
26
  from avrogen.dict_wrapper import DictWrapper
27
+ from typing_extensions import assert_never
27
28
 
28
29
  from datahub.emitter.enum_helpers import get_enum_options
29
30
  from datahub.metadata.schema_classes import (
@@ -103,6 +104,22 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
103
104
  return int(ts.timestamp() * 1000)
104
105
 
105
106
 
107
+ @overload
108
+ def parse_ts_millis(ts: float) -> datetime:
109
+ ...
110
+
111
+
112
+ @overload
113
+ def parse_ts_millis(ts: None) -> None:
114
+ ...
115
+
116
+
117
+ def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
118
+ if ts is None:
119
+ return None
120
+ return datetime.fromtimestamp(ts / 1000, tz=timezone.utc)
121
+
122
+
106
123
  def make_data_platform_urn(platform: str) -> str:
107
124
  if platform.startswith("urn:li:dataPlatform:"):
108
125
  return platform
@@ -253,9 +270,8 @@ def make_owner_urn(owner: str, owner_type: OwnerType) -> str:
253
270
  return make_user_urn(owner)
254
271
  elif owner_type == OwnerType.GROUP:
255
272
  return make_group_urn(owner)
256
- # This should pretty much never happen.
257
- # TODO: With Python 3.11, we can use typing.assert_never() here.
258
- return f"urn:li:{owner_type.value}:{owner}"
273
+ else:
274
+ assert_never(owner_type)
259
275
 
260
276
 
261
277
  def make_ownership_type_urn(type: str) -> str:
@@ -4,8 +4,8 @@ from pydantic.fields import Field
4
4
  from pydantic.main import BaseModel
5
5
 
6
6
  from datahub.cli.env_utils import get_boolean_env_variable
7
- from datahub.emitter.enum_helpers import get_enum_options
8
7
  from datahub.emitter.mce_builder import (
8
+ ALL_ENV_TYPES,
9
9
  Aspect,
10
10
  datahub_guid,
11
11
  make_container_urn,
@@ -25,7 +25,6 @@ from datahub.metadata.schema_classes import (
25
25
  ContainerClass,
26
26
  DomainsClass,
27
27
  EmbedClass,
28
- FabricTypeClass,
29
28
  GlobalTagsClass,
30
29
  MetadataChangeEventClass,
31
30
  OwnerClass,
@@ -206,11 +205,7 @@ def gen_containers(
206
205
  # Extra validation on the env field.
207
206
  # In certain cases (mainly for backwards compatibility), the env field will actually
208
207
  # have a platform instance name.
209
- env = (
210
- container_key.env
211
- if container_key.env in get_enum_options(FabricTypeClass)
212
- else None
213
- )
208
+ env = container_key.env if container_key.env in ALL_ENV_TYPES else None
214
209
 
215
210
  container_urn = container_key.as_urn()
216
211
 
@@ -2,7 +2,19 @@ import json
2
2
  import time
3
3
  from collections import defaultdict
4
4
  from dataclasses import dataclass
5
- from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
5
+ from typing import (
6
+ Any,
7
+ Dict,
8
+ List,
9
+ Literal,
10
+ Optional,
11
+ Protocol,
12
+ Tuple,
13
+ Union,
14
+ runtime_checkable,
15
+ )
16
+
17
+ from typing_extensions import LiteralString
6
18
 
7
19
  from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE
8
20
  from datahub.emitter.serialization_helper import pre_json_transform
@@ -19,25 +31,36 @@ from datahub.metadata.urns import Urn
19
31
  from datahub.utilities.urns.urn import guess_entity_type
20
32
 
21
33
 
34
+ @runtime_checkable
35
+ class SupportsToObj(Protocol):
36
+ def to_obj(self) -> Any:
37
+ ...
38
+
39
+
22
40
  def _recursive_to_obj(obj: Any) -> Any:
23
41
  if isinstance(obj, list):
24
42
  return [_recursive_to_obj(v) for v in obj]
25
- elif hasattr(obj, "to_obj"):
43
+ elif isinstance(obj, SupportsToObj):
26
44
  return obj.to_obj()
27
45
  else:
28
46
  return obj
29
47
 
30
48
 
49
+ PatchPath = Tuple[Union[LiteralString, Urn], ...]
50
+ PatchOp = Literal["add", "remove", "replace"]
51
+
52
+
31
53
  @dataclass
32
- class _Patch:
33
- op: str # one of ['add', 'remove', 'replace']; we don't support move, copy or test
34
- path: str
54
+ class _Patch(SupportsToObj):
55
+ op: PatchOp
56
+ path: PatchPath
35
57
  value: Any
36
58
 
37
59
  def to_obj(self) -> Dict:
60
+ quoted_path = "/" + "/".join(MetadataPatchProposal.quote(p) for p in self.path)
38
61
  return {
39
62
  "op": self.op,
40
- "path": self.path,
63
+ "path": quoted_path,
41
64
  "value": _recursive_to_obj(self.value),
42
65
  }
43
66
 
@@ -63,19 +86,20 @@ class MetadataPatchProposal:
63
86
 
64
87
  # Json Patch quoting based on https://jsonpatch.com/#json-pointer
65
88
  @classmethod
66
- def quote(cls, value: str) -> str:
67
- return value.replace("~", "~0").replace("/", "~1")
89
+ def quote(cls, value: Union[str, Urn]) -> str:
90
+ return str(value).replace("~", "~0").replace("/", "~1")
68
91
 
69
92
  def _add_patch(
70
- self, aspect_name: str, op: str, path: Union[str, Sequence[str]], value: Any
93
+ self,
94
+ aspect_name: str,
95
+ op: PatchOp,
96
+ path: PatchPath,
97
+ value: Any,
71
98
  ) -> None:
72
- if not isinstance(path, str):
73
- path = "/" + "/".join(self.quote(p) for p in path)
74
-
75
99
  # TODO: Validate that aspectName is a valid aspect for this entityType
76
100
  self.patches[aspect_name].append(_Patch(op, path, value))
77
101
 
78
- def build(self) -> Iterable[MetadataChangeProposalClass]:
102
+ def build(self) -> List[MetadataChangeProposalClass]:
79
103
  return [
80
104
  MetadataChangeProposalClass(
81
105
  entityUrn=self.urn,
@@ -3,7 +3,7 @@ import json
3
3
  import logging
4
4
  import os
5
5
  from json.decoder import JSONDecodeError
6
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
6
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union
7
7
 
8
8
  import requests
9
9
  from deprecated import deprecated
@@ -13,6 +13,7 @@ from requests.exceptions import HTTPError, RequestException
13
13
  from datahub import nice_version_name
14
14
  from datahub.cli import config_utils
15
15
  from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
16
+ from datahub.cli.env_utils import get_boolean_env_variable
16
17
  from datahub.configuration.common import ConfigurationError, OperationalError
17
18
  from datahub.emitter.generic_emitter import Emitter
18
19
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -46,6 +47,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
46
47
  os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
47
48
  )
48
49
 
50
+ _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
51
+
49
52
  # The limit is 16mb. We will use a max of 15mb to have some space
50
53
  # for overhead like request headers.
51
54
  # This applies to pretty much all calls to GMS.
@@ -288,9 +291,11 @@ class DataHubRestEmitter(Closeable, Emitter):
288
291
 
289
292
  def emit_mcps(
290
293
  self,
291
- mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
294
+ mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
292
295
  async_flag: Optional[bool] = None,
293
296
  ) -> int:
297
+ if _DATAHUB_EMITTER_TRACE:
298
+ logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
294
299
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
295
300
  for mcp in mcps:
296
301
  ensure_has_system_metadata(mcp)
@@ -303,15 +308,25 @@ class DataHubRestEmitter(Closeable, Emitter):
303
308
  current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
304
309
  for mcp_obj in mcp_objs:
305
310
  mcp_obj_size = len(json.dumps(mcp_obj))
311
+ if _DATAHUB_EMITTER_TRACE:
312
+ logger.debug(
313
+ f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
314
+ )
306
315
 
307
316
  if (
308
317
  mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
309
318
  or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
310
319
  ):
320
+ if _DATAHUB_EMITTER_TRACE:
321
+ logger.debug("Decided to create new chunk")
311
322
  mcp_obj_chunks.append([])
312
323
  current_chunk_size = 0
313
324
  mcp_obj_chunks[-1].append(mcp_obj)
314
325
  current_chunk_size += mcp_obj_size
326
+ if len(mcp_obj_chunks) > 0:
327
+ logger.debug(
328
+ f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
329
+ )
315
330
 
316
331
  for mcp_obj_chunk in mcp_obj_chunks:
317
332
  # TODO: We're calling json.dumps on each MCP object twice, once to estimate
@@ -338,8 +353,15 @@ class DataHubRestEmitter(Closeable, Emitter):
338
353
 
339
354
  def _emit_generic(self, url: str, payload: str) -> None:
340
355
  curl_command = make_curl_command(self._session, "POST", url, payload)
356
+ payload_size = len(payload)
357
+ if payload_size > INGEST_MAX_PAYLOAD_BYTES:
358
+ # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
359
+ logger.warning(
360
+ f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
361
+ )
341
362
  logger.debug(
342
- "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
363
+ "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
364
+ payload_size,
343
365
  curl_command,
344
366
  )
345
367
  try:
@@ -0,0 +1,98 @@
1
+ import json
2
+ import logging
3
+ from typing import TYPE_CHECKING, Iterable, List
4
+
5
+ from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
6
+ from datahub.emitter.serialization_helper import pre_json_transform
7
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
8
+ from datahub.metadata.schema_classes import (
9
+ DatasetProfileClass,
10
+ SchemaFieldClass,
11
+ SchemaMetadataClass,
12
+ )
13
+
14
+ if TYPE_CHECKING:
15
+ from datahub.ingestion.api.source import SourceReport
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class EnsureAspectSizeProcessor:
21
+ def __init__(
22
+ self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
23
+ ):
24
+ self.report = report
25
+ self.payload_constraint = payload_constraint
26
+
27
+ def ensure_dataset_profile_size(
28
+ self, dataset_urn: str, profile: DatasetProfileClass
29
+ ) -> None:
30
+ """
31
+ This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
32
+ in the future
33
+ """
34
+ sample_fields_size = 0
35
+ if profile.fieldProfiles:
36
+ logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
37
+ for field in profile.fieldProfiles:
38
+ if field.sampleValues:
39
+ values_len = 0
40
+ for value in field.sampleValues:
41
+ if value:
42
+ values_len += len(value)
43
+ logger.debug(
44
+ f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
45
+ )
46
+ if sample_fields_size + values_len > self.payload_constraint:
47
+ field.sampleValues = []
48
+ self.report.warning(
49
+ title="Dataset profile truncated due to size constraint",
50
+ message="Dataset profile contained too much data and would have caused ingestion to fail",
51
+ context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
52
+ )
53
+ else:
54
+ sample_fields_size += values_len
55
+ else:
56
+ logger.debug(f"Field {field.fieldPath} has no sample values")
57
+
58
+ def ensure_schema_metadata_size(
59
+ self, dataset_urn: str, schema: SchemaMetadataClass
60
+ ) -> None:
61
+ """
62
+ This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
63
+ in the future
64
+ """
65
+ total_fields_size = 0
66
+ logger.debug(f"Amount of schema fields: {len(schema.fields)}")
67
+ accepted_fields: List[SchemaFieldClass] = []
68
+ for field in schema.fields:
69
+ field_size = len(json.dumps(pre_json_transform(field.to_obj())))
70
+ logger.debug(f"Field {field.fieldPath} takes total {field_size}")
71
+ if total_fields_size + field_size < self.payload_constraint:
72
+ accepted_fields.append(field)
73
+ total_fields_size += field_size
74
+ else:
75
+ self.report.warning(
76
+ title="Schema truncated due to size constraint",
77
+ message="Dataset schema contained too much data and would have caused ingestion to fail",
78
+ context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
79
+ )
80
+
81
+ schema.fields = accepted_fields
82
+
83
+ def ensure_aspect_size(
84
+ self,
85
+ stream: Iterable[MetadataWorkUnit],
86
+ ) -> Iterable[MetadataWorkUnit]:
87
+ """
88
+ We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
89
+ on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
90
+ """
91
+ for wu in stream:
92
+ logger.debug(f"Ensuring size of workunit: {wu.id}")
93
+
94
+ if schema := wu.get_aspect_of_type(SchemaMetadataClass):
95
+ self.ensure_schema_metadata_size(wu.get_urn(), schema)
96
+ elif profile := wu.get_aspect_of_type(DatasetProfileClass):
97
+ self.ensure_dataset_profile_size(wu.get_urn(), profile)
98
+ yield wu
@@ -1,9 +1,9 @@
1
1
  from abc import abstractmethod
2
2
  from contextlib import AbstractContextManager
3
3
  from types import TracebackType
4
- from typing import Optional, Type, TypeVar
4
+ from typing import Optional, Type
5
5
 
6
- _Self = TypeVar("_Self", bound="Closeable")
6
+ from typing_extensions import Self
7
7
 
8
8
 
9
9
  class Closeable(AbstractContextManager):
@@ -11,7 +11,7 @@ class Closeable(AbstractContextManager):
11
11
  def close(self) -> None:
12
12
  pass
13
13
 
14
- def __enter__(self: _Self) -> _Self:
14
+ def __enter__(self) -> Self:
15
15
  # This method is mainly required for type checking.
16
16
  return self
17
17
 
@@ -1,6 +1,8 @@
1
1
  from abc import abstractmethod
2
2
  from dataclasses import dataclass
3
- from typing import Any, Dict, NewType, Optional, Type, TypeVar
3
+ from typing import Any, Dict, NewType, Optional
4
+
5
+ from typing_extensions import Self
4
6
 
5
7
  import datahub.emitter.mce_builder as builder
6
8
  from datahub.configuration.common import ConfigModel
@@ -17,9 +19,6 @@ class IngestionCheckpointingProviderConfig(ConfigModel):
17
19
  pass
18
20
 
19
21
 
20
- _Self = TypeVar("_Self", bound="IngestionCheckpointingProviderBase")
21
-
22
-
23
22
  @dataclass()
24
23
  class IngestionCheckpointingProviderBase(StatefulCommittable[CheckpointJobStatesMap]):
25
24
  """
@@ -32,9 +31,7 @@ class IngestionCheckpointingProviderBase(StatefulCommittable[CheckpointJobStates
32
31
 
33
32
  @classmethod
34
33
  @abstractmethod
35
- def create(
36
- cls: Type[_Self], config_dict: Dict[str, Any], ctx: PipelineContext
37
- ) -> "_Self":
34
+ def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> Self:
38
35
  pass
39
36
 
40
37
  @abstractmethod
@@ -42,7 +42,10 @@ class Report(SupportsAsObj):
42
42
  return some_val.as_obj()
43
43
  elif isinstance(some_val, pydantic.BaseModel):
44
44
  return Report.to_pure_python_obj(some_val.dict())
45
- elif dataclasses.is_dataclass(some_val):
45
+ elif dataclasses.is_dataclass(some_val) and not isinstance(some_val, type):
46
+ # The `is_dataclass` function returns `True` for both instances and classes.
47
+ # We need an extra check to ensure an instance was passed in.
48
+ # https://docs.python.org/3/library/dataclasses.html#dataclasses.is_dataclass
46
49
  return dataclasses.asdict(some_val)
47
50
  elif isinstance(some_val, list):
48
51
  return [Report.to_pure_python_obj(v) for v in some_val if v is not None]
@@ -3,6 +3,8 @@ from abc import ABCMeta, abstractmethod
3
3
  from dataclasses import dataclass, field
4
4
  from typing import Any, Generic, Optional, Type, TypeVar, cast
5
5
 
6
+ from typing_extensions import Self
7
+
6
8
  from datahub.configuration.common import ConfigModel
7
9
  from datahub.ingestion.api.closeable import Closeable
8
10
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
@@ -79,7 +81,6 @@ class NoopWriteCallback(WriteCallback):
79
81
 
80
82
  SinkReportType = TypeVar("SinkReportType", bound=SinkReport, covariant=True)
81
83
  SinkConfig = TypeVar("SinkConfig", bound=ConfigModel, covariant=True)
82
- Self = TypeVar("Self", bound="Sink")
83
84
 
84
85
 
85
86
  class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
@@ -90,7 +91,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
90
91
  report: SinkReportType
91
92
 
92
93
  @classmethod
93
- def get_config_class(cls: Type[Self]) -> Type[SinkConfig]:
94
+ def get_config_class(cls) -> Type[SinkConfig]:
94
95
  config_class = get_class_from_annotation(cls, Sink, ConfigModel)
95
96
  assert config_class, "Sink subclasses must define a config class"
96
97
  return cast(Type[SinkConfig], config_class)
@@ -112,7 +113,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
112
113
  pass
113
114
 
114
115
  @classmethod
115
- def create(cls: Type[Self], config_dict: dict, ctx: PipelineContext) -> "Self":
116
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> "Self":
116
117
  return cls(ctx, cls.get_config_class().parse_obj(config_dict))
117
118
 
118
119
  def handle_work_unit_start(self, workunit: WorkUnit) -> None:
@@ -31,6 +31,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
31
31
  from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
32
32
  auto_patch_last_modified,
33
33
  )
34
+ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
35
+ EnsureAspectSizeProcessor,
36
+ )
34
37
  from datahub.ingestion.api.closeable import Closeable
35
38
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
36
39
  from datahub.ingestion.api.report import Report
@@ -450,6 +453,7 @@ class Source(Closeable, metaclass=ABCMeta):
450
453
  browse_path_processor,
451
454
  partial(auto_workunit_reporter, self.get_report()),
452
455
  auto_patch_last_modified,
456
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
453
457
  ]
454
458
 
455
459
  @staticmethod
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from datetime import datetime, timezone
3
2
  from typing import (
4
3
  TYPE_CHECKING,
5
4
  Dict,
@@ -14,7 +13,7 @@ from typing import (
14
13
  )
15
14
 
16
15
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
17
- from datahub.emitter.mce_builder import make_dataplatform_instance_urn
16
+ from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis
18
17
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
18
  from datahub.emitter.mcp_builder import entity_supports_aspect
20
19
  from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -479,10 +478,7 @@ def auto_empty_dataset_usage_statistics(
479
478
  if invalid_timestamps:
480
479
  logger.warning(
481
480
  f"Usage statistics with unexpected timestamps, bucket_duration={config.bucket_duration}:\n"
482
- ", ".join(
483
- str(datetime.fromtimestamp(ts / 1000, tz=timezone.utc))
484
- for ts in invalid_timestamps
485
- )
481
+ ", ".join(str(parse_ts_millis(ts)) for ts in invalid_timestamps)
486
482
  )
487
483
 
488
484
  for bucket in bucket_timestamps:
@@ -1,4 +1,3 @@
1
- import os
2
1
  from abc import ABCMeta, abstractmethod
3
2
  from dataclasses import dataclass
4
3
  from typing import Any, Dict, List, Optional
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
38
37
  )
39
38
 
40
39
  max_workers: int = Field(
41
- default=(os.cpu_count() or 4),
42
- description="Number of worker processes to use for classification. Set to 1 to disable.",
40
+ default=1,
41
+ description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
43
42
  )
44
43
 
45
44
  table_pattern: AllowDenyPattern = Field(
@@ -188,9 +188,12 @@ class DataHubGraph(DatahubRestEmitter):
188
188
  retry_max_times=emitter._retry_max_times,
189
189
  extra_headers=emitter._session.headers,
190
190
  disable_ssl_verification=emitter._session.verify is False,
191
- # TODO: Support these headers.
192
- # ca_certificate_path=emitter._ca_certificate_path,
193
- # client_certificate_path=emitter._client_certificate_path,
191
+ ca_certificate_path=(
192
+ emitter._session.verify
193
+ if isinstance(emitter._session.verify, str)
194
+ else None
195
+ ),
196
+ client_certificate_path=emitter._session.cert,
194
197
  )
195
198
  )
196
199