acryl-datahub 1.2.0.3rc1__py3-none-any.whl → 1.2.0.4rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (38) hide show
  1. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4rc1.dist-info}/METADATA +2535 -2535
  2. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4rc1.dist-info}/RECORD +38 -38
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/external/external_tag.py +6 -4
  5. datahub/api/entities/external/lake_formation_external_entites.py +50 -49
  6. datahub/api/entities/external/restricted_text.py +107 -182
  7. datahub/api/entities/external/unity_catalog_external_entites.py +51 -52
  8. datahub/emitter/rest_emitter.py +18 -5
  9. datahub/ingestion/api/source.py +81 -7
  10. datahub/ingestion/autogenerated/capability_summary.json +47 -19
  11. datahub/ingestion/graph/client.py +19 -3
  12. datahub/ingestion/sink/datahub_rest.py +2 -0
  13. datahub/ingestion/source/abs/source.py +9 -0
  14. datahub/ingestion/source/aws/glue.py +18 -2
  15. datahub/ingestion/source/aws/tag_entities.py +2 -2
  16. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  17. datahub/ingestion/source/dbt/dbt_common.py +10 -0
  18. datahub/ingestion/source/delta_lake/source.py +8 -1
  19. datahub/ingestion/source/dremio/dremio_source.py +19 -2
  20. datahub/ingestion/source/fivetran/fivetran.py +9 -3
  21. datahub/ingestion/source/ge_data_profiler.py +8 -0
  22. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  23. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  24. datahub/ingestion/source/mock_data/datahub_mock_data.py +26 -10
  25. datahub/ingestion/source/powerbi/powerbi.py +4 -1
  26. datahub/ingestion/source/redshift/redshift.py +1 -0
  27. datahub/ingestion/source/salesforce.py +8 -0
  28. datahub/ingestion/source/sql/athena_properties_extractor.py +2 -2
  29. datahub/ingestion/source/sql/hive_metastore.py +8 -0
  30. datahub/ingestion/source/sql/teradata.py +8 -1
  31. datahub/ingestion/source/sql/trino.py +9 -0
  32. datahub/ingestion/source/unity/tag_entities.py +3 -3
  33. datahub/sdk/entity_client.py +22 -7
  34. datahub/utilities/mapping.py +29 -2
  35. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4rc1.dist-info}/WHEEL +0 -0
  36. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4rc1.dist-info}/entry_points.txt +0 -0
  37. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4rc1.dist-info}/licenses/LICENSE +0 -0
  38. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4rc1.dist-info}/top_level.txt +0 -0
@@ -11,41 +11,12 @@ Features:
11
11
 
12
12
  from __future__ import annotations
13
13
 
14
- from typing import Any, ClassVar, Optional, Set, Union
14
+ from typing import ClassVar, Optional, Set
15
15
 
16
- # Check Pydantic version and import accordingly
17
- try:
18
- from pydantic import VERSION
16
+ from datahub.configuration.common import ConfigModel
19
17
 
20
- PYDANTIC_V2 = int(VERSION.split(".")[0]) >= 2
21
- except (ImportError, AttributeError):
22
- # Fallback for older versions that don't have VERSION
23
- PYDANTIC_V2 = False
24
18
 
25
- if PYDANTIC_V2:
26
- from pydantic import GetCoreSchemaHandler # type: ignore[attr-defined]
27
- from pydantic_core import core_schema
28
- else:
29
- from pydantic.validators import str_validator
30
-
31
-
32
- class RestrictedTextConfig:
33
- """Configuration class for RestrictedText."""
34
-
35
- def __init__(
36
- self,
37
- max_length: Optional[int] = None,
38
- forbidden_chars: Optional[Set[str]] = None,
39
- replacement_char: Optional[str] = None,
40
- truncation_suffix: Optional[str] = None,
41
- ):
42
- self.max_length = max_length
43
- self.forbidden_chars = forbidden_chars
44
- self.replacement_char = replacement_char
45
- self.truncation_suffix = truncation_suffix
46
-
47
-
48
- class RestrictedText(str):
19
+ class RestrictedText(ConfigModel):
49
20
  """A string type that stores the original value but returns a truncated and sanitized version.
50
21
 
51
22
  This type allows you to:
@@ -60,8 +31,9 @@ class RestrictedText(str):
60
31
  # Basic usage with default settings
61
32
  name: RestrictedText
62
33
 
63
- # Custom max length and character replacement using Field
64
- custom_field: RestrictedText = RestrictedText.with_config(
34
+ # Custom max length and character replacement
35
+ custom_field: RestrictedText = RestrictedText(
36
+ text="hello-world.test",
65
37
  max_length=10,
66
38
  forbidden_chars={' ', '-', '.'},
67
39
  replacement_char='_'
@@ -73,175 +45,128 @@ class RestrictedText(str):
73
45
  custom_field="hello-world.test"
74
46
  )
75
47
 
76
- print(model.name) # Truncated and sanitized version
77
- print(model.name.original) # Original value
78
- print(model.custom_field) # "hello_worl..."
48
+ # model.name returns truncated and sanitized version
49
+ # model.name.raw_text returns original value
50
+ # model.custom_field returns "hello_worl..."
79
51
  ```
80
52
  """
81
53
 
82
54
  # Default configuration
83
- _default_max_length: ClassVar[Optional[int]] = 50
84
- _default_forbidden_chars: ClassVar[Set[str]] = {" ", "\t", "\n", "\r"}
85
- _default_replacement_char: ClassVar[str] = "_"
86
- _default_truncation_suffix: ClassVar[str] = "..."
87
-
88
- def __new__(cls, value: str = "") -> "RestrictedText":
89
- """Create a new string instance."""
90
- instance = str.__new__(cls, "") # We'll set the display value later
91
- return instance
92
-
93
- def __init__(self, value: str = ""):
94
- """Initialize the RestrictedText with a value."""
95
- self.original: str = value
96
- self.max_length = self._default_max_length
97
- self.forbidden_chars = self._default_forbidden_chars
98
- self.replacement_char = self._default_replacement_char
99
- self.truncation_suffix = self._default_truncation_suffix
100
-
101
- # Process the value
102
- self._processed_value = self._process_value(value)
103
-
104
- def _configure(
55
+ DEFAULT_MAX_LENGTH: ClassVar[Optional[int]] = 50
56
+ DEFAULT_FORBIDDEN_CHARS: ClassVar[Set[str]] = {" ", "\t", "\n", "\r"}
57
+ DEFAULT_REPLACEMENT_CHAR: ClassVar[str] = "_"
58
+ DEFAULT_TRUNCATION_SUFFIX: ClassVar[str] = "..."
59
+
60
+ raw_text: str
61
+ max_length: Optional[int] = None
62
+ forbidden_chars: Optional[Set[str]] = None
63
+ replacement_char: Optional[str] = None
64
+ truncation_suffix: Optional[str] = None
65
+ _processed_value: Optional[str] = None
66
+
67
+ def __init__(self, **data):
68
+ super().__init__(**data)
69
+ self.validate_text()
70
+
71
+ @classmethod
72
+ def __get_validators__(cls):
73
+ yield cls.pydantic_accept_raw_text
74
+ yield cls.validate
75
+ yield cls.pydantic_validate_text
76
+
77
+ @classmethod
78
+ def pydantic_accept_raw_text(cls, v):
79
+ if isinstance(v, (RestrictedText, dict)):
80
+ return v
81
+ assert isinstance(v, str), "text must be a string"
82
+ return {"text": v}
83
+
84
+ @classmethod
85
+ def pydantic_validate_text(cls, v):
86
+ assert isinstance(v, RestrictedText)
87
+ assert v.validate_text()
88
+ return v
89
+
90
+ @classmethod
91
+ def validate(cls, v):
92
+ """Validate and create a RestrictedText instance."""
93
+ if isinstance(v, RestrictedText):
94
+ return v
95
+
96
+ # This should be a dict at this point from pydantic_accept_raw_text
97
+ if isinstance(v, dict):
98
+ instance = cls(**v)
99
+ instance.validate_text()
100
+ return instance
101
+
102
+ raise ValueError(f"Unable to validate RestrictedText from {type(v)}")
103
+
104
+ def validate_text(self) -> bool:
105
+ """Validate the text and apply restrictions."""
106
+ # Set defaults if not provided
107
+ max_length = (
108
+ self.max_length if self.max_length is not None else self.DEFAULT_MAX_LENGTH
109
+ )
110
+ forbidden_chars = (
111
+ self.forbidden_chars
112
+ if self.forbidden_chars is not None
113
+ else self.DEFAULT_FORBIDDEN_CHARS
114
+ )
115
+ replacement_char = (
116
+ self.replacement_char
117
+ if self.replacement_char is not None
118
+ else self.DEFAULT_REPLACEMENT_CHAR
119
+ )
120
+ truncation_suffix = (
121
+ self.truncation_suffix
122
+ if self.truncation_suffix is not None
123
+ else self.DEFAULT_TRUNCATION_SUFFIX
124
+ )
125
+
126
+ # Store processed value
127
+ self._processed_value = self._process_value(
128
+ self.raw_text,
129
+ max_length,
130
+ forbidden_chars,
131
+ replacement_char,
132
+ truncation_suffix,
133
+ )
134
+ return True
135
+
136
+ def _process_value(
105
137
  self,
106
- max_length: Optional[int] = None,
107
- forbidden_chars: Optional[Set[str]] = None,
108
- replacement_char: Optional[str] = None,
109
- truncation_suffix: Optional[str] = None,
110
- ) -> "RestrictedText":
111
- """Configure this instance with custom settings."""
112
- if max_length is not None:
113
- self.max_length = max_length
114
- if forbidden_chars is not None:
115
- self.forbidden_chars = forbidden_chars
116
- if replacement_char is not None:
117
- self.replacement_char = replacement_char
118
- if truncation_suffix is not None:
119
- self.truncation_suffix = truncation_suffix
120
-
121
- # Reprocess the value with new configuration
122
- self._processed_value = self._process_value(self.original)
123
- return self
124
-
125
- def _process_value(self, value: str) -> str:
138
+ value: str,
139
+ max_length: Optional[int],
140
+ forbidden_chars: Set[str],
141
+ replacement_char: str,
142
+ truncation_suffix: str,
143
+ ) -> str:
126
144
  """Process the value by replacing characters and truncating."""
127
145
  # Replace specified characters
128
146
  processed = value
129
- for char in self.forbidden_chars:
130
- processed = processed.replace(char, self.replacement_char)
147
+ for char in forbidden_chars:
148
+ processed = processed.replace(char, replacement_char)
131
149
 
132
150
  # Truncate if necessary
133
- if self.max_length is not None and len(processed) > self.max_length:
134
- if len(self.truncation_suffix) >= self.max_length:
151
+ if max_length is not None and len(processed) > max_length:
152
+ if len(truncation_suffix) >= max_length:
135
153
  # If suffix is too long, just truncate without suffix
136
- processed = processed[: self.max_length]
154
+ processed = processed[:max_length]
137
155
  else:
138
156
  # Truncate and add suffix
139
- truncate_length = self.max_length - len(self.truncation_suffix)
140
- processed = processed[:truncate_length] + self.truncation_suffix
157
+ truncate_length = max_length - len(truncation_suffix)
158
+ processed = processed[:truncate_length] + truncation_suffix
141
159
 
142
160
  return processed
143
161
 
144
162
  def __str__(self) -> str:
145
163
  """Return the processed (truncated and sanitized) value."""
146
- return self._processed_value
164
+ return self._processed_value or ""
147
165
 
148
166
  def __repr__(self) -> str:
149
- return f"{self.__class__.__name__}({self._processed_value!r})"
167
+ return f"{self.__class__.__name__}({self.raw_text!r})"
150
168
 
151
169
  @property
152
170
  def processed(self) -> str:
153
171
  """Get the processed (truncated and sanitized) value."""
154
- return self._processed_value
155
-
156
- @classmethod
157
- def with_config(
158
- cls,
159
- max_length: Optional[int] = None,
160
- forbidden_chars: Optional[Set[str]] = None,
161
- replacement_char: Optional[str] = None,
162
- truncation_suffix: Optional[str] = None,
163
- ) -> RestrictedTextConfig:
164
- """Create a configuration object for use as field default.
165
-
166
- Args:
167
- max_length: Maximum length of the processed string
168
- forbidden_chars: Set of characters to replace
169
- replacement_char: Character to use as replacement
170
- truncation_suffix: Suffix to add when truncating
171
-
172
- Returns:
173
- A configuration object that can be used as field default
174
- """
175
- return RestrictedTextConfig(
176
- max_length=max_length,
177
- forbidden_chars=forbidden_chars,
178
- replacement_char=replacement_char,
179
- truncation_suffix=truncation_suffix,
180
- )
181
-
182
- # Pydantic v2 methods
183
- if PYDANTIC_V2:
184
-
185
- @classmethod
186
- def _validate(
187
- cls,
188
- __input_value: Union[str, "RestrictedText"],
189
- _: core_schema.ValidationInfo,
190
- ) -> "RestrictedText":
191
- """Validate and create a RestrictedText instance."""
192
- if isinstance(__input_value, RestrictedText):
193
- return __input_value
194
- return cls(__input_value)
195
-
196
- @classmethod
197
- def __get_pydantic_core_schema__(
198
- cls, source: type[Any], handler: GetCoreSchemaHandler
199
- ) -> core_schema.CoreSchema:
200
- """Get the Pydantic core schema for this type."""
201
- return core_schema.with_info_after_validator_function(
202
- cls._validate,
203
- core_schema.str_schema(),
204
- field_name=cls.__name__,
205
- )
206
-
207
- # Pydantic v1 methods
208
- else:
209
-
210
- @classmethod
211
- def __get_validators__(cls):
212
- """Pydantic v1 validator method."""
213
- yield cls.validate
214
-
215
- @classmethod
216
- def validate(cls, v, field=None):
217
- """Validate and create a RestrictedText instance for Pydantic v1."""
218
- if isinstance(v, RestrictedText):
219
- return v
220
-
221
- if not isinstance(v, str):
222
- # Let pydantic handle the string validation
223
- v = str_validator(v)
224
-
225
- # Create instance
226
- instance = cls(v)
227
-
228
- # Check if there's a field default that contains configuration
229
- if (
230
- field
231
- and hasattr(field, "default")
232
- and isinstance(field.default, RestrictedTextConfig)
233
- ):
234
- config = field.default
235
- instance._configure(
236
- max_length=config.max_length,
237
- forbidden_chars=config.forbidden_chars,
238
- replacement_char=config.replacement_char,
239
- truncation_suffix=config.truncation_suffix,
240
- )
241
-
242
- return instance
243
-
244
- @classmethod
245
- def __modify_schema__(cls, field_schema):
246
- """Modify the JSON schema for Pydantic v1."""
247
- field_schema.update(type="string", examples=["example string"])
172
+ return self._processed_value or ""
@@ -10,8 +10,10 @@
10
10
  # Tag search using the workspace search UI is supported only for tables, views, and table columns.
11
11
  # Tag search requires exact term matching.
12
12
  # https://learn.microsoft.com/en-us/azure/databricks/database-objects/tags#constraint
13
- from typing import Any, Dict, Optional, Set, Union
13
+ from typing import Any, Dict, Optional, Set
14
14
 
15
+ # Import validator for Pydantic v1 (always needed since we removed conditional logic)
16
+ from pydantic import validator
15
17
  from typing_extensions import ClassVar
16
18
 
17
19
  from datahub.api.entities.external.external_tag import ExternalTag
@@ -21,9 +23,9 @@ from datahub.api.entities.external.restricted_text import RestrictedText
21
23
  class UnityCatalogTagKeyText(RestrictedText):
22
24
  """RestrictedText configured for Unity Catalog tag keys."""
23
25
 
24
- _default_max_length: ClassVar[int] = 255
25
- # Unity Catalog tag keys: alphanumeric, hyphens, underscores, periods only
26
- _default_forbidden_chars: ClassVar[Set[str]] = {
26
+ DEFAULT_MAX_LENGTH: ClassVar[int] = 255
27
+ # Unity Catalog tag keys: forbidden characters based on constraints
28
+ DEFAULT_FORBIDDEN_CHARS: ClassVar[Set[str]] = {
27
29
  "\t",
28
30
  "\n",
29
31
  "\r",
@@ -34,18 +36,18 @@ class UnityCatalogTagKeyText(RestrictedText):
34
36
  "/",
35
37
  ":",
36
38
  }
37
- _default_replacement_char: ClassVar[str] = "_"
38
- _default_truncation_suffix: ClassVar[str] = "" # No suffix for clean identifiers
39
+ DEFAULT_REPLACEMENT_CHAR: ClassVar[str] = "_"
40
+ DEFAULT_TRUNCATION_SUFFIX: ClassVar[str] = "" # No suffix for clean identifiers
39
41
 
40
42
 
41
43
  class UnityCatalogTagValueText(RestrictedText):
42
44
  """RestrictedText configured for Unity Catalog tag values."""
43
45
 
44
- _default_max_length: ClassVar[int] = 1000
46
+ DEFAULT_MAX_LENGTH: ClassVar[int] = 1000
45
47
  # Unity Catalog tag values are more permissive but still have some restrictions
46
- _default_forbidden_chars: ClassVar[Set[str]] = {"\t", "\n", "\r"}
47
- _default_replacement_char: ClassVar[str] = " "
48
- _default_truncation_suffix: ClassVar[str] = "..."
48
+ DEFAULT_FORBIDDEN_CHARS: ClassVar[Set[str]] = {"\t", "\n", "\r"}
49
+ DEFAULT_REPLACEMENT_CHAR: ClassVar[str] = " "
50
+ DEFAULT_TRUNCATION_SUFFIX: ClassVar[str] = "..."
49
51
 
50
52
 
51
53
  class UnityCatalogTag(ExternalTag):
@@ -60,46 +62,43 @@ class UnityCatalogTag(ExternalTag):
60
62
  key: UnityCatalogTagKeyText
61
63
  value: Optional[UnityCatalogTagValueText] = None
62
64
 
63
- def __init__(
64
- self,
65
- key: Optional[Union[str, UnityCatalogTagKeyText]] = None,
66
- value: Optional[Union[str, UnityCatalogTagValueText]] = None,
67
- **data: Any,
68
- ) -> None:
69
- """
70
- Initialize UnityCatalogTag from either a DataHub Tag URN or explicit key/value.
65
+ # Pydantic v1 validators
66
+ @validator("key", pre=True)
67
+ @classmethod
68
+ def _validate_key(cls, v: Any) -> UnityCatalogTagKeyText:
69
+ """Validate and convert key field for Pydantic v1."""
70
+ if isinstance(v, UnityCatalogTagKeyText):
71
+ return v
71
72
 
72
- Args:
73
- key: Explicit key value (optional for Pydantic initialization)
74
- value: Explicit value (optional)
75
- **data: Additional Pydantic data
76
- """
77
- if key is not None:
78
- # Direct initialization with key/value
79
- processed_key = (
80
- UnityCatalogTagKeyText(key)
81
- if not isinstance(key, UnityCatalogTagKeyText)
82
- else key
83
- )
84
- processed_value = None
85
- if value is not None:
86
- processed_value = (
87
- UnityCatalogTagValueText(value)
88
- if not isinstance(value, UnityCatalogTagValueText)
89
- else value
90
- )
91
- # If value is an empty string, set it to None to not generater empty value in DataHub tag which results in key: tags
92
- if not str(value):
93
- processed_value = None
94
-
95
- super().__init__(
96
- key=processed_key,
97
- value=processed_value,
98
- **data,
99
- )
100
- else:
101
- # Standard pydantic initialization
102
- super().__init__(**data)
73
+ # If we get a RestrictedText object from parent class validation, use its raw_text value
74
+ if hasattr(v, "raw_text"):
75
+ return UnityCatalogTagKeyText(raw_text=v.raw_text)
76
+
77
+ return UnityCatalogTagKeyText(raw_text=v)
78
+
79
+ @validator("value", pre=True)
80
+ @classmethod
81
+ def _validate_value(cls, v: Any) -> Optional[UnityCatalogTagValueText]:
82
+ """Validate and convert value field for Pydantic v1."""
83
+ if v is None:
84
+ return None
85
+
86
+ if isinstance(v, UnityCatalogTagValueText):
87
+ return v
88
+
89
+ # If we get a RestrictedText object from parent class validation, use its raw_text value
90
+ if hasattr(v, "raw_text"):
91
+ text_value = v.raw_text
92
+ # If value is an empty string, set it to None to not generate empty value in DataHub tag
93
+ if not str(text_value):
94
+ return None
95
+ return UnityCatalogTagValueText(raw_text=text_value)
96
+
97
+ # If value is an empty string, set it to None to not generate empty value in DataHub tag
98
+ if not str(v):
99
+ return None
100
+
101
+ return UnityCatalogTagValueText(raw_text=v)
103
102
 
104
103
  def __eq__(self, other: object) -> bool:
105
104
  """Check equality based on key and value."""
@@ -124,7 +123,7 @@ class UnityCatalogTag(ExternalTag):
124
123
  Returns:
125
124
  UnityCatalogTag instance
126
125
  """
127
- return cls(key=tag_dict["key"], value=tag_dict.get("value"))
126
+ return cls(**tag_dict)
128
127
 
129
128
  @classmethod
130
129
  def from_key_value(cls, key: str, value: Optional[str] = None) -> "UnityCatalogTag":
@@ -149,9 +148,9 @@ class UnityCatalogTag(ExternalTag):
149
148
  Returns:
150
149
  Dictionary with 'key' and optionally 'value'
151
150
  """
152
- result: Dict[str, str] = {"key": self.key.original}
151
+ result: Dict[str, str] = {"key": self.key.raw_text}
153
152
  if self.value is not None:
154
- result["value"] = self.value.original
153
+ result["value"] = self.value.raw_text
155
154
  return result
156
155
 
157
156
  def to_display_dict(self) -> Dict[str, str]:
@@ -95,7 +95,7 @@ TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
95
95
  TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
96
96
  TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
97
97
 
98
- # The limit is 16mb. We will use a max of 15mb to have some space
98
+ # The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
99
99
  # for overhead like request headers.
100
100
  # This applies to pretty much all calls to GMS.
101
101
  INGEST_MAX_PAYLOAD_BYTES = int(
@@ -586,6 +586,11 @@ class DataHubRestEmitter(Closeable, Emitter):
586
586
  "systemMetadata": system_metadata_obj,
587
587
  }
588
588
  payload = json.dumps(snapshot)
589
+ if len(payload) > INGEST_MAX_PAYLOAD_BYTES:
590
+ logger.warning(
591
+ f"MCE object has size {len(payload)} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
592
+ "so this metadata will likely fail to be emitted."
593
+ )
589
594
 
590
595
  self._emit_generic(url, payload)
591
596
 
@@ -764,16 +769,24 @@ class DataHubRestEmitter(Closeable, Emitter):
764
769
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
765
770
 
766
771
  mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
772
+ if len(mcp_objs) == 0:
773
+ return 0
767
774
 
768
775
  # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
769
776
  # If we will exceed the limit, we need to break it up into chunks.
770
- mcp_obj_chunks: List[List[str]] = []
771
- current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
777
+ mcp_obj_chunks: List[List[str]] = [[]]
778
+ current_chunk_size = 0
772
779
  for mcp_obj in mcp_objs:
780
+ mcp_identifier = f"{mcp_obj.get('entityUrn')}-{mcp_obj.get('aspectName')}"
773
781
  mcp_obj_size = len(json.dumps(mcp_obj))
774
782
  if _DATAHUB_EMITTER_TRACE:
775
783
  logger.debug(
776
- f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
784
+ f"Iterating through object ({mcp_identifier}) with size {mcp_obj_size}"
785
+ )
786
+ if mcp_obj_size > INGEST_MAX_PAYLOAD_BYTES:
787
+ logger.warning(
788
+ f"MCP object {mcp_identifier} has size {mcp_obj_size} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
789
+ "so this metadata will likely fail to be emitted."
777
790
  )
778
791
 
779
792
  if (
@@ -786,7 +799,7 @@ class DataHubRestEmitter(Closeable, Emitter):
786
799
  current_chunk_size = 0
787
800
  mcp_obj_chunks[-1].append(mcp_obj)
788
801
  current_chunk_size += mcp_obj_size
789
- if len(mcp_obj_chunks) > 0:
802
+ if len(mcp_obj_chunks) > 1 or _DATAHUB_EMITTER_TRACE:
790
803
  logger.debug(
791
804
  f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
792
805
  )