acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (214) hide show
  1. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
  2. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
  3. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +141 -93
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
  30. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  31. datahub/ingestion/api/report.py +1 -2
  32. datahub/ingestion/api/source.py +8 -2
  33. datahub/ingestion/api/source_helpers.py +1 -1
  34. datahub/ingestion/extractor/json_schema_util.py +3 -3
  35. datahub/ingestion/extractor/schema_util.py +3 -5
  36. datahub/ingestion/fs/s3_fs.py +3 -3
  37. datahub/ingestion/glossary/classifier.py +2 -3
  38. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  39. datahub/ingestion/graph/client.py +22 -19
  40. datahub/ingestion/graph/config.py +1 -1
  41. datahub/ingestion/run/pipeline.py +8 -7
  42. datahub/ingestion/run/pipeline_config.py +3 -3
  43. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  44. datahub/ingestion/source/abs/source.py +19 -8
  45. datahub/ingestion/source/aws/glue.py +77 -47
  46. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  47. datahub/ingestion/source/aws/s3_util.py +24 -1
  48. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  49. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  50. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  51. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  53. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  54. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  55. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  56. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  57. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  58. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  59. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  60. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  61. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  62. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  63. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  64. datahub/ingestion/source/csv_enricher.py +29 -29
  65. datahub/ingestion/source/datahub/config.py +20 -0
  66. datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
  67. datahub/ingestion/source/datahub/datahub_source.py +13 -3
  68. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  69. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  70. datahub/ingestion/source/delta_lake/source.py +0 -5
  71. datahub/ingestion/source/demo_data.py +1 -1
  72. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  73. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  74. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  75. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  76. datahub/ingestion/source/elastic_search.py +4 -4
  77. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  78. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  79. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  80. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  81. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  82. datahub/ingestion/source/ge_data_profiler.py +2 -5
  83. datahub/ingestion/source/ge_profiling_config.py +3 -3
  84. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  85. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  86. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  87. datahub/ingestion/source/identity/azure_ad.py +3 -3
  88. datahub/ingestion/source/identity/okta.py +3 -3
  89. datahub/ingestion/source/kafka/kafka.py +11 -9
  90. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  93. datahub/ingestion/source/looker/looker_common.py +19 -19
  94. datahub/ingestion/source/looker/looker_config.py +11 -6
  95. datahub/ingestion/source/looker/looker_source.py +25 -25
  96. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  97. datahub/ingestion/source/looker/looker_usage.py +5 -7
  98. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  99. datahub/ingestion/source/looker/lookml_source.py +13 -15
  100. datahub/ingestion/source/looker/view_upstream.py +5 -5
  101. datahub/ingestion/source/metabase.py +1 -6
  102. datahub/ingestion/source/mlflow.py +4 -9
  103. datahub/ingestion/source/mode.py +5 -5
  104. datahub/ingestion/source/mongodb.py +6 -4
  105. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  106. datahub/ingestion/source/nifi.py +24 -31
  107. datahub/ingestion/source/openapi.py +9 -9
  108. datahub/ingestion/source/powerbi/config.py +12 -12
  109. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  110. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  111. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  112. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  113. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  114. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  115. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  116. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  117. datahub/ingestion/source/redash.py +0 -5
  118. datahub/ingestion/source/redshift/config.py +3 -3
  119. datahub/ingestion/source/redshift/redshift.py +45 -46
  120. datahub/ingestion/source/redshift/usage.py +33 -33
  121. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  122. datahub/ingestion/source/s3/source.py +11 -15
  123. datahub/ingestion/source/salesforce.py +26 -25
  124. datahub/ingestion/source/schema/json_schema.py +1 -1
  125. datahub/ingestion/source/sigma/sigma.py +3 -3
  126. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  127. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  128. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  129. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
  130. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  131. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  132. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  133. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  134. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  135. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  136. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  137. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  138. datahub/ingestion/source/sql/athena.py +1 -3
  139. datahub/ingestion/source/sql/clickhouse.py +8 -14
  140. datahub/ingestion/source/sql/oracle.py +1 -3
  141. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  142. datahub/ingestion/source/sql/sql_types.py +1 -2
  143. datahub/ingestion/source/sql/sql_utils.py +5 -0
  144. datahub/ingestion/source/sql/teradata.py +18 -5
  145. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  146. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  147. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  148. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  149. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/superset.py +1 -6
  151. datahub/ingestion/source/tableau/tableau.py +343 -117
  152. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  153. datahub/ingestion/source/unity/config.py +3 -1
  154. datahub/ingestion/source/unity/proxy.py +1 -1
  155. datahub/ingestion/source/unity/source.py +74 -78
  156. datahub/ingestion/source/unity/usage.py +3 -1
  157. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  159. datahub/ingestion/source/usage/usage_common.py +1 -1
  160. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  161. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  162. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  163. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  164. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  165. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  166. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  167. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  168. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  169. datahub/lite/duckdb_lite.py +12 -10
  170. datahub/metadata/_schema_classes.py +317 -44
  171. datahub/metadata/_urns/urn_defs.py +69 -15
  172. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  173. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  174. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  175. datahub/metadata/schema.avsc +302 -89
  176. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  177. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  179. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  180. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  181. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  182. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  183. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  184. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  185. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  186. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  187. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  188. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  189. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  190. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  191. datahub/secret/datahub_secrets_client.py +12 -21
  192. datahub/secret/secret_common.py +14 -8
  193. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  194. datahub/sql_parsing/schema_resolver.py +5 -10
  195. datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
  196. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  197. datahub/sql_parsing/sqlglot_utils.py +1 -1
  198. datahub/telemetry/stats.py +1 -2
  199. datahub/testing/mcp_diff.py +1 -1
  200. datahub/utilities/file_backed_collections.py +11 -11
  201. datahub/utilities/hive_schema_to_avro.py +2 -2
  202. datahub/utilities/logging_manager.py +2 -2
  203. datahub/utilities/lossy_collections.py +3 -3
  204. datahub/utilities/mapping.py +3 -3
  205. datahub/utilities/memory_footprint.py +3 -2
  206. datahub/utilities/perf_timer.py +11 -6
  207. datahub/utilities/serialized_lru_cache.py +3 -1
  208. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  209. datahub/utilities/sqllineage_patch.py +1 -1
  210. datahub/utilities/stats_collections.py +3 -1
  211. datahub/utilities/urns/_urn_base.py +28 -5
  212. datahub/utilities/urns/urn_iter.py +2 -2
  213. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  214. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -47,7 +47,10 @@ class BaseTimeWindowConfig(ConfigModel):
47
47
  default_factory=lambda: datetime.now(tz=timezone.utc),
48
48
  description="Latest date of lineage/usage to consider. Default: Current time in UTC",
49
49
  )
50
- start_time: datetime = Field(default=None, description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.") # type: ignore
50
+ start_time: datetime = Field(
51
+ default=None,
52
+ description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.",
53
+ ) # type: ignore
51
54
 
52
55
  @pydantic.validator("start_time", pre=True, always=True)
53
56
  def default_start_time(
@@ -63,12 +66,14 @@ class BaseTimeWindowConfig(ConfigModel):
63
66
  # This is where start_time str is resolved to datetime
64
67
  try:
65
68
  delta = parse_relative_timespan(v)
66
- assert delta < timedelta(
67
- 0
68
- ), "Relative start time should start with minus sign (-) e.g. '-2 days'."
69
+ assert delta < timedelta(0), (
70
+ "Relative start time should start with minus sign (-) e.g. '-2 days'."
71
+ )
69
72
  assert abs(delta) >= get_bucket_duration_delta(
70
73
  values["bucket_duration"]
71
- ), "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
74
+ ), (
75
+ "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
76
+ )
72
77
 
73
78
  # The end_time's default value is not yet populated, in which case
74
79
  # we can just manually generate it here.
@@ -88,13 +88,11 @@ def get_sys_time() -> int:
88
88
 
89
89
 
90
90
  @overload
91
- def make_ts_millis(ts: None) -> None:
92
- ...
91
+ def make_ts_millis(ts: None) -> None: ...
93
92
 
94
93
 
95
94
  @overload
96
- def make_ts_millis(ts: datetime) -> int:
97
- ...
95
+ def make_ts_millis(ts: datetime) -> int: ...
98
96
 
99
97
 
100
98
  def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
@@ -105,13 +103,11 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
105
103
 
106
104
 
107
105
  @overload
108
- def parse_ts_millis(ts: float) -> datetime:
109
- ...
106
+ def parse_ts_millis(ts: float) -> datetime: ...
110
107
 
111
108
 
112
109
  @overload
113
- def parse_ts_millis(ts: None) -> None:
114
- ...
110
+ def parse_ts_millis(ts: None) -> None: ...
115
111
 
116
112
 
117
113
  def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
@@ -31,9 +31,12 @@ from datahub.metadata.schema_classes import (
31
31
  OwnershipClass,
32
32
  OwnershipTypeClass,
33
33
  StatusClass,
34
+ StructuredPropertiesClass,
35
+ StructuredPropertyValueAssignmentClass,
34
36
  SubTypesClass,
35
37
  TagAssociationClass,
36
38
  )
39
+ from datahub.metadata.urns import StructuredPropertyUrn
37
40
 
38
41
  # In https://github.com/datahub-project/datahub/pull/11214, we added a
39
42
  # new env field to container properties. However, populating this field
@@ -187,12 +190,31 @@ def add_tags_to_entity_wu(
187
190
  ).as_workunit()
188
191
 
189
192
 
193
+ def add_structured_properties_to_entity_wu(
194
+ entity_urn: str, structured_properties: Dict[StructuredPropertyUrn, str]
195
+ ) -> Iterable[MetadataWorkUnit]:
196
+ aspect = StructuredPropertiesClass(
197
+ properties=[
198
+ StructuredPropertyValueAssignmentClass(
199
+ propertyUrn=urn.urn(),
200
+ values=[value],
201
+ )
202
+ for urn, value in structured_properties.items()
203
+ ]
204
+ )
205
+ yield MetadataChangeProposalWrapper(
206
+ entityUrn=entity_urn,
207
+ aspect=aspect,
208
+ ).as_workunit()
209
+
210
+
190
211
  def gen_containers(
191
212
  container_key: KeyType,
192
213
  name: str,
193
214
  sub_types: List[str],
194
215
  parent_container_key: Optional[ContainerKey] = None,
195
216
  extra_properties: Optional[Dict[str, str]] = None,
217
+ structured_properties: Optional[Dict[StructuredPropertyUrn, str]] = None,
196
218
  domain_urn: Optional[str] = None,
197
219
  description: Optional[str] = None,
198
220
  owner_urn: Optional[str] = None,
@@ -282,6 +304,11 @@ def gen_containers(
282
304
  tags=sorted(tags),
283
305
  )
284
306
 
307
+ if structured_properties:
308
+ yield from add_structured_properties_to_entity_wu(
309
+ entity_urn=container_urn, structured_properties=structured_properties
310
+ )
311
+
285
312
 
286
313
  def add_dataset_to_container(
287
314
  container_key: KeyType, dataset_urn: str
@@ -33,8 +33,7 @@ from datahub.utilities.urns.urn import guess_entity_type
33
33
 
34
34
  @runtime_checkable
35
35
  class SupportsToObj(Protocol):
36
- def to_obj(self) -> Any:
37
- ...
36
+ def to_obj(self) -> Any: ...
38
37
 
39
38
 
40
39
  def _recursive_to_obj(obj: Any) -> Any:
@@ -1,9 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import functools
2
4
  import json
3
5
  import logging
4
6
  import os
5
7
  from json.decoder import JSONDecodeError
6
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union
8
+ from typing import (
9
+ TYPE_CHECKING,
10
+ Any,
11
+ Callable,
12
+ Dict,
13
+ List,
14
+ Optional,
15
+ Sequence,
16
+ Tuple,
17
+ Union,
18
+ )
7
19
 
8
20
  import requests
9
21
  from deprecated import deprecated
@@ -12,8 +24,13 @@ from requests.exceptions import HTTPError, RequestException
12
24
 
13
25
  from datahub import nice_version_name
14
26
  from datahub.cli import config_utils
15
- from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
16
- from datahub.configuration.common import ConfigurationError, OperationalError
27
+ from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
28
+ from datahub.cli.env_utils import get_boolean_env_variable
29
+ from datahub.configuration.common import (
30
+ ConfigModel,
31
+ ConfigurationError,
32
+ OperationalError,
33
+ )
17
34
  from datahub.emitter.generic_emitter import Emitter
18
35
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
36
  from datahub.emitter.request_helper import make_curl_command
@@ -30,10 +47,8 @@ if TYPE_CHECKING:
30
47
 
31
48
  logger = logging.getLogger(__name__)
32
49
 
33
- _DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
34
- _DEFAULT_READ_TIMEOUT_SEC = (
35
- 30 # Any ingest call taking longer than 30 seconds should be abandoned
36
- )
50
+ _DEFAULT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
51
+ _TIMEOUT_LOWER_BOUND_SEC = 1 # if below this, we log a warning
37
52
  _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
38
53
  429,
39
54
  500,
@@ -46,6 +61,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
46
61
  os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
47
62
  )
48
63
 
64
+ _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
65
+
49
66
  # The limit is 16mb. We will use a max of 15mb to have some space
50
67
  # for overhead like request headers.
51
68
  # This applies to pretty much all calls to GMS.
@@ -60,15 +77,76 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
60
77
  )
61
78
 
62
79
 
80
+ class RequestsSessionConfig(ConfigModel):
81
+ timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
82
+
83
+ retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
84
+ retry_methods: List[str] = _DEFAULT_RETRY_METHODS
85
+ retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
86
+
87
+ extra_headers: Dict[str, str] = {}
88
+
89
+ ca_certificate_path: Optional[str] = None
90
+ client_certificate_path: Optional[str] = None
91
+ disable_ssl_verification: bool = False
92
+
93
+ def build_session(self) -> requests.Session:
94
+ session = requests.Session()
95
+
96
+ if self.extra_headers:
97
+ session.headers.update(self.extra_headers)
98
+
99
+ if self.client_certificate_path:
100
+ session.cert = self.client_certificate_path
101
+
102
+ if self.ca_certificate_path:
103
+ session.verify = self.ca_certificate_path
104
+
105
+ if self.disable_ssl_verification:
106
+ session.verify = False
107
+
108
+ try:
109
+ # Set raise_on_status to False to propagate errors:
110
+ # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
111
+ # Must call `raise_for_status` after making a request, which we do
112
+ retry_strategy = Retry(
113
+ total=self.retry_max_times,
114
+ status_forcelist=self.retry_status_codes,
115
+ backoff_factor=2,
116
+ allowed_methods=self.retry_methods,
117
+ raise_on_status=False,
118
+ )
119
+ except TypeError:
120
+ # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
121
+ retry_strategy = Retry(
122
+ total=self.retry_max_times,
123
+ status_forcelist=self.retry_status_codes,
124
+ backoff_factor=2,
125
+ method_whitelist=self.retry_methods,
126
+ raise_on_status=False,
127
+ )
128
+
129
+ adapter = HTTPAdapter(
130
+ pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
131
+ )
132
+ session.mount("http://", adapter)
133
+ session.mount("https://", adapter)
134
+
135
+ if self.timeout is not None:
136
+ # Shim session.request to apply default timeout values.
137
+ # Via https://stackoverflow.com/a/59317604.
138
+ session.request = functools.partial( # type: ignore
139
+ session.request,
140
+ timeout=self.timeout,
141
+ )
142
+
143
+ return session
144
+
145
+
63
146
  class DataHubRestEmitter(Closeable, Emitter):
64
147
  _gms_server: str
65
148
  _token: Optional[str]
66
149
  _session: requests.Session
67
- _connect_timeout_sec: float = _DEFAULT_CONNECT_TIMEOUT_SEC
68
- _read_timeout_sec: float = _DEFAULT_READ_TIMEOUT_SEC
69
- _retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
70
- _retry_methods: List[str] = _DEFAULT_RETRY_METHODS
71
- _retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
72
150
 
73
151
  def __init__(
74
152
  self,
@@ -99,15 +177,13 @@ class DataHubRestEmitter(Closeable, Emitter):
99
177
 
100
178
  self._session = requests.Session()
101
179
 
102
- self._session.headers.update(
103
- {
104
- "X-RestLi-Protocol-Version": "2.0.0",
105
- "X-DataHub-Py-Cli-Version": nice_version_name(),
106
- "Content-Type": "application/json",
107
- }
108
- )
180
+ headers = {
181
+ "X-RestLi-Protocol-Version": "2.0.0",
182
+ "X-DataHub-Py-Cli-Version": nice_version_name(),
183
+ "Content-Type": "application/json",
184
+ }
109
185
  if token:
110
- self._session.headers.update({"Authorization": f"Bearer {token}"})
186
+ headers["Authorization"] = f"Bearer {token}"
111
187
  else:
112
188
  # HACK: When no token is provided but system auth env variables are set, we use them.
113
189
  # Ideally this should simply get passed in as config, instead of being sneakily injected
@@ -116,75 +192,43 @@ class DataHubRestEmitter(Closeable, Emitter):
116
192
  # rest emitter, and the rest sink uses the rest emitter under the hood.
117
193
  system_auth = config_utils.get_system_auth()
118
194
  if system_auth is not None:
119
- self._session.headers.update({"Authorization": system_auth})
120
-
121
- if extra_headers:
122
- self._session.headers.update(extra_headers)
123
-
124
- if client_certificate_path:
125
- self._session.cert = client_certificate_path
195
+ headers["Authorization"] = system_auth
126
196
 
127
- if ca_certificate_path:
128
- self._session.verify = ca_certificate_path
129
-
130
- if disable_ssl_verification:
131
- self._session.verify = False
132
-
133
- self._connect_timeout_sec = (
134
- connect_timeout_sec or timeout_sec or _DEFAULT_CONNECT_TIMEOUT_SEC
135
- )
136
- self._read_timeout_sec = (
137
- read_timeout_sec or timeout_sec or _DEFAULT_READ_TIMEOUT_SEC
138
- )
139
-
140
- if self._connect_timeout_sec < 1 or self._read_timeout_sec < 1:
141
- logger.warning(
142
- f"Setting timeout values lower than 1 second is not recommended. Your configuration is connect_timeout:{self._connect_timeout_sec}s, read_timeout:{self._read_timeout_sec}s"
143
- )
144
-
145
- if retry_status_codes is not None: # Only if missing. Empty list is allowed
146
- self._retry_status_codes = retry_status_codes
147
-
148
- if retry_methods is not None:
149
- self._retry_methods = retry_methods
150
-
151
- if retry_max_times:
152
- self._retry_max_times = retry_max_times
153
-
154
- try:
155
- # Set raise_on_status to False to propagate errors:
156
- # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
157
- # Must call `raise_for_status` after making a request, which we do
158
- retry_strategy = Retry(
159
- total=self._retry_max_times,
160
- status_forcelist=self._retry_status_codes,
161
- backoff_factor=2,
162
- allowed_methods=self._retry_methods,
163
- raise_on_status=False,
164
- )
165
- except TypeError:
166
- # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
167
- retry_strategy = Retry(
168
- total=self._retry_max_times,
169
- status_forcelist=self._retry_status_codes,
170
- backoff_factor=2,
171
- method_whitelist=self._retry_methods,
172
- raise_on_status=False,
197
+ timeout: float | tuple[float, float]
198
+ if connect_timeout_sec is not None or read_timeout_sec is not None:
199
+ timeout = (
200
+ connect_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
201
+ read_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
173
202
  )
203
+ if (
204
+ timeout[0] < _TIMEOUT_LOWER_BOUND_SEC
205
+ or timeout[1] < _TIMEOUT_LOWER_BOUND_SEC
206
+ ):
207
+ logger.warning(
208
+ f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is (connect_timeout, read_timeout) = {timeout} seconds"
209
+ )
210
+ else:
211
+ timeout = get_or_else(timeout_sec, _DEFAULT_TIMEOUT_SEC)
212
+ if timeout < _TIMEOUT_LOWER_BOUND_SEC:
213
+ logger.warning(
214
+ f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is timeout = {timeout} seconds"
215
+ )
174
216
 
175
- adapter = HTTPAdapter(
176
- pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
177
- )
178
- self._session.mount("http://", adapter)
179
- self._session.mount("https://", adapter)
180
-
181
- # Shim session.request to apply default timeout values.
182
- # Via https://stackoverflow.com/a/59317604.
183
- self._session.request = functools.partial( # type: ignore
184
- self._session.request,
185
- timeout=(self._connect_timeout_sec, self._read_timeout_sec),
217
+ self._session_config = RequestsSessionConfig(
218
+ timeout=timeout,
219
+ retry_status_codes=get_or_else(
220
+ retry_status_codes, _DEFAULT_RETRY_STATUS_CODES
221
+ ),
222
+ retry_methods=get_or_else(retry_methods, _DEFAULT_RETRY_METHODS),
223
+ retry_max_times=get_or_else(retry_max_times, _DEFAULT_RETRY_MAX_TIMES),
224
+ extra_headers={**headers, **(extra_headers or {})},
225
+ ca_certificate_path=ca_certificate_path,
226
+ client_certificate_path=client_certificate_path,
227
+ disable_ssl_verification=disable_ssl_verification,
186
228
  )
187
229
 
230
+ self._session = self._session_config.build_session()
231
+
188
232
  def test_connection(self) -> None:
189
233
  url = f"{self._gms_server}/config"
190
234
  response = self._session.get(url)
@@ -291,7 +335,8 @@ class DataHubRestEmitter(Closeable, Emitter):
291
335
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
292
336
  async_flag: Optional[bool] = None,
293
337
  ) -> int:
294
- logger.debug("Attempting to emit batch mcps")
338
+ if _DATAHUB_EMITTER_TRACE:
339
+ logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
295
340
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
296
341
  for mcp in mcps:
297
342
  ensure_has_system_metadata(mcp)
@@ -304,29 +349,32 @@ class DataHubRestEmitter(Closeable, Emitter):
304
349
  current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
305
350
  for mcp_obj in mcp_objs:
306
351
  mcp_obj_size = len(json.dumps(mcp_obj))
307
- logger.debug(
308
- f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
309
- )
352
+ if _DATAHUB_EMITTER_TRACE:
353
+ logger.debug(
354
+ f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
355
+ )
310
356
 
311
357
  if (
312
358
  mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
313
359
  or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
314
360
  ):
315
- logger.debug("Decided to create new chunk")
361
+ if _DATAHUB_EMITTER_TRACE:
362
+ logger.debug("Decided to create new chunk")
316
363
  mcp_obj_chunks.append([])
317
364
  current_chunk_size = 0
318
365
  mcp_obj_chunks[-1].append(mcp_obj)
319
366
  current_chunk_size += mcp_obj_size
320
- logger.debug(
321
- f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
322
- )
367
+ if len(mcp_obj_chunks) > 0:
368
+ logger.debug(
369
+ f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
370
+ )
323
371
 
324
372
  for mcp_obj_chunk in mcp_obj_chunks:
325
373
  # TODO: We're calling json.dumps on each MCP object twice, once to estimate
326
374
  # the size when chunking, and again for the actual request.
327
375
  payload_dict: dict = {"proposals": mcp_obj_chunk}
328
376
  if async_flag is not None:
329
- payload_dict["async"] = True if async_flag else False
377
+ payload_dict["async"] = "true" if async_flag else "false"
330
378
 
331
379
  payload = json.dumps(payload_dict)
332
380
  self._emit_generic(url, payload)
datahub/entrypoints.py CHANGED
@@ -45,6 +45,12 @@ _logging_configured: Optional[ContextManager] = None
45
45
 
46
46
  MAX_CONTENT_WIDTH = 120
47
47
 
48
+ if sys.version_info >= (3, 12):
49
+ click.secho(
50
+ "Python versions above 3.11 are not tested with. Please use Python 3.11.",
51
+ fg="red",
52
+ )
53
+
48
54
 
49
55
  @click.group(
50
56
  context_settings=dict(
@@ -1,10 +1,9 @@
1
1
  import json
2
2
  import logging
3
- from typing import Iterable, List
3
+ from typing import TYPE_CHECKING, Iterable, List
4
4
 
5
5
  from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
6
6
  from datahub.emitter.serialization_helper import pre_json_transform
7
- from datahub.ingestion.api.source import SourceReport
8
7
  from datahub.ingestion.api.workunit import MetadataWorkUnit
9
8
  from datahub.metadata.schema_classes import (
10
9
  DatasetProfileClass,
@@ -12,12 +11,15 @@ from datahub.metadata.schema_classes import (
12
11
  SchemaMetadataClass,
13
12
  )
14
13
 
14
+ if TYPE_CHECKING:
15
+ from datahub.ingestion.api.source import SourceReport
16
+
15
17
  logger = logging.getLogger(__name__)
16
18
 
17
19
 
18
20
  class EnsureAspectSizeProcessor:
19
21
  def __init__(
20
- self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
22
+ self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
21
23
  ):
22
24
  self.report = report
23
25
  self.payload_constraint = payload_constraint
@@ -55,15 +55,9 @@ def convert_chart_info_to_patch(
55
55
  aspect.externalUrl
56
56
  ).set_type(aspect.type).set_title(aspect.title).set_access(
57
57
  aspect.access
58
- ).set_last_modified(
59
- aspect.lastModified
60
- ).set_last_refreshed(
58
+ ).set_last_modified(aspect.lastModified).set_last_refreshed(
61
59
  aspect.lastRefreshed
62
- ).set_description(
63
- aspect.description
64
- ).add_inputs(
65
- aspect.inputs
66
- )
60
+ ).set_description(aspect.description).add_inputs(aspect.inputs)
67
61
 
68
62
  values = patch_builder.build()
69
63
  if values:
@@ -21,8 +21,7 @@ LogLevel = Literal["ERROR", "WARNING", "INFO", "DEBUG"]
21
21
 
22
22
  @runtime_checkable
23
23
  class SupportsAsObj(Protocol):
24
- def as_obj(self) -> dict:
25
- ...
24
+ def as_obj(self) -> dict: ...
26
25
 
27
26
 
28
27
  @dataclass
@@ -23,7 +23,7 @@ from typing import (
23
23
  )
24
24
 
25
25
  from pydantic import BaseModel
26
- from typing_extensions import LiteralString
26
+ from typing_extensions import LiteralString, Self
27
27
 
28
28
  from datahub.configuration.common import ConfigModel
29
29
  from datahub.configuration.source_common import PlatformInstanceConfigMixin
@@ -31,6 +31,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
31
31
  from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
32
32
  auto_patch_last_modified,
33
33
  )
34
+ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
35
+ EnsureAspectSizeProcessor,
36
+ )
34
37
  from datahub.ingestion.api.closeable import Closeable
35
38
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
36
39
  from datahub.ingestion.api.report import Report
@@ -331,6 +334,8 @@ class SourceReport(Report):
331
334
  }
332
335
 
333
336
  def compute_stats(self) -> None:
337
+ super().compute_stats()
338
+
334
339
  duration = datetime.datetime.now() - self.start_time
335
340
  workunits_produced = self.events_produced
336
341
  if duration.total_seconds() > 0:
@@ -395,7 +400,7 @@ class Source(Closeable, metaclass=ABCMeta):
395
400
  ctx: PipelineContext
396
401
 
397
402
  @classmethod
398
- def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
403
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> Self:
399
404
  # Technically, this method should be abstract. However, the @config_class
400
405
  # decorator automatically generates a create method at runtime if one is
401
406
  # not defined. Python still treats the class as abstract because it thinks
@@ -450,6 +455,7 @@ class Source(Closeable, metaclass=ABCMeta):
450
455
  browse_path_processor,
451
456
  partial(auto_workunit_reporter, self.get_report()),
452
457
  auto_patch_last_modified,
458
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
453
459
  ]
454
460
 
455
461
  @staticmethod
@@ -48,7 +48,7 @@ logger = logging.getLogger(__name__)
48
48
 
49
49
 
50
50
  def auto_workunit(
51
- stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]]
51
+ stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]],
52
52
  ) -> Iterable[MetadataWorkUnit]:
53
53
  """Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
54
54
 
@@ -131,9 +131,9 @@ class FieldPath:
131
131
  for i, schema_type in enumerate(p.schema_types):
132
132
  if schema_type == schema_str:
133
133
  # return the corresponding type for the schema that's a match
134
- assert (
135
- len(p.type) > i
136
- ), f"p.type({len(p.type)})) and p.schema_types({len(p.schema_types)}) should have the same length"
134
+ assert len(p.type) > i, (
135
+ f"p.type({len(p.type)})) and p.schema_types({len(p.schema_types)}) should have the same length"
136
+ )
137
137
  return p.type[i]
138
138
  return None
139
139
 
@@ -263,15 +263,13 @@ class AvroToMceSchemaConverter:
263
263
  @overload
264
264
  def _get_underlying_type_if_option_as_union(
265
265
  schema: SchemaOrField, default: SchemaOrField
266
- ) -> SchemaOrField:
267
- ...
266
+ ) -> SchemaOrField: ...
268
267
 
269
268
  @staticmethod
270
269
  @overload
271
270
  def _get_underlying_type_if_option_as_union(
272
271
  schema: SchemaOrField, default: Optional[SchemaOrField] = None
273
- ) -> Optional[SchemaOrField]:
274
- ...
272
+ ) -> Optional[SchemaOrField]: ...
275
273
 
276
274
  @staticmethod
277
275
  def _get_underlying_type_if_option_as_union(
@@ -386,7 +384,7 @@ class AvroToMceSchemaConverter:
386
384
 
387
385
  if "deprecated" in merged_props:
388
386
  description = (
389
- f"<span style=\"color:red\">DEPRECATED: {merged_props['deprecated']}</span>\n"
387
+ f'<span style="color:red">DEPRECATED: {merged_props["deprecated"]}</span>\n'
390
388
  + description
391
389
  if description
392
390
  else ""
@@ -17,9 +17,9 @@ def parse_s3_path(path: str) -> "S3Path":
17
17
 
18
18
  def assert_ok_status(s3_response):
19
19
  is_ok = s3_response["ResponseMetadata"]["HTTPStatusCode"] == 200
20
- assert (
21
- is_ok
22
- ), f"Failed to fetch S3 object, error message: {s3_response['Error']['Message']}"
20
+ assert is_ok, (
21
+ f"Failed to fetch S3 object, error message: {s3_response['Error']['Message']}"
22
+ )
23
23
 
24
24
 
25
25
  @dataclass
@@ -1,4 +1,3 @@
1
- import os
2
1
  from abc import ABCMeta, abstractmethod
3
2
  from dataclasses import dataclass
4
3
  from typing import Any, Dict, List, Optional
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
38
37
  )
39
38
 
40
39
  max_workers: int = Field(
41
- default=(os.cpu_count() or 4),
42
- description="Number of worker processes to use for classification. Set to 1 to disable.",
40
+ default=1,
41
+ description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
43
42
  )
44
43
 
45
44
  table_pattern: AllowDenyPattern = Field(