acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show
  1. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
  2. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
  3. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +126 -85
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  30. datahub/ingestion/api/report.py +1 -2
  31. datahub/ingestion/api/source.py +4 -2
  32. datahub/ingestion/api/source_helpers.py +1 -1
  33. datahub/ingestion/extractor/json_schema_util.py +3 -3
  34. datahub/ingestion/extractor/schema_util.py +3 -5
  35. datahub/ingestion/fs/s3_fs.py +3 -3
  36. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  37. datahub/ingestion/graph/client.py +22 -19
  38. datahub/ingestion/graph/config.py +1 -1
  39. datahub/ingestion/run/pipeline.py +8 -7
  40. datahub/ingestion/run/pipeline_config.py +3 -3
  41. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  42. datahub/ingestion/source/abs/source.py +19 -8
  43. datahub/ingestion/source/aws/glue.py +77 -47
  44. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  45. datahub/ingestion/source/aws/s3_util.py +24 -1
  46. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  47. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  48. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  49. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  50. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  51. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  52. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  53. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  54. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  55. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  56. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  57. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  58. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  59. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  60. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  61. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  62. datahub/ingestion/source/csv_enricher.py +29 -29
  63. datahub/ingestion/source/datahub/config.py +10 -0
  64. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  65. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  66. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  67. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  68. datahub/ingestion/source/delta_lake/source.py +0 -5
  69. datahub/ingestion/source/demo_data.py +1 -1
  70. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  71. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  72. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  73. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  74. datahub/ingestion/source/elastic_search.py +4 -4
  75. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  76. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  77. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  78. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  79. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  80. datahub/ingestion/source/ge_data_profiler.py +2 -5
  81. datahub/ingestion/source/ge_profiling_config.py +3 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  83. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  84. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  85. datahub/ingestion/source/identity/azure_ad.py +3 -3
  86. datahub/ingestion/source/identity/okta.py +3 -3
  87. datahub/ingestion/source/kafka/kafka.py +11 -9
  88. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  89. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  90. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  91. datahub/ingestion/source/looker/looker_common.py +19 -19
  92. datahub/ingestion/source/looker/looker_config.py +11 -6
  93. datahub/ingestion/source/looker/looker_source.py +25 -25
  94. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  95. datahub/ingestion/source/looker/looker_usage.py +5 -7
  96. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  97. datahub/ingestion/source/looker/lookml_source.py +13 -15
  98. datahub/ingestion/source/looker/view_upstream.py +5 -5
  99. datahub/ingestion/source/metabase.py +1 -6
  100. datahub/ingestion/source/mlflow.py +4 -9
  101. datahub/ingestion/source/mode.py +5 -5
  102. datahub/ingestion/source/mongodb.py +6 -4
  103. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  104. datahub/ingestion/source/nifi.py +24 -31
  105. datahub/ingestion/source/openapi.py +9 -9
  106. datahub/ingestion/source/powerbi/config.py +12 -12
  107. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  108. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  109. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  110. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  111. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  112. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  113. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  114. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  115. datahub/ingestion/source/redash.py +0 -5
  116. datahub/ingestion/source/redshift/config.py +3 -3
  117. datahub/ingestion/source/redshift/redshift.py +45 -46
  118. datahub/ingestion/source/redshift/usage.py +33 -33
  119. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  120. datahub/ingestion/source/s3/source.py +11 -15
  121. datahub/ingestion/source/salesforce.py +26 -25
  122. datahub/ingestion/source/schema/json_schema.py +1 -1
  123. datahub/ingestion/source/sigma/sigma.py +3 -3
  124. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  125. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  128. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  129. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  130. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  131. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  132. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  133. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  134. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  135. datahub/ingestion/source/sql/athena.py +1 -3
  136. datahub/ingestion/source/sql/clickhouse.py +8 -14
  137. datahub/ingestion/source/sql/oracle.py +1 -3
  138. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  139. datahub/ingestion/source/sql/sql_types.py +1 -2
  140. datahub/ingestion/source/sql/sql_utils.py +5 -0
  141. datahub/ingestion/source/sql/teradata.py +18 -5
  142. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  143. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  144. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  145. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  146. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  147. datahub/ingestion/source/superset.py +1 -6
  148. datahub/ingestion/source/tableau/tableau.py +343 -117
  149. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  150. datahub/ingestion/source/unity/config.py +3 -1
  151. datahub/ingestion/source/unity/proxy.py +1 -1
  152. datahub/ingestion/source/unity/source.py +74 -74
  153. datahub/ingestion/source/unity/usage.py +3 -1
  154. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  155. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  156. datahub/ingestion/source/usage/usage_common.py +1 -1
  157. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  158. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  159. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  160. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  161. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  162. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  163. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  164. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  165. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  166. datahub/lite/duckdb_lite.py +12 -10
  167. datahub/metadata/_schema_classes.py +317 -44
  168. datahub/metadata/_urns/urn_defs.py +69 -15
  169. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  170. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  171. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  172. datahub/metadata/schema.avsc +302 -89
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  176. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  177. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  178. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  179. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  180. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  181. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  182. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  183. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  184. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  185. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  186. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  187. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  188. datahub/secret/datahub_secrets_client.py +12 -21
  189. datahub/secret/secret_common.py +14 -8
  190. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  191. datahub/sql_parsing/schema_resolver.py +5 -10
  192. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  193. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  194. datahub/sql_parsing/sqlglot_utils.py +1 -1
  195. datahub/telemetry/stats.py +1 -2
  196. datahub/testing/mcp_diff.py +1 -1
  197. datahub/utilities/file_backed_collections.py +11 -11
  198. datahub/utilities/hive_schema_to_avro.py +2 -2
  199. datahub/utilities/logging_manager.py +2 -2
  200. datahub/utilities/lossy_collections.py +3 -3
  201. datahub/utilities/mapping.py +3 -3
  202. datahub/utilities/memory_footprint.py +3 -2
  203. datahub/utilities/perf_timer.py +11 -6
  204. datahub/utilities/serialized_lru_cache.py +3 -1
  205. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  206. datahub/utilities/sqllineage_patch.py +1 -1
  207. datahub/utilities/stats_collections.py +3 -1
  208. datahub/utilities/urns/_urn_base.py +28 -5
  209. datahub/utilities/urns/urn_iter.py +2 -2
  210. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  211. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -47,7 +47,10 @@ class BaseTimeWindowConfig(ConfigModel):
47
47
  default_factory=lambda: datetime.now(tz=timezone.utc),
48
48
  description="Latest date of lineage/usage to consider. Default: Current time in UTC",
49
49
  )
50
- start_time: datetime = Field(default=None, description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.") # type: ignore
50
+ start_time: datetime = Field(
51
+ default=None,
52
+ description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.",
53
+ ) # type: ignore
51
54
 
52
55
  @pydantic.validator("start_time", pre=True, always=True)
53
56
  def default_start_time(
@@ -63,12 +66,14 @@ class BaseTimeWindowConfig(ConfigModel):
63
66
  # This is where start_time str is resolved to datetime
64
67
  try:
65
68
  delta = parse_relative_timespan(v)
66
- assert delta < timedelta(
67
- 0
68
- ), "Relative start time should start with minus sign (-) e.g. '-2 days'."
69
+ assert delta < timedelta(0), (
70
+ "Relative start time should start with minus sign (-) e.g. '-2 days'."
71
+ )
69
72
  assert abs(delta) >= get_bucket_duration_delta(
70
73
  values["bucket_duration"]
71
- ), "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
74
+ ), (
75
+ "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
76
+ )
72
77
 
73
78
  # The end_time's default value is not yet populated, in which case
74
79
  # we can just manually generate it here.
@@ -88,13 +88,11 @@ def get_sys_time() -> int:
88
88
 
89
89
 
90
90
  @overload
91
- def make_ts_millis(ts: None) -> None:
92
- ...
91
+ def make_ts_millis(ts: None) -> None: ...
93
92
 
94
93
 
95
94
  @overload
96
- def make_ts_millis(ts: datetime) -> int:
97
- ...
95
+ def make_ts_millis(ts: datetime) -> int: ...
98
96
 
99
97
 
100
98
  def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
@@ -105,13 +103,11 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
105
103
 
106
104
 
107
105
  @overload
108
- def parse_ts_millis(ts: float) -> datetime:
109
- ...
106
+ def parse_ts_millis(ts: float) -> datetime: ...
110
107
 
111
108
 
112
109
  @overload
113
- def parse_ts_millis(ts: None) -> None:
114
- ...
110
+ def parse_ts_millis(ts: None) -> None: ...
115
111
 
116
112
 
117
113
  def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
@@ -31,9 +31,12 @@ from datahub.metadata.schema_classes import (
31
31
  OwnershipClass,
32
32
  OwnershipTypeClass,
33
33
  StatusClass,
34
+ StructuredPropertiesClass,
35
+ StructuredPropertyValueAssignmentClass,
34
36
  SubTypesClass,
35
37
  TagAssociationClass,
36
38
  )
39
+ from datahub.metadata.urns import StructuredPropertyUrn
37
40
 
38
41
  # In https://github.com/datahub-project/datahub/pull/11214, we added a
39
42
  # new env field to container properties. However, populating this field
@@ -187,12 +190,31 @@ def add_tags_to_entity_wu(
187
190
  ).as_workunit()
188
191
 
189
192
 
193
+ def add_structured_properties_to_entity_wu(
194
+ entity_urn: str, structured_properties: Dict[StructuredPropertyUrn, str]
195
+ ) -> Iterable[MetadataWorkUnit]:
196
+ aspect = StructuredPropertiesClass(
197
+ properties=[
198
+ StructuredPropertyValueAssignmentClass(
199
+ propertyUrn=urn.urn(),
200
+ values=[value],
201
+ )
202
+ for urn, value in structured_properties.items()
203
+ ]
204
+ )
205
+ yield MetadataChangeProposalWrapper(
206
+ entityUrn=entity_urn,
207
+ aspect=aspect,
208
+ ).as_workunit()
209
+
210
+
190
211
  def gen_containers(
191
212
  container_key: KeyType,
192
213
  name: str,
193
214
  sub_types: List[str],
194
215
  parent_container_key: Optional[ContainerKey] = None,
195
216
  extra_properties: Optional[Dict[str, str]] = None,
217
+ structured_properties: Optional[Dict[StructuredPropertyUrn, str]] = None,
196
218
  domain_urn: Optional[str] = None,
197
219
  description: Optional[str] = None,
198
220
  owner_urn: Optional[str] = None,
@@ -282,6 +304,11 @@ def gen_containers(
282
304
  tags=sorted(tags),
283
305
  )
284
306
 
307
+ if structured_properties:
308
+ yield from add_structured_properties_to_entity_wu(
309
+ entity_urn=container_urn, structured_properties=structured_properties
310
+ )
311
+
285
312
 
286
313
  def add_dataset_to_container(
287
314
  container_key: KeyType, dataset_urn: str
@@ -33,8 +33,7 @@ from datahub.utilities.urns.urn import guess_entity_type
33
33
 
34
34
  @runtime_checkable
35
35
  class SupportsToObj(Protocol):
36
- def to_obj(self) -> Any:
37
- ...
36
+ def to_obj(self) -> Any: ...
38
37
 
39
38
 
40
39
  def _recursive_to_obj(obj: Any) -> Any:
@@ -1,9 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import functools
2
4
  import json
3
5
  import logging
4
6
  import os
5
7
  from json.decoder import JSONDecodeError
6
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union
8
+ from typing import (
9
+ TYPE_CHECKING,
10
+ Any,
11
+ Callable,
12
+ Dict,
13
+ List,
14
+ Optional,
15
+ Sequence,
16
+ Tuple,
17
+ Union,
18
+ )
7
19
 
8
20
  import requests
9
21
  from deprecated import deprecated
@@ -12,9 +24,13 @@ from requests.exceptions import HTTPError, RequestException
12
24
 
13
25
  from datahub import nice_version_name
14
26
  from datahub.cli import config_utils
15
- from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
27
+ from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
16
28
  from datahub.cli.env_utils import get_boolean_env_variable
17
- from datahub.configuration.common import ConfigurationError, OperationalError
29
+ from datahub.configuration.common import (
30
+ ConfigModel,
31
+ ConfigurationError,
32
+ OperationalError,
33
+ )
18
34
  from datahub.emitter.generic_emitter import Emitter
19
35
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
36
  from datahub.emitter.request_helper import make_curl_command
@@ -31,10 +47,8 @@ if TYPE_CHECKING:
31
47
 
32
48
  logger = logging.getLogger(__name__)
33
49
 
34
- _DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
35
- _DEFAULT_READ_TIMEOUT_SEC = (
36
- 30 # Any ingest call taking longer than 30 seconds should be abandoned
37
- )
50
+ _DEFAULT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
51
+ _TIMEOUT_LOWER_BOUND_SEC = 1 # if below this, we log a warning
38
52
  _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
39
53
  429,
40
54
  500,
@@ -63,15 +77,76 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
63
77
  )
64
78
 
65
79
 
80
+ class RequestsSessionConfig(ConfigModel):
81
+ timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
82
+
83
+ retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
84
+ retry_methods: List[str] = _DEFAULT_RETRY_METHODS
85
+ retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
86
+
87
+ extra_headers: Dict[str, str] = {}
88
+
89
+ ca_certificate_path: Optional[str] = None
90
+ client_certificate_path: Optional[str] = None
91
+ disable_ssl_verification: bool = False
92
+
93
+ def build_session(self) -> requests.Session:
94
+ session = requests.Session()
95
+
96
+ if self.extra_headers:
97
+ session.headers.update(self.extra_headers)
98
+
99
+ if self.client_certificate_path:
100
+ session.cert = self.client_certificate_path
101
+
102
+ if self.ca_certificate_path:
103
+ session.verify = self.ca_certificate_path
104
+
105
+ if self.disable_ssl_verification:
106
+ session.verify = False
107
+
108
+ try:
109
+ # Set raise_on_status to False to propagate errors:
110
+ # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
111
+ # Must call `raise_for_status` after making a request, which we do
112
+ retry_strategy = Retry(
113
+ total=self.retry_max_times,
114
+ status_forcelist=self.retry_status_codes,
115
+ backoff_factor=2,
116
+ allowed_methods=self.retry_methods,
117
+ raise_on_status=False,
118
+ )
119
+ except TypeError:
120
+ # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
121
+ retry_strategy = Retry(
122
+ total=self.retry_max_times,
123
+ status_forcelist=self.retry_status_codes,
124
+ backoff_factor=2,
125
+ method_whitelist=self.retry_methods,
126
+ raise_on_status=False,
127
+ )
128
+
129
+ adapter = HTTPAdapter(
130
+ pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
131
+ )
132
+ session.mount("http://", adapter)
133
+ session.mount("https://", adapter)
134
+
135
+ if self.timeout is not None:
136
+ # Shim session.request to apply default timeout values.
137
+ # Via https://stackoverflow.com/a/59317604.
138
+ session.request = functools.partial( # type: ignore
139
+ session.request,
140
+ timeout=self.timeout,
141
+ )
142
+
143
+ return session
144
+
145
+
66
146
  class DataHubRestEmitter(Closeable, Emitter):
67
147
  _gms_server: str
68
148
  _token: Optional[str]
69
149
  _session: requests.Session
70
- _connect_timeout_sec: float = _DEFAULT_CONNECT_TIMEOUT_SEC
71
- _read_timeout_sec: float = _DEFAULT_READ_TIMEOUT_SEC
72
- _retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
73
- _retry_methods: List[str] = _DEFAULT_RETRY_METHODS
74
- _retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
75
150
 
76
151
  def __init__(
77
152
  self,
@@ -102,15 +177,13 @@ class DataHubRestEmitter(Closeable, Emitter):
102
177
 
103
178
  self._session = requests.Session()
104
179
 
105
- self._session.headers.update(
106
- {
107
- "X-RestLi-Protocol-Version": "2.0.0",
108
- "X-DataHub-Py-Cli-Version": nice_version_name(),
109
- "Content-Type": "application/json",
110
- }
111
- )
180
+ headers = {
181
+ "X-RestLi-Protocol-Version": "2.0.0",
182
+ "X-DataHub-Py-Cli-Version": nice_version_name(),
183
+ "Content-Type": "application/json",
184
+ }
112
185
  if token:
113
- self._session.headers.update({"Authorization": f"Bearer {token}"})
186
+ headers["Authorization"] = f"Bearer {token}"
114
187
  else:
115
188
  # HACK: When no token is provided but system auth env variables are set, we use them.
116
189
  # Ideally this should simply get passed in as config, instead of being sneakily injected
@@ -119,75 +192,43 @@ class DataHubRestEmitter(Closeable, Emitter):
119
192
  # rest emitter, and the rest sink uses the rest emitter under the hood.
120
193
  system_auth = config_utils.get_system_auth()
121
194
  if system_auth is not None:
122
- self._session.headers.update({"Authorization": system_auth})
123
-
124
- if extra_headers:
125
- self._session.headers.update(extra_headers)
126
-
127
- if client_certificate_path:
128
- self._session.cert = client_certificate_path
129
-
130
- if ca_certificate_path:
131
- self._session.verify = ca_certificate_path
132
-
133
- if disable_ssl_verification:
134
- self._session.verify = False
135
-
136
- self._connect_timeout_sec = (
137
- connect_timeout_sec or timeout_sec or _DEFAULT_CONNECT_TIMEOUT_SEC
138
- )
139
- self._read_timeout_sec = (
140
- read_timeout_sec or timeout_sec or _DEFAULT_READ_TIMEOUT_SEC
141
- )
142
-
143
- if self._connect_timeout_sec < 1 or self._read_timeout_sec < 1:
144
- logger.warning(
145
- f"Setting timeout values lower than 1 second is not recommended. Your configuration is connect_timeout:{self._connect_timeout_sec}s, read_timeout:{self._read_timeout_sec}s"
146
- )
147
-
148
- if retry_status_codes is not None: # Only if missing. Empty list is allowed
149
- self._retry_status_codes = retry_status_codes
150
-
151
- if retry_methods is not None:
152
- self._retry_methods = retry_methods
153
-
154
- if retry_max_times:
155
- self._retry_max_times = retry_max_times
195
+ headers["Authorization"] = system_auth
156
196
 
157
- try:
158
- # Set raise_on_status to False to propagate errors:
159
- # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
160
- # Must call `raise_for_status` after making a request, which we do
161
- retry_strategy = Retry(
162
- total=self._retry_max_times,
163
- status_forcelist=self._retry_status_codes,
164
- backoff_factor=2,
165
- allowed_methods=self._retry_methods,
166
- raise_on_status=False,
167
- )
168
- except TypeError:
169
- # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
170
- retry_strategy = Retry(
171
- total=self._retry_max_times,
172
- status_forcelist=self._retry_status_codes,
173
- backoff_factor=2,
174
- method_whitelist=self._retry_methods,
175
- raise_on_status=False,
197
+ timeout: float | tuple[float, float]
198
+ if connect_timeout_sec is not None or read_timeout_sec is not None:
199
+ timeout = (
200
+ connect_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
201
+ read_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
176
202
  )
203
+ if (
204
+ timeout[0] < _TIMEOUT_LOWER_BOUND_SEC
205
+ or timeout[1] < _TIMEOUT_LOWER_BOUND_SEC
206
+ ):
207
+ logger.warning(
208
+ f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is (connect_timeout, read_timeout) = {timeout} seconds"
209
+ )
210
+ else:
211
+ timeout = get_or_else(timeout_sec, _DEFAULT_TIMEOUT_SEC)
212
+ if timeout < _TIMEOUT_LOWER_BOUND_SEC:
213
+ logger.warning(
214
+ f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is timeout = {timeout} seconds"
215
+ )
177
216
 
178
- adapter = HTTPAdapter(
179
- pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
180
- )
181
- self._session.mount("http://", adapter)
182
- self._session.mount("https://", adapter)
183
-
184
- # Shim session.request to apply default timeout values.
185
- # Via https://stackoverflow.com/a/59317604.
186
- self._session.request = functools.partial( # type: ignore
187
- self._session.request,
188
- timeout=(self._connect_timeout_sec, self._read_timeout_sec),
217
+ self._session_config = RequestsSessionConfig(
218
+ timeout=timeout,
219
+ retry_status_codes=get_or_else(
220
+ retry_status_codes, _DEFAULT_RETRY_STATUS_CODES
221
+ ),
222
+ retry_methods=get_or_else(retry_methods, _DEFAULT_RETRY_METHODS),
223
+ retry_max_times=get_or_else(retry_max_times, _DEFAULT_RETRY_MAX_TIMES),
224
+ extra_headers={**headers, **(extra_headers or {})},
225
+ ca_certificate_path=ca_certificate_path,
226
+ client_certificate_path=client_certificate_path,
227
+ disable_ssl_verification=disable_ssl_verification,
189
228
  )
190
229
 
230
+ self._session = self._session_config.build_session()
231
+
191
232
  def test_connection(self) -> None:
192
233
  url = f"{self._gms_server}/config"
193
234
  response = self._session.get(url)
@@ -333,7 +374,7 @@ class DataHubRestEmitter(Closeable, Emitter):
333
374
  # the size when chunking, and again for the actual request.
334
375
  payload_dict: dict = {"proposals": mcp_obj_chunk}
335
376
  if async_flag is not None:
336
- payload_dict["async"] = True if async_flag else False
377
+ payload_dict["async"] = "true" if async_flag else "false"
337
378
 
338
379
  payload = json.dumps(payload_dict)
339
380
  self._emit_generic(url, payload)
datahub/entrypoints.py CHANGED
@@ -45,6 +45,12 @@ _logging_configured: Optional[ContextManager] = None
45
45
 
46
46
  MAX_CONTENT_WIDTH = 120
47
47
 
48
+ if sys.version_info >= (3, 12):
49
+ click.secho(
50
+ "Python versions above 3.11 are not tested with. Please use Python 3.11.",
51
+ fg="red",
52
+ )
53
+
48
54
 
49
55
  @click.group(
50
56
  context_settings=dict(
@@ -55,15 +55,9 @@ def convert_chart_info_to_patch(
55
55
  aspect.externalUrl
56
56
  ).set_type(aspect.type).set_title(aspect.title).set_access(
57
57
  aspect.access
58
- ).set_last_modified(
59
- aspect.lastModified
60
- ).set_last_refreshed(
58
+ ).set_last_modified(aspect.lastModified).set_last_refreshed(
61
59
  aspect.lastRefreshed
62
- ).set_description(
63
- aspect.description
64
- ).add_inputs(
65
- aspect.inputs
66
- )
60
+ ).set_description(aspect.description).add_inputs(aspect.inputs)
67
61
 
68
62
  values = patch_builder.build()
69
63
  if values:
@@ -21,8 +21,7 @@ LogLevel = Literal["ERROR", "WARNING", "INFO", "DEBUG"]
21
21
 
22
22
  @runtime_checkable
23
23
  class SupportsAsObj(Protocol):
24
- def as_obj(self) -> dict:
25
- ...
24
+ def as_obj(self) -> dict: ...
26
25
 
27
26
 
28
27
  @dataclass
@@ -23,7 +23,7 @@ from typing import (
23
23
  )
24
24
 
25
25
  from pydantic import BaseModel
26
- from typing_extensions import LiteralString
26
+ from typing_extensions import LiteralString, Self
27
27
 
28
28
  from datahub.configuration.common import ConfigModel
29
29
  from datahub.configuration.source_common import PlatformInstanceConfigMixin
@@ -334,6 +334,8 @@ class SourceReport(Report):
334
334
  }
335
335
 
336
336
  def compute_stats(self) -> None:
337
+ super().compute_stats()
338
+
337
339
  duration = datetime.datetime.now() - self.start_time
338
340
  workunits_produced = self.events_produced
339
341
  if duration.total_seconds() > 0:
@@ -398,7 +400,7 @@ class Source(Closeable, metaclass=ABCMeta):
398
400
  ctx: PipelineContext
399
401
 
400
402
  @classmethod
401
- def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
403
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> Self:
402
404
  # Technically, this method should be abstract. However, the @config_class
403
405
  # decorator automatically generates a create method at runtime if one is
404
406
  # not defined. Python still treats the class as abstract because it thinks
@@ -48,7 +48,7 @@ logger = logging.getLogger(__name__)
48
48
 
49
49
 
50
50
  def auto_workunit(
51
- stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]]
51
+ stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]],
52
52
  ) -> Iterable[MetadataWorkUnit]:
53
53
  """Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
54
54
 
@@ -131,9 +131,9 @@ class FieldPath:
131
131
  for i, schema_type in enumerate(p.schema_types):
132
132
  if schema_type == schema_str:
133
133
  # return the corresponding type for the schema that's a match
134
- assert (
135
- len(p.type) > i
136
- ), f"p.type({len(p.type)})) and p.schema_types({len(p.schema_types)}) should have the same length"
134
+ assert len(p.type) > i, (
135
+ f"p.type({len(p.type)})) and p.schema_types({len(p.schema_types)}) should have the same length"
136
+ )
137
137
  return p.type[i]
138
138
  return None
139
139
 
@@ -263,15 +263,13 @@ class AvroToMceSchemaConverter:
263
263
  @overload
264
264
  def _get_underlying_type_if_option_as_union(
265
265
  schema: SchemaOrField, default: SchemaOrField
266
- ) -> SchemaOrField:
267
- ...
266
+ ) -> SchemaOrField: ...
268
267
 
269
268
  @staticmethod
270
269
  @overload
271
270
  def _get_underlying_type_if_option_as_union(
272
271
  schema: SchemaOrField, default: Optional[SchemaOrField] = None
273
- ) -> Optional[SchemaOrField]:
274
- ...
272
+ ) -> Optional[SchemaOrField]: ...
275
273
 
276
274
  @staticmethod
277
275
  def _get_underlying_type_if_option_as_union(
@@ -386,7 +384,7 @@ class AvroToMceSchemaConverter:
386
384
 
387
385
  if "deprecated" in merged_props:
388
386
  description = (
389
- f"<span style=\"color:red\">DEPRECATED: {merged_props['deprecated']}</span>\n"
387
+ f'<span style="color:red">DEPRECATED: {merged_props["deprecated"]}</span>\n'
390
388
  + description
391
389
  if description
392
390
  else ""
@@ -17,9 +17,9 @@ def parse_s3_path(path: str) -> "S3Path":
17
17
 
18
18
  def assert_ok_status(s3_response):
19
19
  is_ok = s3_response["ResponseMetadata"]["HTTPStatusCode"] == 200
20
- assert (
21
- is_ok
22
- ), f"Failed to fetch S3 object, error message: {s3_response['Error']['Message']}"
20
+ assert is_ok, (
21
+ f"Failed to fetch S3 object, error message: {s3_response['Error']['Message']}"
22
+ )
23
23
 
24
24
 
25
25
  @dataclass
@@ -148,9 +148,9 @@ class DataHubClassifierConfig(ConfigModel):
148
148
  weight,
149
149
  ) in custom_infotype_config.Prediction_Factors_and_Weights.dict().items():
150
150
  if weight > 0:
151
- assert (
152
- getattr(custom_infotype_config, factor) is not None
153
- ), f"Missing Configuration for Prediction Factor {factor} for Custom Info Type {custom_infotype}"
151
+ assert getattr(custom_infotype_config, factor) is not None, (
152
+ f"Missing Configuration for Prediction Factor {factor} for Custom Info Type {custom_infotype}"
153
+ )
154
154
 
155
155
  # Custom infotype supports only regex based prediction for column values
156
156
  if custom_infotype_config.Prediction_Factors_and_Weights.Values > 0:
@@ -158,7 +158,9 @@ class DataHubClassifierConfig(ConfigModel):
158
158
  assert (
159
159
  custom_infotype_config.Values.prediction_type
160
160
  == ValuePredictionType.REGEX
161
- ), f"Invalid Prediction Type for Values for Custom Info Type {custom_infotype}. Only `regex` is supported."
161
+ ), (
162
+ f"Invalid Prediction Type for Values for Custom Info Type {custom_infotype}. Only `regex` is supported."
163
+ )
162
164
 
163
165
  return info_types_config
164
166
 
@@ -179,21 +179,24 @@ class DataHubGraph(DatahubRestEmitter):
179
179
 
180
180
  @classmethod
181
181
  def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
182
+ session_config = emitter._session_config
183
+ if isinstance(session_config.timeout, tuple):
184
+ # TODO: This is slightly lossy. Eventually, we want to modify the emitter
185
+ # to accept a tuple for timeout_sec, and then we'll be able to remove this.
186
+ timeout_sec: Optional[float] = session_config.timeout[0]
187
+ else:
188
+ timeout_sec = session_config.timeout
182
189
  return cls(
183
190
  DatahubClientConfig(
184
191
  server=emitter._gms_server,
185
192
  token=emitter._token,
186
- timeout_sec=emitter._read_timeout_sec,
187
- retry_status_codes=emitter._retry_status_codes,
188
- retry_max_times=emitter._retry_max_times,
189
- extra_headers=emitter._session.headers,
190
- disable_ssl_verification=emitter._session.verify is False,
191
- ca_certificate_path=(
192
- emitter._session.verify
193
- if isinstance(emitter._session.verify, str)
194
- else None
195
- ),
196
- client_certificate_path=emitter._session.cert,
193
+ timeout_sec=timeout_sec,
194
+ retry_status_codes=session_config.retry_status_codes,
195
+ retry_max_times=session_config.retry_max_times,
196
+ extra_headers=session_config.extra_headers,
197
+ disable_ssl_verification=session_config.disable_ssl_verification,
198
+ ca_certificate_path=session_config.ca_certificate_path,
199
+ client_certificate_path=session_config.client_certificate_path,
197
200
  )
198
201
  )
199
202
 
@@ -245,9 +248,11 @@ class DataHubGraph(DatahubRestEmitter):
245
248
  with DatahubRestSink(PipelineContext(run_id=run_id), sink_config) as sink:
246
249
  yield sink
247
250
  if sink.report.failures:
251
+ logger.error(
252
+ f"Failed to emit {len(sink.report.failures)} records\n{sink.report.as_string()}"
253
+ )
248
254
  raise OperationalError(
249
- f"Failed to emit {len(sink.report.failures)} records",
250
- info=sink.report.as_obj(),
255
+ f"Failed to emit {len(sink.report.failures)} records"
251
256
  )
252
257
 
253
258
  def emit_all(
@@ -514,9 +519,9 @@ class DataHubGraph(DatahubRestEmitter):
514
519
  :return: Optionally, a map of aspect_name to aspect_value as a dictionary if present, aspect_value will be set to None if that aspect was not found. Returns None on HTTP status 404.
515
520
  :raises HttpError: if the HTTP response is not a 200
516
521
  """
517
- assert len(aspects) == len(
518
- aspect_types
519
- ), f"number of aspects requested ({len(aspects)}) should be the same as number of aspect types provided ({len(aspect_types)})"
522
+ assert len(aspects) == len(aspect_types), (
523
+ f"number of aspects requested ({len(aspects)}) should be the same as number of aspect types provided ({len(aspect_types)})"
524
+ )
520
525
 
521
526
  # TODO: generate aspects list from type classes
522
527
  response_json = self.get_entity_raw(entity_urn, aspects)
@@ -1571,9 +1576,7 @@ class DataHubGraph(DatahubRestEmitter):
1571
1576
  ... assertionResult
1572
1577
  }
1573
1578
  }
1574
- """ % (
1575
- self._assertion_result_shared()
1576
- )
1579
+ """ % (self._assertion_result_shared())
1577
1580
 
1578
1581
  variables = {
1579
1582
  "assertionUrn": urn,
@@ -10,7 +10,7 @@ class DatahubClientConfig(ConfigModel):
10
10
  # by callers / the CLI, but the actual client should not have any magic.
11
11
  server: str
12
12
  token: Optional[str] = None
13
- timeout_sec: Optional[int] = None
13
+ timeout_sec: Optional[float] = None
14
14
  retry_status_codes: Optional[List[int]] = None
15
15
  retry_max_times: Optional[int] = None
16
16
  extra_headers: Optional[Dict[str, str]] = None