acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show
  1. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
  2. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
  3. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +126 -85
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  30. datahub/ingestion/api/report.py +1 -2
  31. datahub/ingestion/api/source.py +4 -2
  32. datahub/ingestion/api/source_helpers.py +1 -1
  33. datahub/ingestion/extractor/json_schema_util.py +3 -3
  34. datahub/ingestion/extractor/schema_util.py +3 -5
  35. datahub/ingestion/fs/s3_fs.py +3 -3
  36. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  37. datahub/ingestion/graph/client.py +22 -19
  38. datahub/ingestion/graph/config.py +1 -1
  39. datahub/ingestion/run/pipeline.py +8 -7
  40. datahub/ingestion/run/pipeline_config.py +3 -3
  41. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  42. datahub/ingestion/source/abs/source.py +19 -8
  43. datahub/ingestion/source/aws/glue.py +77 -47
  44. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  45. datahub/ingestion/source/aws/s3_util.py +24 -1
  46. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  47. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  48. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  49. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  50. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  51. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  52. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  53. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  54. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  55. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  56. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  57. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  58. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  59. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  60. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  61. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  62. datahub/ingestion/source/csv_enricher.py +29 -29
  63. datahub/ingestion/source/datahub/config.py +10 -0
  64. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  65. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  66. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  67. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  68. datahub/ingestion/source/delta_lake/source.py +0 -5
  69. datahub/ingestion/source/demo_data.py +1 -1
  70. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  71. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  72. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  73. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  74. datahub/ingestion/source/elastic_search.py +4 -4
  75. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  76. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  77. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  78. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  79. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  80. datahub/ingestion/source/ge_data_profiler.py +2 -5
  81. datahub/ingestion/source/ge_profiling_config.py +3 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  83. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  84. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  85. datahub/ingestion/source/identity/azure_ad.py +3 -3
  86. datahub/ingestion/source/identity/okta.py +3 -3
  87. datahub/ingestion/source/kafka/kafka.py +11 -9
  88. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  89. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  90. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  91. datahub/ingestion/source/looker/looker_common.py +19 -19
  92. datahub/ingestion/source/looker/looker_config.py +11 -6
  93. datahub/ingestion/source/looker/looker_source.py +25 -25
  94. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  95. datahub/ingestion/source/looker/looker_usage.py +5 -7
  96. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  97. datahub/ingestion/source/looker/lookml_source.py +13 -15
  98. datahub/ingestion/source/looker/view_upstream.py +5 -5
  99. datahub/ingestion/source/metabase.py +1 -6
  100. datahub/ingestion/source/mlflow.py +4 -9
  101. datahub/ingestion/source/mode.py +5 -5
  102. datahub/ingestion/source/mongodb.py +6 -4
  103. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  104. datahub/ingestion/source/nifi.py +24 -31
  105. datahub/ingestion/source/openapi.py +9 -9
  106. datahub/ingestion/source/powerbi/config.py +12 -12
  107. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  108. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  109. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  110. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  111. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  112. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  113. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  114. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  115. datahub/ingestion/source/redash.py +0 -5
  116. datahub/ingestion/source/redshift/config.py +3 -3
  117. datahub/ingestion/source/redshift/redshift.py +45 -46
  118. datahub/ingestion/source/redshift/usage.py +33 -33
  119. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  120. datahub/ingestion/source/s3/source.py +11 -15
  121. datahub/ingestion/source/salesforce.py +26 -25
  122. datahub/ingestion/source/schema/json_schema.py +1 -1
  123. datahub/ingestion/source/sigma/sigma.py +3 -3
  124. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  125. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  128. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  129. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  130. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  131. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  132. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  133. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  134. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  135. datahub/ingestion/source/sql/athena.py +1 -3
  136. datahub/ingestion/source/sql/clickhouse.py +8 -14
  137. datahub/ingestion/source/sql/oracle.py +1 -3
  138. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  139. datahub/ingestion/source/sql/sql_types.py +1 -2
  140. datahub/ingestion/source/sql/sql_utils.py +5 -0
  141. datahub/ingestion/source/sql/teradata.py +18 -5
  142. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  143. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  144. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  145. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  146. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  147. datahub/ingestion/source/superset.py +1 -6
  148. datahub/ingestion/source/tableau/tableau.py +343 -117
  149. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  150. datahub/ingestion/source/unity/config.py +3 -1
  151. datahub/ingestion/source/unity/proxy.py +1 -1
  152. datahub/ingestion/source/unity/source.py +74 -74
  153. datahub/ingestion/source/unity/usage.py +3 -1
  154. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  155. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  156. datahub/ingestion/source/usage/usage_common.py +1 -1
  157. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  158. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  159. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  160. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  161. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  162. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  163. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  164. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  165. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  166. datahub/lite/duckdb_lite.py +12 -10
  167. datahub/metadata/_schema_classes.py +317 -44
  168. datahub/metadata/_urns/urn_defs.py +69 -15
  169. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  170. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  171. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  172. datahub/metadata/schema.avsc +302 -89
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  176. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  177. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  178. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  179. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  180. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  181. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  182. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  183. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  184. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  185. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  186. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  187. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  188. datahub/secret/datahub_secrets_client.py +12 -21
  189. datahub/secret/secret_common.py +14 -8
  190. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  191. datahub/sql_parsing/schema_resolver.py +5 -10
  192. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  193. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  194. datahub/sql_parsing/sqlglot_utils.py +1 -1
  195. datahub/telemetry/stats.py +1 -2
  196. datahub/testing/mcp_diff.py +1 -1
  197. datahub/utilities/file_backed_collections.py +11 -11
  198. datahub/utilities/hive_schema_to_avro.py +2 -2
  199. datahub/utilities/logging_manager.py +2 -2
  200. datahub/utilities/lossy_collections.py +3 -3
  201. datahub/utilities/mapping.py +3 -3
  202. datahub/utilities/memory_footprint.py +3 -2
  203. datahub/utilities/perf_timer.py +11 -6
  204. datahub/utilities/serialized_lru_cache.py +3 -1
  205. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  206. datahub/utilities/sqllineage_patch.py +1 -1
  207. datahub/utilities/stats_collections.py +3 -1
  208. datahub/utilities/urns/_urn_base.py +28 -5
  209. datahub/utilities/urns/urn_iter.py +2 -2
  210. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  211. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.7.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc17"
6
+ __version__ = "0.15.0.2"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -20,15 +20,13 @@ class Operator(Protocol):
20
20
 
21
21
  operator: str
22
22
 
23
- def id(self) -> str:
24
- ...
23
+ def id(self) -> str: ...
25
24
 
26
- def generate_parameters(self) -> AssertionStdParametersClass:
27
- ...
25
+ def generate_parameters(self) -> AssertionStdParametersClass: ...
28
26
 
29
27
 
30
28
  def _generate_assertion_std_parameter(
31
- value: Union[str, int, float, list]
29
+ value: Union[str, int, float, list],
32
30
  ) -> AssertionStdParameterClass:
33
31
  if isinstance(value, str):
34
32
  return AssertionStdParameterClass(
@@ -114,7 +114,7 @@ class CorpGroup(BaseModel):
114
114
  )
115
115
  urns_created.add(m.urn)
116
116
  else:
117
- logger.warn(
117
+ logger.warning(
118
118
  f"Suppressing emission of member {m.urn} before we already emitted metadata for it"
119
119
  )
120
120
 
@@ -19,15 +19,13 @@ class Operator(Protocol):
19
19
 
20
20
  operator: str
21
21
 
22
- def id(self) -> str:
23
- ...
22
+ def id(self) -> str: ...
24
23
 
25
- def generate_parameters(self) -> AssertionStdParametersClass:
26
- ...
24
+ def generate_parameters(self) -> AssertionStdParametersClass: ...
27
25
 
28
26
 
29
27
  def _generate_assertion_std_parameter(
30
- value: Union[str, int, float]
28
+ value: Union[str, int, float],
31
29
  ) -> AssertionStdParameterClass:
32
30
  if isinstance(value, str):
33
31
  return AssertionStdParameterClass(
@@ -321,9 +321,9 @@ class DataProduct(ConfigModel):
321
321
 
322
322
  @classmethod
323
323
  def from_datahub(cls, graph: DataHubGraph, id: str) -> DataProduct:
324
- data_product_properties: Optional[
325
- DataProductPropertiesClass
326
- ] = graph.get_aspect(id, DataProductPropertiesClass)
324
+ data_product_properties: Optional[DataProductPropertiesClass] = (
325
+ graph.get_aspect(id, DataProductPropertiesClass)
326
+ )
327
327
  domains: Optional[DomainsClass] = graph.get_aspect(id, DomainsClass)
328
328
  assert domains, "Data Product must have an associated domain. Found none."
329
329
  owners: Optional[OwnershipClass] = graph.get_aspect(id, OwnershipClass)
@@ -438,7 +438,7 @@ class DataProduct(ConfigModel):
438
438
  for replace_index, replace_value in patches_replace.items():
439
439
  list_to_manipulate[replace_index] = replace_value
440
440
 
441
- for drop_index, drop_value in patches_drop.items():
441
+ for drop_value in patches_drop.values():
442
442
  list_to_manipulate.remove(drop_value)
443
443
 
444
444
  for add_value in patches_add:
@@ -266,7 +266,8 @@ class Dataset(BaseModel):
266
266
  if self.schema_metadata.fields:
267
267
  for field in self.schema_metadata.fields:
268
268
  field_urn = field.urn or make_schema_field_urn(
269
- self.urn, field.id # type: ignore[arg-type]
269
+ self.urn, # type: ignore[arg-type]
270
+ field.id, # type: ignore[arg-type]
270
271
  )
271
272
  assert field_urn.startswith("urn:li:schemaField:")
272
273
 
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from enum import Enum
3
3
  from pathlib import Path
4
- from typing import List, Optional
4
+ from typing import Iterable, List, Optional
5
5
 
6
6
  import yaml
7
7
  from pydantic import validator
@@ -118,9 +118,9 @@ class StructuredProperties(ConfigModel):
118
118
  id = StructuredPropertyUrn.from_string(self.urn).id
119
119
  if self.qualified_name is not None:
120
120
  # ensure that qualified name and ID match
121
- assert (
122
- self.qualified_name == id
123
- ), "ID in the urn and the qualified_name must match"
121
+ assert self.qualified_name == id, (
122
+ "ID in the urn and the qualified_name must match"
123
+ )
124
124
  return id
125
125
 
126
126
  @validator("urn", pre=True, always=True)
@@ -184,9 +184,9 @@ class StructuredProperties(ConfigModel):
184
184
 
185
185
  @classmethod
186
186
  def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
187
- structured_property: Optional[
188
- StructuredPropertyDefinitionClass
189
- ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
187
+ structured_property: Optional[StructuredPropertyDefinitionClass] = (
188
+ graph.get_aspect(urn, StructuredPropertyDefinitionClass)
189
+ )
190
190
  if structured_property is None:
191
191
  raise Exception(
192
192
  "StructuredPropertyDefinition aspect is None. Unable to create structured property."
@@ -226,3 +226,14 @@ class StructuredProperties(ConfigModel):
226
226
  yaml.indent(mapping=2, sequence=4, offset=2)
227
227
  yaml.default_flow_style = False
228
228
  yaml.dump(self.dict(), fp)
229
+
230
+ @staticmethod
231
+ def list_urns(graph: DataHubGraph) -> Iterable[str]:
232
+ return graph.get_urns_by_filter(
233
+ entity_types=["structuredProperty"],
234
+ )
235
+
236
+ @staticmethod
237
+ def list(graph: DataHubGraph) -> Iterable["StructuredProperties"]:
238
+ for urn in StructuredProperties.list_urns(graph):
239
+ yield StructuredProperties.from_datahub(graph, urn)
datahub/cli/cli_utils.py CHANGED
@@ -3,7 +3,7 @@ import logging
3
3
  import time
4
4
  import typing
5
5
  from datetime import datetime
6
- from typing import Any, Dict, List, Optional, Tuple, Type, Union
6
+ from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
7
7
 
8
8
  import click
9
9
  import requests
@@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]:
33
33
  return next((el for el in ls if el is not None and el.strip() != ""), None)
34
34
 
35
35
 
36
+ _T = TypeVar("_T")
37
+
38
+
39
+ def get_or_else(value: Optional[_T], default: _T) -> _T:
40
+ # Normally we'd use `value or default`. However, that runs into issues
41
+ # when value is falsey but not None.
42
+ return value if value is not None else default
43
+
44
+
36
45
  def parse_run_restli_response(response: requests.Response) -> dict:
37
46
  response_json = response.json()
38
47
  if response.status_code != 200:
@@ -321,6 +330,8 @@ def get_frontend_session_login_as(
321
330
  def _ensure_valid_gms_url_acryl_cloud(url: str) -> str:
322
331
  if "acryl.io" not in url:
323
332
  return url
333
+ if url.endswith(":8080"):
334
+ url = url.replace(":8080", "")
324
335
  if url.startswith("http://"):
325
336
  url = url.replace("http://", "https://")
326
337
  if url.endswith("acryl.io"):
@@ -401,7 +412,7 @@ def generate_access_token(
401
412
  def ensure_has_system_metadata(
402
413
  event: Union[
403
414
  MetadataChangeProposal, MetadataChangeProposalWrapper, MetadataChangeEvent
404
- ]
415
+ ],
405
416
  ) -> None:
406
417
  if event.systemMetadata is None:
407
418
  event.systemMetadata = SystemMetadataClass()
datahub/cli/delete_cli.py CHANGED
@@ -1,8 +1,8 @@
1
1
  import logging
2
+ import random
2
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
4
  from dataclasses import dataclass
4
5
  from datetime import datetime
5
- from random import choices
6
6
  from typing import Dict, List, Optional
7
7
 
8
8
  import click
@@ -457,11 +457,11 @@ def by_filter(
457
457
  click.echo("Found urns of multiple entity types")
458
458
  for entity_type, entity_urns in urns_by_type.items():
459
459
  click.echo(
460
- f"- {len(entity_urns)} {entity_type} urn(s). Sample: {choices(entity_urns, k=min(5, len(entity_urns)))}"
460
+ f"- {len(entity_urns)} {entity_type} urn(s). Sample: {random.sample(entity_urns, k=min(5, len(entity_urns)))}"
461
461
  )
462
462
  else:
463
463
  click.echo(
464
- f"Found {len(urns)} {entity_type} urn(s). Sample: {choices(urns, k=min(5, len(urns)))}"
464
+ f"Found {len(urns)} {entity_type} urn(s). Sample: {random.sample(urns, k=min(5, len(urns)))}"
465
465
  )
466
466
 
467
467
  if not force and not dry_run:
datahub/cli/docker_cli.py CHANGED
@@ -296,9 +296,9 @@ def _restore(
296
296
  restore_indices: Optional[bool],
297
297
  primary_restore_file: Optional[str],
298
298
  ) -> int:
299
- assert (
300
- restore_primary or restore_indices
301
- ), "Either restore_primary or restore_indices must be set"
299
+ assert restore_primary or restore_indices, (
300
+ "Either restore_primary or restore_indices must be set"
301
+ )
302
302
  msg = "datahub> "
303
303
  if restore_primary:
304
304
  msg += f"Will restore primary database from {primary_restore_file}. "
@@ -314,9 +314,9 @@ def _restore(
314
314
  assert primary_restore_file
315
315
  resolved_restore_file = os.path.expanduser(primary_restore_file)
316
316
  logger.info(f"Restoring primary db from backup at {resolved_restore_file}")
317
- assert os.path.exists(
318
- resolved_restore_file
319
- ), f"File {resolved_restore_file} does not exist"
317
+ assert os.path.exists(resolved_restore_file), (
318
+ f"File {resolved_restore_file} does not exist"
319
+ )
320
320
  with open(resolved_restore_file) as fp:
321
321
  result = subprocess.run(
322
322
  [
datahub/cli/ingest_cli.py CHANGED
@@ -507,15 +507,11 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
507
507
  click.echo("No response received from the server.")
508
508
  return
509
509
 
510
- # when urn or source filter does not match, exit gracefully
511
- if (
512
- not isinstance(data.get("data"), dict)
513
- or "listIngestionSources" not in data["data"]
514
- ):
515
- click.echo("No matching ingestion sources found. Please check your filters.")
516
- return
510
+ # a lot of responses can be null if there's errors in the run
511
+ ingestion_sources = (
512
+ data.get("data", {}).get("listIngestionSources", {}).get("ingestionSources", [])
513
+ )
517
514
 
518
- ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
519
515
  if not ingestion_sources:
520
516
  click.echo("No ingestion sources or executions found.")
521
517
  return
@@ -526,18 +522,32 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
526
522
  name = ingestion_source.get("name", "N/A")
527
523
 
528
524
  executions = ingestion_source.get("executions", {}).get("executionRequests", [])
525
+
529
526
  for execution in executions:
527
+ if execution is None:
528
+ continue
529
+
530
530
  execution_id = execution.get("id", "N/A")
531
- start_time = execution.get("result", {}).get("startTimeMs", "N/A")
532
- start_time = (
533
- datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S")
534
- if start_time != "N/A"
535
- else "N/A"
536
- )
537
- status = execution.get("result", {}).get("status", "N/A")
531
+ result = execution.get("result") or {}
532
+ status = result.get("status", "N/A")
533
+
534
+ try:
535
+ start_time = (
536
+ datetime.fromtimestamp(
537
+ result.get("startTimeMs", 0) / 1000
538
+ ).strftime("%Y-%m-%d %H:%M:%S")
539
+ if status != "DUPLICATE" and result.get("startTimeMs") is not None
540
+ else "N/A"
541
+ )
542
+ except (TypeError, ValueError):
543
+ start_time = "N/A"
538
544
 
539
545
  rows.append([execution_id, name, start_time, status, urn])
540
546
 
547
+ if not rows:
548
+ click.echo("No execution data found.")
549
+ return
550
+
541
551
  click.echo(
542
552
  tabulate(
543
553
  rows,
datahub/cli/lite_cli.py CHANGED
@@ -176,7 +176,7 @@ def get(
176
176
  )
177
177
  )
178
178
  end_time = time.time()
179
- logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
179
+ logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis")
180
180
 
181
181
 
182
182
  @lite.command()
@@ -228,7 +228,7 @@ def ls(path: Optional[str]) -> None:
228
228
  try:
229
229
  browseables = lite.ls(path)
230
230
  end_time = time.time()
231
- logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
231
+ logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis")
232
232
  auto_complete: List[AutoComplete] = [
233
233
  b.auto_complete for b in browseables if b.auto_complete is not None
234
234
  ]
datahub/cli/migrate.py CHANGED
@@ -179,7 +179,7 @@ def dataplatform2instance_func(
179
179
 
180
180
  if not force and not dry_run:
181
181
  # get a confirmation from the operator before proceeding if this is not a dry run
182
- sampled_urns_to_migrate = random.choices(
182
+ sampled_urns_to_migrate = random.sample(
183
183
  urns_to_migrate, k=min(10, len(urns_to_migrate))
184
184
  )
185
185
  sampled_new_urns: List[str] = [
@@ -193,7 +193,7 @@ def dataplatform2instance_func(
193
193
  if key
194
194
  ]
195
195
  click.echo(
196
- f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
196
+ f"Will migrate {len(urns_to_migrate)} urns such as {random.sample(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
197
197
  )
198
198
  click.echo(f"New urns will look like {sampled_new_urns}")
199
199
  click.confirm("Ok to proceed?", abort=True)
@@ -426,9 +426,9 @@ def batch_get_ids(
426
426
  entities_yielded += 1
427
427
  log.debug(f"yielding {x}")
428
428
  yield x
429
- assert (
430
- entities_yielded == num_entities
431
- ), "Did not delete all entities, try running this command again!"
429
+ assert entities_yielded == num_entities, (
430
+ "Did not delete all entities, try running this command again!"
431
+ )
432
432
  else:
433
433
  log.error(f"Failed to execute batch get with {str(response.content)}")
434
434
  response.raise_for_status()
@@ -136,9 +136,9 @@ def extras_list_to_dict(extras: List[str]) -> Dict[str, str]:
136
136
  extra_properties: Dict[str, str] = dict()
137
137
  for x in extras:
138
138
  parts = x.split("=")
139
- assert (
140
- len(parts) == 2
141
- ), f"Invalid value for extras {x}, should be in format key=value"
139
+ assert len(parts) == 2, (
140
+ f"Invalid value for extras {x}, should be in format key=value"
141
+ )
142
142
  extra_properties[parts[0]] = parts[1]
143
143
  return extra_properties
144
144
 
@@ -1,9 +1,11 @@
1
1
  import json
2
2
  import logging
3
3
  from pathlib import Path
4
+ from typing import Iterable
4
5
 
5
6
  import click
6
7
  from click_default_group import DefaultGroup
8
+ from ruamel.yaml import YAML
7
9
 
8
10
  from datahub.api.entities.structuredproperties.structuredproperties import (
9
11
  StructuredProperties,
@@ -61,3 +63,85 @@ def get(urn: str, to_file: str) -> None:
61
63
  )
62
64
  else:
63
65
  click.secho(f"Structured property {urn} does not exist")
66
+
67
+
68
+ @properties.command(
69
+ name="list",
70
+ )
71
+ @click.option("--details/--no-details", is_flag=True, default=True)
72
+ @click.option("--to-file", required=False, type=str)
73
+ @telemetry.with_telemetry()
74
+ def list(details: bool, to_file: str) -> None:
75
+ """List structured properties in DataHub"""
76
+
77
+ def to_yaml_list(
78
+ objects: Iterable[StructuredProperties], # iterable of objects to dump
79
+ file: Path,
80
+ ) -> None:
81
+ # if file exists, first we read it
82
+ yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip)
83
+ yaml.indent(mapping=2, sequence=4, offset=2)
84
+ yaml.default_flow_style = False
85
+ serialized_objects = []
86
+ if file.exists():
87
+ with open(file, "r") as fp:
88
+ existing_objects = yaml.load(fp) # this is a list of dicts
89
+ existing_objects = [
90
+ StructuredProperties.parse_obj(obj) for obj in existing_objects
91
+ ]
92
+ objects = [obj for obj in objects]
93
+ # do a positional update of the existing objects
94
+ existing_urns = {obj.urn for obj in existing_objects}
95
+ # existing_urns = {obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" for obj in existing_objects}
96
+ for i, obj in enumerate(existing_objects):
97
+ # existing_urn = obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}"
98
+ existing_urn = obj.urn
99
+ # breakpoint()
100
+ if existing_urn in {obj.urn for obj in objects}:
101
+ existing_objects[i] = next(
102
+ obj.dict(exclude_unset=True, exclude_none=True)
103
+ for obj in objects
104
+ if obj.urn == existing_urn
105
+ )
106
+ new_objects = [
107
+ obj.dict(exclude_unset=True, exclude_none=True)
108
+ for obj in objects
109
+ if obj.urn not in existing_urns
110
+ ]
111
+ serialized_objects = existing_objects + new_objects
112
+ else:
113
+ serialized_objects = [
114
+ obj.dict(exclude_unset=True, exclude_none=True) for obj in objects
115
+ ]
116
+
117
+ with open(file, "w") as fp:
118
+ yaml.dump(serialized_objects, fp)
119
+
120
+ with get_default_graph() as graph:
121
+ if details:
122
+ logger.info(
123
+ "Listing structured properties with details. Use --no-details for urns only"
124
+ )
125
+ structuredproperties = StructuredProperties.list(graph)
126
+ if to_file:
127
+ to_yaml_list(structuredproperties, Path(to_file))
128
+ else:
129
+ for structuredproperty in structuredproperties:
130
+ click.secho(
131
+ f"{json.dumps(structuredproperty.dict(exclude_unset=True, exclude_none=True), indent=2)}"
132
+ )
133
+ else:
134
+ logger.info(
135
+ "Listing structured property urns only, use --details for more information"
136
+ )
137
+ structured_property_urns = StructuredProperties.list_urns(graph)
138
+ if to_file:
139
+ with open(to_file, "w") as f:
140
+ for urn in structured_property_urns:
141
+ f.write(f"{urn}\n")
142
+ click.secho(
143
+ f"Structured property urns written to {to_file}", fg="green"
144
+ )
145
+ else:
146
+ for urn in structured_property_urns:
147
+ click.secho(f"{urn}")
@@ -50,7 +50,7 @@ def pretty_id(id: Optional[str]) -> str:
50
50
  if id.startswith("urn:li:dataset"):
51
51
  dataset_key = dataset_urn_to_key(id)
52
52
  if dataset_key:
53
- return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:'):], fg='white')}:{click.style(dataset_key.name, fg='white')}"
53
+ return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:') :], fg='white')}:{click.style(dataset_key.name, fg='white')}"
54
54
  # failed to prettify, return original
55
55
  return id
56
56
 
@@ -200,8 +200,7 @@ class IgnorableError(MetaError):
200
200
 
201
201
  @runtime_checkable
202
202
  class ExceptionWithProps(Protocol):
203
- def get_telemetry_props(self) -> Dict[str, Any]:
204
- ...
203
+ def get_telemetry_props(self) -> Dict[str, Any]: ...
205
204
 
206
205
 
207
206
  def should_show_stack_trace(exc: Exception) -> bool:
@@ -19,64 +19,87 @@ from datahub.configuration.yaml import YamlConfigurationMechanism
19
19
  Environ = Mapping[str, str]
20
20
 
21
21
 
22
- def _resolve_element(element: str, environ: Environ) -> str:
23
- if re.search(r"(\$\{).+(\})", element):
24
- return expand(element, nounset=True, environ=environ)
25
- elif element.startswith("$"):
26
- try:
27
- return expand(element, nounset=True, environ=environ)
28
- except UnboundVariable:
29
- return element
30
- else:
31
- return element
32
-
33
-
34
- def _resolve_list(ele_list: list, environ: Environ) -> list:
35
- new_v: list = []
36
- for ele in ele_list:
37
- if isinstance(ele, str):
38
- new_v.append(_resolve_element(ele, environ=environ))
39
- elif isinstance(ele, list):
40
- new_v.append(_resolve_list(ele, environ=environ))
41
- elif isinstance(ele, dict):
42
- new_v.append(resolve_env_variables(ele, environ=environ))
43
- else:
44
- new_v.append(ele)
45
- return new_v
46
-
47
-
48
22
  def resolve_env_variables(config: dict, environ: Environ) -> dict:
49
- new_dict: Dict[Any, Any] = {}
50
- for k, v in config.items():
51
- if isinstance(v, dict):
52
- new_dict[k] = resolve_env_variables(v, environ=environ)
53
- elif isinstance(v, list):
54
- new_dict[k] = _resolve_list(v, environ=environ)
55
- elif isinstance(v, str):
56
- new_dict[k] = _resolve_element(v, environ=environ)
57
- else:
58
- new_dict[k] = v
59
- return new_dict
23
+ # TODO: This is kept around for backwards compatibility.
24
+ return EnvResolver(environ).resolve(config)
60
25
 
61
26
 
62
27
  def list_referenced_env_variables(config: dict) -> Set[str]:
63
- # This is a bit of a hack, but expandvars does a bunch of escaping
64
- # and other logic that we don't want to duplicate here.
28
+ # TODO: This is kept around for backwards compatibility.
29
+ return EnvResolver(environ=os.environ).list_referenced_variables(config)
30
+
31
+
32
+ class EnvResolver:
33
+ def __init__(self, environ: Environ, strict_env_syntax: bool = False):
34
+ self.environ = environ
35
+ self.strict_env_syntax = strict_env_syntax
65
36
 
66
- vars = set()
37
+ def resolve(self, config: dict) -> dict:
38
+ return self._resolve_dict(config)
67
39
 
68
- def mock_get_env(key: str, default: Optional[str] = None) -> str:
69
- vars.add(key)
70
- if default is not None:
71
- return default
72
- return "mocked_value"
40
+ @classmethod
41
+ def list_referenced_variables(
42
+ cls,
43
+ config: dict,
44
+ strict_env_syntax: bool = False,
45
+ ) -> Set[str]:
46
+ # This is a bit of a hack, but expandvars does a bunch of escaping
47
+ # and other logic that we don't want to duplicate here.
73
48
 
74
- mock = unittest.mock.MagicMock()
75
- mock.get.side_effect = mock_get_env
49
+ vars = set()
76
50
 
77
- resolve_env_variables(config, environ=mock)
51
+ def mock_get_env(key: str, default: Optional[str] = None) -> str:
52
+ vars.add(key)
53
+ if default is not None:
54
+ return default
55
+ return "mocked_value"
56
+
57
+ mock = unittest.mock.MagicMock()
58
+ mock.get.side_effect = mock_get_env
59
+
60
+ resolver = EnvResolver(environ=mock, strict_env_syntax=strict_env_syntax)
61
+ resolver._resolve_dict(config)
62
+
63
+ return vars
64
+
65
+ def _resolve_element(self, element: str) -> str:
66
+ if re.search(r"(\$\{).+(\})", element):
67
+ return expand(element, nounset=True, environ=self.environ)
68
+ elif not self.strict_env_syntax and element.startswith("$"):
69
+ try:
70
+ return expand(element, nounset=True, environ=self.environ)
71
+ except UnboundVariable:
72
+ # TODO: This fallback is kept around for backwards compatibility, but
73
+ # doesn't make a ton of sense from first principles.
74
+ return element
75
+ else:
76
+ return element
78
77
 
79
- return vars
78
+ def _resolve_list(self, ele_list: list) -> list:
79
+ new_v: list = []
80
+ for ele in ele_list:
81
+ if isinstance(ele, str):
82
+ new_v.append(self._resolve_element(ele))
83
+ elif isinstance(ele, list):
84
+ new_v.append(self._resolve_list(ele))
85
+ elif isinstance(ele, dict):
86
+ new_v.append(self._resolve_dict(ele))
87
+ else:
88
+ new_v.append(ele)
89
+ return new_v
90
+
91
+ def _resolve_dict(self, config: dict) -> dict:
92
+ new_dict: Dict[Any, Any] = {}
93
+ for k, v in config.items():
94
+ if isinstance(v, dict):
95
+ new_dict[k] = self._resolve_dict(v)
96
+ elif isinstance(v, list):
97
+ new_dict[k] = self._resolve_list(v)
98
+ elif isinstance(v, str):
99
+ new_dict[k] = self._resolve_element(v)
100
+ else:
101
+ new_dict[k] = v
102
+ return new_dict
80
103
 
81
104
 
82
105
  WRITE_TO_FILE_DIRECTIVE_PREFIX = "__DATAHUB_TO_FILE_"
@@ -159,7 +182,7 @@ def load_config_file(
159
182
 
160
183
  config = raw_config.copy()
161
184
  if resolve_env_vars:
162
- config = resolve_env_variables(config, environ=os.environ)
185
+ config = EnvResolver(environ=os.environ).resolve(config)
163
186
  if process_directives:
164
187
  config = _process_directives(config)
165
188
 
@@ -121,9 +121,9 @@ class GitInfo(GitReference):
121
121
 
122
122
  repo: str = values["repo"]
123
123
  if repo.startswith(_GITHUB_PREFIX):
124
- return f"git@github.com:{repo[len(_GITHUB_PREFIX):]}.git"
124
+ return f"git@github.com:{repo[len(_GITHUB_PREFIX) :]}.git"
125
125
  elif repo.startswith(_GITLAB_PREFIX):
126
- return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX):]}.git"
126
+ return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX) :]}.git"
127
127
  else:
128
128
  raise ValueError(
129
129
  "Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually."