acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show
  1. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
  2. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
  3. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/common/subtypes.py +2 -0
  35. datahub/ingestion/source/csv_enricher.py +1 -1
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  37. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  38. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  39. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  40. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  41. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  42. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  43. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  44. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  45. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  46. datahub/ingestion/source/elastic_search.py +1 -1
  47. datahub/ingestion/source/feast.py +97 -6
  48. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  49. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  50. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  51. datahub/ingestion/source/ge_data_profiler.py +23 -1
  52. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  53. datahub/ingestion/source/kafka/kafka.py +39 -19
  54. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  55. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  56. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  57. datahub/ingestion/source/looker/view_upstream.py +65 -30
  58. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  59. datahub/ingestion/source/mode.py +0 -23
  60. datahub/ingestion/source/neo4j/__init__.py +0 -0
  61. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  62. datahub/ingestion/source/powerbi/__init__.py +0 -1
  63. datahub/ingestion/source/powerbi/config.py +3 -3
  64. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  65. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  66. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  67. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  68. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  69. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  70. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  71. datahub/ingestion/source/preset.py +1 -0
  72. datahub/ingestion/source/pulsar.py +21 -2
  73. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  74. datahub/ingestion/source/redash.py +13 -63
  75. datahub/ingestion/source/redshift/config.py +1 -0
  76. datahub/ingestion/source/redshift/redshift.py +3 -0
  77. datahub/ingestion/source/s3/source.py +2 -3
  78. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  79. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  80. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  81. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  82. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  83. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  84. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  85. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  86. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  87. datahub/ingestion/source/sql/athena.py +46 -22
  88. datahub/ingestion/source/sql/mssql/source.py +0 -2
  89. datahub/ingestion/source/sql/sql_common.py +34 -21
  90. datahub/ingestion/source/sql/sql_report.py +1 -0
  91. datahub/ingestion/source/sql/sql_types.py +85 -8
  92. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  93. datahub/ingestion/source/superset.py +215 -65
  94. datahub/ingestion/source/tableau/tableau.py +237 -76
  95. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  96. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  97. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  98. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  99. datahub/ingestion/source/unity/proxy_types.py +1 -0
  100. datahub/ingestion/source/unity/source.py +4 -0
  101. datahub/ingestion/source/unity/usage.py +20 -11
  102. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  103. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  104. datahub/integrations/assertion/common.py +1 -1
  105. datahub/lite/duckdb_lite.py +12 -17
  106. datahub/metadata/_schema_classes.py +512 -392
  107. datahub/metadata/_urns/urn_defs.py +1355 -1355
  108. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  109. datahub/metadata/schema.avsc +17222 -17499
  110. datahub/metadata/schemas/FormInfo.avsc +4 -0
  111. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  112. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  113. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  114. datahub/specific/chart.py +0 -39
  115. datahub/specific/dashboard.py +0 -39
  116. datahub/specific/datajob.py +7 -57
  117. datahub/sql_parsing/schema_resolver.py +23 -0
  118. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  119. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  120. datahub/sql_parsing/sqlglot_utils.py +8 -2
  121. datahub/telemetry/telemetry.py +23 -9
  122. datahub/testing/compare_metadata_json.py +1 -1
  123. datahub/testing/doctest.py +12 -0
  124. datahub/utilities/file_backed_collections.py +35 -2
  125. datahub/utilities/partition_executor.py +1 -1
  126. datahub/utilities/urn_encoder.py +2 -1
  127. datahub/utilities/urns/_urn_base.py +1 -1
  128. datahub/utilities/urns/structured_properties_urn.py +1 -1
  129. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  130. datahub/utilities/sql_parser.py +0 -94
  131. datahub/utilities/sql_parser_base.py +0 -21
  132. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  133. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -68,12 +68,13 @@ mode = datahub.ingestion.source.mode:ModeSource
68
68
  mongodb = datahub.ingestion.source.mongodb:MongoDBSource
69
69
  mssql = datahub.ingestion.source.sql.mssql:SQLServerSource
70
70
  mysql = datahub.ingestion.source.sql.mysql:MySQLSource
71
+ neo4j = datahub.ingestion.source.neo4j.neo4j_source:Neo4jSource
71
72
  nifi = datahub.ingestion.source.nifi:NifiSource
72
73
  okta = datahub.ingestion.source.identity.okta:OktaSource
73
74
  openapi = datahub.ingestion.source.openapi:OpenApiSource
74
75
  oracle = datahub.ingestion.source.sql.oracle:OracleSource
75
76
  postgres = datahub.ingestion.source.sql.postgres:PostgresSource
76
- powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource
77
+ powerbi = datahub.ingestion.source.powerbi.powerbi:PowerBiDashboardSource
77
78
  powerbi-report-server = datahub.ingestion.source.powerbi_report_server:PowerBiReportServerDashboardSource
78
79
  preset = datahub.ingestion.source.preset:PresetSource
79
80
  presto = datahub.ingestion.source.sql.presto:PrestoSource
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.14.1.13rc9"
6
+ __version__ = "0.15.0"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -1,8 +1,7 @@
1
1
  import logging
2
- from contextlib import contextmanager
3
2
  from enum import Enum
4
3
  from pathlib import Path
5
- from typing import Generator, List, Optional
4
+ from typing import List, Optional
6
5
 
7
6
  import yaml
8
7
  from pydantic import validator
@@ -10,39 +9,18 @@ from ruamel.yaml import YAML
10
9
 
11
10
  from datahub.configuration.common import ConfigModel
12
11
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
- from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
12
+ from datahub.ingestion.graph.client import DataHubGraph
14
13
  from datahub.metadata.schema_classes import (
15
14
  PropertyValueClass,
16
15
  StructuredPropertyDefinitionClass,
17
16
  )
18
- from datahub.utilities.urns.urn import Urn
17
+ from datahub.metadata.urns import StructuredPropertyUrn, Urn
18
+ from datahub.utilities.urns._urn_base import URN_TYPES
19
19
 
20
20
  logging.basicConfig(level=logging.INFO)
21
21
  logger = logging.getLogger(__name__)
22
22
 
23
23
 
24
- class StructuredPropertiesConfig:
25
- """Configuration class to hold the graph client"""
26
-
27
- _graph: Optional[DataHubGraph] = None
28
-
29
- @classmethod
30
- @contextmanager
31
- def use_graph(cls, graph: DataHubGraph) -> Generator[None, None, None]:
32
- """Context manager to temporarily set a custom graph"""
33
- previous_graph = cls._graph
34
- cls._graph = graph
35
- try:
36
- yield
37
- finally:
38
- cls._graph = previous_graph
39
-
40
- @classmethod
41
- def get_graph(cls) -> DataHubGraph:
42
- """Get the current graph, falling back to default if none set"""
43
- return cls._graph if cls._graph is not None else get_default_graph()
44
-
45
-
46
24
  class AllowedTypes(Enum):
47
25
  STRING = "string"
48
26
  RICH_TEXT = "rich_text"
@@ -64,29 +42,28 @@ class AllowedValue(ConfigModel):
64
42
  description: Optional[str] = None
65
43
 
66
44
 
67
- VALID_ENTITY_TYPES_PREFIX_STRING = ", ".join(
68
- [
69
- f"urn:li:entityType:datahub.{x}"
70
- for x in ["dataset", "dashboard", "dataFlow", "schemaField"]
71
- ]
72
- )
73
- VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {VALID_ENTITY_TYPES_PREFIX_STRING}, etc... Ensure that the entity type is valid."
45
+ VALID_ENTITY_TYPE_URNS = [
46
+ Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys()
47
+ ]
48
+ _VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
49
+
50
+
51
+ def _validate_entity_type_urn(v: str) -> str:
52
+ urn = Urn.make_entity_type_urn(v)
53
+ if urn not in VALID_ENTITY_TYPE_URNS:
54
+ raise ValueError(
55
+ f"Input {v} is not a valid entity type urn. {_VALID_ENTITY_TYPES_STRING}"
56
+ )
57
+ v = str(urn)
58
+ return v
74
59
 
75
60
 
76
61
  class TypeQualifierAllowedTypes(ConfigModel):
77
62
  allowed_types: List[str]
78
63
 
79
- @validator("allowed_types", each_item=True)
80
- def validate_allowed_types(cls, v):
81
- if v:
82
- graph = StructuredPropertiesConfig.get_graph()
83
- validated_urn = Urn.make_entity_type_urn(v)
84
- if not graph.exists(validated_urn):
85
- raise ValueError(
86
- f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}"
87
- )
88
- v = str(validated_urn)
89
- return v
64
+ _check_allowed_types = validator("allowed_types", each_item=True, allow_reuse=True)(
65
+ _validate_entity_type_urn
66
+ )
90
67
 
91
68
 
92
69
  class StructuredProperties(ConfigModel):
@@ -103,26 +80,36 @@ class StructuredProperties(ConfigModel):
103
80
  type_qualifier: Optional[TypeQualifierAllowedTypes] = None
104
81
  immutable: Optional[bool] = False
105
82
 
106
- @validator("entity_types", each_item=True)
107
- def validate_entity_types(cls, v):
108
- if v:
109
- graph = StructuredPropertiesConfig.get_graph()
110
- validated_urn = Urn.make_entity_type_urn(v)
111
- if not graph.exists(validated_urn):
112
- raise ValueError(
113
- f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}"
114
- )
115
- v = str(validated_urn)
83
+ _check_entity_types = validator("entity_types", each_item=True, allow_reuse=True)(
84
+ _validate_entity_type_urn
85
+ )
86
+
87
+ @validator("type")
88
+ def validate_type(cls, v: str) -> str:
89
+ # Convert to lowercase if needed
90
+ if not v.islower():
91
+ logger.warning(
92
+ f"Structured property type should be lowercase. Updated to {v.lower()}"
93
+ )
94
+ v = v.lower()
95
+
96
+ # Check if type is allowed
97
+ if not AllowedTypes.check_allowed_type(v):
98
+ raise ValueError(
99
+ f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}"
100
+ )
116
101
  return v
117
102
 
118
103
  @property
119
104
  def fqn(self) -> str:
120
105
  assert self.urn is not None
121
- return (
122
- self.qualified_name
123
- or self.id
124
- or Urn.create_from_string(self.urn).get_entity_id()[0]
125
- )
106
+ id = StructuredPropertyUrn.from_string(self.urn).id
107
+ if self.qualified_name is not None:
108
+ # ensure that qualified name and ID match
109
+ assert (
110
+ self.qualified_name == id
111
+ ), "ID in the urn and the qualified_name must match"
112
+ return id
126
113
 
127
114
  @validator("urn", pre=True, always=True)
128
115
  def urn_must_be_present(cls, v, values):
@@ -133,100 +120,90 @@ class StructuredProperties(ConfigModel):
133
120
  return v
134
121
 
135
122
  @staticmethod
136
- def create(file: str, graph: Optional[DataHubGraph] = None) -> None:
137
- emitter: DataHubGraph = graph if graph else get_default_graph()
138
- with StructuredPropertiesConfig.use_graph(emitter):
139
- print("Using graph")
140
- with open(file) as fp:
141
- structuredproperties: List[dict] = yaml.safe_load(fp)
142
- for structuredproperty_raw in structuredproperties:
143
- structuredproperty = StructuredProperties.parse_obj(
144
- structuredproperty_raw
145
- )
146
- if not structuredproperty.type.islower():
147
- structuredproperty.type = structuredproperty.type.lower()
148
- logger.warn(
149
- f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
150
- )
151
- if not AllowedTypes.check_allowed_type(structuredproperty.type):
152
- raise ValueError(
153
- f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
154
- )
155
- mcp = MetadataChangeProposalWrapper(
156
- entityUrn=structuredproperty.urn,
157
- aspect=StructuredPropertyDefinitionClass(
158
- qualifiedName=structuredproperty.fqn,
159
- valueType=Urn.make_data_type_urn(structuredproperty.type),
160
- displayName=structuredproperty.display_name,
161
- description=structuredproperty.description,
162
- entityTypes=[
163
- Urn.make_entity_type_urn(entity_type)
164
- for entity_type in structuredproperty.entity_types or []
165
- ],
166
- cardinality=structuredproperty.cardinality,
167
- immutable=structuredproperty.immutable,
168
- allowedValues=(
169
- [
170
- PropertyValueClass(
171
- value=v.value, description=v.description
172
- )
173
- for v in structuredproperty.allowed_values
174
- ]
175
- if structuredproperty.allowed_values
176
- else None
177
- ),
178
- typeQualifier=(
179
- {
180
- "allowedTypes": structuredproperty.type_qualifier.allowed_types
181
- }
182
- if structuredproperty.type_qualifier
183
- else None
184
- ),
185
- ),
186
- )
187
- emitter.emit_mcp(mcp)
188
-
189
- logger.info(f"Created structured property {structuredproperty.urn}")
190
-
191
- @classmethod
192
- def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
193
- with StructuredPropertiesConfig.use_graph(graph):
194
- structured_property: Optional[
195
- StructuredPropertyDefinitionClass
196
- ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
197
- if structured_property is None:
198
- raise Exception(
199
- "StructuredPropertyDefinition aspect is None. Unable to create structured property."
200
- )
201
- return StructuredProperties(
202
- urn=urn,
203
- qualified_name=structured_property.qualifiedName,
204
- display_name=structured_property.displayName,
205
- type=structured_property.valueType,
206
- description=structured_property.description,
207
- entity_types=structured_property.entityTypes,
208
- cardinality=structured_property.cardinality,
209
- allowed_values=(
123
+ def from_yaml(file: str) -> List["StructuredProperties"]:
124
+ with open(file) as fp:
125
+ structuredproperties: List[dict] = yaml.safe_load(fp)
126
+
127
+ result: List[StructuredProperties] = []
128
+ for structuredproperty_raw in structuredproperties:
129
+ result.append(StructuredProperties.parse_obj(structuredproperty_raw))
130
+ return result
131
+
132
+ def generate_mcps(self) -> List[MetadataChangeProposalWrapper]:
133
+ mcp = MetadataChangeProposalWrapper(
134
+ entityUrn=self.urn,
135
+ aspect=StructuredPropertyDefinitionClass(
136
+ qualifiedName=self.fqn,
137
+ valueType=Urn.make_data_type_urn(self.type),
138
+ displayName=self.display_name,
139
+ description=self.description,
140
+ entityTypes=[
141
+ Urn.make_entity_type_urn(entity_type)
142
+ for entity_type in self.entity_types or []
143
+ ],
144
+ cardinality=self.cardinality,
145
+ immutable=self.immutable,
146
+ allowedValues=(
210
147
  [
211
- AllowedValue(
212
- value=av.value,
213
- description=av.description,
214
- )
215
- for av in structured_property.allowedValues or []
148
+ PropertyValueClass(value=v.value, description=v.description)
149
+ for v in self.allowed_values
216
150
  ]
217
- if structured_property.allowedValues is not None
151
+ if self.allowed_values
218
152
  else None
219
153
  ),
220
- type_qualifier=(
221
- {
222
- "allowed_types": structured_property.typeQualifier.get(
223
- "allowedTypes"
224
- )
225
- }
226
- if structured_property.typeQualifier
154
+ typeQualifier=(
155
+ {"allowedTypes": self.type_qualifier.allowed_types}
156
+ if self.type_qualifier
227
157
  else None
228
158
  ),
159
+ ),
160
+ )
161
+ return [mcp]
162
+
163
+ @staticmethod
164
+ def create(file: str, graph: DataHubGraph) -> None:
165
+ # TODO: Deprecate this method.
166
+ structuredproperties = StructuredProperties.from_yaml(file)
167
+ for structuredproperty in structuredproperties:
168
+ for mcp in structuredproperty.generate_mcps():
169
+ graph.emit_mcp(mcp)
170
+
171
+ logger.info(f"Created structured property {structuredproperty.urn}")
172
+
173
+ @classmethod
174
+ def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
175
+ structured_property: Optional[
176
+ StructuredPropertyDefinitionClass
177
+ ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
178
+ if structured_property is None:
179
+ raise Exception(
180
+ "StructuredPropertyDefinition aspect is None. Unable to create structured property."
229
181
  )
182
+ return StructuredProperties(
183
+ urn=urn,
184
+ qualified_name=structured_property.qualifiedName,
185
+ display_name=structured_property.displayName,
186
+ type=structured_property.valueType,
187
+ description=structured_property.description,
188
+ entity_types=structured_property.entityTypes,
189
+ cardinality=structured_property.cardinality,
190
+ allowed_values=(
191
+ [
192
+ AllowedValue(
193
+ value=av.value,
194
+ description=av.description,
195
+ )
196
+ for av in structured_property.allowedValues or []
197
+ ]
198
+ if structured_property.allowedValues is not None
199
+ else None
200
+ ),
201
+ type_qualifier=(
202
+ {"allowed_types": structured_property.typeQualifier.get("allowedTypes")}
203
+ if structured_property.typeQualifier
204
+ else None
205
+ ),
206
+ )
230
207
 
231
208
  def to_yaml(
232
209
  self,
datahub/cli/cli_utils.py CHANGED
@@ -327,6 +327,8 @@ def _ensure_valid_gms_url_acryl_cloud(url: str) -> str:
327
327
  url = f"{url}/gms"
328
328
  elif url.endswith("acryl.io/"):
329
329
  url = f"{url}gms"
330
+ if url.endswith("acryl.io/api/gms"):
331
+ url = url.replace("acryl.io/api/gms", "acryl.io/gms")
330
332
 
331
333
  return url
332
334
 
datahub/cli/delete_cli.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2
3
  from dataclasses import dataclass
3
4
  from datetime import datetime
4
5
  from random import choices
@@ -214,14 +215,47 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
214
215
 
215
216
 
216
217
  @delete.command()
217
- @click.option("--urn", required=True, type=str, help="the urn of the entity")
218
- def undo_by_filter(urn: str) -> None:
218
+ @click.option("--urn", required=False, type=str, help="the urn of the entity")
219
+ @click.option(
220
+ "-p",
221
+ "--platform",
222
+ required=False,
223
+ type=str,
224
+ help="Platform filter (e.g. snowflake)",
225
+ )
226
+ @click.option(
227
+ "-b",
228
+ "--batch-size",
229
+ required=False,
230
+ default=3000,
231
+ type=int,
232
+ help="Batch size when querying for entities to un-soft delete."
233
+ "Maximum 10000. Large batch sizes may cause timeouts.",
234
+ )
235
+ def undo_by_filter(
236
+ urn: Optional[str], platform: Optional[str], batch_size: int
237
+ ) -> None:
219
238
  """
220
- Undo a soft deletion of an entity
239
+ Undo soft deletion by filters
221
240
  """
222
241
  graph = get_default_graph()
223
242
  logger.info(f"Using {graph}")
224
- graph.set_soft_delete_status(urn=urn, delete=False)
243
+ if urn:
244
+ graph.set_soft_delete_status(urn=urn, delete=False)
245
+ else:
246
+ urns = list(
247
+ graph.get_urns_by_filter(
248
+ platform=platform,
249
+ query="*",
250
+ status=RemovedStatusFilter.ONLY_SOFT_DELETED,
251
+ batch_size=batch_size,
252
+ )
253
+ )
254
+ logger.info(f"Going to un-soft delete {len(urns)} urns")
255
+ urns_iter = progressbar.progressbar(urns, redirect_stdout=True)
256
+ for urn in urns_iter:
257
+ assert urn
258
+ graph.set_soft_delete_status(urn=urn, delete=False)
225
259
 
226
260
 
227
261
  @delete.command(no_args_is_help=True)
@@ -312,6 +346,9 @@ def undo_by_filter(urn: str) -> None:
312
346
  default=False,
313
347
  help="Only delete soft-deleted entities, for hard deletion",
314
348
  )
349
+ @click.option(
350
+ "--workers", type=int, default=1, help="Num of workers to use for deletion."
351
+ )
315
352
  @upgrade.check_upgrade
316
353
  @telemetry.with_telemetry()
317
354
  def by_filter(
@@ -329,6 +366,7 @@ def by_filter(
329
366
  batch_size: int,
330
367
  dry_run: bool,
331
368
  only_soft_deleted: bool,
369
+ workers: int = 1,
332
370
  ) -> None:
333
371
  """Delete metadata from datahub using a single urn or a combination of filters."""
334
372
 
@@ -349,16 +387,19 @@ def by_filter(
349
387
  # TODO: add some validation on entity_type
350
388
 
351
389
  if not force and not soft and not dry_run:
390
+ message = (
391
+ "Hard deletion will permanently delete data from DataHub and can be slow. "
392
+ "We generally recommend using soft deletes instead. "
393
+ "Do you want to continue?"
394
+ )
352
395
  if only_soft_deleted:
353
396
  click.confirm(
354
- "This will permanently delete data from DataHub. Do you want to continue?",
397
+ message,
355
398
  abort=True,
356
399
  )
357
400
  else:
358
401
  click.confirm(
359
- "Hard deletion will permanently delete data from DataHub and can be slow. "
360
- "We generally recommend using soft deletes instead. "
361
- "Do you want to continue?",
402
+ message,
362
403
  abort=True,
363
404
  )
364
405
 
@@ -429,26 +470,64 @@ def by_filter(
429
470
  abort=True,
430
471
  )
431
472
 
432
- urns_iter = urns
433
- if not delete_by_urn and not dry_run:
434
- urns_iter = progressbar.progressbar(urns, redirect_stdout=True)
473
+ _delete_urns_parallel(
474
+ graph=graph,
475
+ urns=urns,
476
+ aspect_name=aspect,
477
+ soft=soft,
478
+ dry_run=dry_run,
479
+ delete_by_urn=delete_by_urn,
480
+ start_time=start_time,
481
+ end_time=end_time,
482
+ workers=workers,
483
+ )
484
+
435
485
 
436
- # Run the deletion.
486
+ def _delete_urns_parallel(
487
+ graph: DataHubGraph,
488
+ urns: List[str],
489
+ delete_by_urn: bool,
490
+ start_time: Optional[datetime],
491
+ end_time: Optional[datetime],
492
+ aspect_name: Optional[str] = None,
493
+ soft: bool = True,
494
+ dry_run: bool = False,
495
+ workers: int = 1,
496
+ ) -> None:
437
497
  deletion_result = DeletionResult()
438
- with PerfTimer() as timer:
439
- for urn in urns_iter:
440
- one_result = _delete_one_urn(
441
- graph=graph,
442
- urn=urn,
443
- aspect_name=aspect,
444
- soft=soft,
445
- dry_run=dry_run,
446
- start_time=start_time,
447
- end_time=end_time,
498
+
499
+ def process_urn(urn):
500
+ return _delete_one_urn(
501
+ graph=graph,
502
+ urn=urn,
503
+ aspect_name=aspect_name,
504
+ soft=soft,
505
+ dry_run=dry_run,
506
+ start_time=start_time,
507
+ end_time=end_time,
508
+ )
509
+
510
+ with PerfTimer() as timer, ThreadPoolExecutor(max_workers=workers) as executor:
511
+ future_to_urn = {executor.submit(process_urn, urn): urn for urn in urns}
512
+
513
+ completed_futures = as_completed(future_to_urn)
514
+ if not delete_by_urn and not dry_run:
515
+ futures_iter = progressbar.progressbar(
516
+ as_completed(future_to_urn),
517
+ max_value=len(future_to_urn),
518
+ redirect_stdout=True,
448
519
  )
449
- deletion_result.merge(one_result)
520
+ else:
521
+ futures_iter = completed_futures
522
+
523
+ for future in futures_iter:
524
+ try:
525
+ one_result = future.result()
526
+ deletion_result.merge(one_result)
527
+ except Exception as e:
528
+ urn = future_to_urn[future]
529
+ click.secho(f"Error processing URN {urn}: {e}", fg="red")
450
530
 
451
- # Report out a summary of the deletion result.
452
531
  click.echo(
453
532
  deletion_result.format_message(
454
533
  dry_run=dry_run, soft=soft, time_sec=timer.elapsed_seconds()
datahub/cli/ingest_cli.py CHANGED
@@ -27,6 +27,7 @@ from datahub.utilities.perf_timer import PerfTimer
27
27
 
28
28
  logger = logging.getLogger(__name__)
29
29
 
30
+ INGEST_SRC_TABLE_COLUMNS = ["runId", "source", "startTime", "status", "URN"]
30
31
  RUNS_TABLE_COLUMNS = ["runId", "rows", "created at"]
31
32
  RUN_TABLE_COLUMNS = ["urn", "aspect name", "created at"]
32
33
 
@@ -437,6 +438,115 @@ def mcps(path: str) -> None:
437
438
  sys.exit(ret)
438
439
 
439
440
 
441
+ @ingest.command()
442
+ @click.argument("page_offset", type=int, default=0)
443
+ @click.argument("page_size", type=int, default=100)
444
+ @click.option("--urn", type=str, default=None, help="Filter by ingestion source URN.")
445
+ @click.option(
446
+ "--source", type=str, default=None, help="Filter by ingestion source name."
447
+ )
448
+ @upgrade.check_upgrade
449
+ @telemetry.with_telemetry()
450
+ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None:
451
+ """List ingestion source runs with their details, optionally filtered by URN or source."""
452
+
453
+ query = """
454
+ query listIngestionRuns($input: ListIngestionSourcesInput!) {
455
+ listIngestionSources(input: $input) {
456
+ ingestionSources {
457
+ urn
458
+ name
459
+ executions {
460
+ executionRequests {
461
+ id
462
+ result {
463
+ startTimeMs
464
+ status
465
+ }
466
+ }
467
+ }
468
+ }
469
+ }
470
+ }
471
+ """
472
+
473
+ # filter by urn and/or source using CONTAINS
474
+ filters = []
475
+ if urn:
476
+ filters.append({"field": "urn", "values": [urn], "condition": "CONTAIN"})
477
+ if source:
478
+ filters.append({"field": "name", "values": [source], "condition": "CONTAIN"})
479
+
480
+ variables = {
481
+ "input": {
482
+ "start": page_offset,
483
+ "count": page_size,
484
+ "filters": filters,
485
+ }
486
+ }
487
+
488
+ client = get_default_graph()
489
+ session = client._session
490
+ gms_host = client.config.server
491
+
492
+ url = f"{gms_host}/api/graphql"
493
+ try:
494
+ response = session.post(url, json={"query": query, "variables": variables})
495
+ response.raise_for_status()
496
+ except Exception as e:
497
+ click.echo(f"Error fetching data: {str(e)}")
498
+ return
499
+
500
+ try:
501
+ data = response.json()
502
+ except ValueError:
503
+ click.echo("Failed to parse JSON response from server.")
504
+ return
505
+
506
+ if not data:
507
+ click.echo("No response received from the server.")
508
+ return
509
+
510
+ # when urn or source filter does not match, exit gracefully
511
+ if (
512
+ not isinstance(data.get("data"), dict)
513
+ or "listIngestionSources" not in data["data"]
514
+ ):
515
+ click.echo("No matching ingestion sources found. Please check your filters.")
516
+ return
517
+
518
+ ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
519
+ if not ingestion_sources:
520
+ click.echo("No ingestion sources or executions found.")
521
+ return
522
+
523
+ rows = []
524
+ for ingestion_source in ingestion_sources:
525
+ urn = ingestion_source.get("urn", "N/A")
526
+ name = ingestion_source.get("name", "N/A")
527
+
528
+ executions = ingestion_source.get("executions", {}).get("executionRequests", [])
529
+ for execution in executions:
530
+ execution_id = execution.get("id", "N/A")
531
+ start_time = execution.get("result", {}).get("startTimeMs", "N/A")
532
+ start_time = (
533
+ datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S")
534
+ if start_time != "N/A"
535
+ else "N/A"
536
+ )
537
+ status = execution.get("result", {}).get("status", "N/A")
538
+
539
+ rows.append([execution_id, name, start_time, status, urn])
540
+
541
+ click.echo(
542
+ tabulate(
543
+ rows,
544
+ headers=INGEST_SRC_TABLE_COLUMNS,
545
+ tablefmt="grid",
546
+ )
547
+ )
548
+
549
+
440
550
  @ingest.command()
441
551
  @click.argument("page_offset", type=int, default=0)
442
552
  @click.argument("page_size", type=int, default=100)
datahub/cli/put_cli.py CHANGED
@@ -105,7 +105,7 @@ def platform(
105
105
  """
106
106
 
107
107
  if name.startswith(f"urn:li:{DataPlatformUrn.ENTITY_TYPE}"):
108
- platform_urn = DataPlatformUrn.create_from_string(name)
108
+ platform_urn = DataPlatformUrn.from_string(name)
109
109
  platform_name = platform_urn.get_entity_id_as_string()
110
110
  else:
111
111
  platform_name = name.lower()