acryl-datahub 0.15.0rc23__py3-none-any.whl → 0.15.0rc25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=eOmo10Qg3UHdXM-mhXsProWUviox9Ng9kfUMS-B8xpo,575
1
+ datahub/__init__.py,sha256=U5x9yuhIDpX_smTqMjMG3FyKLV9w8fZiOK34-Pl2vd0,575
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -52,7 +52,7 @@ datahub/api/entities/forms/forms_graphql_constants.py,sha256=DKpnKlMKTjmnyrCTvp6
52
52
  datahub/api/entities/platformresource/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  datahub/api/entities/platformresource/platform_resource.py,sha256=pVAjv6NoH746Mfvdak7ji0eqlEcEeV-Ji7M5gyNXmds,10603
54
54
  datahub/api/entities/structuredproperties/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
- datahub/api/entities/structuredproperties/structuredproperties.py,sha256=YO4mdn6BziOzvzoFe-g2KfZlOZy8gqwMyyzj_7vF4BY,8845
55
+ datahub/api/entities/structuredproperties/structuredproperties.py,sha256=tYEVp2oqJa9FhlrnbAf2Zw82WqicJI9lF0P5U9soY9E,7502
56
56
  datahub/api/graphql/__init__.py,sha256=5yl0dJxO-2d_QuykdJrDIbWq4ja9bo0t2dAEh89JOog,142
57
57
  datahub/api/graphql/assertion.py,sha256=ponITypRQ8vE8kiqRNpvdoniNJzi4aeBK97UvkF0VhA,2818
58
58
  datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1614
@@ -85,7 +85,7 @@ datahub/cli/specific/dataset_cli.py,sha256=AwSmIiuV3XbgprW4_1Wj-EJq1OPqFyolSNczQ
85
85
  datahub/cli/specific/file_loader.py,sha256=YMyv_evdKyHSft5Tm_kOcqJ4ALpRmMm54ZJAyl7Nxqs,773
86
86
  datahub/cli/specific/forms_cli.py,sha256=OLVeG8NtK1eDBuUKCT5Ald35np8__f8mLzbZM_zUfWU,1484
87
87
  datahub/cli/specific/group_cli.py,sha256=xPUYk48VbVXLMj-z9VNW0RZzXOe4rQsc2jLwSOGCoec,1967
88
- datahub/cli/specific/structuredproperties_cli.py,sha256=Q-ew8JBPmSbyj2IGFaaMHs1VL4f6PP1I-MC-TuD80Z0,1887
88
+ datahub/cli/specific/structuredproperties_cli.py,sha256=qP7kHpN7y3cOR0IGZkD4PGlRzFqLdXqZ6yrbCKVmG8M,1937
89
89
  datahub/cli/specific/user_cli.py,sha256=jGAokb1NRu8obs6P2g4OL2NQdFgpUBa9De55TBBtun0,1897
90
90
  datahub/configuration/__init__.py,sha256=5TN3a7CWNsLRHpdj-sv2bxKWF2IslvJwE6EpNMFrIS4,123
91
91
  datahub/configuration/_config_enum.py,sha256=ul2hr5gMmdLvBINicFkMNMi1ApmnmZSwNdUYYted5nk,1447
@@ -138,8 +138,8 @@ datahub/ingestion/api/registry.py,sha256=LGElUdzhNQoEr-k2SN23mJaIYnA1PYfF97LQxBm
138
138
  datahub/ingestion/api/report.py,sha256=CpQHqLAoYGV4bxNIpYQugLY0EUoxROlp2NUM9ONHj_I,4364
139
139
  datahub/ingestion/api/report_helpers.py,sha256=WbUC1kQeaKqIagGV3XzfPmPs7slAT1mfNY4og2BH2A8,994
140
140
  datahub/ingestion/api/sink.py,sha256=6g01wou8pv79s0leDWyK12cgl7eLtpiwSUHqOw08vx4,4503
141
- datahub/ingestion/api/source.py,sha256=W_GkXkEXGdwwO0OEaR2BgxoBAATsvY9VIubCTXSHfB8,18774
142
- datahub/ingestion/api/source_helpers.py,sha256=ninruzG4MwJuEmkOzpqLONzVi4OOi2x3RLWoogoELY4,19708
141
+ datahub/ingestion/api/source.py,sha256=pHfFIBZa57ySpZWnt03mmayWLdbbBAGOhWqWZnf1KUA,18815
142
+ datahub/ingestion/api/source_helpers.py,sha256=k40ofHHPsfbYFJgZXWaD6ORvEa0SBmsaOacF1ttolcQ,19743
143
143
  datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188mmC8J8,582
144
144
  datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
145
145
  datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -202,7 +202,7 @@ datahub/ingestion/source/nifi.py,sha256=ttsjZ9aRUvINmewvKFIQD8Rwa4jcl35WFG-F-jPG
202
202
  datahub/ingestion/source/openapi.py,sha256=3ea2ORz1cuq4e7L2hSjxG9Cw3__pVoJ5UNYTJS3EnKU,17386
203
203
  datahub/ingestion/source/openapi_parser.py,sha256=1_68wHWe_SzWYEyC1YVDw9vxoadKjW1yv8DecvyIhwY,13606
204
204
  datahub/ingestion/source/preset.py,sha256=fByqamRLnXxsfCGdLPzWN_5LJR_s2_G2f_zwSKUc8EA,3981
205
- datahub/ingestion/source/pulsar.py,sha256=H8XJC7xIX8Kdkd7006PxllAGVO_Pjza8Xx9VUBOvpPc,19827
205
+ datahub/ingestion/source/pulsar.py,sha256=7rTOEqYmeOuRZl5DG8d5OFkb4l9H6-1bETZfa-4DfmI,20163
206
206
  datahub/ingestion/source/redash.py,sha256=g-wBJ4e54EdA2A2D5XmoNBilCDyh5b32M_C_fY1bhmA,30055
207
207
  datahub/ingestion/source/salesforce.py,sha256=S6LSM6mzl8-zKbrJPoINhM1SCpYfM244Xb74pbEI-J0,31792
208
208
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
@@ -301,10 +301,10 @@ datahub/ingestion/source/fivetran/fivetran.py,sha256=uKbM5czPz-6LOseoh1FwavWDIuL
301
301
  datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
302
302
  datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
303
303
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
304
- datahub/ingestion/source/gc/datahub_gc.py,sha256=oWeaIGBDolz-Rf6qgGJ5VlQ9H1IY4hJFPAetoUWFyL4,12394
305
- datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=ficLiRb9DEx4YFXZqWO8o-6ndVIrNW_yR-Yn2SXfDxc,15836
304
+ datahub/ingestion/source/gc/datahub_gc.py,sha256=AHlKGwDD-E_TEHcJIpRtwk6ikjT-KiyfTo-BXZnMSk0,12114
305
+ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=u90XEmW1vRFbvp4CQ8ujPxTGJUyJqO2U6ApcI6mFrjE,16588
306
306
  datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=cHJmxz4NmA7VjTX2iGEo3wZ_SDrjC_rCQcnRxKgfUVI,8713
307
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=lEqZEfNMoC7FoUKsZJ91x3WHo14cH8sCaG7PZRuYCQU,7353
307
+ datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=wRnRaIVUG483tY4nyDkEn6Xi2RL5MjrVvoCoZimqwSg,7514
308
308
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
309
309
  datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
310
310
  datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
@@ -976,8 +976,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
976
976
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
977
977
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
978
978
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
979
- acryl_datahub-0.15.0rc23.dist-info/METADATA,sha256=cPlJko8JF1pZEIihXKsAct2ai4okUHAMu8e3sAha7mU,173559
980
- acryl_datahub-0.15.0rc23.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
981
- acryl_datahub-0.15.0rc23.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
982
- acryl_datahub-0.15.0rc23.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
983
- acryl_datahub-0.15.0rc23.dist-info/RECORD,,
979
+ acryl_datahub-0.15.0rc25.dist-info/METADATA,sha256=G7pCS6M1IzZefvuba5jKgr6IvehacgjV1JsauQfsMsk,173639
980
+ acryl_datahub-0.15.0rc25.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
981
+ acryl_datahub-0.15.0rc25.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
982
+ acryl_datahub-0.15.0rc25.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
983
+ acryl_datahub-0.15.0rc25.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0rc23"
6
+ __version__ = "0.15.0rc25"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -9,27 +9,18 @@ from ruamel.yaml import YAML
9
9
 
10
10
  from datahub.configuration.common import ConfigModel
11
11
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
- from datahub.ingestion.api.global_context import get_graph_context, set_graph_context
13
- from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
12
+ from datahub.ingestion.graph.client import DataHubGraph
14
13
  from datahub.metadata.schema_classes import (
15
14
  PropertyValueClass,
16
15
  StructuredPropertyDefinitionClass,
17
16
  )
18
- from datahub.utilities.urns.urn import Urn
17
+ from datahub.metadata.urns import StructuredPropertyUrn, Urn
18
+ from datahub.utilities.urns._urn_base import URN_TYPES
19
19
 
20
20
  logging.basicConfig(level=logging.INFO)
21
21
  logger = logging.getLogger(__name__)
22
22
 
23
23
 
24
- class StructuredPropertiesConfig:
25
- """Configuration class to hold the graph client"""
26
-
27
- @classmethod
28
- def get_graph_required(cls) -> DataHubGraph:
29
- """Get the current graph, falling back to default if none set"""
30
- return get_graph_context() or get_default_graph()
31
-
32
-
33
24
  class AllowedTypes(Enum):
34
25
  STRING = "string"
35
26
  RICH_TEXT = "rich_text"
@@ -51,29 +42,28 @@ class AllowedValue(ConfigModel):
51
42
  description: Optional[str] = None
52
43
 
53
44
 
54
- VALID_ENTITY_TYPES_PREFIX_STRING = ", ".join(
55
- [
56
- f"urn:li:entityType:datahub.{x}"
57
- for x in ["dataset", "dashboard", "dataFlow", "schemaField"]
58
- ]
59
- )
60
- VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {VALID_ENTITY_TYPES_PREFIX_STRING}, etc... Ensure that the entity type is valid."
45
+ VALID_ENTITY_TYPE_URNS = [
46
+ Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys()
47
+ ]
48
+ _VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
49
+
50
+
51
+ def _validate_entity_type_urn(v: str) -> str:
52
+ urn = Urn.make_entity_type_urn(v)
53
+ if urn not in VALID_ENTITY_TYPE_URNS:
54
+ raise ValueError(
55
+ f"Input {v} is not a valid entity type urn. {_VALID_ENTITY_TYPES_STRING}"
56
+ )
57
+ v = str(urn)
58
+ return v
61
59
 
62
60
 
63
61
  class TypeQualifierAllowedTypes(ConfigModel):
64
62
  allowed_types: List[str]
65
63
 
66
- @validator("allowed_types", each_item=True)
67
- def validate_allowed_types(cls, v):
68
- if v:
69
- graph = StructuredPropertiesConfig.get_graph_required()
70
- validated_urn = Urn.make_entity_type_urn(v)
71
- if not graph.exists(validated_urn):
72
- raise ValueError(
73
- f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}"
74
- )
75
- v = str(validated_urn)
76
- return v
64
+ _check_allowed_types = validator("allowed_types", each_item=True, allow_reuse=True)(
65
+ _validate_entity_type_urn
66
+ )
77
67
 
78
68
 
79
69
  class StructuredProperties(ConfigModel):
@@ -90,22 +80,30 @@ class StructuredProperties(ConfigModel):
90
80
  type_qualifier: Optional[TypeQualifierAllowedTypes] = None
91
81
  immutable: Optional[bool] = False
92
82
 
93
- @validator("entity_types", each_item=True)
94
- def validate_entity_types(cls, v):
95
- if v:
96
- graph = StructuredPropertiesConfig.get_graph_required()
97
- validated_urn = Urn.make_entity_type_urn(v)
98
- if not graph.exists(validated_urn):
99
- raise ValueError(
100
- f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}"
101
- )
102
- v = str(validated_urn)
83
+ _check_entity_types = validator("entity_types", each_item=True, allow_reuse=True)(
84
+ _validate_entity_type_urn
85
+ )
86
+
87
+ @validator("type")
88
+ def validate_type(cls, v: str) -> str:
89
+ # Convert to lowercase if needed
90
+ if not v.islower():
91
+ logger.warning(
92
+ f"Structured property type should be lowercase. Updated to {v.lower()}"
93
+ )
94
+ v = v.lower()
95
+
96
+ # Check if type is allowed
97
+ if not AllowedTypes.check_allowed_type(v):
98
+ raise ValueError(
99
+ f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}"
100
+ )
103
101
  return v
104
102
 
105
103
  @property
106
104
  def fqn(self) -> str:
107
105
  assert self.urn is not None
108
- id = Urn.create_from_string(self.urn).get_entity_id()[0]
106
+ id = StructuredPropertyUrn.from_string(self.urn).id
109
107
  if self.qualified_name is not None:
110
108
  # ensure that qualified name and ID match
111
109
  assert (
@@ -122,101 +120,90 @@ class StructuredProperties(ConfigModel):
122
120
  return v
123
121
 
124
122
  @staticmethod
125
- def create(file: str, graph: Optional[DataHubGraph] = None) -> None:
126
- with set_graph_context(graph):
127
- graph = StructuredPropertiesConfig.get_graph_required()
128
-
129
- with open(file) as fp:
130
- structuredproperties: List[dict] = yaml.safe_load(fp)
131
- for structuredproperty_raw in structuredproperties:
132
- structuredproperty = StructuredProperties.parse_obj(
133
- structuredproperty_raw
134
- )
135
-
136
- if not structuredproperty.type.islower():
137
- structuredproperty.type = structuredproperty.type.lower()
138
- logger.warning(
139
- f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
140
- )
141
- if not AllowedTypes.check_allowed_type(structuredproperty.type):
142
- raise ValueError(
143
- f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
144
- )
145
- mcp = MetadataChangeProposalWrapper(
146
- entityUrn=structuredproperty.urn,
147
- aspect=StructuredPropertyDefinitionClass(
148
- qualifiedName=structuredproperty.fqn,
149
- valueType=Urn.make_data_type_urn(structuredproperty.type),
150
- displayName=structuredproperty.display_name,
151
- description=structuredproperty.description,
152
- entityTypes=[
153
- Urn.make_entity_type_urn(entity_type)
154
- for entity_type in structuredproperty.entity_types or []
155
- ],
156
- cardinality=structuredproperty.cardinality,
157
- immutable=structuredproperty.immutable,
158
- allowedValues=(
159
- [
160
- PropertyValueClass(
161
- value=v.value, description=v.description
162
- )
163
- for v in structuredproperty.allowed_values
164
- ]
165
- if structuredproperty.allowed_values
166
- else None
167
- ),
168
- typeQualifier=(
169
- {
170
- "allowedTypes": structuredproperty.type_qualifier.allowed_types
171
- }
172
- if structuredproperty.type_qualifier
173
- else None
174
- ),
175
- ),
176
- )
177
- graph.emit_mcp(mcp)
178
-
179
- logger.info(f"Created structured property {structuredproperty.urn}")
180
-
181
- @classmethod
182
- def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
183
- with set_graph_context(graph):
184
- structured_property: Optional[
185
- StructuredPropertyDefinitionClass
186
- ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
187
- if structured_property is None:
188
- raise Exception(
189
- "StructuredPropertyDefinition aspect is None. Unable to create structured property."
190
- )
191
- return StructuredProperties(
192
- urn=urn,
193
- qualified_name=structured_property.qualifiedName,
194
- display_name=structured_property.displayName,
195
- type=structured_property.valueType,
196
- description=structured_property.description,
197
- entity_types=structured_property.entityTypes,
198
- cardinality=structured_property.cardinality,
199
- allowed_values=(
123
+ def from_yaml(file: str) -> List["StructuredProperties"]:
124
+ with open(file) as fp:
125
+ structuredproperties: List[dict] = yaml.safe_load(fp)
126
+
127
+ result: List[StructuredProperties] = []
128
+ for structuredproperty_raw in structuredproperties:
129
+ result.append(StructuredProperties.parse_obj(structuredproperty_raw))
130
+ return result
131
+
132
+ def generate_mcps(self) -> List[MetadataChangeProposalWrapper]:
133
+ mcp = MetadataChangeProposalWrapper(
134
+ entityUrn=self.urn,
135
+ aspect=StructuredPropertyDefinitionClass(
136
+ qualifiedName=self.fqn,
137
+ valueType=Urn.make_data_type_urn(self.type),
138
+ displayName=self.display_name,
139
+ description=self.description,
140
+ entityTypes=[
141
+ Urn.make_entity_type_urn(entity_type)
142
+ for entity_type in self.entity_types or []
143
+ ],
144
+ cardinality=self.cardinality,
145
+ immutable=self.immutable,
146
+ allowedValues=(
200
147
  [
201
- AllowedValue(
202
- value=av.value,
203
- description=av.description,
204
- )
205
- for av in structured_property.allowedValues or []
148
+ PropertyValueClass(value=v.value, description=v.description)
149
+ for v in self.allowed_values
206
150
  ]
207
- if structured_property.allowedValues is not None
151
+ if self.allowed_values
208
152
  else None
209
153
  ),
210
- type_qualifier=(
211
- {
212
- "allowed_types": structured_property.typeQualifier.get(
213
- "allowedTypes"
214
- )
215
- }
216
- if structured_property.typeQualifier
154
+ typeQualifier=(
155
+ {"allowedTypes": self.type_qualifier.allowed_types}
156
+ if self.type_qualifier
217
157
  else None
218
158
  ),
159
+ ),
160
+ )
161
+ return [mcp]
162
+
163
+ @staticmethod
164
+ def create(file: str, graph: DataHubGraph) -> None:
165
+ # TODO: Deprecate this method.
166
+ structuredproperties = StructuredProperties.from_yaml(file)
167
+ for structuredproperty in structuredproperties:
168
+ for mcp in structuredproperty.generate_mcps():
169
+ graph.emit_mcp(mcp)
170
+
171
+ logger.info(f"Created structured property {structuredproperty.urn}")
172
+
173
+ @classmethod
174
+ def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
175
+ structured_property: Optional[
176
+ StructuredPropertyDefinitionClass
177
+ ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
178
+ if structured_property is None:
179
+ raise Exception(
180
+ "StructuredPropertyDefinition aspect is None. Unable to create structured property."
219
181
  )
182
+ return StructuredProperties(
183
+ urn=urn,
184
+ qualified_name=structured_property.qualifiedName,
185
+ display_name=structured_property.displayName,
186
+ type=structured_property.valueType,
187
+ description=structured_property.description,
188
+ entity_types=structured_property.entityTypes,
189
+ cardinality=structured_property.cardinality,
190
+ allowed_values=(
191
+ [
192
+ AllowedValue(
193
+ value=av.value,
194
+ description=av.description,
195
+ )
196
+ for av in structured_property.allowedValues or []
197
+ ]
198
+ if structured_property.allowedValues is not None
199
+ else None
200
+ ),
201
+ type_qualifier=(
202
+ {"allowed_types": structured_property.typeQualifier.get("allowedTypes")}
203
+ if structured_property.typeQualifier
204
+ else None
205
+ ),
206
+ )
220
207
 
221
208
  def to_yaml(
222
209
  self,
@@ -31,7 +31,8 @@ def properties() -> None:
31
31
  def upsert(file: Path) -> None:
32
32
  """Upsert structured properties in DataHub."""
33
33
 
34
- StructuredProperties.create(str(file))
34
+ with get_default_graph() as graph:
35
+ StructuredProperties.create(str(file), graph)
35
36
 
36
37
 
37
38
  @properties.command(
@@ -184,6 +184,7 @@ class StructuredLogs(Report):
184
184
 
185
185
  @dataclass
186
186
  class SourceReport(Report):
187
+ event_not_produced_warn: bool = True
187
188
  events_produced: int = 0
188
189
  events_produced_per_sec: int = 0
189
190
 
@@ -150,7 +150,7 @@ def auto_workunit_reporter(report: "SourceReport", stream: Iterable[T]) -> Itera
150
150
  report.report_workunit(wu)
151
151
  yield wu
152
152
 
153
- if report.events_produced == 0:
153
+ if report.event_not_produced_warn and report.events_produced == 0:
154
154
  report.warning(
155
155
  title="No metadata was produced by the source",
156
156
  message="Please check the source configuration, filters, and permissions.",
@@ -65,18 +65,18 @@ class DataHubGcSourceConfig(ConfigModel):
65
65
  description="Sleep between truncation monitoring.",
66
66
  )
67
67
 
68
- dataprocess_cleanup: Optional[DataProcessCleanupConfig] = Field(
69
- default=None,
68
+ dataprocess_cleanup: DataProcessCleanupConfig = Field(
69
+ default_factory=DataProcessCleanupConfig,
70
70
  description="Configuration for data process cleanup",
71
71
  )
72
72
 
73
- soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanupConfig] = Field(
74
- default=None,
73
+ soft_deleted_entities_cleanup: SoftDeletedEntitiesCleanupConfig = Field(
74
+ default_factory=SoftDeletedEntitiesCleanupConfig,
75
75
  description="Configuration for soft deleted entities cleanup",
76
76
  )
77
77
 
78
- execution_request_cleanup: Optional[DatahubExecutionRequestCleanupConfig] = Field(
79
- default=None,
78
+ execution_request_cleanup: DatahubExecutionRequestCleanupConfig = Field(
79
+ default_factory=DatahubExecutionRequestCleanupConfig,
80
80
  description="Configuration for execution request cleanup",
81
81
  )
82
82
 
@@ -108,28 +108,22 @@ class DataHubGcSource(Source):
108
108
  self.ctx = ctx
109
109
  self.config = config
110
110
  self.report = DataHubGcSourceReport()
111
+ self.report.event_not_produced_warn = False
111
112
  self.graph = ctx.require_graph("The DataHubGc source")
112
- self.dataprocess_cleanup: Optional[DataProcessCleanup] = None
113
- self.soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanup] = None
114
- self.execution_request_cleanup: Optional[DatahubExecutionRequestCleanup] = None
115
-
116
- if self.config.dataprocess_cleanup:
117
- self.dataprocess_cleanup = DataProcessCleanup(
118
- ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
119
- )
120
- if self.config.soft_deleted_entities_cleanup:
121
- self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
122
- ctx,
123
- self.config.soft_deleted_entities_cleanup,
124
- self.report,
125
- self.config.dry_run,
126
- )
127
- if self.config.execution_request_cleanup:
128
- self.execution_request_cleanup = DatahubExecutionRequestCleanup(
129
- config=self.config.execution_request_cleanup,
130
- graph=self.graph,
131
- report=self.report,
132
- )
113
+ self.dataprocess_cleanup = DataProcessCleanup(
114
+ ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
115
+ )
116
+ self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
117
+ ctx,
118
+ self.config.soft_deleted_entities_cleanup,
119
+ self.report,
120
+ self.config.dry_run,
121
+ )
122
+ self.execution_request_cleanup = DatahubExecutionRequestCleanup(
123
+ config=self.config.execution_request_cleanup,
124
+ graph=self.graph,
125
+ report=self.report,
126
+ )
133
127
 
134
128
  @classmethod
135
129
  def create(cls, config_dict, ctx):
@@ -153,19 +147,19 @@ class DataHubGcSource(Source):
153
147
  self.truncate_indices()
154
148
  except Exception as e:
155
149
  self.report.failure("While trying to truncate indices ", exc=e)
156
- if self.soft_deleted_entities_cleanup:
150
+ if self.config.soft_deleted_entities_cleanup.enabled:
157
151
  try:
158
152
  self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
159
153
  except Exception as e:
160
154
  self.report.failure(
161
155
  "While trying to cleanup soft deleted entities ", exc=e
162
156
  )
163
- if self.execution_request_cleanup:
157
+ if self.config.execution_request_cleanup.enabled:
164
158
  try:
165
159
  self.execution_request_cleanup.run()
166
160
  except Exception as e:
167
161
  self.report.failure("While trying to cleanup execution request ", exc=e)
168
- if self.dataprocess_cleanup:
162
+ if self.config.dataprocess_cleanup.enabled:
169
163
  try:
170
164
  yield from self.dataprocess_cleanup.get_workunits_internal()
171
165
  except Exception as e:
@@ -98,6 +98,9 @@ query getDataJobRuns($dataJobUrn: String!, $start: Int!, $count: Int!) {
98
98
 
99
99
 
100
100
  class DataProcessCleanupConfig(ConfigModel):
101
+ enabled: bool = Field(
102
+ default=True, description="Whether to do data process cleanup."
103
+ )
101
104
  retention_days: Optional[int] = Field(
102
105
  10,
103
106
  description="Number of days to retain metadata in DataHub",
@@ -371,17 +374,26 @@ class DataProcessCleanup:
371
374
  previous_scroll_id: Optional[str] = None
372
375
 
373
376
  while True:
374
- result = self.ctx.graph.execute_graphql(
375
- DATAFLOW_QUERY,
376
- {
377
- "query": "*",
378
- "scrollId": scroll_id if scroll_id else None,
379
- "batchSize": self.config.batch_size,
380
- },
381
- )
377
+ result = None
378
+ try:
379
+ result = self.ctx.graph.execute_graphql(
380
+ DATAFLOW_QUERY,
381
+ {
382
+ "query": "*",
383
+ "scrollId": scroll_id if scroll_id else None,
384
+ "batchSize": self.config.batch_size,
385
+ },
386
+ )
387
+ except Exception as e:
388
+ self.report.failure(
389
+ f"While trying to get dataflows with {scroll_id}", exc=e
390
+ )
391
+ break
392
+
382
393
  scrollAcrossEntities = result.get("scrollAcrossEntities")
383
394
  if not scrollAcrossEntities:
384
395
  raise ValueError("Missing scrollAcrossEntities in response")
396
+ logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
385
397
 
386
398
  scroll_id = scrollAcrossEntities.get("nextScrollId")
387
399
  for flow in scrollAcrossEntities.get("searchResults"):
@@ -398,6 +410,8 @@ class DataProcessCleanup:
398
410
  previous_scroll_id = scroll_id
399
411
 
400
412
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
413
+ if not self.config.enabled:
414
+ return []
401
415
  assert self.ctx.graph
402
416
 
403
417
  dataFlows: Dict[str, DataFlowEntity] = {}
@@ -411,14 +425,20 @@ class DataProcessCleanup:
411
425
  deleted_jobs: int = 0
412
426
 
413
427
  while True:
414
- result = self.ctx.graph.execute_graphql(
415
- DATAJOB_QUERY,
416
- {
417
- "query": "*",
418
- "scrollId": scroll_id if scroll_id else None,
419
- "batchSize": self.config.batch_size,
420
- },
421
- )
428
+ try:
429
+ result = self.ctx.graph.execute_graphql(
430
+ DATAJOB_QUERY,
431
+ {
432
+ "query": "*",
433
+ "scrollId": scroll_id if scroll_id else None,
434
+ "batchSize": self.config.batch_size,
435
+ },
436
+ )
437
+ except Exception as e:
438
+ self.report.failure(
439
+ f"While trying to get data jobs with {scroll_id}", exc=e
440
+ )
441
+ break
422
442
  scrollAcrossEntities = result.get("scrollAcrossEntities")
423
443
  if not scrollAcrossEntities:
424
444
  raise ValueError("Missing scrollAcrossEntities in response")
@@ -20,6 +20,9 @@ logger = logging.getLogger(__name__)
20
20
 
21
21
 
22
22
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
23
+ enabled: bool = Field(
24
+ default=True, description="Whether to do soft deletion cleanup."
25
+ )
23
26
  retention_days: Optional[int] = Field(
24
27
  10,
25
28
  description="Number of days to retain metadata in DataHub",
@@ -156,6 +159,8 @@ class SoftDeletedEntitiesCleanup:
156
159
  self.delete_entity(urn)
157
160
 
158
161
  def cleanup_soft_deleted_entities(self) -> None:
162
+ if not self.config.enabled:
163
+ return
159
164
  assert self.ctx.graph
160
165
  start_time = time.time()
161
166
 
@@ -89,7 +89,16 @@ class PulsarSchema:
89
89
  logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
90
90
  avro_schema = {}
91
91
 
92
- self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name")
92
+ self.schema_name = "null"
93
+ if avro_schema.get("namespace") and avro_schema.get("name"):
94
+ self.schema_name = (
95
+ avro_schema.get("namespace") + "." + avro_schema.get("name")
96
+ )
97
+ elif avro_schema.get("namespace"):
98
+ self.schema_name = avro_schema.get("namespace")
99
+ elif avro_schema.get("name"):
100
+ self.schema_name = avro_schema.get("name")
101
+
93
102
  self.schema_description = avro_schema.get("doc")
94
103
  self.schema_type = schema.get("type")
95
104
  self.schema_str = schema.get("data")