acryl-datahub-cloud 0.3.14.1rc5__py3-none-any.whl → 0.3.15rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +1 -1
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +30 -7
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +1 -1
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +22 -18
- acryl_datahub_cloud/elasticsearch/graph_service.py +23 -9
- acryl_datahub_cloud/lineage_features/source.py +77 -6
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +60 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- acryl_datahub_cloud/metadata/schema.avsc +420 -21
- acryl_datahub_cloud/metadata/schema_classes.py +521 -8
- acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +37 -15
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +18 -15
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +19 -15
- acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +230 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
- acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
- acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +24 -15
- acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +22 -6
- acryl_datahub_cloud/sdk/assertions_client.py +35 -7
- acryl_datahub_cloud/sdk/entities/subscription.py +22 -6
- acryl_datahub_cloud/sdk/subscription_client.py +8 -2
- {acryl_datahub_cloud-0.3.14.1rc5.dist-info → acryl_datahub_cloud-0.3.15rc0.dist-info}/METADATA +44 -47
- {acryl_datahub_cloud-0.3.14.1rc5.dist-info → acryl_datahub_cloud-0.3.15rc0.dist-info}/RECORD +40 -36
- {acryl_datahub_cloud-0.3.14.1rc5.dist-info → acryl_datahub_cloud-0.3.15rc0.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.14.1rc5.dist-info → acryl_datahub_cloud-0.3.15rc0.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.14.1rc5.dist-info → acryl_datahub_cloud-0.3.15rc0.dist-info}/top_level.txt +0 -0
|
@@ -391,7 +391,7 @@ class DataHubFormsNotificationsSource(Source):
|
|
|
391
391
|
user_urns = []
|
|
392
392
|
group_urns = []
|
|
393
393
|
|
|
394
|
-
extra_fields = [f for f in DataHubDatasetSearchRow.
|
|
394
|
+
extra_fields = [f for f in DataHubDatasetSearchRow.model_fields]
|
|
395
395
|
results = self.graph.get_results_by_filter(
|
|
396
396
|
extra_or_filters=self._get_incomplete_assets_for_form(form_urn, form.type),
|
|
397
397
|
extra_source_fields=extra_fields,
|
|
@@ -5,7 +5,7 @@ import pathlib
|
|
|
5
5
|
import tempfile
|
|
6
6
|
import time
|
|
7
7
|
from enum import Enum
|
|
8
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
8
|
+
from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
|
|
9
9
|
|
|
10
10
|
import boto3
|
|
11
11
|
import duckdb
|
|
@@ -73,7 +73,9 @@ class FileStoreBackedDatasetConfig(ConfigModel):
|
|
|
73
73
|
store_platform: str = "s3"
|
|
74
74
|
file_name: str = "data"
|
|
75
75
|
file_extension: str = "parquet"
|
|
76
|
-
file_compression:
|
|
76
|
+
file_compression: Literal[
|
|
77
|
+
"gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"
|
|
78
|
+
] = "snappy"
|
|
77
79
|
file_overwrite_existing: bool = True
|
|
78
80
|
snapshot_partitioning_strategy: str = PartitioningStrategy.DATE
|
|
79
81
|
generate_presigned_url: bool = True
|
|
@@ -119,9 +121,14 @@ class DataHubBasedS3Dataset:
|
|
|
119
121
|
self.local_file_path: str = (
|
|
120
122
|
config.file if config.file else self._initialize_local_file()
|
|
121
123
|
)
|
|
122
|
-
self.file_writer = None
|
|
124
|
+
self.file_writer: Optional[pq.ParquetWriter] = None
|
|
123
125
|
self.schema = (
|
|
124
|
-
pa.schema(
|
|
126
|
+
pa.schema(
|
|
127
|
+
[
|
|
128
|
+
pa.field(x.name, BaseModelRow.string_to_pyarrow_type(x.type))
|
|
129
|
+
for x in self.dataset_metadata.schemaFields
|
|
130
|
+
]
|
|
131
|
+
)
|
|
125
132
|
if self.dataset_metadata.schemaFields
|
|
126
133
|
else None
|
|
127
134
|
)
|
|
@@ -163,14 +170,28 @@ class DataHubBasedS3Dataset:
|
|
|
163
170
|
self.schema = row.arrow_schema()
|
|
164
171
|
else:
|
|
165
172
|
# hail mary: infer schema from the first row and cast everything to string
|
|
166
|
-
self.schema = pa.schema([(key, pa.string()) for key in row])
|
|
173
|
+
self.schema = pa.schema([pa.field(key, pa.string()) for key in row])
|
|
167
174
|
self.stringify_row = True
|
|
168
175
|
|
|
169
176
|
self._initialize_local_file()
|
|
177
|
+
# Map compression names to PyArrow format (most are direct mappings)
|
|
178
|
+
compression_map = {
|
|
179
|
+
"gzip": "gzip",
|
|
180
|
+
"bz2": "brotli", # PyArrow doesn't support bz2, use brotli
|
|
181
|
+
"brotli": "brotli",
|
|
182
|
+
"lz4": "lz4",
|
|
183
|
+
"zstd": "zstd",
|
|
184
|
+
"snappy": "snappy",
|
|
185
|
+
"none": "none",
|
|
186
|
+
}
|
|
187
|
+
compression = cast(
|
|
188
|
+
Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"],
|
|
189
|
+
compression_map.get(self.config.file_compression, "snappy"),
|
|
190
|
+
)
|
|
170
191
|
self.file_writer = pq.ParquetWriter(
|
|
171
192
|
self.local_file_path,
|
|
172
193
|
self.schema,
|
|
173
|
-
compression=
|
|
194
|
+
compression=compression,
|
|
174
195
|
)
|
|
175
196
|
if isinstance(row, (BaseModel, BaseModelRow)):
|
|
176
197
|
# for anything extending BaseModel, we want to use the dict representation
|
|
@@ -396,7 +417,9 @@ class DataHubBasedS3Dataset:
|
|
|
396
417
|
assert dataset_profiles.fieldProfiles is not None
|
|
397
418
|
dataset_profiles.fieldProfiles.append(field_profile)
|
|
398
419
|
logger.info("Generated dataset profile")
|
|
399
|
-
schema_metadata = self._generate_schema_metadata(
|
|
420
|
+
schema_metadata = self._generate_schema_metadata(
|
|
421
|
+
[(col[0], col[1]) for col in columns]
|
|
422
|
+
)
|
|
400
423
|
return dataset_profiles, schema_metadata
|
|
401
424
|
|
|
402
425
|
def register_dataset(
|
|
@@ -306,7 +306,7 @@ class DataHubFormReportingData(FormData):
|
|
|
306
306
|
on_asset_scanned: Optional[Callable[[str], Any]] = None,
|
|
307
307
|
on_form_scanned: Optional[Callable[[str], Any]] = None,
|
|
308
308
|
) -> Iterable[FormReportingRow]:
|
|
309
|
-
extra_fields = [f for f in self.DataHubDatasetSearchRow.
|
|
309
|
+
extra_fields = [f for f in self.DataHubDatasetSearchRow.model_fields]
|
|
310
310
|
# TODO: Replace with the new search/filter SDK.
|
|
311
311
|
result = self.graph.get_results_by_filter(
|
|
312
312
|
extra_or_filters=self.get_form_existence_or_filters(),
|
|
@@ -42,7 +42,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
42
42
|
platform_name,
|
|
43
43
|
support_status,
|
|
44
44
|
)
|
|
45
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
45
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
46
46
|
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
47
47
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
48
48
|
from datahub.ingestion.graph.client import DatahubClientConfig
|
|
@@ -239,7 +239,7 @@ def exp_cdf(series: polars.Series) -> polars.Series:
|
|
|
239
239
|
|
|
240
240
|
|
|
241
241
|
@dataclass
|
|
242
|
-
class DatahubUsageFeatureReport(
|
|
242
|
+
class DatahubUsageFeatureReport(StatefulIngestionReport, IngestionStageReport):
|
|
243
243
|
dataset_platforms_count: Dict[str, int] = field(
|
|
244
244
|
default_factory=lambda: defaultdict(lambda: 0)
|
|
245
245
|
)
|
|
@@ -738,17 +738,20 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
738
738
|
return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
|
|
739
739
|
elif isinstance(polars_dtype, polars.Struct):
|
|
740
740
|
return pa.struct(
|
|
741
|
-
|
|
742
|
-
field.name
|
|
741
|
+
[
|
|
742
|
+
pa.field(field.name, convert_dtype(field.dtype))
|
|
743
743
|
for field in polars_dtype.fields
|
|
744
|
-
|
|
744
|
+
]
|
|
745
745
|
)
|
|
746
746
|
elif isinstance(polars_dtype, polars.List):
|
|
747
747
|
return pa.list_(convert_dtype(polars_dtype.inner))
|
|
748
748
|
else:
|
|
749
749
|
raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
|
|
750
750
|
|
|
751
|
-
fields = [
|
|
751
|
+
fields = [
|
|
752
|
+
pa.field(name, convert_dtype(dtype))
|
|
753
|
+
for name, dtype in polars_schema.items()
|
|
754
|
+
]
|
|
752
755
|
return pa.schema(fields)
|
|
753
756
|
|
|
754
757
|
def batch_write_parquet(
|
|
@@ -971,26 +974,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
971
974
|
|
|
972
975
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
973
976
|
if self.config.user_usage_enabled:
|
|
974
|
-
self.report.new_stage("generate user usage")
|
|
975
|
-
|
|
977
|
+
with self.report.new_stage("generate user usage"):
|
|
978
|
+
yield from self.generate_user_usage_mcps()
|
|
976
979
|
|
|
977
980
|
if self.config.dataset_usage_enabled:
|
|
978
|
-
self.report.new_stage("generate dataset usage")
|
|
979
|
-
|
|
981
|
+
with self.report.new_stage("generate dataset usage"):
|
|
982
|
+
yield from self.generate_dataset_usage_mcps()
|
|
980
983
|
|
|
981
984
|
if self.config.dashboard_usage_enabled:
|
|
982
|
-
self.report.new_stage("generate dashboard usage")
|
|
983
|
-
|
|
985
|
+
with self.report.new_stage("generate dashboard usage"):
|
|
986
|
+
yield from self.generate_dashboard_usage_mcps()
|
|
984
987
|
|
|
985
988
|
if self.config.chart_usage_enabled:
|
|
986
|
-
self.report.new_stage("generate chart usage")
|
|
987
|
-
|
|
989
|
+
with self.report.new_stage("generate chart usage"):
|
|
990
|
+
yield from self.generate_chart_usage_mcps()
|
|
988
991
|
|
|
989
992
|
if self.config.query_usage_enabled:
|
|
990
|
-
self.report.new_stage("generate query usage")
|
|
991
|
-
|
|
993
|
+
with self.report.new_stage("generate query usage"):
|
|
994
|
+
yield from self.generate_query_usage_mcps()
|
|
992
995
|
|
|
993
|
-
self.report.new_stage("end so time is calculated for last stage")
|
|
996
|
+
with self.report.new_stage("end so time is calculated for last stage"):
|
|
997
|
+
pass
|
|
994
998
|
|
|
995
999
|
def generate_mcp_from_lazyframe(
|
|
996
1000
|
self, lazy_frame: polars.LazyFrame
|
|
@@ -2091,5 +2095,5 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
2091
2095
|
)
|
|
2092
2096
|
time.sleep(delay)
|
|
2093
2097
|
|
|
2094
|
-
def get_report(self) ->
|
|
2098
|
+
def get_report(self) -> "DatahubUsageFeatureReport":
|
|
2095
2099
|
return self.report
|
|
@@ -32,15 +32,31 @@ class BaseModelRow(BaseModel):
|
|
|
32
32
|
else:
|
|
33
33
|
raise ValueError(f"No mapping for type {type_}")
|
|
34
34
|
|
|
35
|
+
@staticmethod
|
|
36
|
+
def string_to_pyarrow_type(type_string: str) -> pa.DataType:
|
|
37
|
+
"""Convert string representation back to pyarrow type by converting to Python type first."""
|
|
38
|
+
# Mapping of pyarrow string representations to Python types
|
|
39
|
+
type_mapping = {
|
|
40
|
+
"string": str,
|
|
41
|
+
"int64": int,
|
|
42
|
+
"float64": float,
|
|
43
|
+
"bool": bool,
|
|
44
|
+
"timestamp[ns]": datetime.datetime,
|
|
45
|
+
"date32[day]": datetime.date,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
python_type = type_mapping.get(
|
|
49
|
+
type_string, str
|
|
50
|
+
) # Default to str for unknown types
|
|
51
|
+
return BaseModelRow.pydantic_type_to_pyarrow(python_type)
|
|
52
|
+
|
|
35
53
|
@classmethod
|
|
36
54
|
def arrow_schema(cls) -> pa.Schema:
|
|
37
55
|
fields = []
|
|
38
|
-
for field_name, field_model in cls.
|
|
39
|
-
pyarrow_type = BaseModelRow.pydantic_type_to_pyarrow(
|
|
40
|
-
field_model.outer_type_
|
|
41
|
-
)
|
|
56
|
+
for field_name, field_model in cls.model_fields.items():
|
|
57
|
+
pyarrow_type = BaseModelRow.pydantic_type_to_pyarrow(field_model.annotation)
|
|
42
58
|
pyarrow_field = pa.field(field_name, pyarrow_type)
|
|
43
|
-
if not field_model.
|
|
59
|
+
if not field_model.is_required():
|
|
44
60
|
pyarrow_field = pyarrow_field.with_nullable(True)
|
|
45
61
|
else:
|
|
46
62
|
pyarrow_field = pyarrow_field.with_nullable(False)
|
|
@@ -50,10 +66,8 @@ class BaseModelRow(BaseModel):
|
|
|
50
66
|
@classmethod
|
|
51
67
|
def datahub_schema(cls) -> List[SchemaField]:
|
|
52
68
|
fields = []
|
|
53
|
-
for field_name, field_model in cls.
|
|
54
|
-
pyarrow_type = BaseModelRow.pydantic_type_to_pyarrow(
|
|
55
|
-
field_model.outer_type_
|
|
56
|
-
)
|
|
69
|
+
for field_name, field_model in cls.model_fields.items():
|
|
70
|
+
pyarrow_type = BaseModelRow.pydantic_type_to_pyarrow(field_model.annotation)
|
|
57
71
|
fields.append(SchemaField(name=field_name, type=str(pyarrow_type)))
|
|
58
72
|
return fields
|
|
59
73
|
|
|
@@ -3,7 +3,6 @@ import os
|
|
|
3
3
|
import time
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
-
from datetime import datetime, timezone
|
|
7
6
|
from typing import Any, Callable, Dict, Iterable, List, Set
|
|
8
7
|
|
|
9
8
|
from opensearchpy import OpenSearch
|
|
@@ -53,6 +52,12 @@ class LineageFeaturesSourceConfig(ConfigModel):
|
|
|
53
52
|
retry_delay_seconds: int = 5
|
|
54
53
|
retry_backoff_multiplier: float = 2.0
|
|
55
54
|
|
|
55
|
+
# Cleanup old features when they have not been updated for this many days
|
|
56
|
+
# This is required because we only emit this feature for cases where we find a lineage
|
|
57
|
+
# in the graph index
|
|
58
|
+
cleanup_batch_size: int = 100
|
|
59
|
+
cleanup_old_features_days: int = 2
|
|
60
|
+
|
|
56
61
|
@validator("max_retries")
|
|
57
62
|
def validate_max_retries(cls, v: int) -> int:
|
|
58
63
|
if v < 1:
|
|
@@ -79,6 +84,12 @@ class LineageExtractGraphSourceReport(SourceReport, IngestionStageReport):
|
|
|
79
84
|
downstream_count: int = 0
|
|
80
85
|
edges_scanned: int = 0
|
|
81
86
|
skipped_materialized_urns_count: int = 0
|
|
87
|
+
zero_upstream_count: int = 0
|
|
88
|
+
zero_downstream_count: int = 0
|
|
89
|
+
has_asset_level_lineage_count: int = 0
|
|
90
|
+
zero_asset_level_lineage_count: int = 0
|
|
91
|
+
cleanup_old_features_time: int = 0
|
|
92
|
+
cleanup_old_features_count: int = 0
|
|
82
93
|
|
|
83
94
|
|
|
84
95
|
@platform_name(id="datahub", platform_name="DataHub")
|
|
@@ -255,7 +266,6 @@ class DataHubLineageFeaturesSource(Source):
|
|
|
255
266
|
with self.report.new_stage("Load valid URNs"):
|
|
256
267
|
self.populate_valid_urns()
|
|
257
268
|
|
|
258
|
-
timestamp = datetime.now(tz=timezone.utc)
|
|
259
269
|
server = self._create_opensearch_client_with_retry()
|
|
260
270
|
|
|
261
271
|
query = {
|
|
@@ -326,7 +336,58 @@ class DataHubLineageFeaturesSource(Source):
|
|
|
326
336
|
self._update_report()
|
|
327
337
|
self._delete_pit_with_retry(server, pit)
|
|
328
338
|
|
|
329
|
-
self.report.new_stage("
|
|
339
|
+
with self.report.new_stage("emission of lineage features"):
|
|
340
|
+
yield from self._emit_lineage_features()
|
|
341
|
+
|
|
342
|
+
with self.report.new_stage("cleanup old lineage features"):
|
|
343
|
+
yield from self._cleanup_old_features()
|
|
344
|
+
|
|
345
|
+
def _cleanup_old_features(self) -> Iterable[MetadataWorkUnit]:
|
|
346
|
+
"""
|
|
347
|
+
This is required because we only emit this feature for cases where we find a lineage
|
|
348
|
+
in the graph index
|
|
349
|
+
"""
|
|
350
|
+
cutoff_time = int(
|
|
351
|
+
(time.time() - (self.config.cleanup_old_features_days * 24 * 60 * 60))
|
|
352
|
+
* 1000
|
|
353
|
+
)
|
|
354
|
+
self.report.cleanup_old_features_time = cutoff_time
|
|
355
|
+
|
|
356
|
+
for urn in self.ctx.require_graph("Cleanup old features").get_urns_by_filter(
|
|
357
|
+
extraFilters=[
|
|
358
|
+
{
|
|
359
|
+
"field": "hasAssetLevelLineageFeature",
|
|
360
|
+
"negated": False,
|
|
361
|
+
"condition": "EQUAL",
|
|
362
|
+
"values": ["true"],
|
|
363
|
+
},
|
|
364
|
+
{
|
|
365
|
+
"field": "lineageFeaturesComputedAt",
|
|
366
|
+
"negated": False,
|
|
367
|
+
"condition": "LESS_THAN",
|
|
368
|
+
"values": [str(cutoff_time)],
|
|
369
|
+
},
|
|
370
|
+
],
|
|
371
|
+
batch_size=self.config.cleanup_batch_size,
|
|
372
|
+
):
|
|
373
|
+
# Emit lineage features with zero upstreams and downstreams for cleanup
|
|
374
|
+
wu = MetadataChangeProposalWrapper(
|
|
375
|
+
entityUrn=urn,
|
|
376
|
+
aspect=LineageFeaturesClass(
|
|
377
|
+
upstreamCount=0,
|
|
378
|
+
downstreamCount=0,
|
|
379
|
+
hasAssetLevelLineage=False,
|
|
380
|
+
computedAt=AuditStampClass(
|
|
381
|
+
time=int(time.time() * 1000),
|
|
382
|
+
actor=SYSTEM_ACTOR,
|
|
383
|
+
),
|
|
384
|
+
),
|
|
385
|
+
).as_workunit()
|
|
386
|
+
self.report.cleanup_old_features_count += 1
|
|
387
|
+
self.report.report_workunit(wu)
|
|
388
|
+
yield wu
|
|
389
|
+
|
|
390
|
+
def _emit_lineage_features(self) -> Iterable[MetadataWorkUnit]:
|
|
330
391
|
# In Python 3.9, can be replaced by `self.self.upstream_counts.keys() | self.downstream_counts.keys()`
|
|
331
392
|
for urn in set(self.upstream_counts.keys()).union(
|
|
332
393
|
self.downstream_counts.keys()
|
|
@@ -337,21 +398,31 @@ class DataHubLineageFeaturesSource(Source):
|
|
|
337
398
|
logger.debug(
|
|
338
399
|
f"{urn}: {self.upstream_counts[urn]}, {self.downstream_counts[urn]}"
|
|
339
400
|
)
|
|
401
|
+
if self.upstream_counts[urn] == 0:
|
|
402
|
+
self.report.zero_upstream_count += 1
|
|
403
|
+
if self.downstream_counts[urn] == 0:
|
|
404
|
+
self.report.zero_downstream_count += 1
|
|
405
|
+
has_asset_level_lineage = (
|
|
406
|
+
self.upstream_counts[urn] > 0 or self.downstream_counts[urn] > 0
|
|
407
|
+
)
|
|
408
|
+
if has_asset_level_lineage:
|
|
409
|
+
self.report.has_asset_level_lineage_count += 1
|
|
410
|
+
else:
|
|
411
|
+
self.report.zero_asset_level_lineage_count += 1
|
|
340
412
|
wu = MetadataChangeProposalWrapper(
|
|
341
413
|
entityUrn=urn,
|
|
342
414
|
aspect=LineageFeaturesClass(
|
|
343
415
|
upstreamCount=self.upstream_counts[urn],
|
|
344
416
|
downstreamCount=self.downstream_counts[urn],
|
|
417
|
+
hasAssetLevelLineage=has_asset_level_lineage,
|
|
345
418
|
computedAt=AuditStampClass(
|
|
346
|
-
time=int(
|
|
419
|
+
time=int(time.time() * 1000),
|
|
347
420
|
actor=SYSTEM_ACTOR,
|
|
348
421
|
),
|
|
349
422
|
),
|
|
350
423
|
).as_workunit()
|
|
351
424
|
self.report.report_workunit(wu)
|
|
352
425
|
yield wu
|
|
353
|
-
# So previous stage's calculations are done
|
|
354
|
-
self.report.new_stage("end emission of lineage features")
|
|
355
426
|
|
|
356
427
|
def get_report(self) -> SourceReport:
|
|
357
428
|
return self.report
|
|
@@ -2391,6 +2391,62 @@ class ActionRequestUrn(_SpecificUrn):
|
|
|
2391
2391
|
def id(self) -> str:
|
|
2392
2392
|
return self._entity_ids[0]
|
|
2393
2393
|
|
|
2394
|
+
if TYPE_CHECKING:
|
|
2395
|
+
from datahub.metadata.schema_classes import DataHubFileKeyClass
|
|
2396
|
+
|
|
2397
|
+
class DataHubFileUrn(_SpecificUrn):
|
|
2398
|
+
ENTITY_TYPE: ClassVar[Literal["dataHubFile"]] = "dataHubFile"
|
|
2399
|
+
_URN_PARTS: ClassVar[int] = 1
|
|
2400
|
+
|
|
2401
|
+
def __init__(self, id: Union["DataHubFileUrn", str], *, _allow_coercion: bool = True) -> None:
|
|
2402
|
+
if _allow_coercion:
|
|
2403
|
+
# Field coercion logic (if any is required).
|
|
2404
|
+
if isinstance(id, str):
|
|
2405
|
+
if id.startswith('urn:li:'):
|
|
2406
|
+
try:
|
|
2407
|
+
id = DataHubFileUrn.from_string(id)
|
|
2408
|
+
except InvalidUrnError:
|
|
2409
|
+
raise InvalidUrnError(f'Expecting a DataHubFileUrn but got {id}')
|
|
2410
|
+
else:
|
|
2411
|
+
id = UrnEncoder.encode_string(id)
|
|
2412
|
+
|
|
2413
|
+
# Validation logic.
|
|
2414
|
+
if not id:
|
|
2415
|
+
raise InvalidUrnError("DataHubFileUrn id cannot be empty")
|
|
2416
|
+
if isinstance(id, DataHubFileUrn):
|
|
2417
|
+
id = id.id
|
|
2418
|
+
elif isinstance(id, Urn):
|
|
2419
|
+
raise InvalidUrnError(f'Expecting a DataHubFileUrn but got {id}')
|
|
2420
|
+
if UrnEncoder.contains_reserved_char(id):
|
|
2421
|
+
raise InvalidUrnError(f'DataHubFileUrn id contains reserved characters')
|
|
2422
|
+
|
|
2423
|
+
super().__init__(self.ENTITY_TYPE, [id])
|
|
2424
|
+
|
|
2425
|
+
@classmethod
|
|
2426
|
+
def _parse_ids(cls, entity_ids: List[str]) -> "DataHubFileUrn":
|
|
2427
|
+
if len(entity_ids) != cls._URN_PARTS:
|
|
2428
|
+
raise InvalidUrnError(f"DataHubFileUrn should have {cls._URN_PARTS} parts, got {len(entity_ids)}: {entity_ids}")
|
|
2429
|
+
return cls(id=entity_ids[0], _allow_coercion=False)
|
|
2430
|
+
|
|
2431
|
+
@classmethod
|
|
2432
|
+
def underlying_key_aspect_type(cls) -> Type["DataHubFileKeyClass"]:
|
|
2433
|
+
from datahub.metadata.schema_classes import DataHubFileKeyClass
|
|
2434
|
+
|
|
2435
|
+
return DataHubFileKeyClass
|
|
2436
|
+
|
|
2437
|
+
def to_key_aspect(self) -> "DataHubFileKeyClass":
|
|
2438
|
+
from datahub.metadata.schema_classes import DataHubFileKeyClass
|
|
2439
|
+
|
|
2440
|
+
return DataHubFileKeyClass(id=self.id)
|
|
2441
|
+
|
|
2442
|
+
@classmethod
|
|
2443
|
+
def from_key_aspect(cls, key_aspect: "DataHubFileKeyClass") -> "DataHubFileUrn":
|
|
2444
|
+
return cls(id=key_aspect.id)
|
|
2445
|
+
|
|
2446
|
+
@property
|
|
2447
|
+
def id(self) -> str:
|
|
2448
|
+
return self._entity_ids[0]
|
|
2449
|
+
|
|
2394
2450
|
if TYPE_CHECKING:
|
|
2395
2451
|
from datahub.metadata.schema_classes import DataProcessInstanceKeyClass
|
|
2396
2452
|
|
|
@@ -3537,6 +3593,10 @@ class DataJobUrn(_SpecificUrn):
|
|
|
3537
3593
|
def get_data_flow_urn(self) -> "DataFlowUrn":
|
|
3538
3594
|
return DataFlowUrn.from_string(self.flow)
|
|
3539
3595
|
|
|
3596
|
+
@property
|
|
3597
|
+
def orchestrator(self) -> str:
|
|
3598
|
+
return self.get_data_flow_urn().orchestrator
|
|
3599
|
+
|
|
3540
3600
|
@deprecated(reason="Use .job_id instead")
|
|
3541
3601
|
def get_job_id(self) -> str:
|
|
3542
3602
|
return self.job_id
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
# pylint: skip-file
|
|
8
8
|
# fmt: off
|
|
9
9
|
# isort: skip_file
|
|
10
|
+
from .....schema_classes import ExecutionRequestArtifactsLocationClass
|
|
10
11
|
from .....schema_classes import ExecutionRequestInputClass
|
|
11
12
|
from .....schema_classes import ExecutionRequestResultClass
|
|
12
13
|
from .....schema_classes import ExecutionRequestSignalClass
|
|
@@ -14,6 +15,7 @@ from .....schema_classes import ExecutionRequestSourceClass
|
|
|
14
15
|
from .....schema_classes import StructuredExecutionReportClass
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
ExecutionRequestArtifactsLocation = ExecutionRequestArtifactsLocationClass
|
|
17
19
|
ExecutionRequestInput = ExecutionRequestInputClass
|
|
18
20
|
ExecutionRequestResult = ExecutionRequestResultClass
|
|
19
21
|
ExecutionRequestSignal = ExecutionRequestSignalClass
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# mypy: ignore-errors
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
|
|
4
|
+
# This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py
|
|
5
|
+
# Do not modify manually!
|
|
6
|
+
|
|
7
|
+
# pylint: skip-file
|
|
8
|
+
# fmt: off
|
|
9
|
+
# isort: skip_file
|
|
10
|
+
from .....schema_classes import BucketStorageLocationClass
|
|
11
|
+
from .....schema_classes import DataHubFileInfoClass
|
|
12
|
+
from .....schema_classes import FileUploadScenarioClass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
BucketStorageLocation = BucketStorageLocationClass
|
|
16
|
+
DataHubFileInfo = DataHubFileInfoClass
|
|
17
|
+
FileUploadScenario = FileUploadScenarioClass
|
|
18
|
+
|
|
19
|
+
# fmt: on
|
|
@@ -22,6 +22,7 @@ from ......schema_classes import DataFlowKeyClass
|
|
|
22
22
|
from ......schema_classes import DataHubAccessTokenKeyClass
|
|
23
23
|
from ......schema_classes import DataHubActionKeyClass
|
|
24
24
|
from ......schema_classes import DataHubConnectionKeyClass
|
|
25
|
+
from ......schema_classes import DataHubFileKeyClass
|
|
25
26
|
from ......schema_classes import DataHubIngestionSourceKeyClass
|
|
26
27
|
from ......schema_classes import DataHubMetricCubeKeyClass
|
|
27
28
|
from ......schema_classes import DataHubOpenAPISchemaKeyClass
|
|
@@ -92,6 +93,7 @@ DataFlowKey = DataFlowKeyClass
|
|
|
92
93
|
DataHubAccessTokenKey = DataHubAccessTokenKeyClass
|
|
93
94
|
DataHubActionKey = DataHubActionKeyClass
|
|
94
95
|
DataHubConnectionKey = DataHubConnectionKeyClass
|
|
96
|
+
DataHubFileKey = DataHubFileKeyClass
|
|
95
97
|
DataHubIngestionSourceKey = DataHubIngestionSourceKeyClass
|
|
96
98
|
DataHubMetricCubeKey = DataHubMetricCubeKeyClass
|
|
97
99
|
DataHubOpenAPISchemaKey = DataHubOpenAPISchemaKeyClass
|
|
@@ -8,11 +8,13 @@
|
|
|
8
8
|
# fmt: off
|
|
9
9
|
# isort: skip_file
|
|
10
10
|
from .....schema_classes import ActorsClass
|
|
11
|
+
from .....schema_classes import RoleGroupClass
|
|
11
12
|
from .....schema_classes import RolePropertiesClass
|
|
12
13
|
from .....schema_classes import RoleUserClass
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
Actors = ActorsClass
|
|
17
|
+
RoleGroup = RoleGroupClass
|
|
16
18
|
RoleProperties = RolePropertiesClass
|
|
17
19
|
RoleUser = RoleUserClass
|
|
18
20
|
|
|
@@ -23,6 +23,8 @@ from ......schema_classes import GlobalSettingsInfoClass
|
|
|
23
23
|
from ......schema_classes import GlobalViewsSettingsClass
|
|
24
24
|
from ......schema_classes import GlobalVisualSettingsClass
|
|
25
25
|
from ......schema_classes import HelpLinkClass
|
|
26
|
+
from ......schema_classes import OAuthProviderClass
|
|
27
|
+
from ......schema_classes import OAuthSettingsClass
|
|
26
28
|
from ......schema_classes import OidcSettingsClass
|
|
27
29
|
from ......schema_classes import SlackIntegrationSettingsClass
|
|
28
30
|
from ......schema_classes import SsoSettingsClass
|
|
@@ -47,6 +49,8 @@ GlobalSettingsInfo = GlobalSettingsInfoClass
|
|
|
47
49
|
GlobalViewsSettings = GlobalViewsSettingsClass
|
|
48
50
|
GlobalVisualSettings = GlobalVisualSettingsClass
|
|
49
51
|
HelpLink = HelpLinkClass
|
|
52
|
+
OAuthProvider = OAuthProviderClass
|
|
53
|
+
OAuthSettings = OAuthSettingsClass
|
|
50
54
|
OidcSettings = OidcSettingsClass
|
|
51
55
|
SlackIntegrationSettings = SlackIntegrationSettingsClass
|
|
52
56
|
SsoSettings = SsoSettingsClass
|