acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +23 -1
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +0 -2
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "record",
|
|
3
|
+
"Aspect": {
|
|
4
|
+
"name": "structuredPropertySettings"
|
|
5
|
+
},
|
|
6
|
+
"name": "StructuredPropertySettings",
|
|
7
|
+
"namespace": "com.linkedin.pegasus2avro.structured",
|
|
8
|
+
"fields": [
|
|
9
|
+
{
|
|
10
|
+
"Searchable": {
|
|
11
|
+
"fieldType": "BOOLEAN"
|
|
12
|
+
},
|
|
13
|
+
"type": "boolean",
|
|
14
|
+
"name": "isHidden",
|
|
15
|
+
"default": false,
|
|
16
|
+
"doc": "Whether or not this asset should be hidden in the main application"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"Searchable": {
|
|
20
|
+
"fieldType": "BOOLEAN"
|
|
21
|
+
},
|
|
22
|
+
"type": "boolean",
|
|
23
|
+
"name": "showInSearchFilters",
|
|
24
|
+
"default": false,
|
|
25
|
+
"doc": "Whether or not this asset should be displayed as a search filter"
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"Searchable": {
|
|
29
|
+
"fieldType": "BOOLEAN"
|
|
30
|
+
},
|
|
31
|
+
"type": "boolean",
|
|
32
|
+
"name": "showInAssetSummary",
|
|
33
|
+
"default": false,
|
|
34
|
+
"doc": "Whether or not this asset should be displayed in the asset sidebar"
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"Searchable": {
|
|
38
|
+
"fieldType": "BOOLEAN"
|
|
39
|
+
},
|
|
40
|
+
"type": "boolean",
|
|
41
|
+
"name": "showAsAssetBadge",
|
|
42
|
+
"default": false,
|
|
43
|
+
"doc": "Whether or not this asset should be displayed as an asset badge on other\nasset's headers"
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"Searchable": {
|
|
47
|
+
"fieldType": "BOOLEAN"
|
|
48
|
+
},
|
|
49
|
+
"type": "boolean",
|
|
50
|
+
"name": "showInColumnsTable",
|
|
51
|
+
"default": false,
|
|
52
|
+
"doc": "Whether or not this asset should be displayed as a column in the schema field table\nin a Dataset's \"Columns\" tab."
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"Searchable": {
|
|
56
|
+
"/time": {
|
|
57
|
+
"fieldName": "lastModifiedSettings",
|
|
58
|
+
"fieldType": "DATETIME"
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
"type": [
|
|
62
|
+
"null",
|
|
63
|
+
{
|
|
64
|
+
"type": "record",
|
|
65
|
+
"name": "AuditStamp",
|
|
66
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
67
|
+
"fields": [
|
|
68
|
+
{
|
|
69
|
+
"type": "long",
|
|
70
|
+
"name": "time",
|
|
71
|
+
"doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"java": {
|
|
75
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
76
|
+
},
|
|
77
|
+
"type": "string",
|
|
78
|
+
"name": "actor",
|
|
79
|
+
"doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
|
|
80
|
+
"Urn": "Urn"
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"java": {
|
|
84
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
85
|
+
},
|
|
86
|
+
"type": [
|
|
87
|
+
"null",
|
|
88
|
+
"string"
|
|
89
|
+
],
|
|
90
|
+
"name": "impersonator",
|
|
91
|
+
"default": null,
|
|
92
|
+
"doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
|
|
93
|
+
"Urn": "Urn"
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"type": [
|
|
97
|
+
"null",
|
|
98
|
+
"string"
|
|
99
|
+
],
|
|
100
|
+
"name": "message",
|
|
101
|
+
"default": null,
|
|
102
|
+
"doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
|
|
103
|
+
}
|
|
104
|
+
],
|
|
105
|
+
"doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
|
|
106
|
+
}
|
|
107
|
+
],
|
|
108
|
+
"name": "lastModified",
|
|
109
|
+
"default": null,
|
|
110
|
+
"doc": "Last Modified Audit stamp"
|
|
111
|
+
}
|
|
112
|
+
],
|
|
113
|
+
"doc": "Settings specific to a structured property entity"
|
|
114
|
+
}
|
datahub/specific/chart.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
import time
|
|
2
1
|
from typing import Dict, List, Optional, Union
|
|
3
2
|
|
|
4
3
|
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
|
|
5
4
|
from datahub.metadata.schema_classes import (
|
|
6
5
|
AccessLevelClass,
|
|
7
|
-
AuditStampClass,
|
|
8
6
|
ChangeAuditStampsClass,
|
|
9
7
|
ChartInfoClass as ChartInfo,
|
|
10
8
|
ChartTypeClass,
|
|
@@ -47,43 +45,6 @@ class ChartPatchBuilder(MetadataPatchProposal):
|
|
|
47
45
|
)
|
|
48
46
|
self.ownership_patch_helper = OwnershipPatchHelper(self)
|
|
49
47
|
|
|
50
|
-
def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass:
|
|
51
|
-
"""
|
|
52
|
-
Creates an AuditStampClass instance with the current timestamp and other default values.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
message: The message associated with the audit stamp (optional).
|
|
56
|
-
|
|
57
|
-
Returns:
|
|
58
|
-
An instance of AuditStampClass.
|
|
59
|
-
"""
|
|
60
|
-
return AuditStampClass(
|
|
61
|
-
time=int(time.time() * 1000.0),
|
|
62
|
-
actor="urn:li:corpuser:datahub",
|
|
63
|
-
message=message,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
def _ensure_urn_type(
|
|
67
|
-
self, entity_type: str, edges: List[Edge], context: str
|
|
68
|
-
) -> None:
|
|
69
|
-
"""
|
|
70
|
-
Ensures that the destination URNs in the given edges have the specified entity type.
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
entity_type: The entity type to check against.
|
|
74
|
-
edges: A list of Edge objects.
|
|
75
|
-
context: The context or description of the operation.
|
|
76
|
-
|
|
77
|
-
Raises:
|
|
78
|
-
ValueError: If any of the destination URNs is not of the specified entity type.
|
|
79
|
-
"""
|
|
80
|
-
for e in edges:
|
|
81
|
-
urn = Urn.create_from_string(e.destinationUrn)
|
|
82
|
-
if not urn.get_type() == entity_type:
|
|
83
|
-
raise ValueError(
|
|
84
|
-
f"{context}: {e.destinationUrn} is not of type {entity_type}"
|
|
85
|
-
)
|
|
86
|
-
|
|
87
48
|
def add_owner(self, owner: Owner) -> "ChartPatchBuilder":
|
|
88
49
|
"""
|
|
89
50
|
Adds an owner to the ChartPatchBuilder.
|
datahub/specific/dashboard.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
import time
|
|
2
1
|
from typing import Dict, List, Optional, Union
|
|
3
2
|
|
|
4
3
|
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
|
|
5
4
|
from datahub.metadata.schema_classes import (
|
|
6
5
|
AccessLevelClass,
|
|
7
|
-
AuditStampClass,
|
|
8
6
|
ChangeAuditStampsClass,
|
|
9
7
|
DashboardInfoClass as DashboardInfo,
|
|
10
8
|
EdgeClass as Edge,
|
|
@@ -46,43 +44,6 @@ class DashboardPatchBuilder(MetadataPatchProposal):
|
|
|
46
44
|
)
|
|
47
45
|
self.ownership_patch_helper = OwnershipPatchHelper(self)
|
|
48
46
|
|
|
49
|
-
def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass:
|
|
50
|
-
"""
|
|
51
|
-
Creates an AuditStampClass instance with the current timestamp and other default values.
|
|
52
|
-
|
|
53
|
-
Args:
|
|
54
|
-
message: The message associated with the audit stamp (optional).
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
An instance of AuditStampClass.
|
|
58
|
-
"""
|
|
59
|
-
return AuditStampClass(
|
|
60
|
-
time=int(time.time() * 1000.0),
|
|
61
|
-
actor="urn:li:corpuser:datahub",
|
|
62
|
-
message=message,
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
def _ensure_urn_type(
|
|
66
|
-
self, entity_type: str, edges: List[Edge], context: str
|
|
67
|
-
) -> None:
|
|
68
|
-
"""
|
|
69
|
-
Ensures that the destination URNs in the given edges have the specified entity type.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
entity_type: The entity type to check against.
|
|
73
|
-
edges: A list of Edge objects.
|
|
74
|
-
context: The context or description of the operation.
|
|
75
|
-
|
|
76
|
-
Raises:
|
|
77
|
-
ValueError: If any of the destination URNs is not of the specified entity type.
|
|
78
|
-
"""
|
|
79
|
-
for e in edges:
|
|
80
|
-
urn = Urn.create_from_string(e.destinationUrn)
|
|
81
|
-
if not urn.get_type() == entity_type:
|
|
82
|
-
raise ValueError(
|
|
83
|
-
f"{context}: {e.destinationUrn} is not of type {entity_type}"
|
|
84
|
-
)
|
|
85
|
-
|
|
86
47
|
def add_owner(self, owner: Owner) -> "DashboardPatchBuilder":
|
|
87
48
|
"""
|
|
88
49
|
Adds an owner to the DashboardPatchBuilder.
|
datahub/specific/datajob.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
import time
|
|
2
1
|
from typing import Dict, List, Optional, Union
|
|
3
2
|
|
|
4
3
|
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
|
|
5
4
|
from datahub.metadata.schema_classes import (
|
|
6
|
-
AuditStampClass,
|
|
7
5
|
DataJobInfoClass as DataJobInfo,
|
|
8
6
|
DataJobInputOutputClass as DataJobInputOutput,
|
|
9
7
|
EdgeClass as Edge,
|
|
@@ -16,10 +14,9 @@ from datahub.metadata.schema_classes import (
|
|
|
16
14
|
SystemMetadataClass,
|
|
17
15
|
TagAssociationClass as Tag,
|
|
18
16
|
)
|
|
17
|
+
from datahub.metadata.urns import SchemaFieldUrn, TagUrn, Urn
|
|
19
18
|
from datahub.specific.custom_properties import CustomPropertiesPatchHelper
|
|
20
19
|
from datahub.specific.ownership import OwnershipPatchHelper
|
|
21
|
-
from datahub.utilities.urns.tag_urn import TagUrn
|
|
22
|
-
from datahub.utilities.urns.urn import Urn
|
|
23
20
|
|
|
24
21
|
|
|
25
22
|
class DataJobPatchBuilder(MetadataPatchProposal):
|
|
@@ -45,43 +42,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
45
42
|
)
|
|
46
43
|
self.ownership_patch_helper = OwnershipPatchHelper(self)
|
|
47
44
|
|
|
48
|
-
def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass:
|
|
49
|
-
"""
|
|
50
|
-
Creates an AuditStampClass instance with the current timestamp and other default values.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
message: The message associated with the audit stamp (optional).
|
|
54
|
-
|
|
55
|
-
Returns:
|
|
56
|
-
An instance of AuditStampClass.
|
|
57
|
-
"""
|
|
58
|
-
return AuditStampClass(
|
|
59
|
-
time=int(time.time() * 1000.0),
|
|
60
|
-
actor="urn:li:corpuser:datahub",
|
|
61
|
-
message=message,
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
def _ensure_urn_type(
|
|
65
|
-
self, entity_type: str, edges: List[Edge], context: str
|
|
66
|
-
) -> None:
|
|
67
|
-
"""
|
|
68
|
-
Ensures that the destination URNs in the given edges have the specified entity type.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
entity_type: The entity type to check against.
|
|
72
|
-
edges: A list of Edge objects.
|
|
73
|
-
context: The context or description of the operation.
|
|
74
|
-
|
|
75
|
-
Raises:
|
|
76
|
-
ValueError: If any of the destination URNs is not of the specified entity type.
|
|
77
|
-
"""
|
|
78
|
-
for e in edges:
|
|
79
|
-
urn = Urn.create_from_string(e.destinationUrn)
|
|
80
|
-
if not urn.get_type() == entity_type:
|
|
81
|
-
raise ValueError(
|
|
82
|
-
f"{context}: {e.destinationUrn} is not of type {entity_type}"
|
|
83
|
-
)
|
|
84
|
-
|
|
85
45
|
def add_owner(self, owner: Owner) -> "DataJobPatchBuilder":
|
|
86
46
|
"""
|
|
87
47
|
Adds an owner to the DataJobPatchBuilder.
|
|
@@ -142,7 +102,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
142
102
|
|
|
143
103
|
Notes:
|
|
144
104
|
If `input` is an Edge object, it is used directly. If `input` is a Urn object or string,
|
|
145
|
-
it is converted to an Edge object and added
|
|
105
|
+
it is converted to an Edge object and added without any audit stamps.
|
|
146
106
|
"""
|
|
147
107
|
if isinstance(input, Edge):
|
|
148
108
|
input_urn: str = input.destinationUrn
|
|
@@ -154,8 +114,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
154
114
|
|
|
155
115
|
input_edge = Edge(
|
|
156
116
|
destinationUrn=input_urn,
|
|
157
|
-
created=self._mint_auditstamp(),
|
|
158
|
-
lastModified=self._mint_auditstamp(),
|
|
159
117
|
)
|
|
160
118
|
|
|
161
119
|
self._ensure_urn_type("dataJob", [input_edge], "add_input_datajob")
|
|
@@ -225,7 +183,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
225
183
|
|
|
226
184
|
Notes:
|
|
227
185
|
If `input` is an Edge object, it is used directly. If `input` is a Urn object or string,
|
|
228
|
-
it is converted to an Edge object and added
|
|
186
|
+
it is converted to an Edge object and added without any audit stamps.
|
|
229
187
|
"""
|
|
230
188
|
if isinstance(input, Edge):
|
|
231
189
|
input_urn: str = input.destinationUrn
|
|
@@ -237,8 +195,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
237
195
|
|
|
238
196
|
input_edge = Edge(
|
|
239
197
|
destinationUrn=input_urn,
|
|
240
|
-
created=self._mint_auditstamp(),
|
|
241
|
-
lastModified=self._mint_auditstamp(),
|
|
242
198
|
)
|
|
243
199
|
|
|
244
200
|
self._ensure_urn_type("dataset", [input_edge], "add_input_dataset")
|
|
@@ -310,7 +266,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
310
266
|
|
|
311
267
|
Notes:
|
|
312
268
|
If `output` is an Edge object, it is used directly. If `output` is a Urn object or string,
|
|
313
|
-
it is converted to an Edge object and added
|
|
269
|
+
it is converted to an Edge object and added without any audit stamps.
|
|
314
270
|
"""
|
|
315
271
|
if isinstance(output, Edge):
|
|
316
272
|
output_urn: str = output.destinationUrn
|
|
@@ -322,15 +278,13 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
322
278
|
|
|
323
279
|
output_edge = Edge(
|
|
324
280
|
destinationUrn=output_urn,
|
|
325
|
-
created=self._mint_auditstamp(),
|
|
326
|
-
lastModified=self._mint_auditstamp(),
|
|
327
281
|
)
|
|
328
282
|
|
|
329
283
|
self._ensure_urn_type("dataset", [output_edge], "add_output_dataset")
|
|
330
284
|
self._add_patch(
|
|
331
285
|
DataJobInputOutput.ASPECT_NAME,
|
|
332
286
|
"add",
|
|
333
|
-
path=f"/outputDatasetEdges/{self.quote(
|
|
287
|
+
path=f"/outputDatasetEdges/{self.quote(output_urn)}",
|
|
334
288
|
value=output_edge,
|
|
335
289
|
)
|
|
336
290
|
return self
|
|
@@ -392,9 +346,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
392
346
|
ValueError: If the input is not a Schema Field urn.
|
|
393
347
|
"""
|
|
394
348
|
input_urn = str(input)
|
|
395
|
-
|
|
396
|
-
if not urn.get_type() == "schemaField":
|
|
397
|
-
raise ValueError(f"Input {input} is not a Schema Field urn")
|
|
349
|
+
assert SchemaFieldUrn.from_string(input_urn)
|
|
398
350
|
|
|
399
351
|
self._add_patch(
|
|
400
352
|
DataJobInputOutput.ASPECT_NAME,
|
|
@@ -466,9 +418,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
466
418
|
ValueError: If the output is not a Schema Field urn.
|
|
467
419
|
"""
|
|
468
420
|
output_urn = str(output)
|
|
469
|
-
|
|
470
|
-
if not urn.get_type() == "schemaField":
|
|
471
|
-
raise ValueError(f"Input {output} is not a Schema Field urn")
|
|
421
|
+
assert SchemaFieldUrn.from_string(output_urn)
|
|
472
422
|
|
|
473
423
|
self._add_patch(
|
|
474
424
|
DataJobInputOutput.ASPECT_NAME,
|
|
@@ -123,6 +123,13 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
123
123
|
)
|
|
124
124
|
return urn
|
|
125
125
|
|
|
126
|
+
def resolve_urn(self, urn: str) -> Tuple[str, Optional[SchemaInfo]]:
|
|
127
|
+
schema_info = self._resolve_schema_info(urn)
|
|
128
|
+
if schema_info:
|
|
129
|
+
return urn, schema_info
|
|
130
|
+
|
|
131
|
+
return urn, None
|
|
132
|
+
|
|
126
133
|
def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]:
|
|
127
134
|
urn = self.get_urn_for_table(table)
|
|
128
135
|
|
|
@@ -293,3 +300,19 @@ def _convert_schema_field_list_to_info(
|
|
|
293
300
|
|
|
294
301
|
def _convert_schema_aspect_to_info(schema_metadata: SchemaMetadataClass) -> SchemaInfo:
|
|
295
302
|
return _convert_schema_field_list_to_info(schema_metadata.fields)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def match_columns_to_schema(
|
|
306
|
+
schema_info: SchemaInfo, input_columns: List[str]
|
|
307
|
+
) -> List[str]:
|
|
308
|
+
column_from_gms: List[str] = list(schema_info.keys()) # list() to silent lint
|
|
309
|
+
|
|
310
|
+
gms_column_map: Dict[str, str] = {
|
|
311
|
+
column.lower(): column for column in column_from_gms
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
output_columns: List[str] = [
|
|
315
|
+
gms_column_map.get(column.lower(), column) for column in input_columns
|
|
316
|
+
]
|
|
317
|
+
|
|
318
|
+
return output_columns
|
|
@@ -1383,8 +1383,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1383
1383
|
return QueryUrn(query_id).urn()
|
|
1384
1384
|
|
|
1385
1385
|
@classmethod
|
|
1386
|
-
def _composite_query_id(cls, composed_of_queries:
|
|
1387
|
-
composed_of_queries = list(composed_of_queries)
|
|
1386
|
+
def _composite_query_id(cls, composed_of_queries: List[QueryId]) -> str:
|
|
1388
1387
|
combined = json.dumps(composed_of_queries)
|
|
1389
1388
|
return f"composite_{generate_hash(combined)}"
|
|
1390
1389
|
|
|
@@ -1181,6 +1181,45 @@ def sqlglot_lineage(
|
|
|
1181
1181
|
)
|
|
1182
1182
|
|
|
1183
1183
|
|
|
1184
|
+
@functools.lru_cache(maxsize=128)
|
|
1185
|
+
def create_and_cache_schema_resolver(
|
|
1186
|
+
platform: str,
|
|
1187
|
+
env: str,
|
|
1188
|
+
graph: Optional[DataHubGraph] = None,
|
|
1189
|
+
platform_instance: Optional[str] = None,
|
|
1190
|
+
schema_aware: bool = True,
|
|
1191
|
+
) -> SchemaResolver:
|
|
1192
|
+
return create_schema_resolver(
|
|
1193
|
+
platform=platform,
|
|
1194
|
+
env=env,
|
|
1195
|
+
graph=graph,
|
|
1196
|
+
platform_instance=platform_instance,
|
|
1197
|
+
schema_aware=schema_aware,
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
|
|
1201
|
+
def create_schema_resolver(
|
|
1202
|
+
platform: str,
|
|
1203
|
+
env: str,
|
|
1204
|
+
graph: Optional[DataHubGraph] = None,
|
|
1205
|
+
platform_instance: Optional[str] = None,
|
|
1206
|
+
schema_aware: bool = True,
|
|
1207
|
+
) -> SchemaResolver:
|
|
1208
|
+
if graph and schema_aware:
|
|
1209
|
+
return graph._make_schema_resolver(
|
|
1210
|
+
platform=platform,
|
|
1211
|
+
platform_instance=platform_instance,
|
|
1212
|
+
env=env,
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
return SchemaResolver(
|
|
1216
|
+
platform=platform,
|
|
1217
|
+
platform_instance=platform_instance,
|
|
1218
|
+
env=env,
|
|
1219
|
+
graph=None,
|
|
1220
|
+
)
|
|
1221
|
+
|
|
1222
|
+
|
|
1184
1223
|
def create_lineage_sql_parsed_result(
|
|
1185
1224
|
query: str,
|
|
1186
1225
|
default_db: Optional[str],
|
|
@@ -1191,21 +1230,17 @@ def create_lineage_sql_parsed_result(
|
|
|
1191
1230
|
graph: Optional[DataHubGraph] = None,
|
|
1192
1231
|
schema_aware: bool = True,
|
|
1193
1232
|
) -> SqlParsingResult:
|
|
1233
|
+
schema_resolver = create_schema_resolver(
|
|
1234
|
+
platform=platform,
|
|
1235
|
+
platform_instance=platform_instance,
|
|
1236
|
+
env=env,
|
|
1237
|
+
schema_aware=schema_aware,
|
|
1238
|
+
graph=graph,
|
|
1239
|
+
)
|
|
1240
|
+
|
|
1241
|
+
needs_close: bool = True
|
|
1194
1242
|
if graph and schema_aware:
|
|
1195
1243
|
needs_close = False
|
|
1196
|
-
schema_resolver = graph._make_schema_resolver(
|
|
1197
|
-
platform=platform,
|
|
1198
|
-
platform_instance=platform_instance,
|
|
1199
|
-
env=env,
|
|
1200
|
-
)
|
|
1201
|
-
else:
|
|
1202
|
-
needs_close = True
|
|
1203
|
-
schema_resolver = SchemaResolver(
|
|
1204
|
-
platform=platform,
|
|
1205
|
-
platform_instance=platform_instance,
|
|
1206
|
-
env=env,
|
|
1207
|
-
graph=None,
|
|
1208
|
-
)
|
|
1209
1244
|
|
|
1210
1245
|
try:
|
|
1211
1246
|
return sqlglot_lineage(
|
|
@@ -1243,13 +1278,19 @@ def infer_output_schema(result: SqlParsingResult) -> Optional[List[SchemaFieldCl
|
|
|
1243
1278
|
def view_definition_lineage_helper(
|
|
1244
1279
|
result: SqlParsingResult, view_urn: str
|
|
1245
1280
|
) -> SqlParsingResult:
|
|
1246
|
-
if result.query_type is QueryType.SELECT
|
|
1281
|
+
if result.query_type is QueryType.SELECT or (
|
|
1282
|
+
result.out_tables and result.out_tables != [view_urn]
|
|
1283
|
+
):
|
|
1247
1284
|
# Some platforms (e.g. postgres) store only <select statement> from view definition
|
|
1248
1285
|
# `create view V as <select statement>` . For such view definitions, `result.out_tables` and
|
|
1249
1286
|
# `result.column_lineage[].downstream` are empty in `sqlglot_lineage` response, whereas upstream
|
|
1250
1287
|
# details and downstream column details are extracted correctly.
|
|
1251
1288
|
# Here, we inject view V's urn in `result.out_tables` and `result.column_lineage[].downstream`
|
|
1252
1289
|
# to get complete lineage result.
|
|
1290
|
+
|
|
1291
|
+
# Some platforms(e.g. mssql) may have slightly different view name in view definition than
|
|
1292
|
+
# actual view name used elsewhere. Therefore we overwrite downstream table for such cases as well.
|
|
1293
|
+
|
|
1253
1294
|
result.out_tables = [view_urn]
|
|
1254
1295
|
if result.column_lineage:
|
|
1255
1296
|
for col_result in result.column_lineage:
|
|
@@ -121,7 +121,7 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
121
121
|
# Remove /* */ comments.
|
|
122
122
|
re.compile(r"/\*.*?\*/", re.DOTALL): "",
|
|
123
123
|
# Remove -- comments.
|
|
124
|
-
re.compile(r"--.*$"): "",
|
|
124
|
+
re.compile(r"--.*$", re.MULTILINE): "",
|
|
125
125
|
# Replace all runs of whitespace with a single space.
|
|
126
126
|
re.compile(r"\s+"): " ",
|
|
127
127
|
# Remove leading and trailing whitespace and trailing semicolons.
|
|
@@ -131,10 +131,16 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
131
131
|
# Replace anything that looks like a string with a placeholder.
|
|
132
132
|
re.compile(r"'[^']*'"): "?",
|
|
133
133
|
# Replace sequences of IN/VALUES with a single placeholder.
|
|
134
|
-
|
|
134
|
+
# The r" ?" makes it more robust to uneven spacing.
|
|
135
|
+
re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
|
|
135
136
|
# Normalize parenthesis spacing.
|
|
136
137
|
re.compile(r"\( "): "(",
|
|
137
138
|
re.compile(r" \)"): ")",
|
|
139
|
+
# Fix up spaces before commas in column lists.
|
|
140
|
+
# e.g. "col1 , col2" -> "col1, col2"
|
|
141
|
+
# e.g. "col1,col2" -> "col1, col2"
|
|
142
|
+
re.compile(r"\b ,"): ",",
|
|
143
|
+
re.compile(r"\b,\b"): ", ",
|
|
138
144
|
}
|
|
139
145
|
_TABLE_NAME_NORMALIZATION_RULES = {
|
|
140
146
|
# Replace UUID-like strings with a placeholder (both - and _ variants).
|
datahub/telemetry/telemetry.py
CHANGED
|
@@ -7,7 +7,7 @@ import sys
|
|
|
7
7
|
import uuid
|
|
8
8
|
from functools import wraps
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any, Callable, Dict, List, Optional, TypeVar
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, TypeVar
|
|
11
11
|
|
|
12
12
|
from mixpanel import Consumer, Mixpanel
|
|
13
13
|
from typing_extensions import ParamSpec
|
|
@@ -16,10 +16,12 @@ import datahub as datahub_package
|
|
|
16
16
|
from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER
|
|
17
17
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
18
18
|
from datahub.configuration.common import ExceptionWithProps
|
|
19
|
-
from datahub.ingestion.graph.client import DataHubGraph
|
|
20
19
|
from datahub.metadata.schema_classes import _custom_package_path
|
|
21
20
|
from datahub.utilities.perf_timer import PerfTimer
|
|
22
21
|
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
24
|
+
|
|
23
25
|
logger = logging.getLogger(__name__)
|
|
24
26
|
|
|
25
27
|
DATAHUB_FOLDER = Path(DATAHUB_ROOT_FOLDER)
|
|
@@ -117,7 +119,11 @@ class Telemetry:
|
|
|
117
119
|
tracking_init: bool = False
|
|
118
120
|
sentry_enabled: bool = False
|
|
119
121
|
|
|
122
|
+
context_properties: Dict[str, Any] = {}
|
|
123
|
+
|
|
120
124
|
def __init__(self):
|
|
125
|
+
self.context_properties = {}
|
|
126
|
+
|
|
121
127
|
if SENTRY_DSN:
|
|
122
128
|
self.sentry_enabled = True
|
|
123
129
|
try:
|
|
@@ -157,6 +163,9 @@ class Telemetry:
|
|
|
157
163
|
except Exception as e:
|
|
158
164
|
logger.debug(f"Error connecting to mixpanel: {e}")
|
|
159
165
|
|
|
166
|
+
# Initialize the default properties for all events.
|
|
167
|
+
self.set_context()
|
|
168
|
+
|
|
160
169
|
def update_config(self) -> bool:
|
|
161
170
|
"""
|
|
162
171
|
Update the config file with the current client ID and enabled status.
|
|
@@ -238,18 +247,22 @@ class Telemetry:
|
|
|
238
247
|
|
|
239
248
|
return False
|
|
240
249
|
|
|
241
|
-
def
|
|
250
|
+
def set_context(
|
|
242
251
|
self,
|
|
243
|
-
server: Optional[DataHubGraph] = None,
|
|
252
|
+
server: Optional["DataHubGraph"] = None,
|
|
244
253
|
properties: Optional[Dict[str, Any]] = None,
|
|
245
254
|
) -> None:
|
|
255
|
+
self.context_properties = {
|
|
256
|
+
**self._server_props(server),
|
|
257
|
+
**(properties or {}),
|
|
258
|
+
}
|
|
259
|
+
|
|
246
260
|
if self.sentry_enabled:
|
|
247
261
|
from sentry_sdk import set_tag
|
|
248
262
|
|
|
249
263
|
properties = {
|
|
250
264
|
**_default_telemetry_properties(),
|
|
251
|
-
**self.
|
|
252
|
-
**(properties or {}),
|
|
265
|
+
**self.context_properties,
|
|
253
266
|
}
|
|
254
267
|
|
|
255
268
|
for key in properties:
|
|
@@ -297,7 +310,6 @@ class Telemetry:
|
|
|
297
310
|
self,
|
|
298
311
|
event_name: str,
|
|
299
312
|
properties: Optional[Dict[str, Any]] = None,
|
|
300
|
-
server: Optional[DataHubGraph] = None,
|
|
301
313
|
) -> None:
|
|
302
314
|
"""
|
|
303
315
|
Send a single telemetry event.
|
|
@@ -323,14 +335,15 @@ class Telemetry:
|
|
|
323
335
|
|
|
324
336
|
properties = {
|
|
325
337
|
**_default_telemetry_properties(),
|
|
326
|
-
**self.
|
|
338
|
+
**self.context_properties,
|
|
327
339
|
**properties,
|
|
328
340
|
}
|
|
329
341
|
self.mp.track(self.client_id, event_name, properties)
|
|
330
342
|
except Exception as e:
|
|
331
343
|
logger.debug(f"Error reporting telemetry: {e}")
|
|
332
344
|
|
|
333
|
-
|
|
345
|
+
@classmethod
|
|
346
|
+
def _server_props(cls, server: Optional["DataHubGraph"]) -> Dict[str, str]:
|
|
334
347
|
if not server:
|
|
335
348
|
return {
|
|
336
349
|
"server_type": "n/a",
|
|
@@ -435,6 +448,7 @@ def with_telemetry(
|
|
|
435
448
|
**call_props,
|
|
436
449
|
"status": "error",
|
|
437
450
|
**_error_props(e),
|
|
451
|
+
"code": e.code,
|
|
438
452
|
},
|
|
439
453
|
)
|
|
440
454
|
telemetry_instance.capture_exception(e)
|
|
@@ -117,7 +117,7 @@ def diff_metadata_json(
|
|
|
117
117
|
ignore_paths: Sequence[str] = (),
|
|
118
118
|
ignore_order: bool = True,
|
|
119
119
|
) -> Union[DeepDiff, MCPDiff]:
|
|
120
|
-
ignore_paths =
|
|
120
|
+
ignore_paths = [*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info"]
|
|
121
121
|
try:
|
|
122
122
|
if ignore_order:
|
|
123
123
|
golden_map = get_aspects_by_urn(golden)
|