acryl-datahub 1.1.0.4rc3__py3-none-any.whl → 1.1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2499 -2501
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +149 -131
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/cli/check_cli.py +65 -11
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +3 -4
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +24 -8
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +47 -45
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +73 -30
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +12 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/glue.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
- datahub/ingestion/source/dbt/dbt_common.py +3 -1
- datahub/ingestion/source/dremio/dremio_api.py +38 -27
- datahub/ingestion/source/dremio/dremio_source.py +7 -7
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +28 -20
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -12
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/mssql/source.py +24 -15
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +11 -0
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +57 -2
- datahub/ingestion/source/tableau/tableau.py +57 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +56 -30
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1253 -536
- datahub/metadata/_urns/urn_defs.py +1797 -1685
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +16614 -16538
- datahub/metadata/schemas/ContainerProperties.avsc +2 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataJobInfo.avsc +2 -0
- datahub/metadata/schemas/DataProcessKey.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +4 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
- datahub/metadata/schemas/MLModelKey.avsc +2 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/datajob.py +39 -15
- datahub/sdk/lineage_client.py +2 -0
- datahub/sdk/main_client.py +14 -2
- datahub/sdk/search_client.py +4 -3
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "record",
|
|
3
|
+
"Aspect": {
|
|
4
|
+
"name": "logicalParent"
|
|
5
|
+
},
|
|
6
|
+
"name": "LogicalParent",
|
|
7
|
+
"namespace": "com.linkedin.pegasus2avro.logical",
|
|
8
|
+
"fields": [
|
|
9
|
+
{
|
|
10
|
+
"Relationship": {
|
|
11
|
+
"/destinationUrn": {
|
|
12
|
+
"createdActor": "parent/created/actor",
|
|
13
|
+
"createdOn": "parent/created/time",
|
|
14
|
+
"entityTypes": [
|
|
15
|
+
"dataset",
|
|
16
|
+
"schemaField"
|
|
17
|
+
],
|
|
18
|
+
"name": "PhysicalInstanceOf",
|
|
19
|
+
"properties": "parent/properties",
|
|
20
|
+
"updatedActor": "parent/lastModified/actor",
|
|
21
|
+
"updatedOn": "parent/lastModified/time"
|
|
22
|
+
}
|
|
23
|
+
},
|
|
24
|
+
"Searchable": {
|
|
25
|
+
"/destinationUrn": {
|
|
26
|
+
"addToFilters": true,
|
|
27
|
+
"fieldName": "logicalParent",
|
|
28
|
+
"fieldType": "URN",
|
|
29
|
+
"filterNameOverride": "Physical Instance Of",
|
|
30
|
+
"hasValuesFieldName": "hasLogicalParent",
|
|
31
|
+
"queryByDefault": false
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"type": {
|
|
35
|
+
"type": "record",
|
|
36
|
+
"name": "Edge",
|
|
37
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
38
|
+
"fields": [
|
|
39
|
+
{
|
|
40
|
+
"java": {
|
|
41
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
42
|
+
},
|
|
43
|
+
"type": [
|
|
44
|
+
"null",
|
|
45
|
+
"string"
|
|
46
|
+
],
|
|
47
|
+
"name": "sourceUrn",
|
|
48
|
+
"default": null,
|
|
49
|
+
"doc": "Urn of the source of this relationship edge.\nIf not specified, assumed to be the entity that this aspect belongs to.",
|
|
50
|
+
"Urn": "Urn"
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"java": {
|
|
54
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
55
|
+
},
|
|
56
|
+
"type": "string",
|
|
57
|
+
"name": "destinationUrn",
|
|
58
|
+
"doc": "Urn of the destination of this relationship edge.",
|
|
59
|
+
"Urn": "Urn"
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"type": [
|
|
63
|
+
"null",
|
|
64
|
+
{
|
|
65
|
+
"type": "record",
|
|
66
|
+
"name": "AuditStamp",
|
|
67
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
68
|
+
"fields": [
|
|
69
|
+
{
|
|
70
|
+
"type": "long",
|
|
71
|
+
"name": "time",
|
|
72
|
+
"doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
"java": {
|
|
76
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
77
|
+
},
|
|
78
|
+
"type": "string",
|
|
79
|
+
"name": "actor",
|
|
80
|
+
"doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
|
|
81
|
+
"Urn": "Urn"
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"java": {
|
|
85
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
86
|
+
},
|
|
87
|
+
"type": [
|
|
88
|
+
"null",
|
|
89
|
+
"string"
|
|
90
|
+
],
|
|
91
|
+
"name": "impersonator",
|
|
92
|
+
"default": null,
|
|
93
|
+
"doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
|
|
94
|
+
"Urn": "Urn"
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"type": [
|
|
98
|
+
"null",
|
|
99
|
+
"string"
|
|
100
|
+
],
|
|
101
|
+
"name": "message",
|
|
102
|
+
"default": null,
|
|
103
|
+
"doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
|
|
104
|
+
}
|
|
105
|
+
],
|
|
106
|
+
"doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
|
|
107
|
+
}
|
|
108
|
+
],
|
|
109
|
+
"name": "created",
|
|
110
|
+
"default": null,
|
|
111
|
+
"doc": "Audit stamp containing who created this relationship edge and when"
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
"type": [
|
|
115
|
+
"null",
|
|
116
|
+
"com.linkedin.pegasus2avro.common.AuditStamp"
|
|
117
|
+
],
|
|
118
|
+
"name": "lastModified",
|
|
119
|
+
"default": null,
|
|
120
|
+
"doc": "Audit stamp containing who last modified this relationship edge and when"
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
"type": [
|
|
124
|
+
"null",
|
|
125
|
+
{
|
|
126
|
+
"type": "map",
|
|
127
|
+
"values": "string"
|
|
128
|
+
}
|
|
129
|
+
],
|
|
130
|
+
"name": "properties",
|
|
131
|
+
"default": null,
|
|
132
|
+
"doc": "A generic properties bag that allows us to store specific information on this graph edge."
|
|
133
|
+
}
|
|
134
|
+
],
|
|
135
|
+
"doc": "A common structure to represent all edges to entities when used inside aspects as collections\nThis ensures that all edges have common structure around audit-stamps and will support PATCH, time-travel automatically."
|
|
136
|
+
},
|
|
137
|
+
"name": "parent"
|
|
138
|
+
}
|
|
139
|
+
]
|
|
140
|
+
}
|
|
@@ -60,6 +60,7 @@
|
|
|
60
60
|
"QA": "Designates quality assurance fabrics",
|
|
61
61
|
"RVW": "Designates review fabrics",
|
|
62
62
|
"SANDBOX": "Designates sandbox fabrics",
|
|
63
|
+
"SBX": "Alternative spelling for sandbox",
|
|
63
64
|
"SIT": "System Integration Testing",
|
|
64
65
|
"STG": "Designates staging fabrics",
|
|
65
66
|
"TEST": "Designates testing fabrics",
|
|
@@ -83,6 +84,7 @@
|
|
|
83
84
|
"PRD",
|
|
84
85
|
"TST",
|
|
85
86
|
"SIT",
|
|
87
|
+
"SBX",
|
|
86
88
|
"SANDBOX"
|
|
87
89
|
],
|
|
88
90
|
"doc": "Fabric group type"
|
|
@@ -67,6 +67,7 @@
|
|
|
67
67
|
"QA": "Designates quality assurance fabrics",
|
|
68
68
|
"RVW": "Designates review fabrics",
|
|
69
69
|
"SANDBOX": "Designates sandbox fabrics",
|
|
70
|
+
"SBX": "Alternative spelling for sandbox",
|
|
70
71
|
"SIT": "System Integration Testing",
|
|
71
72
|
"STG": "Designates staging fabrics",
|
|
72
73
|
"TEST": "Designates testing fabrics",
|
|
@@ -90,6 +91,7 @@
|
|
|
90
91
|
"PRD",
|
|
91
92
|
"TST",
|
|
92
93
|
"SIT",
|
|
94
|
+
"SBX",
|
|
93
95
|
"SANDBOX"
|
|
94
96
|
],
|
|
95
97
|
"doc": "Fabric group type"
|
|
@@ -81,6 +81,7 @@
|
|
|
81
81
|
"QA": "Designates quality assurance fabrics",
|
|
82
82
|
"RVW": "Designates review fabrics",
|
|
83
83
|
"SANDBOX": "Designates sandbox fabrics",
|
|
84
|
+
"SBX": "Alternative spelling for sandbox",
|
|
84
85
|
"SIT": "System Integration Testing",
|
|
85
86
|
"STG": "Designates staging fabrics",
|
|
86
87
|
"TEST": "Designates testing fabrics",
|
|
@@ -104,6 +105,7 @@
|
|
|
104
105
|
"PRD",
|
|
105
106
|
"TST",
|
|
106
107
|
"SIT",
|
|
108
|
+
"SBX",
|
|
107
109
|
"SANDBOX"
|
|
108
110
|
],
|
|
109
111
|
"doc": "Fabric group type"
|
|
@@ -2430,6 +2430,7 @@
|
|
|
2430
2430
|
"QA": "Designates quality assurance fabrics",
|
|
2431
2431
|
"RVW": "Designates review fabrics",
|
|
2432
2432
|
"SANDBOX": "Designates sandbox fabrics",
|
|
2433
|
+
"SBX": "Alternative spelling for sandbox",
|
|
2433
2434
|
"SIT": "System Integration Testing",
|
|
2434
2435
|
"STG": "Designates staging fabrics",
|
|
2435
2436
|
"TEST": "Designates testing fabrics",
|
|
@@ -2453,6 +2454,7 @@
|
|
|
2453
2454
|
"PRD",
|
|
2454
2455
|
"TST",
|
|
2455
2456
|
"SIT",
|
|
2457
|
+
"SBX",
|
|
2456
2458
|
"SANDBOX"
|
|
2457
2459
|
],
|
|
2458
2460
|
"doc": "Fabric group type"
|
|
@@ -15,13 +15,6 @@
|
|
|
15
15
|
"namespace": "com.linkedin.pegasus2avro.query",
|
|
16
16
|
"fields": [
|
|
17
17
|
{
|
|
18
|
-
"Relationship": {
|
|
19
|
-
"entityTypes": [
|
|
20
|
-
"dataset",
|
|
21
|
-
"schemaField"
|
|
22
|
-
],
|
|
23
|
-
"name": "IsAssociatedWith"
|
|
24
|
-
},
|
|
25
18
|
"Searchable": {
|
|
26
19
|
"fieldName": "entities",
|
|
27
20
|
"fieldType": "URN"
|
|
@@ -32,11 +25,7 @@
|
|
|
32
25
|
"type": "string",
|
|
33
26
|
"name": "entity",
|
|
34
27
|
"doc": "An entity which is the subject of a query.",
|
|
35
|
-
"Urn": "Urn"
|
|
36
|
-
"entityTypes": [
|
|
37
|
-
"dataset",
|
|
38
|
-
"schemaField"
|
|
39
|
-
]
|
|
28
|
+
"Urn": "Urn"
|
|
40
29
|
}
|
|
41
30
|
],
|
|
42
31
|
"doc": "A single subject of a particular query.\nIn the future, we may evolve this model to include richer details\nabout the Query Subject in relation to the query."
|
datahub/sdk/datajob.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Dict, List, Optional, Type
|
|
|
6
6
|
|
|
7
7
|
from typing_extensions import Self
|
|
8
8
|
|
|
9
|
+
import datahub.emitter.mce_builder as builder
|
|
9
10
|
import datahub.metadata.schema_classes as models
|
|
10
11
|
from datahub.cli.cli_utils import first_non_null
|
|
11
12
|
from datahub.errors import IngestionAttributionWarning
|
|
@@ -64,7 +65,7 @@ class DataJob(
|
|
|
64
65
|
"""Get the URN type for data jobs."""
|
|
65
66
|
return DataJobUrn
|
|
66
67
|
|
|
67
|
-
def __init__(
|
|
68
|
+
def __init__( # noqa: C901
|
|
68
69
|
self,
|
|
69
70
|
*,
|
|
70
71
|
name: str,
|
|
@@ -86,6 +87,7 @@ class DataJob(
|
|
|
86
87
|
domain: Optional[DomainInputType] = None,
|
|
87
88
|
inlets: Optional[List[DatasetUrnOrStr]] = None,
|
|
88
89
|
outlets: Optional[List[DatasetUrnOrStr]] = None,
|
|
90
|
+
fine_grained_lineages: Optional[List[models.FineGrainedLineageClass]] = None,
|
|
89
91
|
structured_properties: Optional[StructuredPropertyInputType] = None,
|
|
90
92
|
extra_aspects: ExtraAspectsType = None,
|
|
91
93
|
):
|
|
@@ -103,12 +105,14 @@ class DataJob(
|
|
|
103
105
|
ValueError: If neither flow nor (flow_urn and platform_instance) are provided
|
|
104
106
|
"""
|
|
105
107
|
if flow is None:
|
|
106
|
-
if flow_urn is None
|
|
108
|
+
if flow_urn is None:
|
|
107
109
|
raise ValueError(
|
|
108
110
|
"You must provide either: 1. a DataFlow object, or 2. a DataFlowUrn (and a platform_instance config if required)"
|
|
109
111
|
)
|
|
110
112
|
flow_urn = DataFlowUrn.from_string(flow_urn)
|
|
111
|
-
if flow_urn.flow_id.startswith(
|
|
113
|
+
if platform_instance and flow_urn.flow_id.startswith(
|
|
114
|
+
f"{platform_instance}."
|
|
115
|
+
):
|
|
112
116
|
flow_name = flow_urn.flow_id[len(platform_instance) + 1 :]
|
|
113
117
|
else:
|
|
114
118
|
flow_name = flow_urn.flow_id
|
|
@@ -133,8 +137,6 @@ class DataJob(
|
|
|
133
137
|
)
|
|
134
138
|
self._setdefault_aspect(job_info)
|
|
135
139
|
self._ensure_datajob_props().flowUrn = str(flow.urn)
|
|
136
|
-
|
|
137
|
-
# Set properties if provided
|
|
138
140
|
if description is not None:
|
|
139
141
|
self.set_description(description)
|
|
140
142
|
if external_url is not None:
|
|
@@ -145,8 +147,6 @@ class DataJob(
|
|
|
145
147
|
self.set_created(created)
|
|
146
148
|
if last_modified is not None:
|
|
147
149
|
self.set_last_modified(last_modified)
|
|
148
|
-
|
|
149
|
-
# Set standard aspects
|
|
150
150
|
if subtype is not None:
|
|
151
151
|
self.set_subtype(subtype)
|
|
152
152
|
if owners is not None:
|
|
@@ -159,13 +159,19 @@ class DataJob(
|
|
|
159
159
|
self.set_terms(terms)
|
|
160
160
|
if domain is not None:
|
|
161
161
|
self.set_domain(domain)
|
|
162
|
+
if structured_properties is not None:
|
|
163
|
+
for key, value in structured_properties.items():
|
|
164
|
+
self.set_structured_property(property_urn=key, values=value)
|
|
162
165
|
if inlets is not None:
|
|
163
166
|
self.set_inlets(inlets)
|
|
164
167
|
if outlets is not None:
|
|
165
168
|
self.set_outlets(outlets)
|
|
166
|
-
if
|
|
167
|
-
|
|
168
|
-
|
|
169
|
+
if fine_grained_lineages is not None:
|
|
170
|
+
self.set_fine_grained_lineages(fine_grained_lineages)
|
|
171
|
+
|
|
172
|
+
if self.flow_urn.cluster.upper() in builder.ALL_ENV_TYPES:
|
|
173
|
+
env = self.flow_urn.cluster.upper()
|
|
174
|
+
self._ensure_datajob_props().env = env
|
|
169
175
|
|
|
170
176
|
@classmethod
|
|
171
177
|
def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
|
|
@@ -201,9 +207,7 @@ class DataJob(
|
|
|
201
207
|
) -> Optional[models.DataJobInputOutputClass]:
|
|
202
208
|
return self._get_aspect(models.DataJobInputOutputClass)
|
|
203
209
|
|
|
204
|
-
def _ensure_datajob_inputoutput_props(
|
|
205
|
-
self,
|
|
206
|
-
) -> models.DataJobInputOutputClass:
|
|
210
|
+
def _ensure_datajob_inputoutput_props(self) -> models.DataJobInputOutputClass:
|
|
207
211
|
return self._setdefault_aspect(
|
|
208
212
|
models.DataJobInputOutputClass(inputDatasets=[], outputDatasets=[])
|
|
209
213
|
)
|
|
@@ -307,8 +311,6 @@ class DataJob(
|
|
|
307
311
|
browse_path.append(
|
|
308
312
|
models.BrowsePathEntryClass(id=entry.id, urn=entry.urn)
|
|
309
313
|
)
|
|
310
|
-
|
|
311
|
-
# Add the job itself to the path
|
|
312
314
|
browse_path.append(models.BrowsePathEntryClass(id=flow.name, urn=str(flow.urn)))
|
|
313
315
|
# Set the browse path aspect
|
|
314
316
|
self._set_aspect(models.BrowsePathsV2Class(path=browse_path))
|
|
@@ -341,3 +343,25 @@ class DataJob(
|
|
|
341
343
|
self._ensure_datajob_inputoutput_props().outputDatasets.append(
|
|
342
344
|
str(outlet_urn)
|
|
343
345
|
)
|
|
346
|
+
|
|
347
|
+
@property
|
|
348
|
+
def fine_grained_lineages(self) -> List[models.FineGrainedLineageClass]:
|
|
349
|
+
io_aspect = self._get_datajob_inputoutput_props()
|
|
350
|
+
return (
|
|
351
|
+
io_aspect.fineGrainedLineages
|
|
352
|
+
if io_aspect and io_aspect.fineGrainedLineages
|
|
353
|
+
else []
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
def set_fine_grained_lineages(
|
|
357
|
+
self, lineages: List[models.FineGrainedLineageClass]
|
|
358
|
+
) -> None:
|
|
359
|
+
io_aspect = self._ensure_datajob_inputoutput_props()
|
|
360
|
+
if io_aspect.fineGrainedLineages is None:
|
|
361
|
+
io_aspect.fineGrainedLineages = []
|
|
362
|
+
io_aspect.fineGrainedLineages.extend(lineages)
|
|
363
|
+
|
|
364
|
+
@property
|
|
365
|
+
def env(self) -> Optional[str]:
|
|
366
|
+
"""Get the environment of the data job."""
|
|
367
|
+
return str(self._ensure_datajob_props().env)
|
datahub/sdk/lineage_client.py
CHANGED
|
@@ -478,6 +478,7 @@ class LineageClient:
|
|
|
478
478
|
env: str = "PROD",
|
|
479
479
|
default_db: Optional[str] = None,
|
|
480
480
|
default_schema: Optional[str] = None,
|
|
481
|
+
override_dialect: Optional[str] = None,
|
|
481
482
|
) -> None:
|
|
482
483
|
"""Add lineage by parsing a SQL query."""
|
|
483
484
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
@@ -493,6 +494,7 @@ class LineageClient:
|
|
|
493
494
|
platform_instance=platform_instance,
|
|
494
495
|
env=env,
|
|
495
496
|
graph=self._client._graph,
|
|
497
|
+
override_dialect=override_dialect,
|
|
496
498
|
)
|
|
497
499
|
|
|
498
500
|
if parsed_result.debug_info.table_error:
|
datahub/sdk/main_client.py
CHANGED
|
@@ -66,7 +66,12 @@ class DataHubClient:
|
|
|
66
66
|
self._graph.test_connection()
|
|
67
67
|
|
|
68
68
|
@classmethod
|
|
69
|
-
def from_env(
|
|
69
|
+
def from_env(
|
|
70
|
+
cls,
|
|
71
|
+
*,
|
|
72
|
+
client_mode: ClientMode = ClientMode.SDK,
|
|
73
|
+
datahub_component: Optional[str] = None,
|
|
74
|
+
) -> "DataHubClient":
|
|
70
75
|
"""Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
|
|
71
76
|
|
|
72
77
|
This will first check DATAHUB_GMS_URL and DATAHUB_GMS_TOKEN. If not present,
|
|
@@ -76,6 +81,10 @@ class DataHubClient:
|
|
|
76
81
|
If you're looking to specify the server/token in code, use the
|
|
77
82
|
DataHubClient(server=..., token=...) constructor instead.
|
|
78
83
|
|
|
84
|
+
Args:
|
|
85
|
+
client_mode: [internal] The client mode to use. Defaults to "SDK".
|
|
86
|
+
datahub_component: [internal] The DataHub component name to include in the user agent.
|
|
87
|
+
|
|
79
88
|
Returns:
|
|
80
89
|
A DataHubClient instance.
|
|
81
90
|
"""
|
|
@@ -83,7 +92,10 @@ class DataHubClient:
|
|
|
83
92
|
# Inspired by the DockerClient.from_env() method.
|
|
84
93
|
# TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
|
|
85
94
|
# That file is part of the "environment", but is not a traditional "env variable".
|
|
86
|
-
graph = get_default_graph(
|
|
95
|
+
graph = get_default_graph(
|
|
96
|
+
client_mode=client_mode,
|
|
97
|
+
datahub_component=datahub_component,
|
|
98
|
+
)
|
|
87
99
|
|
|
88
100
|
return cls(graph=graph)
|
|
89
101
|
|
datahub/sdk/search_client.py
CHANGED
|
@@ -19,6 +19,7 @@ from datahub.sdk.search_filters import (
|
|
|
19
19
|
_OrFilters,
|
|
20
20
|
_StatusFilter,
|
|
21
21
|
)
|
|
22
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
22
23
|
|
|
23
24
|
if TYPE_CHECKING:
|
|
24
25
|
from datahub.sdk.main_client import DataHubClient
|
|
@@ -80,7 +81,7 @@ def compute_entity_types(
|
|
|
80
81
|
) -> Optional[List[str]]:
|
|
81
82
|
found_filters = False
|
|
82
83
|
found_positive_filters = False
|
|
83
|
-
entity_types:
|
|
84
|
+
entity_types: OrderedSet[str] = OrderedSet()
|
|
84
85
|
for ands in filters:
|
|
85
86
|
for clause in ands["and"]:
|
|
86
87
|
if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
|
|
@@ -88,7 +89,7 @@ def compute_entity_types(
|
|
|
88
89
|
if not clause.negated:
|
|
89
90
|
found_positive_filters = True
|
|
90
91
|
|
|
91
|
-
entity_types.
|
|
92
|
+
entity_types.update(clause.values)
|
|
92
93
|
|
|
93
94
|
if not found_filters:
|
|
94
95
|
# If we didn't find any filters, use None so we use the default set.
|
|
@@ -100,7 +101,7 @@ def compute_entity_types(
|
|
|
100
101
|
# still want to use the default set.
|
|
101
102
|
return None
|
|
102
103
|
|
|
103
|
-
return entity_types
|
|
104
|
+
return list(entity_types)
|
|
104
105
|
|
|
105
106
|
|
|
106
107
|
class SearchClient:
|
datahub/specific/dataproduct.py
CHANGED
|
@@ -9,6 +9,9 @@ from datahub.metadata.schema_classes import (
|
|
|
9
9
|
)
|
|
10
10
|
from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
|
|
11
11
|
from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
|
|
12
|
+
from datahub.specific.aspect_helpers.structured_properties import (
|
|
13
|
+
HasStructuredPropertiesPatch,
|
|
14
|
+
)
|
|
12
15
|
from datahub.specific.aspect_helpers.tags import HasTagsPatch
|
|
13
16
|
from datahub.specific.aspect_helpers.terms import HasTermsPatch
|
|
14
17
|
|
|
@@ -16,6 +19,7 @@ from datahub.specific.aspect_helpers.terms import HasTermsPatch
|
|
|
16
19
|
class DataProductPatchBuilder(
|
|
17
20
|
HasOwnershipPatch,
|
|
18
21
|
HasCustomPropertiesPatch,
|
|
22
|
+
HasStructuredPropertiesPatch,
|
|
19
23
|
HasTagsPatch,
|
|
20
24
|
HasTermsPatch,
|
|
21
25
|
MetadataPatchProposal,
|
|
@@ -58,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
|
|
|
58
58
|
ToolMetaExtractorReport,
|
|
59
59
|
)
|
|
60
60
|
from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
|
|
61
|
+
from datahub.utilities.dedup_list import deduplicate_list
|
|
61
62
|
from datahub.utilities.file_backed_collections import (
|
|
62
63
|
ConnectionWrapper,
|
|
63
64
|
FileBackedDict,
|
|
@@ -140,6 +141,7 @@ class QueryMetadata:
|
|
|
140
141
|
|
|
141
142
|
used_temp_tables: bool = True
|
|
142
143
|
|
|
144
|
+
extra_info: Optional[dict] = None
|
|
143
145
|
origin: Optional[Urn] = None
|
|
144
146
|
|
|
145
147
|
def make_created_audit_stamp(self) -> models.AuditStampClass:
|
|
@@ -263,7 +265,7 @@ class PreparsedQuery:
|
|
|
263
265
|
query_type_props: QueryTypeProps = dataclasses.field(
|
|
264
266
|
default_factory=lambda: QueryTypeProps()
|
|
265
267
|
)
|
|
266
|
-
# Use this to store
|
|
268
|
+
# Use this to store additional key-value information about the query for debugging.
|
|
267
269
|
extra_info: Optional[dict] = None
|
|
268
270
|
origin: Optional[Urn] = None
|
|
269
271
|
|
|
@@ -948,6 +950,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
948
950
|
column_usage=parsed.column_usage or {},
|
|
949
951
|
confidence_score=parsed.confidence_score,
|
|
950
952
|
used_temp_tables=session_has_temp_tables,
|
|
953
|
+
extra_info=parsed.extra_info,
|
|
951
954
|
origin=parsed.origin,
|
|
952
955
|
)
|
|
953
956
|
)
|
|
@@ -1491,9 +1494,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1491
1494
|
return
|
|
1492
1495
|
|
|
1493
1496
|
# If a query doesn't involve any allowed tables, skip it.
|
|
1494
|
-
if
|
|
1495
|
-
self.is_allowed_table(
|
|
1496
|
-
):
|
|
1497
|
+
if (
|
|
1498
|
+
downstream_urn is None or not self.is_allowed_table(downstream_urn)
|
|
1499
|
+
) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
|
|
1497
1500
|
self.report.num_queries_skipped_due_to_filters += 1
|
|
1498
1501
|
return
|
|
1499
1502
|
|
|
@@ -1574,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
|
|
|
1574
1577
|
|
|
1575
1578
|
@dataclasses.dataclass
|
|
1576
1579
|
class QueryLineageInfo:
|
|
1577
|
-
upstreams:
|
|
1578
|
-
|
|
1580
|
+
upstreams: OrderedSet[
|
|
1581
|
+
UrnStr
|
|
1582
|
+
] # this is direct upstreams, with *no temp tables*
|
|
1583
|
+
column_lineage: OrderedSet[ColumnLineageInfo]
|
|
1579
1584
|
confidence_score: float
|
|
1580
1585
|
|
|
1581
1586
|
def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
|
|
1582
|
-
self.upstreams
|
|
1583
|
-
self.column_lineage
|
|
1587
|
+
self.upstreams.update(other_query.upstreams)
|
|
1588
|
+
self.column_lineage.update(other_query.column_lineage)
|
|
1584
1589
|
self.confidence_score = min(
|
|
1585
1590
|
self.confidence_score, other_query.confidence_score
|
|
1586
1591
|
)
|
|
1587
1592
|
|
|
1593
|
+
cache: Dict[str, QueryLineageInfo] = {}
|
|
1594
|
+
|
|
1588
1595
|
def _recurse_into_query(
|
|
1589
1596
|
query: QueryMetadata, recursion_path: List[QueryId]
|
|
1590
1597
|
) -> QueryLineageInfo:
|
|
1591
1598
|
if query.query_id in recursion_path:
|
|
1592
1599
|
# This is a cycle, so we just return the query as-is.
|
|
1593
1600
|
return QueryLineageInfo(
|
|
1594
|
-
upstreams=query.upstreams,
|
|
1595
|
-
column_lineage=query.column_lineage,
|
|
1601
|
+
upstreams=OrderedSet(query.upstreams),
|
|
1602
|
+
column_lineage=OrderedSet(query.column_lineage),
|
|
1596
1603
|
confidence_score=query.confidence_score,
|
|
1597
1604
|
)
|
|
1605
|
+
if query.query_id in cache:
|
|
1606
|
+
return cache[query.query_id]
|
|
1598
1607
|
recursion_path = [*recursion_path, query.query_id]
|
|
1599
1608
|
composed_of_queries.add(query.query_id)
|
|
1600
1609
|
|
|
@@ -1609,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1609
1618
|
upstream_query = self._query_map.get(upstream_query_id)
|
|
1610
1619
|
if (
|
|
1611
1620
|
upstream_query
|
|
1612
|
-
and upstream_query.query_id not in
|
|
1621
|
+
and upstream_query.query_id not in recursion_path
|
|
1613
1622
|
):
|
|
1614
1623
|
temp_query_lineage_info = _recurse_into_query(
|
|
1615
1624
|
upstream_query, recursion_path
|
|
@@ -1669,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
|
|
|
1669
1678
|
]
|
|
1670
1679
|
)
|
|
1671
1680
|
|
|
1672
|
-
|
|
1673
|
-
upstreams=
|
|
1674
|
-
column_lineage=new_cll,
|
|
1681
|
+
ret = QueryLineageInfo(
|
|
1682
|
+
upstreams=new_upstreams,
|
|
1683
|
+
column_lineage=OrderedSet(new_cll),
|
|
1675
1684
|
confidence_score=new_confidence_score,
|
|
1676
1685
|
)
|
|
1686
|
+
cache[query.query_id] = ret
|
|
1687
|
+
|
|
1688
|
+
return ret
|
|
1677
1689
|
|
|
1678
1690
|
resolved_lineage_info = _recurse_into_query(base_query, [])
|
|
1679
1691
|
|
|
@@ -1706,15 +1718,15 @@ class SqlParsingAggregator(Closeable):
|
|
|
1706
1718
|
)
|
|
1707
1719
|
|
|
1708
1720
|
merged_query_text = ";\n\n".join(
|
|
1709
|
-
[q.formatted_query_string for q in ordered_queries]
|
|
1721
|
+
deduplicate_list([q.formatted_query_string for q in ordered_queries])
|
|
1710
1722
|
)
|
|
1711
1723
|
|
|
1712
1724
|
resolved_query = dataclasses.replace(
|
|
1713
1725
|
base_query,
|
|
1714
1726
|
query_id=composite_query_id,
|
|
1715
1727
|
formatted_query_string=merged_query_text,
|
|
1716
|
-
upstreams=resolved_lineage_info.upstreams,
|
|
1717
|
-
column_lineage=resolved_lineage_info.column_lineage,
|
|
1728
|
+
upstreams=list(resolved_lineage_info.upstreams),
|
|
1729
|
+
column_lineage=list(resolved_lineage_info.column_lineage),
|
|
1718
1730
|
confidence_score=resolved_lineage_info.confidence_score,
|
|
1719
1731
|
)
|
|
1720
1732
|
|