acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (156) hide show
  1. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
  2. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
  3. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/cli/check_cli.py +65 -11
  7. datahub/cli/cli_utils.py +63 -0
  8. datahub/cli/container_cli.py +5 -0
  9. datahub/cli/delete_cli.py +3 -4
  10. datahub/cli/docker_check.py +107 -12
  11. datahub/cli/docker_cli.py +149 -227
  12. datahub/cli/exists_cli.py +0 -2
  13. datahub/cli/get_cli.py +0 -2
  14. datahub/cli/iceberg_cli.py +5 -0
  15. datahub/cli/ingest_cli.py +3 -15
  16. datahub/cli/migrate.py +2 -0
  17. datahub/cli/put_cli.py +1 -4
  18. datahub/cli/quickstart_versioning.py +50 -7
  19. datahub/cli/specific/assertions_cli.py +0 -4
  20. datahub/cli/specific/datacontract_cli.py +0 -3
  21. datahub/cli/specific/dataproduct_cli.py +0 -11
  22. datahub/cli/specific/dataset_cli.py +1 -8
  23. datahub/cli/specific/forms_cli.py +0 -4
  24. datahub/cli/specific/group_cli.py +0 -2
  25. datahub/cli/specific/structuredproperties_cli.py +1 -4
  26. datahub/cli/specific/user_cli.py +0 -2
  27. datahub/cli/state_cli.py +0 -2
  28. datahub/cli/timeline_cli.py +0 -2
  29. datahub/emitter/rest_emitter.py +41 -8
  30. datahub/entrypoints.py +4 -3
  31. datahub/ingestion/api/decorators.py +15 -3
  32. datahub/ingestion/api/report.py +332 -3
  33. datahub/ingestion/api/sink.py +3 -0
  34. datahub/ingestion/api/source.py +47 -45
  35. datahub/ingestion/autogenerated/__init__.py +0 -0
  36. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  37. datahub/ingestion/autogenerated/lineage.json +401 -0
  38. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  39. datahub/ingestion/extractor/schema_util.py +13 -4
  40. datahub/ingestion/graph/client.py +73 -30
  41. datahub/ingestion/run/pipeline.py +54 -2
  42. datahub/ingestion/sink/datahub_rest.py +12 -0
  43. datahub/ingestion/source/abs/source.py +1 -1
  44. datahub/ingestion/source/aws/glue.py +1 -1
  45. datahub/ingestion/source/azure/azure_common.py +2 -2
  46. datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
  47. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  48. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  49. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  50. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  51. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  52. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  53. datahub/ingestion/source/common/subtypes.py +45 -0
  54. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  55. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  56. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  57. datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
  58. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  59. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  60. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  61. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  62. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  63. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  64. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  65. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  66. datahub/ingestion/source/ge_data_profiler.py +76 -28
  67. datahub/ingestion/source/hex/api.py +26 -1
  68. datahub/ingestion/source/identity/azure_ad.py +1 -1
  69. datahub/ingestion/source/identity/okta.py +1 -14
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  72. datahub/ingestion/source/mlflow.py +11 -1
  73. datahub/ingestion/source/mock_data/__init__.py +0 -0
  74. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  75. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  76. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  77. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  78. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  79. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  80. datahub/ingestion/source/preset.py +2 -2
  81. datahub/ingestion/source/redshift/redshift.py +17 -0
  82. datahub/ingestion/source/redshift/usage.py +4 -3
  83. datahub/ingestion/source/s3/report.py +4 -2
  84. datahub/ingestion/source/s3/source.py +367 -115
  85. datahub/ingestion/source/salesforce.py +6 -3
  86. datahub/ingestion/source/sigma/sigma.py +6 -1
  87. datahub/ingestion/source/slack/slack.py +2 -1
  88. datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
  89. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  90. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  91. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
  92. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  93. datahub/ingestion/source/sql/athena.py +119 -12
  94. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  95. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  96. datahub/ingestion/source/sql/mssql/source.py +24 -15
  97. datahub/ingestion/source/sql/oracle.py +1 -1
  98. datahub/ingestion/source/sql/sql_common.py +11 -0
  99. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  100. datahub/ingestion/source/sql/teradata.py +997 -235
  101. datahub/ingestion/source/sql/vertica.py +10 -6
  102. datahub/ingestion/source/sql_queries.py +2 -2
  103. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  104. datahub/ingestion/source/superset.py +57 -2
  105. datahub/ingestion/source/tableau/tableau.py +57 -37
  106. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  107. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  108. datahub/ingestion/source/unity/proxy.py +4 -3
  109. datahub/ingestion/source/unity/source.py +56 -30
  110. datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
  111. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  112. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  113. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  114. datahub/metadata/_internal_schema_classes.py +1253 -536
  115. datahub/metadata/_urns/urn_defs.py +1797 -1685
  116. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  117. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  118. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  119. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  120. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  121. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  122. datahub/metadata/schema.avsc +16614 -16538
  123. datahub/metadata/schemas/ContainerProperties.avsc +2 -0
  124. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  125. datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
  126. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  127. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  128. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  129. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  130. datahub/metadata/schemas/DataJobInfo.avsc +2 -0
  131. datahub/metadata/schemas/DataProcessKey.avsc +2 -0
  132. datahub/metadata/schemas/DatasetKey.avsc +4 -1
  133. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  134. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
  135. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  136. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
  137. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
  138. datahub/metadata/schemas/MLModelKey.avsc +2 -0
  139. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  140. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  141. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  142. datahub/sdk/datajob.py +39 -15
  143. datahub/sdk/lineage_client.py +2 -0
  144. datahub/sdk/main_client.py +14 -2
  145. datahub/sdk/search_client.py +4 -3
  146. datahub/specific/dataproduct.py +4 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  148. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  149. datahub/telemetry/telemetry.py +17 -11
  150. datahub/upgrade/upgrade.py +46 -13
  151. datahub/utilities/server_config_util.py +8 -0
  152. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  153. datahub/utilities/stats_collections.py +4 -0
  154. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
  155. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
  156. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,140 @@
1
+ {
2
+ "type": "record",
3
+ "Aspect": {
4
+ "name": "logicalParent"
5
+ },
6
+ "name": "LogicalParent",
7
+ "namespace": "com.linkedin.pegasus2avro.logical",
8
+ "fields": [
9
+ {
10
+ "Relationship": {
11
+ "/destinationUrn": {
12
+ "createdActor": "parent/created/actor",
13
+ "createdOn": "parent/created/time",
14
+ "entityTypes": [
15
+ "dataset",
16
+ "schemaField"
17
+ ],
18
+ "name": "PhysicalInstanceOf",
19
+ "properties": "parent/properties",
20
+ "updatedActor": "parent/lastModified/actor",
21
+ "updatedOn": "parent/lastModified/time"
22
+ }
23
+ },
24
+ "Searchable": {
25
+ "/destinationUrn": {
26
+ "addToFilters": true,
27
+ "fieldName": "logicalParent",
28
+ "fieldType": "URN",
29
+ "filterNameOverride": "Physical Instance Of",
30
+ "hasValuesFieldName": "hasLogicalParent",
31
+ "queryByDefault": false
32
+ }
33
+ },
34
+ "type": {
35
+ "type": "record",
36
+ "name": "Edge",
37
+ "namespace": "com.linkedin.pegasus2avro.common",
38
+ "fields": [
39
+ {
40
+ "java": {
41
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
42
+ },
43
+ "type": [
44
+ "null",
45
+ "string"
46
+ ],
47
+ "name": "sourceUrn",
48
+ "default": null,
49
+ "doc": "Urn of the source of this relationship edge.\nIf not specified, assumed to be the entity that this aspect belongs to.",
50
+ "Urn": "Urn"
51
+ },
52
+ {
53
+ "java": {
54
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
55
+ },
56
+ "type": "string",
57
+ "name": "destinationUrn",
58
+ "doc": "Urn of the destination of this relationship edge.",
59
+ "Urn": "Urn"
60
+ },
61
+ {
62
+ "type": [
63
+ "null",
64
+ {
65
+ "type": "record",
66
+ "name": "AuditStamp",
67
+ "namespace": "com.linkedin.pegasus2avro.common",
68
+ "fields": [
69
+ {
70
+ "type": "long",
71
+ "name": "time",
72
+ "doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
73
+ },
74
+ {
75
+ "java": {
76
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
77
+ },
78
+ "type": "string",
79
+ "name": "actor",
80
+ "doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
81
+ "Urn": "Urn"
82
+ },
83
+ {
84
+ "java": {
85
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
86
+ },
87
+ "type": [
88
+ "null",
89
+ "string"
90
+ ],
91
+ "name": "impersonator",
92
+ "default": null,
93
+ "doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
94
+ "Urn": "Urn"
95
+ },
96
+ {
97
+ "type": [
98
+ "null",
99
+ "string"
100
+ ],
101
+ "name": "message",
102
+ "default": null,
103
+ "doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
104
+ }
105
+ ],
106
+ "doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
107
+ }
108
+ ],
109
+ "name": "created",
110
+ "default": null,
111
+ "doc": "Audit stamp containing who created this relationship edge and when"
112
+ },
113
+ {
114
+ "type": [
115
+ "null",
116
+ "com.linkedin.pegasus2avro.common.AuditStamp"
117
+ ],
118
+ "name": "lastModified",
119
+ "default": null,
120
+ "doc": "Audit stamp containing who last modified this relationship edge and when"
121
+ },
122
+ {
123
+ "type": [
124
+ "null",
125
+ {
126
+ "type": "map",
127
+ "values": "string"
128
+ }
129
+ ],
130
+ "name": "properties",
131
+ "default": null,
132
+ "doc": "A generic properties bag that allows us to store specific information on this graph edge."
133
+ }
134
+ ],
135
+ "doc": "A common structure to represent all edges to entities when used inside aspects as collections\nThis ensures that all edges have common structure around audit-stamps and will support PATCH, time-travel automatically."
136
+ },
137
+ "name": "parent"
138
+ }
139
+ ]
140
+ }
@@ -60,6 +60,7 @@
60
60
  "QA": "Designates quality assurance fabrics",
61
61
  "RVW": "Designates review fabrics",
62
62
  "SANDBOX": "Designates sandbox fabrics",
63
+ "SBX": "Alternative spelling for sandbox",
63
64
  "SIT": "System Integration Testing",
64
65
  "STG": "Designates staging fabrics",
65
66
  "TEST": "Designates testing fabrics",
@@ -83,6 +84,7 @@
83
84
  "PRD",
84
85
  "TST",
85
86
  "SIT",
87
+ "SBX",
86
88
  "SANDBOX"
87
89
  ],
88
90
  "doc": "Fabric group type"
@@ -67,6 +67,7 @@
67
67
  "QA": "Designates quality assurance fabrics",
68
68
  "RVW": "Designates review fabrics",
69
69
  "SANDBOX": "Designates sandbox fabrics",
70
+ "SBX": "Alternative spelling for sandbox",
70
71
  "SIT": "System Integration Testing",
71
72
  "STG": "Designates staging fabrics",
72
73
  "TEST": "Designates testing fabrics",
@@ -90,6 +91,7 @@
90
91
  "PRD",
91
92
  "TST",
92
93
  "SIT",
94
+ "SBX",
93
95
  "SANDBOX"
94
96
  ],
95
97
  "doc": "Fabric group type"
@@ -81,6 +81,7 @@
81
81
  "QA": "Designates quality assurance fabrics",
82
82
  "RVW": "Designates review fabrics",
83
83
  "SANDBOX": "Designates sandbox fabrics",
84
+ "SBX": "Alternative spelling for sandbox",
84
85
  "SIT": "System Integration Testing",
85
86
  "STG": "Designates staging fabrics",
86
87
  "TEST": "Designates testing fabrics",
@@ -104,6 +105,7 @@
104
105
  "PRD",
105
106
  "TST",
106
107
  "SIT",
108
+ "SBX",
107
109
  "SANDBOX"
108
110
  ],
109
111
  "doc": "Fabric group type"
@@ -2430,6 +2430,7 @@
2430
2430
  "QA": "Designates quality assurance fabrics",
2431
2431
  "RVW": "Designates review fabrics",
2432
2432
  "SANDBOX": "Designates sandbox fabrics",
2433
+ "SBX": "Alternative spelling for sandbox",
2433
2434
  "SIT": "System Integration Testing",
2434
2435
  "STG": "Designates staging fabrics",
2435
2436
  "TEST": "Designates testing fabrics",
@@ -2453,6 +2454,7 @@
2453
2454
  "PRD",
2454
2455
  "TST",
2455
2456
  "SIT",
2457
+ "SBX",
2456
2458
  "SANDBOX"
2457
2459
  ],
2458
2460
  "doc": "Fabric group type"
@@ -15,13 +15,6 @@
15
15
  "namespace": "com.linkedin.pegasus2avro.query",
16
16
  "fields": [
17
17
  {
18
- "Relationship": {
19
- "entityTypes": [
20
- "dataset",
21
- "schemaField"
22
- ],
23
- "name": "IsAssociatedWith"
24
- },
25
18
  "Searchable": {
26
19
  "fieldName": "entities",
27
20
  "fieldType": "URN"
@@ -32,11 +25,7 @@
32
25
  "type": "string",
33
26
  "name": "entity",
34
27
  "doc": "An entity which is the subject of a query.",
35
- "Urn": "Urn",
36
- "entityTypes": [
37
- "dataset",
38
- "schemaField"
39
- ]
28
+ "Urn": "Urn"
40
29
  }
41
30
  ],
42
31
  "doc": "A single subject of a particular query.\nIn the future, we may evolve this model to include richer details\nabout the Query Subject in relation to the query."
@@ -14,7 +14,8 @@
14
14
  "documentation",
15
15
  "testResults",
16
16
  "deprecation",
17
- "subTypes"
17
+ "subTypes",
18
+ "logicalParent"
18
19
  ]
19
20
  },
20
21
  "name": "SchemaFieldKey",
datahub/sdk/datajob.py CHANGED
@@ -6,6 +6,7 @@ from typing import Dict, List, Optional, Type
6
6
 
7
7
  from typing_extensions import Self
8
8
 
9
+ import datahub.emitter.mce_builder as builder
9
10
  import datahub.metadata.schema_classes as models
10
11
  from datahub.cli.cli_utils import first_non_null
11
12
  from datahub.errors import IngestionAttributionWarning
@@ -64,7 +65,7 @@ class DataJob(
64
65
  """Get the URN type for data jobs."""
65
66
  return DataJobUrn
66
67
 
67
- def __init__(
68
+ def __init__( # noqa: C901
68
69
  self,
69
70
  *,
70
71
  name: str,
@@ -86,6 +87,7 @@ class DataJob(
86
87
  domain: Optional[DomainInputType] = None,
87
88
  inlets: Optional[List[DatasetUrnOrStr]] = None,
88
89
  outlets: Optional[List[DatasetUrnOrStr]] = None,
90
+ fine_grained_lineages: Optional[List[models.FineGrainedLineageClass]] = None,
89
91
  structured_properties: Optional[StructuredPropertyInputType] = None,
90
92
  extra_aspects: ExtraAspectsType = None,
91
93
  ):
@@ -103,12 +105,14 @@ class DataJob(
103
105
  ValueError: If neither flow nor (flow_urn and platform_instance) are provided
104
106
  """
105
107
  if flow is None:
106
- if flow_urn is None or platform_instance is None:
108
+ if flow_urn is None:
107
109
  raise ValueError(
108
110
  "You must provide either: 1. a DataFlow object, or 2. a DataFlowUrn (and a platform_instance config if required)"
109
111
  )
110
112
  flow_urn = DataFlowUrn.from_string(flow_urn)
111
- if flow_urn.flow_id.startswith(f"{platform_instance}."):
113
+ if platform_instance and flow_urn.flow_id.startswith(
114
+ f"{platform_instance}."
115
+ ):
112
116
  flow_name = flow_urn.flow_id[len(platform_instance) + 1 :]
113
117
  else:
114
118
  flow_name = flow_urn.flow_id
@@ -133,8 +137,6 @@ class DataJob(
133
137
  )
134
138
  self._setdefault_aspect(job_info)
135
139
  self._ensure_datajob_props().flowUrn = str(flow.urn)
136
-
137
- # Set properties if provided
138
140
  if description is not None:
139
141
  self.set_description(description)
140
142
  if external_url is not None:
@@ -145,8 +147,6 @@ class DataJob(
145
147
  self.set_created(created)
146
148
  if last_modified is not None:
147
149
  self.set_last_modified(last_modified)
148
-
149
- # Set standard aspects
150
150
  if subtype is not None:
151
151
  self.set_subtype(subtype)
152
152
  if owners is not None:
@@ -159,13 +159,19 @@ class DataJob(
159
159
  self.set_terms(terms)
160
160
  if domain is not None:
161
161
  self.set_domain(domain)
162
+ if structured_properties is not None:
163
+ for key, value in structured_properties.items():
164
+ self.set_structured_property(property_urn=key, values=value)
162
165
  if inlets is not None:
163
166
  self.set_inlets(inlets)
164
167
  if outlets is not None:
165
168
  self.set_outlets(outlets)
166
- if structured_properties is not None:
167
- for key, value in structured_properties.items():
168
- self.set_structured_property(property_urn=key, values=value)
169
+ if fine_grained_lineages is not None:
170
+ self.set_fine_grained_lineages(fine_grained_lineages)
171
+
172
+ if self.flow_urn.cluster.upper() in builder.ALL_ENV_TYPES:
173
+ env = self.flow_urn.cluster.upper()
174
+ self._ensure_datajob_props().env = env
169
175
 
170
176
  @classmethod
171
177
  def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
@@ -201,9 +207,7 @@ class DataJob(
201
207
  ) -> Optional[models.DataJobInputOutputClass]:
202
208
  return self._get_aspect(models.DataJobInputOutputClass)
203
209
 
204
- def _ensure_datajob_inputoutput_props(
205
- self,
206
- ) -> models.DataJobInputOutputClass:
210
+ def _ensure_datajob_inputoutput_props(self) -> models.DataJobInputOutputClass:
207
211
  return self._setdefault_aspect(
208
212
  models.DataJobInputOutputClass(inputDatasets=[], outputDatasets=[])
209
213
  )
@@ -307,8 +311,6 @@ class DataJob(
307
311
  browse_path.append(
308
312
  models.BrowsePathEntryClass(id=entry.id, urn=entry.urn)
309
313
  )
310
-
311
- # Add the job itself to the path
312
314
  browse_path.append(models.BrowsePathEntryClass(id=flow.name, urn=str(flow.urn)))
313
315
  # Set the browse path aspect
314
316
  self._set_aspect(models.BrowsePathsV2Class(path=browse_path))
@@ -341,3 +343,25 @@ class DataJob(
341
343
  self._ensure_datajob_inputoutput_props().outputDatasets.append(
342
344
  str(outlet_urn)
343
345
  )
346
+
347
+ @property
348
+ def fine_grained_lineages(self) -> List[models.FineGrainedLineageClass]:
349
+ io_aspect = self._get_datajob_inputoutput_props()
350
+ return (
351
+ io_aspect.fineGrainedLineages
352
+ if io_aspect and io_aspect.fineGrainedLineages
353
+ else []
354
+ )
355
+
356
+ def set_fine_grained_lineages(
357
+ self, lineages: List[models.FineGrainedLineageClass]
358
+ ) -> None:
359
+ io_aspect = self._ensure_datajob_inputoutput_props()
360
+ if io_aspect.fineGrainedLineages is None:
361
+ io_aspect.fineGrainedLineages = []
362
+ io_aspect.fineGrainedLineages.extend(lineages)
363
+
364
+ @property
365
+ def env(self) -> Optional[str]:
366
+ """Get the environment of the data job."""
367
+ return str(self._ensure_datajob_props().env)
@@ -478,6 +478,7 @@ class LineageClient:
478
478
  env: str = "PROD",
479
479
  default_db: Optional[str] = None,
480
480
  default_schema: Optional[str] = None,
481
+ override_dialect: Optional[str] = None,
481
482
  ) -> None:
482
483
  """Add lineage by parsing a SQL query."""
483
484
  from datahub.sql_parsing.sqlglot_lineage import (
@@ -493,6 +494,7 @@ class LineageClient:
493
494
  platform_instance=platform_instance,
494
495
  env=env,
495
496
  graph=self._client._graph,
497
+ override_dialect=override_dialect,
496
498
  )
497
499
 
498
500
  if parsed_result.debug_info.table_error:
@@ -66,7 +66,12 @@ class DataHubClient:
66
66
  self._graph.test_connection()
67
67
 
68
68
  @classmethod
69
- def from_env(cls) -> "DataHubClient":
69
+ def from_env(
70
+ cls,
71
+ *,
72
+ client_mode: ClientMode = ClientMode.SDK,
73
+ datahub_component: Optional[str] = None,
74
+ ) -> "DataHubClient":
70
75
  """Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
71
76
 
72
77
  This will first check DATAHUB_GMS_URL and DATAHUB_GMS_TOKEN. If not present,
@@ -76,6 +81,10 @@ class DataHubClient:
76
81
  If you're looking to specify the server/token in code, use the
77
82
  DataHubClient(server=..., token=...) constructor instead.
78
83
 
84
+ Args:
85
+ client_mode: [internal] The client mode to use. Defaults to "SDK".
86
+ datahub_component: [internal] The DataHub component name to include in the user agent.
87
+
79
88
  Returns:
80
89
  A DataHubClient instance.
81
90
  """
@@ -83,7 +92,10 @@ class DataHubClient:
83
92
  # Inspired by the DockerClient.from_env() method.
84
93
  # TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
85
94
  # That file is part of the "environment", but is not a traditional "env variable".
86
- graph = get_default_graph(ClientMode.SDK)
95
+ graph = get_default_graph(
96
+ client_mode=client_mode,
97
+ datahub_component=datahub_component,
98
+ )
87
99
 
88
100
  return cls(graph=graph)
89
101
 
@@ -19,6 +19,7 @@ from datahub.sdk.search_filters import (
19
19
  _OrFilters,
20
20
  _StatusFilter,
21
21
  )
22
+ from datahub.utilities.ordered_set import OrderedSet
22
23
 
23
24
  if TYPE_CHECKING:
24
25
  from datahub.sdk.main_client import DataHubClient
@@ -80,7 +81,7 @@ def compute_entity_types(
80
81
  ) -> Optional[List[str]]:
81
82
  found_filters = False
82
83
  found_positive_filters = False
83
- entity_types: List[str] = []
84
+ entity_types: OrderedSet[str] = OrderedSet()
84
85
  for ands in filters:
85
86
  for clause in ands["and"]:
86
87
  if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
@@ -88,7 +89,7 @@ def compute_entity_types(
88
89
  if not clause.negated:
89
90
  found_positive_filters = True
90
91
 
91
- entity_types.extend(clause.values)
92
+ entity_types.update(clause.values)
92
93
 
93
94
  if not found_filters:
94
95
  # If we didn't find any filters, use None so we use the default set.
@@ -100,7 +101,7 @@ def compute_entity_types(
100
101
  # still want to use the default set.
101
102
  return None
102
103
 
103
- return entity_types
104
+ return list(entity_types)
104
105
 
105
106
 
106
107
  class SearchClient:
@@ -9,6 +9,9 @@ from datahub.metadata.schema_classes import (
9
9
  )
10
10
  from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
11
11
  from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
12
+ from datahub.specific.aspect_helpers.structured_properties import (
13
+ HasStructuredPropertiesPatch,
14
+ )
12
15
  from datahub.specific.aspect_helpers.tags import HasTagsPatch
13
16
  from datahub.specific.aspect_helpers.terms import HasTermsPatch
14
17
 
@@ -16,6 +19,7 @@ from datahub.specific.aspect_helpers.terms import HasTermsPatch
16
19
  class DataProductPatchBuilder(
17
20
  HasOwnershipPatch,
18
21
  HasCustomPropertiesPatch,
22
+ HasStructuredPropertiesPatch,
19
23
  HasTagsPatch,
20
24
  HasTermsPatch,
21
25
  MetadataPatchProposal,
@@ -58,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
58
58
  ToolMetaExtractorReport,
59
59
  )
60
60
  from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
61
+ from datahub.utilities.dedup_list import deduplicate_list
61
62
  from datahub.utilities.file_backed_collections import (
62
63
  ConnectionWrapper,
63
64
  FileBackedDict,
@@ -140,6 +141,7 @@ class QueryMetadata:
140
141
 
141
142
  used_temp_tables: bool = True
142
143
 
144
+ extra_info: Optional[dict] = None
143
145
  origin: Optional[Urn] = None
144
146
 
145
147
  def make_created_audit_stamp(self) -> models.AuditStampClass:
@@ -263,7 +265,7 @@ class PreparsedQuery:
263
265
  query_type_props: QueryTypeProps = dataclasses.field(
264
266
  default_factory=lambda: QueryTypeProps()
265
267
  )
266
- # Use this to store addtitional key-value information about query for debugging
268
+ # Use this to store additional key-value information about the query for debugging.
267
269
  extra_info: Optional[dict] = None
268
270
  origin: Optional[Urn] = None
269
271
 
@@ -948,6 +950,7 @@ class SqlParsingAggregator(Closeable):
948
950
  column_usage=parsed.column_usage or {},
949
951
  confidence_score=parsed.confidence_score,
950
952
  used_temp_tables=session_has_temp_tables,
953
+ extra_info=parsed.extra_info,
951
954
  origin=parsed.origin,
952
955
  )
953
956
  )
@@ -1491,9 +1494,9 @@ class SqlParsingAggregator(Closeable):
1491
1494
  return
1492
1495
 
1493
1496
  # If a query doesn't involve any allowed tables, skip it.
1494
- if downstream_urn is None and not any(
1495
- self.is_allowed_table(urn) for urn in query.upstreams
1496
- ):
1497
+ if (
1498
+ downstream_urn is None or not self.is_allowed_table(downstream_urn)
1499
+ ) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
1497
1500
  self.report.num_queries_skipped_due_to_filters += 1
1498
1501
  return
1499
1502
 
@@ -1574,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
1574
1577
 
1575
1578
  @dataclasses.dataclass
1576
1579
  class QueryLineageInfo:
1577
- upstreams: List[UrnStr] # this is direct upstreams, with *no temp tables*
1578
- column_lineage: List[ColumnLineageInfo]
1580
+ upstreams: OrderedSet[
1581
+ UrnStr
1582
+ ] # this is direct upstreams, with *no temp tables*
1583
+ column_lineage: OrderedSet[ColumnLineageInfo]
1579
1584
  confidence_score: float
1580
1585
 
1581
1586
  def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
1582
- self.upstreams += other_query.upstreams
1583
- self.column_lineage += other_query.column_lineage
1587
+ self.upstreams.update(other_query.upstreams)
1588
+ self.column_lineage.update(other_query.column_lineage)
1584
1589
  self.confidence_score = min(
1585
1590
  self.confidence_score, other_query.confidence_score
1586
1591
  )
1587
1592
 
1593
+ cache: Dict[str, QueryLineageInfo] = {}
1594
+
1588
1595
  def _recurse_into_query(
1589
1596
  query: QueryMetadata, recursion_path: List[QueryId]
1590
1597
  ) -> QueryLineageInfo:
1591
1598
  if query.query_id in recursion_path:
1592
1599
  # This is a cycle, so we just return the query as-is.
1593
1600
  return QueryLineageInfo(
1594
- upstreams=query.upstreams,
1595
- column_lineage=query.column_lineage,
1601
+ upstreams=OrderedSet(query.upstreams),
1602
+ column_lineage=OrderedSet(query.column_lineage),
1596
1603
  confidence_score=query.confidence_score,
1597
1604
  )
1605
+ if query.query_id in cache:
1606
+ return cache[query.query_id]
1598
1607
  recursion_path = [*recursion_path, query.query_id]
1599
1608
  composed_of_queries.add(query.query_id)
1600
1609
 
@@ -1609,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
1609
1618
  upstream_query = self._query_map.get(upstream_query_id)
1610
1619
  if (
1611
1620
  upstream_query
1612
- and upstream_query.query_id not in composed_of_queries
1621
+ and upstream_query.query_id not in recursion_path
1613
1622
  ):
1614
1623
  temp_query_lineage_info = _recurse_into_query(
1615
1624
  upstream_query, recursion_path
@@ -1669,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
1669
1678
  ]
1670
1679
  )
1671
1680
 
1672
- return QueryLineageInfo(
1673
- upstreams=list(new_upstreams),
1674
- column_lineage=new_cll,
1681
+ ret = QueryLineageInfo(
1682
+ upstreams=new_upstreams,
1683
+ column_lineage=OrderedSet(new_cll),
1675
1684
  confidence_score=new_confidence_score,
1676
1685
  )
1686
+ cache[query.query_id] = ret
1687
+
1688
+ return ret
1677
1689
 
1678
1690
  resolved_lineage_info = _recurse_into_query(base_query, [])
1679
1691
 
@@ -1706,15 +1718,15 @@ class SqlParsingAggregator(Closeable):
1706
1718
  )
1707
1719
 
1708
1720
  merged_query_text = ";\n\n".join(
1709
- [q.formatted_query_string for q in ordered_queries]
1721
+ deduplicate_list([q.formatted_query_string for q in ordered_queries])
1710
1722
  )
1711
1723
 
1712
1724
  resolved_query = dataclasses.replace(
1713
1725
  base_query,
1714
1726
  query_id=composite_query_id,
1715
1727
  formatted_query_string=merged_query_text,
1716
- upstreams=resolved_lineage_info.upstreams,
1717
- column_lineage=resolved_lineage_info.column_lineage,
1728
+ upstreams=list(resolved_lineage_info.upstreams),
1729
+ column_lineage=list(resolved_lineage_info.column_lineage),
1718
1730
  confidence_score=resolved_lineage_info.confidence_score,
1719
1731
  )
1720
1732