acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show
  1. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
  2. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
  3. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
  35. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
  36. datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
  37. datahub/ingestion/source/common/subtypes.py +2 -0
  38. datahub/ingestion/source/csv_enricher.py +1 -1
  39. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  40. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  41. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  42. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  43. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  44. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  45. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  46. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  47. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  48. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  49. datahub/ingestion/source/elastic_search.py +1 -1
  50. datahub/ingestion/source/feast.py +97 -6
  51. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  52. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  53. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  54. datahub/ingestion/source/ge_data_profiler.py +46 -9
  55. datahub/ingestion/source/ge_profiling_config.py +5 -0
  56. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  57. datahub/ingestion/source/kafka/kafka.py +39 -19
  58. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  59. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  60. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  61. datahub/ingestion/source/looker/view_upstream.py +65 -30
  62. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  63. datahub/ingestion/source/mode.py +0 -23
  64. datahub/ingestion/source/neo4j/__init__.py +0 -0
  65. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  66. datahub/ingestion/source/powerbi/__init__.py +0 -1
  67. datahub/ingestion/source/powerbi/config.py +3 -3
  68. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  69. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  70. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  71. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  72. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  73. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  74. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  75. datahub/ingestion/source/preset.py +1 -0
  76. datahub/ingestion/source/pulsar.py +21 -2
  77. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  78. datahub/ingestion/source/redash.py +13 -63
  79. datahub/ingestion/source/redshift/config.py +1 -0
  80. datahub/ingestion/source/redshift/redshift.py +3 -0
  81. datahub/ingestion/source/s3/source.py +2 -3
  82. datahub/ingestion/source/sigma/data_classes.py +1 -0
  83. datahub/ingestion/source/sigma/sigma.py +101 -43
  84. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  85. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  86. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  87. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  88. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  89. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  90. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  91. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  92. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  93. datahub/ingestion/source/sql/athena.py +46 -22
  94. datahub/ingestion/source/sql/mssql/source.py +18 -6
  95. datahub/ingestion/source/sql/sql_common.py +34 -21
  96. datahub/ingestion/source/sql/sql_report.py +1 -0
  97. datahub/ingestion/source/sql/sql_types.py +85 -8
  98. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  99. datahub/ingestion/source/superset.py +215 -65
  100. datahub/ingestion/source/tableau/tableau.py +237 -76
  101. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  102. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  103. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  104. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  105. datahub/ingestion/source/unity/proxy_types.py +1 -0
  106. datahub/ingestion/source/unity/source.py +4 -0
  107. datahub/ingestion/source/unity/usage.py +20 -11
  108. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  109. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  110. datahub/integrations/assertion/common.py +1 -1
  111. datahub/lite/duckdb_lite.py +12 -17
  112. datahub/metadata/_schema_classes.py +512 -392
  113. datahub/metadata/_urns/urn_defs.py +1355 -1355
  114. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  115. datahub/metadata/schema.avsc +17222 -17499
  116. datahub/metadata/schemas/FormInfo.avsc +4 -0
  117. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  118. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  119. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  120. datahub/specific/chart.py +0 -39
  121. datahub/specific/dashboard.py +0 -39
  122. datahub/specific/datajob.py +7 -57
  123. datahub/sql_parsing/schema_resolver.py +23 -0
  124. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  125. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  126. datahub/sql_parsing/sqlglot_utils.py +8 -2
  127. datahub/telemetry/telemetry.py +23 -9
  128. datahub/testing/compare_metadata_json.py +1 -1
  129. datahub/testing/doctest.py +12 -0
  130. datahub/utilities/file_backed_collections.py +35 -2
  131. datahub/utilities/partition_executor.py +1 -1
  132. datahub/utilities/urn_encoder.py +2 -1
  133. datahub/utilities/urns/_urn_base.py +1 -1
  134. datahub/utilities/urns/structured_properties_urn.py +1 -1
  135. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  136. datahub/utilities/sql_parser.py +0 -94
  137. datahub/utilities/sql_parser_base.py +0 -21
  138. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  139. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -97,6 +97,10 @@
97
97
  "namespace": "com.linkedin.pegasus2avro.form",
98
98
  "fields": [
99
99
  {
100
+ "Searchable": {
101
+ "fieldName": "structuredPropertyPromptUrns",
102
+ "fieldType": "URN"
103
+ },
100
104
  "java": {
101
105
  "class": "com.linkedin.pegasus2avro.common.urn.Urn"
102
106
  },
@@ -359,7 +359,7 @@
359
359
  ],
360
360
  "name": "lastModified",
361
361
  "default": null,
362
- "doc": "Created Audit stamp"
362
+ "doc": "Last Modified Audit stamp"
363
363
  }
364
364
  ]
365
365
  }
@@ -6,6 +6,7 @@
6
6
  "entityCategory": "core",
7
7
  "entityAspects": [
8
8
  "propertyDefinition",
9
+ "structuredPropertySettings",
9
10
  "institutionalMemory",
10
11
  "status"
11
12
  ],
@@ -0,0 +1,114 @@
1
+ {
2
+ "type": "record",
3
+ "Aspect": {
4
+ "name": "structuredPropertySettings"
5
+ },
6
+ "name": "StructuredPropertySettings",
7
+ "namespace": "com.linkedin.pegasus2avro.structured",
8
+ "fields": [
9
+ {
10
+ "Searchable": {
11
+ "fieldType": "BOOLEAN"
12
+ },
13
+ "type": "boolean",
14
+ "name": "isHidden",
15
+ "default": false,
16
+ "doc": "Whether or not this asset should be hidden in the main application"
17
+ },
18
+ {
19
+ "Searchable": {
20
+ "fieldType": "BOOLEAN"
21
+ },
22
+ "type": "boolean",
23
+ "name": "showInSearchFilters",
24
+ "default": false,
25
+ "doc": "Whether or not this asset should be displayed as a search filter"
26
+ },
27
+ {
28
+ "Searchable": {
29
+ "fieldType": "BOOLEAN"
30
+ },
31
+ "type": "boolean",
32
+ "name": "showInAssetSummary",
33
+ "default": false,
34
+ "doc": "Whether or not this asset should be displayed in the asset sidebar"
35
+ },
36
+ {
37
+ "Searchable": {
38
+ "fieldType": "BOOLEAN"
39
+ },
40
+ "type": "boolean",
41
+ "name": "showAsAssetBadge",
42
+ "default": false,
43
+ "doc": "Whether or not this asset should be displayed as an asset badge on other\nasset's headers"
44
+ },
45
+ {
46
+ "Searchable": {
47
+ "fieldType": "BOOLEAN"
48
+ },
49
+ "type": "boolean",
50
+ "name": "showInColumnsTable",
51
+ "default": false,
52
+ "doc": "Whether or not this asset should be displayed as a column in the schema field table\nin a Dataset's \"Columns\" tab."
53
+ },
54
+ {
55
+ "Searchable": {
56
+ "/time": {
57
+ "fieldName": "lastModifiedSettings",
58
+ "fieldType": "DATETIME"
59
+ }
60
+ },
61
+ "type": [
62
+ "null",
63
+ {
64
+ "type": "record",
65
+ "name": "AuditStamp",
66
+ "namespace": "com.linkedin.pegasus2avro.common",
67
+ "fields": [
68
+ {
69
+ "type": "long",
70
+ "name": "time",
71
+ "doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
72
+ },
73
+ {
74
+ "java": {
75
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
76
+ },
77
+ "type": "string",
78
+ "name": "actor",
79
+ "doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
80
+ "Urn": "Urn"
81
+ },
82
+ {
83
+ "java": {
84
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
85
+ },
86
+ "type": [
87
+ "null",
88
+ "string"
89
+ ],
90
+ "name": "impersonator",
91
+ "default": null,
92
+ "doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
93
+ "Urn": "Urn"
94
+ },
95
+ {
96
+ "type": [
97
+ "null",
98
+ "string"
99
+ ],
100
+ "name": "message",
101
+ "default": null,
102
+ "doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
103
+ }
104
+ ],
105
+ "doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
106
+ }
107
+ ],
108
+ "name": "lastModified",
109
+ "default": null,
110
+ "doc": "Last Modified Audit stamp"
111
+ }
112
+ ],
113
+ "doc": "Settings specific to a structured property entity"
114
+ }
datahub/specific/chart.py CHANGED
@@ -1,10 +1,8 @@
1
- import time
2
1
  from typing import Dict, List, Optional, Union
3
2
 
4
3
  from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
5
4
  from datahub.metadata.schema_classes import (
6
5
  AccessLevelClass,
7
- AuditStampClass,
8
6
  ChangeAuditStampsClass,
9
7
  ChartInfoClass as ChartInfo,
10
8
  ChartTypeClass,
@@ -47,43 +45,6 @@ class ChartPatchBuilder(MetadataPatchProposal):
47
45
  )
48
46
  self.ownership_patch_helper = OwnershipPatchHelper(self)
49
47
 
50
- def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass:
51
- """
52
- Creates an AuditStampClass instance with the current timestamp and other default values.
53
-
54
- Args:
55
- message: The message associated with the audit stamp (optional).
56
-
57
- Returns:
58
- An instance of AuditStampClass.
59
- """
60
- return AuditStampClass(
61
- time=int(time.time() * 1000.0),
62
- actor="urn:li:corpuser:datahub",
63
- message=message,
64
- )
65
-
66
- def _ensure_urn_type(
67
- self, entity_type: str, edges: List[Edge], context: str
68
- ) -> None:
69
- """
70
- Ensures that the destination URNs in the given edges have the specified entity type.
71
-
72
- Args:
73
- entity_type: The entity type to check against.
74
- edges: A list of Edge objects.
75
- context: The context or description of the operation.
76
-
77
- Raises:
78
- ValueError: If any of the destination URNs is not of the specified entity type.
79
- """
80
- for e in edges:
81
- urn = Urn.create_from_string(e.destinationUrn)
82
- if not urn.get_type() == entity_type:
83
- raise ValueError(
84
- f"{context}: {e.destinationUrn} is not of type {entity_type}"
85
- )
86
-
87
48
  def add_owner(self, owner: Owner) -> "ChartPatchBuilder":
88
49
  """
89
50
  Adds an owner to the ChartPatchBuilder.
@@ -1,10 +1,8 @@
1
- import time
2
1
  from typing import Dict, List, Optional, Union
3
2
 
4
3
  from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
5
4
  from datahub.metadata.schema_classes import (
6
5
  AccessLevelClass,
7
- AuditStampClass,
8
6
  ChangeAuditStampsClass,
9
7
  DashboardInfoClass as DashboardInfo,
10
8
  EdgeClass as Edge,
@@ -46,43 +44,6 @@ class DashboardPatchBuilder(MetadataPatchProposal):
46
44
  )
47
45
  self.ownership_patch_helper = OwnershipPatchHelper(self)
48
46
 
49
- def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass:
50
- """
51
- Creates an AuditStampClass instance with the current timestamp and other default values.
52
-
53
- Args:
54
- message: The message associated with the audit stamp (optional).
55
-
56
- Returns:
57
- An instance of AuditStampClass.
58
- """
59
- return AuditStampClass(
60
- time=int(time.time() * 1000.0),
61
- actor="urn:li:corpuser:datahub",
62
- message=message,
63
- )
64
-
65
- def _ensure_urn_type(
66
- self, entity_type: str, edges: List[Edge], context: str
67
- ) -> None:
68
- """
69
- Ensures that the destination URNs in the given edges have the specified entity type.
70
-
71
- Args:
72
- entity_type: The entity type to check against.
73
- edges: A list of Edge objects.
74
- context: The context or description of the operation.
75
-
76
- Raises:
77
- ValueError: If any of the destination URNs is not of the specified entity type.
78
- """
79
- for e in edges:
80
- urn = Urn.create_from_string(e.destinationUrn)
81
- if not urn.get_type() == entity_type:
82
- raise ValueError(
83
- f"{context}: {e.destinationUrn} is not of type {entity_type}"
84
- )
85
-
86
47
  def add_owner(self, owner: Owner) -> "DashboardPatchBuilder":
87
48
  """
88
49
  Adds an owner to the DashboardPatchBuilder.
@@ -1,9 +1,7 @@
1
- import time
2
1
  from typing import Dict, List, Optional, Union
3
2
 
4
3
  from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
5
4
  from datahub.metadata.schema_classes import (
6
- AuditStampClass,
7
5
  DataJobInfoClass as DataJobInfo,
8
6
  DataJobInputOutputClass as DataJobInputOutput,
9
7
  EdgeClass as Edge,
@@ -16,10 +14,9 @@ from datahub.metadata.schema_classes import (
16
14
  SystemMetadataClass,
17
15
  TagAssociationClass as Tag,
18
16
  )
17
+ from datahub.metadata.urns import SchemaFieldUrn, TagUrn, Urn
19
18
  from datahub.specific.custom_properties import CustomPropertiesPatchHelper
20
19
  from datahub.specific.ownership import OwnershipPatchHelper
21
- from datahub.utilities.urns.tag_urn import TagUrn
22
- from datahub.utilities.urns.urn import Urn
23
20
 
24
21
 
25
22
  class DataJobPatchBuilder(MetadataPatchProposal):
@@ -45,43 +42,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
45
42
  )
46
43
  self.ownership_patch_helper = OwnershipPatchHelper(self)
47
44
 
48
- def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass:
49
- """
50
- Creates an AuditStampClass instance with the current timestamp and other default values.
51
-
52
- Args:
53
- message: The message associated with the audit stamp (optional).
54
-
55
- Returns:
56
- An instance of AuditStampClass.
57
- """
58
- return AuditStampClass(
59
- time=int(time.time() * 1000.0),
60
- actor="urn:li:corpuser:datahub",
61
- message=message,
62
- )
63
-
64
- def _ensure_urn_type(
65
- self, entity_type: str, edges: List[Edge], context: str
66
- ) -> None:
67
- """
68
- Ensures that the destination URNs in the given edges have the specified entity type.
69
-
70
- Args:
71
- entity_type: The entity type to check against.
72
- edges: A list of Edge objects.
73
- context: The context or description of the operation.
74
-
75
- Raises:
76
- ValueError: If any of the destination URNs is not of the specified entity type.
77
- """
78
- for e in edges:
79
- urn = Urn.create_from_string(e.destinationUrn)
80
- if not urn.get_type() == entity_type:
81
- raise ValueError(
82
- f"{context}: {e.destinationUrn} is not of type {entity_type}"
83
- )
84
-
85
45
  def add_owner(self, owner: Owner) -> "DataJobPatchBuilder":
86
46
  """
87
47
  Adds an owner to the DataJobPatchBuilder.
@@ -142,7 +102,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
142
102
 
143
103
  Notes:
144
104
  If `input` is an Edge object, it is used directly. If `input` is a Urn object or string,
145
- it is converted to an Edge object and added with default audit stamps.
105
+ it is converted to an Edge object and added without any audit stamps.
146
106
  """
147
107
  if isinstance(input, Edge):
148
108
  input_urn: str = input.destinationUrn
@@ -154,8 +114,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
154
114
 
155
115
  input_edge = Edge(
156
116
  destinationUrn=input_urn,
157
- created=self._mint_auditstamp(),
158
- lastModified=self._mint_auditstamp(),
159
117
  )
160
118
 
161
119
  self._ensure_urn_type("dataJob", [input_edge], "add_input_datajob")
@@ -225,7 +183,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
225
183
 
226
184
  Notes:
227
185
  If `input` is an Edge object, it is used directly. If `input` is a Urn object or string,
228
- it is converted to an Edge object and added with default audit stamps.
186
+ it is converted to an Edge object and added without any audit stamps.
229
187
  """
230
188
  if isinstance(input, Edge):
231
189
  input_urn: str = input.destinationUrn
@@ -237,8 +195,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
237
195
 
238
196
  input_edge = Edge(
239
197
  destinationUrn=input_urn,
240
- created=self._mint_auditstamp(),
241
- lastModified=self._mint_auditstamp(),
242
198
  )
243
199
 
244
200
  self._ensure_urn_type("dataset", [input_edge], "add_input_dataset")
@@ -310,7 +266,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
310
266
 
311
267
  Notes:
312
268
  If `output` is an Edge object, it is used directly. If `output` is a Urn object or string,
313
- it is converted to an Edge object and added with default audit stamps.
269
+ it is converted to an Edge object and added without any audit stamps.
314
270
  """
315
271
  if isinstance(output, Edge):
316
272
  output_urn: str = output.destinationUrn
@@ -322,15 +278,13 @@ class DataJobPatchBuilder(MetadataPatchProposal):
322
278
 
323
279
  output_edge = Edge(
324
280
  destinationUrn=output_urn,
325
- created=self._mint_auditstamp(),
326
- lastModified=self._mint_auditstamp(),
327
281
  )
328
282
 
329
283
  self._ensure_urn_type("dataset", [output_edge], "add_output_dataset")
330
284
  self._add_patch(
331
285
  DataJobInputOutput.ASPECT_NAME,
332
286
  "add",
333
- path=f"/outputDatasetEdges/{self.quote(str(output))}",
287
+ path=f"/outputDatasetEdges/{self.quote(output_urn)}",
334
288
  value=output_edge,
335
289
  )
336
290
  return self
@@ -392,9 +346,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
392
346
  ValueError: If the input is not a Schema Field urn.
393
347
  """
394
348
  input_urn = str(input)
395
- urn = Urn.create_from_string(input_urn)
396
- if not urn.get_type() == "schemaField":
397
- raise ValueError(f"Input {input} is not a Schema Field urn")
349
+ assert SchemaFieldUrn.from_string(input_urn)
398
350
 
399
351
  self._add_patch(
400
352
  DataJobInputOutput.ASPECT_NAME,
@@ -466,9 +418,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
466
418
  ValueError: If the output is not a Schema Field urn.
467
419
  """
468
420
  output_urn = str(output)
469
- urn = Urn.create_from_string(output_urn)
470
- if not urn.get_type() == "schemaField":
471
- raise ValueError(f"Input {output} is not a Schema Field urn")
421
+ assert SchemaFieldUrn.from_string(output_urn)
472
422
 
473
423
  self._add_patch(
474
424
  DataJobInputOutput.ASPECT_NAME,
@@ -123,6 +123,13 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
123
123
  )
124
124
  return urn
125
125
 
126
+ def resolve_urn(self, urn: str) -> Tuple[str, Optional[SchemaInfo]]:
127
+ schema_info = self._resolve_schema_info(urn)
128
+ if schema_info:
129
+ return urn, schema_info
130
+
131
+ return urn, None
132
+
126
133
  def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]:
127
134
  urn = self.get_urn_for_table(table)
128
135
 
@@ -293,3 +300,19 @@ def _convert_schema_field_list_to_info(
293
300
 
294
301
  def _convert_schema_aspect_to_info(schema_metadata: SchemaMetadataClass) -> SchemaInfo:
295
302
  return _convert_schema_field_list_to_info(schema_metadata.fields)
303
+
304
+
305
+ def match_columns_to_schema(
306
+ schema_info: SchemaInfo, input_columns: List[str]
307
+ ) -> List[str]:
308
+ column_from_gms: List[str] = list(schema_info.keys()) # list() to silent lint
309
+
310
+ gms_column_map: Dict[str, str] = {
311
+ column.lower(): column for column in column_from_gms
312
+ }
313
+
314
+ output_columns: List[str] = [
315
+ gms_column_map.get(column.lower(), column) for column in input_columns
316
+ ]
317
+
318
+ return output_columns
@@ -1383,8 +1383,7 @@ class SqlParsingAggregator(Closeable):
1383
1383
  return QueryUrn(query_id).urn()
1384
1384
 
1385
1385
  @classmethod
1386
- def _composite_query_id(cls, composed_of_queries: Iterable[QueryId]) -> str:
1387
- composed_of_queries = list(composed_of_queries)
1386
+ def _composite_query_id(cls, composed_of_queries: List[QueryId]) -> str:
1388
1387
  combined = json.dumps(composed_of_queries)
1389
1388
  return f"composite_{generate_hash(combined)}"
1390
1389
 
@@ -1181,6 +1181,45 @@ def sqlglot_lineage(
1181
1181
  )
1182
1182
 
1183
1183
 
1184
+ @functools.lru_cache(maxsize=128)
1185
+ def create_and_cache_schema_resolver(
1186
+ platform: str,
1187
+ env: str,
1188
+ graph: Optional[DataHubGraph] = None,
1189
+ platform_instance: Optional[str] = None,
1190
+ schema_aware: bool = True,
1191
+ ) -> SchemaResolver:
1192
+ return create_schema_resolver(
1193
+ platform=platform,
1194
+ env=env,
1195
+ graph=graph,
1196
+ platform_instance=platform_instance,
1197
+ schema_aware=schema_aware,
1198
+ )
1199
+
1200
+
1201
+ def create_schema_resolver(
1202
+ platform: str,
1203
+ env: str,
1204
+ graph: Optional[DataHubGraph] = None,
1205
+ platform_instance: Optional[str] = None,
1206
+ schema_aware: bool = True,
1207
+ ) -> SchemaResolver:
1208
+ if graph and schema_aware:
1209
+ return graph._make_schema_resolver(
1210
+ platform=platform,
1211
+ platform_instance=platform_instance,
1212
+ env=env,
1213
+ )
1214
+
1215
+ return SchemaResolver(
1216
+ platform=platform,
1217
+ platform_instance=platform_instance,
1218
+ env=env,
1219
+ graph=None,
1220
+ )
1221
+
1222
+
1184
1223
  def create_lineage_sql_parsed_result(
1185
1224
  query: str,
1186
1225
  default_db: Optional[str],
@@ -1191,21 +1230,17 @@ def create_lineage_sql_parsed_result(
1191
1230
  graph: Optional[DataHubGraph] = None,
1192
1231
  schema_aware: bool = True,
1193
1232
  ) -> SqlParsingResult:
1233
+ schema_resolver = create_schema_resolver(
1234
+ platform=platform,
1235
+ platform_instance=platform_instance,
1236
+ env=env,
1237
+ schema_aware=schema_aware,
1238
+ graph=graph,
1239
+ )
1240
+
1241
+ needs_close: bool = True
1194
1242
  if graph and schema_aware:
1195
1243
  needs_close = False
1196
- schema_resolver = graph._make_schema_resolver(
1197
- platform=platform,
1198
- platform_instance=platform_instance,
1199
- env=env,
1200
- )
1201
- else:
1202
- needs_close = True
1203
- schema_resolver = SchemaResolver(
1204
- platform=platform,
1205
- platform_instance=platform_instance,
1206
- env=env,
1207
- graph=None,
1208
- )
1209
1244
 
1210
1245
  try:
1211
1246
  return sqlglot_lineage(
@@ -1243,13 +1278,19 @@ def infer_output_schema(result: SqlParsingResult) -> Optional[List[SchemaFieldCl
1243
1278
  def view_definition_lineage_helper(
1244
1279
  result: SqlParsingResult, view_urn: str
1245
1280
  ) -> SqlParsingResult:
1246
- if result.query_type is QueryType.SELECT:
1281
+ if result.query_type is QueryType.SELECT or (
1282
+ result.out_tables and result.out_tables != [view_urn]
1283
+ ):
1247
1284
  # Some platforms (e.g. postgres) store only <select statement> from view definition
1248
1285
  # `create view V as <select statement>` . For such view definitions, `result.out_tables` and
1249
1286
  # `result.column_lineage[].downstream` are empty in `sqlglot_lineage` response, whereas upstream
1250
1287
  # details and downstream column details are extracted correctly.
1251
1288
  # Here, we inject view V's urn in `result.out_tables` and `result.column_lineage[].downstream`
1252
1289
  # to get complete lineage result.
1290
+
1291
+ # Some platforms(e.g. mssql) may have slightly different view name in view definition than
1292
+ # actual view name used elsewhere. Therefore we overwrite downstream table for such cases as well.
1293
+
1253
1294
  result.out_tables = [view_urn]
1254
1295
  if result.column_lineage:
1255
1296
  for col_result in result.column_lineage:
@@ -121,7 +121,7 @@ _BASIC_NORMALIZATION_RULES = {
121
121
  # Remove /* */ comments.
122
122
  re.compile(r"/\*.*?\*/", re.DOTALL): "",
123
123
  # Remove -- comments.
124
- re.compile(r"--.*$"): "",
124
+ re.compile(r"--.*$", re.MULTILINE): "",
125
125
  # Replace all runs of whitespace with a single space.
126
126
  re.compile(r"\s+"): " ",
127
127
  # Remove leading and trailing whitespace and trailing semicolons.
@@ -131,10 +131,16 @@ _BASIC_NORMALIZATION_RULES = {
131
131
  # Replace anything that looks like a string with a placeholder.
132
132
  re.compile(r"'[^']*'"): "?",
133
133
  # Replace sequences of IN/VALUES with a single placeholder.
134
- re.compile(r"\b(IN|VALUES)\s*\(\?(?:, \?)*\)", re.IGNORECASE): r"\1 (?)",
134
+ # The r" ?" makes it more robust to uneven spacing.
135
+ re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
135
136
  # Normalize parenthesis spacing.
136
137
  re.compile(r"\( "): "(",
137
138
  re.compile(r" \)"): ")",
139
+ # Fix up spaces before commas in column lists.
140
+ # e.g. "col1 , col2" -> "col1, col2"
141
+ # e.g. "col1,col2" -> "col1, col2"
142
+ re.compile(r"\b ,"): ",",
143
+ re.compile(r"\b,\b"): ", ",
138
144
  }
139
145
  _TABLE_NAME_NORMALIZATION_RULES = {
140
146
  # Replace UUID-like strings with a placeholder (both - and _ variants).
@@ -7,7 +7,7 @@ import sys
7
7
  import uuid
8
8
  from functools import wraps
9
9
  from pathlib import Path
10
- from typing import Any, Callable, Dict, List, Optional, TypeVar
10
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, TypeVar
11
11
 
12
12
  from mixpanel import Consumer, Mixpanel
13
13
  from typing_extensions import ParamSpec
@@ -16,10 +16,12 @@ import datahub as datahub_package
16
16
  from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER
17
17
  from datahub.cli.env_utils import get_boolean_env_variable
18
18
  from datahub.configuration.common import ExceptionWithProps
19
- from datahub.ingestion.graph.client import DataHubGraph
20
19
  from datahub.metadata.schema_classes import _custom_package_path
21
20
  from datahub.utilities.perf_timer import PerfTimer
22
21
 
22
+ if TYPE_CHECKING:
23
+ from datahub.ingestion.graph.client import DataHubGraph
24
+
23
25
  logger = logging.getLogger(__name__)
24
26
 
25
27
  DATAHUB_FOLDER = Path(DATAHUB_ROOT_FOLDER)
@@ -117,7 +119,11 @@ class Telemetry:
117
119
  tracking_init: bool = False
118
120
  sentry_enabled: bool = False
119
121
 
122
+ context_properties: Dict[str, Any] = {}
123
+
120
124
  def __init__(self):
125
+ self.context_properties = {}
126
+
121
127
  if SENTRY_DSN:
122
128
  self.sentry_enabled = True
123
129
  try:
@@ -157,6 +163,9 @@ class Telemetry:
157
163
  except Exception as e:
158
164
  logger.debug(f"Error connecting to mixpanel: {e}")
159
165
 
166
+ # Initialize the default properties for all events.
167
+ self.set_context()
168
+
160
169
  def update_config(self) -> bool:
161
170
  """
162
171
  Update the config file with the current client ID and enabled status.
@@ -238,18 +247,22 @@ class Telemetry:
238
247
 
239
248
  return False
240
249
 
241
- def update_capture_exception_context(
250
+ def set_context(
242
251
  self,
243
- server: Optional[DataHubGraph] = None,
252
+ server: Optional["DataHubGraph"] = None,
244
253
  properties: Optional[Dict[str, Any]] = None,
245
254
  ) -> None:
255
+ self.context_properties = {
256
+ **self._server_props(server),
257
+ **(properties or {}),
258
+ }
259
+
246
260
  if self.sentry_enabled:
247
261
  from sentry_sdk import set_tag
248
262
 
249
263
  properties = {
250
264
  **_default_telemetry_properties(),
251
- **self._server_props(server),
252
- **(properties or {}),
265
+ **self.context_properties,
253
266
  }
254
267
 
255
268
  for key in properties:
@@ -297,7 +310,6 @@ class Telemetry:
297
310
  self,
298
311
  event_name: str,
299
312
  properties: Optional[Dict[str, Any]] = None,
300
- server: Optional[DataHubGraph] = None,
301
313
  ) -> None:
302
314
  """
303
315
  Send a single telemetry event.
@@ -323,14 +335,15 @@ class Telemetry:
323
335
 
324
336
  properties = {
325
337
  **_default_telemetry_properties(),
326
- **self._server_props(server),
338
+ **self.context_properties,
327
339
  **properties,
328
340
  }
329
341
  self.mp.track(self.client_id, event_name, properties)
330
342
  except Exception as e:
331
343
  logger.debug(f"Error reporting telemetry: {e}")
332
344
 
333
- def _server_props(self, server: Optional[DataHubGraph]) -> Dict[str, str]:
345
+ @classmethod
346
+ def _server_props(cls, server: Optional["DataHubGraph"]) -> Dict[str, str]:
334
347
  if not server:
335
348
  return {
336
349
  "server_type": "n/a",
@@ -435,6 +448,7 @@ def with_telemetry(
435
448
  **call_props,
436
449
  "status": "error",
437
450
  **_error_props(e),
451
+ "code": e.code,
438
452
  },
439
453
  )
440
454
  telemetry_instance.capture_exception(e)
@@ -117,7 +117,7 @@ def diff_metadata_json(
117
117
  ignore_paths: Sequence[str] = (),
118
118
  ignore_order: bool = True,
119
119
  ) -> Union[DeepDiff, MCPDiff]:
120
- ignore_paths = (*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info")
120
+ ignore_paths = [*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info"]
121
121
  try:
122
122
  if ignore_order:
123
123
  golden_map = get_aspects_by_urn(golden)