acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (65) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/METADATA +2486 -2487
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/RECORD +64 -49
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/emitter/request_helper.py +19 -14
  8. datahub/ingestion/api/source.py +6 -2
  9. datahub/ingestion/api/source_helpers.py +6 -2
  10. datahub/ingestion/extractor/schema_util.py +1 -0
  11. datahub/ingestion/source/common/data_platforms.py +23 -0
  12. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  13. datahub/ingestion/source/common/subtypes.py +15 -0
  14. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  15. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  16. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  17. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  18. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  19. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  20. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  21. datahub/ingestion/source/hex/__init__.py +0 -0
  22. datahub/ingestion/source/hex/api.py +394 -0
  23. datahub/ingestion/source/hex/constants.py +3 -0
  24. datahub/ingestion/source/hex/hex.py +167 -0
  25. datahub/ingestion/source/hex/mapper.py +372 -0
  26. datahub/ingestion/source/hex/model.py +68 -0
  27. datahub/ingestion/source/iceberg/iceberg.py +62 -66
  28. datahub/ingestion/source/mlflow.py +198 -7
  29. datahub/ingestion/source/mode.py +11 -1
  30. datahub/ingestion/source/openapi.py +69 -34
  31. datahub/ingestion/source/powerbi/powerbi.py +29 -23
  32. datahub/ingestion/source/s3/source.py +11 -0
  33. datahub/ingestion/source/slack/slack.py +399 -82
  34. datahub/ingestion/source/superset.py +138 -22
  35. datahub/ingestion/source/vertexai/__init__.py +0 -0
  36. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  37. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  38. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  39. datahub/metadata/_schema_classes.py +472 -1
  40. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  41. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  42. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  43. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  44. datahub/metadata/schema.avsc +311 -2
  45. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  46. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  47. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  48. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  49. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  50. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  51. datahub/metadata/schemas/MetadataChangeEvent.avsc +30 -0
  52. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  53. datahub/metadata/schemas/Siblings.avsc +2 -0
  54. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  55. datahub/sdk/dataset.py +122 -0
  56. datahub/sdk/entity.py +99 -3
  57. datahub/sdk/entity_client.py +27 -3
  58. datahub/sdk/main_client.py +22 -0
  59. datahub/sdk/search_filters.py +4 -4
  60. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  61. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  62. datahub/testing/mcp_diff.py +1 -18
  63. datahub/ingestion/source/vertexai.py +0 -697
  64. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info/licenses}/LICENSE +0 -0
  65. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/top_level.txt +0 -0
@@ -30,6 +30,7 @@ from datahub.metadata.urns import (
30
30
  DatasetUrn,
31
31
  QueryUrn,
32
32
  SchemaFieldUrn,
33
+ Urn,
33
34
  )
34
35
  from datahub.sql_parsing.schema_resolver import (
35
36
  SchemaResolver,
@@ -139,6 +140,8 @@ class QueryMetadata:
139
140
 
140
141
  used_temp_tables: bool = True
141
142
 
143
+ origin: Optional[Urn] = None
144
+
142
145
  def make_created_audit_stamp(self) -> models.AuditStampClass:
143
146
  return models.AuditStampClass(
144
147
  time=make_ts_millis(self.latest_timestamp) or 0,
@@ -221,6 +224,7 @@ class PreparsedQuery:
221
224
  )
222
225
  # Use this to store addtitional key-value information about query for debugging
223
226
  extra_info: Optional[dict] = None
227
+ origin: Optional[Urn] = None
224
228
 
225
229
 
226
230
  @dataclasses.dataclass
@@ -903,6 +907,7 @@ class SqlParsingAggregator(Closeable):
903
907
  column_usage=parsed.column_usage or {},
904
908
  confidence_score=parsed.confidence_score,
905
909
  used_temp_tables=session_has_temp_tables,
910
+ origin=parsed.origin,
906
911
  )
907
912
  )
908
913
 
@@ -1464,6 +1469,7 @@ class SqlParsingAggregator(Closeable):
1464
1469
  source=models.QuerySourceClass.SYSTEM,
1465
1470
  created=query.make_created_audit_stamp(),
1466
1471
  lastModified=query.make_last_modified_audit_stamp(),
1472
+ origin=query.origin.urn() if query.origin else None,
1467
1473
  ),
1468
1474
  models.QuerySubjectsClass(
1469
1475
  subjects=[
@@ -13,7 +13,7 @@ from datahub.api.entities.platformresource.platform_resource import (
13
13
  )
14
14
  from datahub.ingestion.api.report import Report
15
15
  from datahub.ingestion.graph.client import DataHubGraph
16
- from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
16
+ from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn, DataPlatformUrn, Urn
17
17
  from datahub.utilities.search_utils import LogicalOperator
18
18
  from datahub.utilities.stats_collections import int_top_k_dict
19
19
 
@@ -21,6 +21,10 @@ UrnStr = str
21
21
 
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
+ MODE_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:mode")
25
+ LOOKER_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:looker")
26
+ HEX_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:hex")
27
+
24
28
 
25
29
  class QueryLog(Protocol):
26
30
  """Represents Query Log Entry
@@ -30,6 +34,7 @@ class QueryLog(Protocol):
30
34
  query_text: str
31
35
  user: Optional[Union[CorpUserUrn, CorpGroupUrn]]
32
36
  extra_info: Optional[dict]
37
+ origin: Optional[Urn]
33
38
 
34
39
 
35
40
  def _get_last_line(query: str) -> str:
@@ -67,6 +72,10 @@ class ToolMetaExtractor:
67
72
  "looker",
68
73
  self._extract_looker_query,
69
74
  ),
75
+ (
76
+ "hex",
77
+ self._extract_hex_query,
78
+ ),
70
79
  ]
71
80
  # maps user id (as string) to email address
72
81
  self.looker_user_mapping = looker_user_mapping
@@ -153,7 +162,7 @@ class ToolMetaExtractor:
153
162
  entry.extra_info = entry.extra_info or {}
154
163
  entry.extra_info["user_via"] = original_user
155
164
 
156
- # TODO: Generate an "origin" urn.
165
+ entry.origin = MODE_PLATFORM_URN
157
166
 
158
167
  return True
159
168
 
@@ -190,6 +199,22 @@ class ToolMetaExtractor:
190
199
  entry.extra_info = entry.extra_info or {}
191
200
  entry.extra_info["user_via"] = original_user
192
201
 
202
+ entry.origin = LOOKER_PLATFORM_URN
203
+
204
+ return True
205
+
206
+ def _extract_hex_query(self, entry: QueryLog) -> bool:
207
+ """
208
+ Returns:
209
+ bool: whether QueryLog entry is that of hex.
210
+ """
211
+ last_line = _get_last_line(entry.query_text)
212
+
213
+ if not last_line.startswith("-- Hex query metadata:"):
214
+ return False
215
+
216
+ entry.origin = HEX_PLATFORM_URN
217
+
193
218
  return True
194
219
 
195
220
  def extract_bi_metadata(self, entry: QueryLog) -> bool:
@@ -8,7 +8,6 @@ import deepdiff.serialization
8
8
  import yaml
9
9
  from deepdiff import DeepDiff
10
10
  from deepdiff.model import DiffLevel
11
- from deepdiff.operator import BaseOperator
12
11
  from typing_extensions import Literal
13
12
 
14
13
  ReportType = Literal[
@@ -59,27 +58,12 @@ class AspectForDiff:
59
58
 
60
59
  @dataclasses.dataclass
61
60
  class DeltaInfo:
62
- """Information about an MCP used to construct a diff delta.
63
-
64
- In a separate class so it can be ignored by DeepDiff via MCPDeltaInfoOperator.
65
- """
61
+ """Information about an MCP used to construct a diff delta."""
66
62
 
67
63
  idx: int # Location in list of MCEs in golden file
68
64
  original: Dict[str, Any] # Original json-serialized MCP
69
65
 
70
66
 
71
- class DeltaInfoOperator(BaseOperator):
72
- """Warning: Doesn't seem to be working right now.
73
- Ignored via an ignore path as an extra layer of defense.
74
- """
75
-
76
- def __init__(self):
77
- super().__init__(types=[DeltaInfo])
78
-
79
- def give_up_diffing(self, *args: Any, **kwargs: Any) -> bool:
80
- return True
81
-
82
-
83
67
  AspectsByUrn = Dict[str, Dict[str, List[AspectForDiff]]]
84
68
 
85
69
 
@@ -176,7 +160,6 @@ class MCPDiff:
176
160
  t2=t2,
177
161
  exclude_regex_paths=ignore_paths,
178
162
  ignore_order=True,
179
- custom_operators=[DeltaInfoOperator()],
180
163
  )
181
164
  if diff:
182
165
  aspect_changes[urn][aspect_name] = MCPAspectDiff.create(diff)