acryl-datahub 1.0.0rc4__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (62) hide show
  1. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2502 -2502
  2. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +62 -59
  3. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/ingest_cli.py +3 -1
  6. datahub/emitter/mcp_builder.py +4 -1
  7. datahub/ingestion/api/source_helpers.py +4 -0
  8. datahub/ingestion/run/pipeline.py +109 -143
  9. datahub/ingestion/run/sink_callback.py +77 -0
  10. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -0
  11. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  12. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  13. datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
  14. datahub/ingestion/source/delta_lake/config.py +8 -1
  15. datahub/ingestion/source/delta_lake/report.py +4 -2
  16. datahub/ingestion/source/delta_lake/source.py +20 -5
  17. datahub/ingestion/source/elastic_search.py +26 -6
  18. datahub/ingestion/source/feast.py +27 -8
  19. datahub/ingestion/source/file.py +1 -1
  20. datahub/ingestion/source/identity/okta.py +1 -2
  21. datahub/ingestion/source/mlflow.py +30 -7
  22. datahub/ingestion/source/mode.py +7 -2
  23. datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
  24. datahub/ingestion/source/nifi.py +29 -6
  25. datahub/ingestion/source/openapi_parser.py +46 -14
  26. datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
  27. datahub/ingestion/source/pulsar.py +1 -0
  28. datahub/ingestion/source/redash.py +29 -6
  29. datahub/ingestion/source/s3/config.py +3 -1
  30. datahub/ingestion/source/salesforce.py +28 -6
  31. datahub/ingestion/source/slack/slack.py +31 -10
  32. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  33. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  34. datahub/ingestion/source/sql/oracle.py +34 -0
  35. datahub/ingestion/source_config/pulsar.py +3 -1
  36. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  37. datahub/metadata/_schema_classes.py +534 -410
  38. datahub/metadata/_urns/urn_defs.py +1670 -1670
  39. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  40. datahub/metadata/schema.avsc +17379 -17637
  41. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  42. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  43. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  44. datahub/metadata/schemas/MetadataChangeEvent.avsc +13 -0
  45. datahub/metadata/schemas/__init__.py +3 -3
  46. datahub/sdk/__init__.py +29 -12
  47. datahub/sdk/_attribution.py +4 -0
  48. datahub/sdk/_entity.py +20 -1
  49. datahub/sdk/_shared.py +163 -13
  50. datahub/sdk/_utils.py +35 -0
  51. datahub/sdk/container.py +23 -5
  52. datahub/sdk/dataset.py +109 -17
  53. datahub/sdk/main_client.py +17 -0
  54. datahub/specific/dataset.py +3 -4
  55. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  56. datahub/sql_parsing/split_statements.py +20 -13
  57. datahub/utilities/file_backed_collections.py +3 -14
  58. datahub/utilities/sentinels.py +22 -0
  59. datahub/utilities/unified_diff.py +5 -1
  60. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
  61. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
  62. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -165,6 +165,19 @@
165
165
  "name": "countryCode",
166
166
  "default": null,
167
167
  "doc": "two uppercase letters country code. e.g. US"
168
+ },
169
+ {
170
+ "Searchable": {
171
+ "fieldType": "BOOLEAN",
172
+ "queryByDefault": false
173
+ },
174
+ "type": [
175
+ "boolean",
176
+ "null"
177
+ ],
178
+ "name": "system",
179
+ "default": false,
180
+ "doc": "Whether the corpUser is a system user."
168
181
  }
169
182
  ],
170
183
  "doc": "Linkedin corp user information"
@@ -16,7 +16,8 @@
16
16
  },
17
17
  {
18
18
  "Searchable": {
19
- "fieldType": "TEXT_PARTIAL"
19
+ "fieldType": "KEYWORD",
20
+ "queryByDefault": false
20
21
  },
21
22
  "type": "string",
22
23
  "name": "type",
@@ -83,7 +84,9 @@
83
84
  },
84
85
  {
85
86
  "Searchable": {
86
- "fieldName": "sourceExecutorId"
87
+ "fieldName": "sourceExecutorId",
88
+ "fieldType": "KEYWORD",
89
+ "queryByDefault": false
87
90
  },
88
91
  "type": [
89
92
  "null",
@@ -129,7 +132,9 @@
129
132
  "fields": [
130
133
  {
131
134
  "Searchable": {
132
- "fieldName": "sourceType"
135
+ "fieldName": "sourceType",
136
+ "fieldType": "KEYWORD",
137
+ "queryByDefault": false
133
138
  },
134
139
  "type": {
135
140
  "type": "enum",
@@ -108,7 +108,88 @@
108
108
  ],
109
109
  "name": "priority",
110
110
  "default": 0,
111
- "doc": "A numeric severity or priority for the incident. On the UI we will translate this into something easy to understand."
111
+ "doc": "A numeric severity or priority for the incident. On the UI we will translate this into something easy to understand.\nCurrently supported: 0 - CRITICAL, 1 - HIGH, 2 - MED, 3 - LOW\n(We probably should have modeled as an enum)"
112
+ },
113
+ {
114
+ "type": [
115
+ "null",
116
+ {
117
+ "type": "array",
118
+ "items": {
119
+ "type": "record",
120
+ "name": "IncidentAssignee",
121
+ "namespace": "com.linkedin.pegasus2avro.incident",
122
+ "fields": [
123
+ {
124
+ "Searchable": {
125
+ "addToFilters": true,
126
+ "fieldName": "assignees",
127
+ "filterNameOverride": "Assignee"
128
+ },
129
+ "java": {
130
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
131
+ },
132
+ "type": "string",
133
+ "name": "actor",
134
+ "doc": "The user or group assigned to the incident.",
135
+ "Urn": "Urn"
136
+ },
137
+ {
138
+ "type": {
139
+ "type": "record",
140
+ "name": "AuditStamp",
141
+ "namespace": "com.linkedin.pegasus2avro.common",
142
+ "fields": [
143
+ {
144
+ "type": "long",
145
+ "name": "time",
146
+ "doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
147
+ },
148
+ {
149
+ "java": {
150
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
151
+ },
152
+ "type": "string",
153
+ "name": "actor",
154
+ "doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
155
+ "Urn": "Urn"
156
+ },
157
+ {
158
+ "java": {
159
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
160
+ },
161
+ "type": [
162
+ "null",
163
+ "string"
164
+ ],
165
+ "name": "impersonator",
166
+ "default": null,
167
+ "doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
168
+ "Urn": "Urn"
169
+ },
170
+ {
171
+ "type": [
172
+ "null",
173
+ "string"
174
+ ],
175
+ "name": "message",
176
+ "default": null,
177
+ "doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
178
+ }
179
+ ],
180
+ "doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
181
+ },
182
+ "name": "assignedAt",
183
+ "doc": "The time & actor responsible for assiging the assignee."
184
+ }
185
+ ],
186
+ "doc": "The incident assignee type.\nThis is in a record so that we can add additional fields if we need to later (e.g.\nthe type of the assignee."
187
+ }
188
+ }
189
+ ],
190
+ "name": "assignees",
191
+ "default": null,
192
+ "doc": "The parties assigned with resolving the incident"
112
193
  },
113
194
  {
114
195
  "type": {
@@ -135,7 +216,38 @@
135
216
  ]
136
217
  },
137
218
  "name": "state",
138
- "doc": "The state of the incident"
219
+ "doc": "The top-level state of the incident, whether it's active or resolved."
220
+ },
221
+ {
222
+ "Searchable": {
223
+ "addToFilters": true,
224
+ "filterNameOverride": "Stage"
225
+ },
226
+ "type": [
227
+ "null",
228
+ {
229
+ "type": "enum",
230
+ "symbolDocs": {
231
+ "FIXED": "The incident is in the resolved as completed stage.",
232
+ "INVESTIGATION": "The incident root cause is being investigated.",
233
+ "NO_ACTION_REQUIRED": "The incident is in the resolved with no action required state, e.g. the\nincident was a false positive, or was expected.",
234
+ "TRIAGE": "The impact and priority of the incident is being actively assessed.",
235
+ "WORK_IN_PROGRESS": "The incident is in the remediation stage."
236
+ },
237
+ "name": "IncidentStage",
238
+ "namespace": "com.linkedin.pegasus2avro.incident",
239
+ "symbols": [
240
+ "TRIAGE",
241
+ "INVESTIGATION",
242
+ "WORK_IN_PROGRESS",
243
+ "FIXED",
244
+ "NO_ACTION_REQUIRED"
245
+ ]
246
+ }
247
+ ],
248
+ "name": "stage",
249
+ "default": null,
250
+ "doc": "The lifecycle stage for the incident - Null means no stage was assigned yet.\nIn the future, we may add CUSTOM here with a customStage string field for user-defined stages."
139
251
  },
140
252
  {
141
253
  "type": [
@@ -153,50 +265,7 @@
153
265
  "fieldType": "COUNT"
154
266
  }
155
267
  },
156
- "type": {
157
- "type": "record",
158
- "name": "AuditStamp",
159
- "namespace": "com.linkedin.pegasus2avro.common",
160
- "fields": [
161
- {
162
- "type": "long",
163
- "name": "time",
164
- "doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
165
- },
166
- {
167
- "java": {
168
- "class": "com.linkedin.pegasus2avro.common.urn.Urn"
169
- },
170
- "type": "string",
171
- "name": "actor",
172
- "doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
173
- "Urn": "Urn"
174
- },
175
- {
176
- "java": {
177
- "class": "com.linkedin.pegasus2avro.common.urn.Urn"
178
- },
179
- "type": [
180
- "null",
181
- "string"
182
- ],
183
- "name": "impersonator",
184
- "default": null,
185
- "doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
186
- "Urn": "Urn"
187
- },
188
- {
189
- "type": [
190
- "null",
191
- "string"
192
- ],
193
- "name": "message",
194
- "default": null,
195
- "doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
196
- }
197
- ],
198
- "doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
199
- },
268
+ "type": "com.linkedin.pegasus2avro.common.AuditStamp",
200
269
  "name": "lastUpdated",
201
270
  "doc": "The time at which the request was initially created"
202
271
  }
@@ -262,6 +331,21 @@
262
331
  "default": null,
263
332
  "doc": "The source of an incident, i.e. how it was generated."
264
333
  },
334
+ {
335
+ "Searchable": {
336
+ "/time": {
337
+ "fieldName": "startedAt",
338
+ "fieldType": "COUNT"
339
+ }
340
+ },
341
+ "type": [
342
+ "null",
343
+ "long"
344
+ ],
345
+ "name": "startedAt",
346
+ "default": null,
347
+ "doc": "The time at which the incident actually started (may be before the date it was raised)."
348
+ },
265
349
  {
266
350
  "Searchable": {
267
351
  "/time": {
@@ -1619,6 +1619,19 @@
1619
1619
  "name": "countryCode",
1620
1620
  "default": null,
1621
1621
  "doc": "two uppercase letters country code. e.g. US"
1622
+ },
1623
+ {
1624
+ "Searchable": {
1625
+ "fieldType": "BOOLEAN",
1626
+ "queryByDefault": false
1627
+ },
1628
+ "type": [
1629
+ "boolean",
1630
+ "null"
1631
+ ],
1632
+ "name": "system",
1633
+ "default": false,
1634
+ "doc": "Whether the corpUser is a system user."
1622
1635
  }
1623
1636
  ],
1624
1637
  "doc": "Linkedin corp user information"
@@ -15,10 +15,10 @@ import pathlib
15
15
  def _load_schema(schema_name: str) -> str:
16
16
  return (pathlib.Path(__file__).parent / f"{schema_name}.avsc").read_text()
17
17
 
18
- def getMetadataChangeEventSchema() -> str:
19
- return _load_schema("MetadataChangeEvent")
20
-
21
18
  def getMetadataChangeProposalSchema() -> str:
22
19
  return _load_schema("MetadataChangeProposal")
23
20
 
21
+ def getMetadataChangeEventSchema() -> str:
22
+ return _load_schema("MetadataChangeEvent")
23
+
24
24
  # fmt: on
datahub/sdk/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- import warnings
1
+ import types
2
2
 
3
3
  import datahub.metadata.schema_classes as models
4
- from datahub.errors import ExperimentalWarning, SdkUsageError
4
+ from datahub.errors import SdkUsageError
5
5
  from datahub.ingestion.graph.config import DatahubClientConfig
6
6
  from datahub.metadata.urns import (
7
7
  ChartUrn,
@@ -21,13 +21,30 @@ from datahub.sdk.container import Container
21
21
  from datahub.sdk.dataset import Dataset
22
22
  from datahub.sdk.main_client import DataHubClient
23
23
 
24
- warnings.warn(
25
- "The new datahub SDK (e.g. datahub.sdk.*) is experimental. "
26
- "Our typical backwards-compatibility and stability guarantees do not apply to this code. "
27
- "When it's promoted to stable, the import path will change "
28
- "from `from datahub.sdk import ...` to `from datahub import ...`.",
29
- ExperimentalWarning,
30
- stacklevel=2,
31
- )
32
- del warnings
33
- del ExperimentalWarning
24
+ # We want to print out the warning if people do `from datahub.sdk import X`.
25
+ # But we don't want to print out warnings if they're doing a more direct
26
+ # import like `from datahub.sdk.container import Container`, since that's
27
+ # what our internal code does.
28
+ _vars = {}
29
+ for _name, _value in list(locals().items()):
30
+ if not _name.startswith("_") and (
31
+ _name == "models" or not isinstance(_value, types.ModuleType)
32
+ ):
33
+ _vars[_name] = _value
34
+ del locals()[_name]
35
+
36
+
37
+ def __getattr__(name):
38
+ import warnings
39
+
40
+ from datahub.errors import ExperimentalWarning
41
+
42
+ warnings.warn(
43
+ "The new datahub SDK (e.g. datahub.sdk.*) is experimental. "
44
+ "Our typical backwards-compatibility and stability guarantees do not apply to this code. "
45
+ "When it's promoted to stable, the import path will change "
46
+ "from `from datahub.sdk import ...` to `from datahub import ...`.",
47
+ ExperimentalWarning,
48
+ stacklevel=2,
49
+ )
50
+ return _vars[name]
@@ -5,6 +5,10 @@ from typing import Iterator
5
5
 
6
6
  from datahub.utilities.str_enum import StrEnum
7
7
 
8
+ # TODO: This attribution setup is not the final form. I expect that once we have better
9
+ # backend support for attribution and attribution-oriented patch, this will become a bit
10
+ # more sophisticated.
11
+
8
12
 
9
13
  class KnownAttribution(StrEnum):
10
14
  INGESTION = "INGESTION"
datahub/sdk/_entity.py CHANGED
@@ -1,5 +1,7 @@
1
+ from __future__ import annotations
2
+
1
3
  import abc
2
- from typing import List, Optional, Type, Union
4
+ from typing import TYPE_CHECKING, List, Optional, Type, Union
3
5
 
4
6
  from typing_extensions import Self
5
7
 
@@ -10,6 +12,12 @@ from datahub.errors import SdkUsageError
10
12
  from datahub.metadata.urns import Urn
11
13
  from datahub.utilities.urns._urn_base import _SpecificUrn
12
14
 
15
+ if TYPE_CHECKING:
16
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
17
+
18
+
19
+ ExtraAspectsType = Union[None, List[AspectTypeVar]]
20
+
13
21
 
14
22
  class Entity:
15
23
  __slots__ = ("_urn", "_prev_aspects", "_aspects")
@@ -36,6 +44,8 @@ class Entity:
36
44
 
37
45
  def _init_from_graph(self, current_aspects: models.AspectBag) -> Self:
38
46
  self._prev_aspects = current_aspects
47
+
48
+ self._aspects = {}
39
49
  aspect: models._Aspect
40
50
  for aspect_name, aspect in (current_aspects or {}).items(): # type: ignore
41
51
  aspect_copy = type(aspect).from_obj(aspect.to_obj())
@@ -85,5 +95,14 @@ class Entity:
85
95
  )
86
96
  return mcps
87
97
 
98
+ def as_workunits(self) -> List[MetadataWorkUnit]:
99
+ return [mcp.as_workunit() for mcp in self._as_mcps()]
100
+
101
+ def _set_extra_aspects(self, extra_aspects: ExtraAspectsType) -> None:
102
+ # TODO: Add validation to ensure that an "extra aspect" does not conflict
103
+ # with / get overridden by a standard aspect.
104
+ for aspect in extra_aspects or []:
105
+ self._set_aspect(aspect)
106
+
88
107
  def __repr__(self) -> str:
89
108
  return f"{self.__class__.__name__}('{self.urn}')"