acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (47) hide show
  1. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2415 -2415
  2. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +47 -46
  3. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/ingest_cli.py +3 -1
  6. datahub/ingestion/api/source_helpers.py +4 -0
  7. datahub/ingestion/run/pipeline.py +109 -143
  8. datahub/ingestion/run/sink_callback.py +77 -0
  9. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  10. datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
  11. datahub/ingestion/source/delta_lake/config.py +8 -1
  12. datahub/ingestion/source/delta_lake/report.py +4 -2
  13. datahub/ingestion/source/delta_lake/source.py +20 -5
  14. datahub/ingestion/source/elastic_search.py +26 -6
  15. datahub/ingestion/source/feast.py +27 -8
  16. datahub/ingestion/source/file.py +1 -1
  17. datahub/ingestion/source/identity/okta.py +1 -2
  18. datahub/ingestion/source/mlflow.py +30 -7
  19. datahub/ingestion/source/mode.py +7 -2
  20. datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
  21. datahub/ingestion/source/nifi.py +29 -6
  22. datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
  23. datahub/ingestion/source/pulsar.py +1 -0
  24. datahub/ingestion/source/redash.py +29 -6
  25. datahub/ingestion/source/s3/config.py +3 -1
  26. datahub/ingestion/source/salesforce.py +28 -6
  27. datahub/ingestion/source/slack/slack.py +31 -10
  28. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  29. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  30. datahub/ingestion/source/sql/oracle.py +34 -0
  31. datahub/ingestion/source_config/pulsar.py +3 -1
  32. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  33. datahub/metadata/_schema_classes.py +517 -410
  34. datahub/metadata/_urns/urn_defs.py +1670 -1670
  35. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  36. datahub/metadata/schema.avsc +17362 -17638
  37. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  38. datahub/metadata/schemas/__init__.py +3 -3
  39. datahub/sdk/__init__.py +29 -12
  40. datahub/sdk/_entity.py +18 -1
  41. datahub/sdk/container.py +3 -1
  42. datahub/sdk/dataset.py +5 -3
  43. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  44. datahub/utilities/unified_diff.py +5 -1
  45. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
  46. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
  47. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -108,7 +108,88 @@
108
108
  ],
109
109
  "name": "priority",
110
110
  "default": 0,
111
- "doc": "A numeric severity or priority for the incident. On the UI we will translate this into something easy to understand."
111
+ "doc": "A numeric severity or priority for the incident. On the UI we will translate this into something easy to understand.\nCurrently supported: 0 - CRITICAL, 1 - HIGH, 2 - MED, 3 - LOW\n(We probably should have modeled as an enum)"
112
+ },
113
+ {
114
+ "type": [
115
+ "null",
116
+ {
117
+ "type": "array",
118
+ "items": {
119
+ "type": "record",
120
+ "name": "IncidentAssignee",
121
+ "namespace": "com.linkedin.pegasus2avro.incident",
122
+ "fields": [
123
+ {
124
+ "Searchable": {
125
+ "addToFilters": true,
126
+ "fieldName": "assignees",
127
+ "filterNameOverride": "Assignee"
128
+ },
129
+ "java": {
130
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
131
+ },
132
+ "type": "string",
133
+ "name": "actor",
134
+ "doc": "The user or group assigned to the incident.",
135
+ "Urn": "Urn"
136
+ },
137
+ {
138
+ "type": {
139
+ "type": "record",
140
+ "name": "AuditStamp",
141
+ "namespace": "com.linkedin.pegasus2avro.common",
142
+ "fields": [
143
+ {
144
+ "type": "long",
145
+ "name": "time",
146
+ "doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
147
+ },
148
+ {
149
+ "java": {
150
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
151
+ },
152
+ "type": "string",
153
+ "name": "actor",
154
+ "doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
155
+ "Urn": "Urn"
156
+ },
157
+ {
158
+ "java": {
159
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
160
+ },
161
+ "type": [
162
+ "null",
163
+ "string"
164
+ ],
165
+ "name": "impersonator",
166
+ "default": null,
167
+ "doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
168
+ "Urn": "Urn"
169
+ },
170
+ {
171
+ "type": [
172
+ "null",
173
+ "string"
174
+ ],
175
+ "name": "message",
176
+ "default": null,
177
+ "doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
178
+ }
179
+ ],
180
+ "doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
181
+ },
182
+ "name": "assignedAt",
183
+ "doc": "The time & actor responsible for assiging the assignee."
184
+ }
185
+ ],
186
+ "doc": "The incident assignee type.\nThis is in a record so that we can add additional fields if we need to later (e.g.\nthe type of the assignee."
187
+ }
188
+ }
189
+ ],
190
+ "name": "assignees",
191
+ "default": null,
192
+ "doc": "The parties assigned with resolving the incident"
112
193
  },
113
194
  {
114
195
  "type": {
@@ -135,7 +216,38 @@
135
216
  ]
136
217
  },
137
218
  "name": "state",
138
- "doc": "The state of the incident"
219
+ "doc": "The top-level state of the incident, whether it's active or resolved."
220
+ },
221
+ {
222
+ "Searchable": {
223
+ "addToFilters": true,
224
+ "filterNameOverride": "Stage"
225
+ },
226
+ "type": [
227
+ "null",
228
+ {
229
+ "type": "enum",
230
+ "symbolDocs": {
231
+ "FIXED": "The incident is in the resolved as completed stage.",
232
+ "INVESTIGATION": "The incident root cause is being investigated.",
233
+ "NO_ACTION_REQUIRED": "The incident is in the resolved with no action required state, e.g. the\nincident was a false positive, or was expected.",
234
+ "TRIAGE": "The impact and priority of the incident is being actively assessed.",
235
+ "WORK_IN_PROGRESS": "The incident is in the remediation stage."
236
+ },
237
+ "name": "IncidentStage",
238
+ "namespace": "com.linkedin.pegasus2avro.incident",
239
+ "symbols": [
240
+ "TRIAGE",
241
+ "INVESTIGATION",
242
+ "WORK_IN_PROGRESS",
243
+ "FIXED",
244
+ "NO_ACTION_REQUIRED"
245
+ ]
246
+ }
247
+ ],
248
+ "name": "stage",
249
+ "default": null,
250
+ "doc": "The lifecycle stage for the incident - Null means no stage was assigned yet.\nIn the future, we may add CUSTOM here with a customStage string field for user-defined stages."
139
251
  },
140
252
  {
141
253
  "type": [
@@ -153,50 +265,7 @@
153
265
  "fieldType": "COUNT"
154
266
  }
155
267
  },
156
- "type": {
157
- "type": "record",
158
- "name": "AuditStamp",
159
- "namespace": "com.linkedin.pegasus2avro.common",
160
- "fields": [
161
- {
162
- "type": "long",
163
- "name": "time",
164
- "doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
165
- },
166
- {
167
- "java": {
168
- "class": "com.linkedin.pegasus2avro.common.urn.Urn"
169
- },
170
- "type": "string",
171
- "name": "actor",
172
- "doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
173
- "Urn": "Urn"
174
- },
175
- {
176
- "java": {
177
- "class": "com.linkedin.pegasus2avro.common.urn.Urn"
178
- },
179
- "type": [
180
- "null",
181
- "string"
182
- ],
183
- "name": "impersonator",
184
- "default": null,
185
- "doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
186
- "Urn": "Urn"
187
- },
188
- {
189
- "type": [
190
- "null",
191
- "string"
192
- ],
193
- "name": "message",
194
- "default": null,
195
- "doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
196
- }
197
- ],
198
- "doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
199
- },
268
+ "type": "com.linkedin.pegasus2avro.common.AuditStamp",
200
269
  "name": "lastUpdated",
201
270
  "doc": "The time at which the request was initially created"
202
271
  }
@@ -262,6 +331,21 @@
262
331
  "default": null,
263
332
  "doc": "The source of an incident, i.e. how it was generated."
264
333
  },
334
+ {
335
+ "Searchable": {
336
+ "/time": {
337
+ "fieldName": "startedAt",
338
+ "fieldType": "COUNT"
339
+ }
340
+ },
341
+ "type": [
342
+ "null",
343
+ "long"
344
+ ],
345
+ "name": "startedAt",
346
+ "default": null,
347
+ "doc": "The time at which the incident actually started (may be before the date it was raised)."
348
+ },
265
349
  {
266
350
  "Searchable": {
267
351
  "/time": {
@@ -15,10 +15,10 @@ import pathlib
15
15
  def _load_schema(schema_name: str) -> str:
16
16
  return (pathlib.Path(__file__).parent / f"{schema_name}.avsc").read_text()
17
17
 
18
- def getMetadataChangeEventSchema() -> str:
19
- return _load_schema("MetadataChangeEvent")
20
-
21
18
  def getMetadataChangeProposalSchema() -> str:
22
19
  return _load_schema("MetadataChangeProposal")
23
20
 
21
+ def getMetadataChangeEventSchema() -> str:
22
+ return _load_schema("MetadataChangeEvent")
23
+
24
24
  # fmt: on
datahub/sdk/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- import warnings
1
+ import types
2
2
 
3
3
  import datahub.metadata.schema_classes as models
4
- from datahub.errors import ExperimentalWarning, SdkUsageError
4
+ from datahub.errors import SdkUsageError
5
5
  from datahub.ingestion.graph.config import DatahubClientConfig
6
6
  from datahub.metadata.urns import (
7
7
  ChartUrn,
@@ -21,13 +21,30 @@ from datahub.sdk.container import Container
21
21
  from datahub.sdk.dataset import Dataset
22
22
  from datahub.sdk.main_client import DataHubClient
23
23
 
24
- warnings.warn(
25
- "The new datahub SDK (e.g. datahub.sdk.*) is experimental. "
26
- "Our typical backwards-compatibility and stability guarantees do not apply to this code. "
27
- "When it's promoted to stable, the import path will change "
28
- "from `from datahub.sdk import ...` to `from datahub import ...`.",
29
- ExperimentalWarning,
30
- stacklevel=2,
31
- )
32
- del warnings
33
- del ExperimentalWarning
24
+ # We want to print out the warning if people do `from datahub.sdk import X`.
25
+ # But we don't want to print out warnings if they're doing a more direct
26
+ # import like `from datahub.sdk.container import Container`, since that's
27
+ # what our internal code does.
28
+ _vars = {}
29
+ for _name, _value in list(locals().items()):
30
+ if not _name.startswith("_") and (
31
+ _name == "models" or not isinstance(_value, types.ModuleType)
32
+ ):
33
+ _vars[_name] = _value
34
+ del locals()[_name]
35
+
36
+
37
+ def __getattr__(name):
38
+ import warnings
39
+
40
+ from datahub.errors import ExperimentalWarning
41
+
42
+ warnings.warn(
43
+ "The new datahub SDK (e.g. datahub.sdk.*) is experimental. "
44
+ "Our typical backwards-compatibility and stability guarantees do not apply to this code. "
45
+ "When it's promoted to stable, the import path will change "
46
+ "from `from datahub.sdk import ...` to `from datahub import ...`.",
47
+ ExperimentalWarning,
48
+ stacklevel=2,
49
+ )
50
+ return _vars[name]
datahub/sdk/_entity.py CHANGED
@@ -1,5 +1,7 @@
1
+ from __future__ import annotations
2
+
1
3
  import abc
2
- from typing import List, Optional, Type, Union
4
+ from typing import TYPE_CHECKING, List, Optional, Type, Union
3
5
 
4
6
  from typing_extensions import Self
5
7
 
@@ -10,6 +12,12 @@ from datahub.errors import SdkUsageError
10
12
  from datahub.metadata.urns import Urn
11
13
  from datahub.utilities.urns._urn_base import _SpecificUrn
12
14
 
15
+ if TYPE_CHECKING:
16
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
17
+
18
+
19
+ ExtraAspectsType = Union[None, List[AspectTypeVar]]
20
+
13
21
 
14
22
  class Entity:
15
23
  __slots__ = ("_urn", "_prev_aspects", "_aspects")
@@ -87,5 +95,14 @@ class Entity:
87
95
  )
88
96
  return mcps
89
97
 
98
+ def as_workunits(self) -> List[MetadataWorkUnit]:
99
+ return [mcp.as_workunit() for mcp in self._as_mcps()]
100
+
101
+ def _set_extra_aspects(self, extra_aspects: ExtraAspectsType) -> None:
102
+ # TODO: Add validation to ensure that an "extra aspect" does not conflict
103
+ # with / get overridden by a standard aspect.
104
+ for aspect in extra_aspects or []:
105
+ self._set_aspect(aspect)
106
+
90
107
  def __repr__(self) -> str:
91
108
  return f"{self.__class__.__name__}('{self.urn}')"
datahub/sdk/container.py CHANGED
@@ -16,7 +16,7 @@ from datahub.metadata.urns import (
16
16
  ContainerUrn,
17
17
  Urn,
18
18
  )
19
- from datahub.sdk._entity import Entity
19
+ from datahub.sdk._entity import Entity, ExtraAspectsType
20
20
  from datahub.sdk._shared import (
21
21
  DomainInputType,
22
22
  HasContainer,
@@ -74,6 +74,7 @@ class Container(
74
74
  tags: Optional[TagsInputType] = None,
75
75
  terms: Optional[TermsInputType] = None,
76
76
  domain: Optional[DomainInputType] = None,
77
+ extra_aspects: ExtraAspectsType = None,
77
78
  ):
78
79
  # Hack: while the type annotations say container_key is always a ContainerKey,
79
80
  # we allow ContainerUrn to make the graph-based constructor work.
@@ -82,6 +83,7 @@ class Container(
82
83
  else:
83
84
  urn = ContainerUrn.from_string(container_key.as_urn())
84
85
  super().__init__(urn)
86
+ self._set_extra_aspects(extra_aspects)
85
87
 
86
88
  # This needs to come first to ensure that the display name is registered.
87
89
  self._ensure_container_props(name=display_name)
datahub/sdk/dataset.py CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import warnings
4
4
  from datetime import datetime
5
- from typing import Dict, List, Optional, Tuple, Type, Union
5
+ from typing import Dict, List, Optional, Sequence, Tuple, Type, Union
6
6
 
7
7
  from typing_extensions import Self, TypeAlias, assert_never
8
8
 
@@ -18,7 +18,7 @@ from datahub.errors import (
18
18
  from datahub.ingestion.source.sql.sql_types import resolve_sql_type
19
19
  from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn
20
20
  from datahub.sdk._attribution import is_ingestion_attribution
21
- from datahub.sdk._entity import Entity
21
+ from datahub.sdk._entity import Entity, ExtraAspectsType
22
22
  from datahub.sdk._shared import (
23
23
  DatasetUrnOrStr,
24
24
  DomainInputType,
@@ -47,7 +47,7 @@ SchemaFieldInputType: TypeAlias = Union[
47
47
  models.SchemaFieldClass,
48
48
  ]
49
49
  SchemaFieldsInputType: TypeAlias = Union[
50
- List[SchemaFieldInputType],
50
+ Sequence[SchemaFieldInputType],
51
51
  models.SchemaMetadataClass,
52
52
  ]
53
53
 
@@ -457,6 +457,7 @@ class Dataset(
457
457
  terms: Optional[TermsInputType] = None,
458
458
  # TODO structured_properties
459
459
  domain: Optional[DomainInputType] = None,
460
+ extra_aspects: ExtraAspectsType = None,
460
461
  # Dataset-specific aspects.
461
462
  schema: Optional[SchemaFieldsInputType] = None,
462
463
  upstreams: Optional[models.UpstreamLineageClass] = None,
@@ -468,6 +469,7 @@ class Dataset(
468
469
  env=env,
469
470
  )
470
471
  super().__init__(urn)
472
+ self._set_extra_aspects(extra_aspects)
471
473
 
472
474
  self._set_platform_instance(urn.platform, platform_instance)
473
475
 
@@ -172,17 +172,9 @@ def _patch_lineage() -> None:
172
172
  derived_tables = [
173
173
  source.expression.parent
174
174
  for source in scope.sources.values()
175
- @@ -254,6 +257,7 @@ def to_node(
176
- if dt.comments and dt.comments[0].startswith("source: ")
177
- }
178
-
179
- + c: exp.Column
180
- for c in source_columns:
181
- table = c.table
182
- source = scope.sources.get(table)
183
175
  @@ -281,8 +285,21 @@ def to_node(
184
- # it means this column's lineage is unknown. This can happen if the definition of a source used in a query
185
- # is not passed into the `sources` map.
176
+ # is unknown. This can happen if the definition of a source used in a query is not
177
+ # passed into the `sources` map.
186
178
  source = source or exp.Placeholder()
187
179
  +
188
180
  + subfields = []
@@ -2,8 +2,12 @@ import logging
2
2
  from dataclasses import dataclass
3
3
  from typing import List, Tuple
4
4
 
5
+ from datahub.cli.env_utils import get_boolean_env_variable
6
+
7
+ _debug_diff = get_boolean_env_variable("DATAHUB_DEBUG_DIFF_PATCHER")
8
+
5
9
  logger = logging.getLogger(__name__)
6
- logger.setLevel(logging.INFO)
10
+ logger.setLevel(logging.DEBUG if _debug_diff else logging.INFO)
7
11
 
8
12
  _LOOKAROUND_LINES = 300
9
13