acryl-datahub 1.0.0rc4__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (62) hide show
  1. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2502 -2502
  2. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +62 -59
  3. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/ingest_cli.py +3 -1
  6. datahub/emitter/mcp_builder.py +4 -1
  7. datahub/ingestion/api/source_helpers.py +4 -0
  8. datahub/ingestion/run/pipeline.py +109 -143
  9. datahub/ingestion/run/sink_callback.py +77 -0
  10. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -0
  11. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  12. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  13. datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
  14. datahub/ingestion/source/delta_lake/config.py +8 -1
  15. datahub/ingestion/source/delta_lake/report.py +4 -2
  16. datahub/ingestion/source/delta_lake/source.py +20 -5
  17. datahub/ingestion/source/elastic_search.py +26 -6
  18. datahub/ingestion/source/feast.py +27 -8
  19. datahub/ingestion/source/file.py +1 -1
  20. datahub/ingestion/source/identity/okta.py +1 -2
  21. datahub/ingestion/source/mlflow.py +30 -7
  22. datahub/ingestion/source/mode.py +7 -2
  23. datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
  24. datahub/ingestion/source/nifi.py +29 -6
  25. datahub/ingestion/source/openapi_parser.py +46 -14
  26. datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
  27. datahub/ingestion/source/pulsar.py +1 -0
  28. datahub/ingestion/source/redash.py +29 -6
  29. datahub/ingestion/source/s3/config.py +3 -1
  30. datahub/ingestion/source/salesforce.py +28 -6
  31. datahub/ingestion/source/slack/slack.py +31 -10
  32. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  33. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  34. datahub/ingestion/source/sql/oracle.py +34 -0
  35. datahub/ingestion/source_config/pulsar.py +3 -1
  36. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  37. datahub/metadata/_schema_classes.py +534 -410
  38. datahub/metadata/_urns/urn_defs.py +1670 -1670
  39. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  40. datahub/metadata/schema.avsc +17379 -17637
  41. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  42. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  43. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  44. datahub/metadata/schemas/MetadataChangeEvent.avsc +13 -0
  45. datahub/metadata/schemas/__init__.py +3 -3
  46. datahub/sdk/__init__.py +29 -12
  47. datahub/sdk/_attribution.py +4 -0
  48. datahub/sdk/_entity.py +20 -1
  49. datahub/sdk/_shared.py +163 -13
  50. datahub/sdk/_utils.py +35 -0
  51. datahub/sdk/container.py +23 -5
  52. datahub/sdk/dataset.py +109 -17
  53. datahub/sdk/main_client.py +17 -0
  54. datahub/specific/dataset.py +3 -4
  55. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  56. datahub/sql_parsing/split_statements.py +20 -13
  57. datahub/utilities/file_backed_collections.py +3 -14
  58. datahub/utilities/sentinels.py +22 -0
  59. datahub/utilities/unified_diff.py +5 -1
  60. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
  61. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
  62. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -15,6 +15,7 @@ from datahub.metadata.schema_classes import (
15
15
  UpstreamClass as Upstream,
16
16
  UpstreamLineageClass as UpstreamLineage,
17
17
  )
18
+ from datahub.metadata.urns import DatasetUrn, TagUrn, Urn
18
19
  from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
19
20
  from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
20
21
  from datahub.specific.aspect_helpers.structured_properties import (
@@ -22,8 +23,6 @@ from datahub.specific.aspect_helpers.structured_properties import (
22
23
  )
23
24
  from datahub.specific.aspect_helpers.tags import HasTagsPatch
24
25
  from datahub.specific.aspect_helpers.terms import HasTermsPatch
25
- from datahub.utilities.urns.tag_urn import TagUrn
26
- from datahub.utilities.urns.urn import Urn
27
26
 
28
27
  _Parent = TypeVar("_Parent", bound=MetadataPatchProposal)
29
28
 
@@ -104,12 +103,12 @@ class DatasetPatchBuilder(
104
103
  ):
105
104
  def __init__(
106
105
  self,
107
- urn: str,
106
+ urn: Union[str, DatasetUrn],
108
107
  system_metadata: Optional[SystemMetadataClass] = None,
109
108
  audit_header: Optional[KafkaAuditHeaderClass] = None,
110
109
  ) -> None:
111
110
  super().__init__(
112
- urn, system_metadata=system_metadata, audit_header=audit_header
111
+ str(urn), system_metadata=system_metadata, audit_header=audit_header
113
112
  )
114
113
 
115
114
  @classmethod
@@ -172,17 +172,9 @@ def _patch_lineage() -> None:
172
172
  derived_tables = [
173
173
  source.expression.parent
174
174
  for source in scope.sources.values()
175
- @@ -254,6 +257,7 @@ def to_node(
176
- if dt.comments and dt.comments[0].startswith("source: ")
177
- }
178
-
179
- + c: exp.Column
180
- for c in source_columns:
181
- table = c.table
182
- source = scope.sources.get(table)
183
175
  @@ -281,8 +285,21 @@ def to_node(
184
- # it means this column's lineage is unknown. This can happen if the definition of a source used in a query
185
- # is not passed into the `sources` map.
176
+ # is unknown. This can happen if the definition of a source used in a query is not
177
+ # passed into the `sources` map.
186
178
  source = source or exp.Placeholder()
187
179
  +
188
180
  + subfields = []
@@ -8,11 +8,11 @@ END_KEYWORD = "END"
8
8
 
9
9
  CONTROL_FLOW_KEYWORDS = [
10
10
  "GO",
11
- r"BEGIN\w+TRY",
12
- r"BEGIN\w+CATCH",
11
+ r"BEGIN\s+TRY",
12
+ r"BEGIN\s+CATCH",
13
13
  "BEGIN",
14
- r"END\w+TRY",
15
- r"END\w+CATCH",
14
+ r"END\s+TRY",
15
+ r"END\s+CATCH",
16
16
  # This isn't strictly correct, but we assume that IF | (condition) | (block) should all be split up
17
17
  # This mainly ensures that IF statements don't get tacked onto the previous statement incorrectly
18
18
  "IF",
@@ -73,25 +73,31 @@ class _StatementSplitter:
73
73
  # what a given END is closing.
74
74
  self.current_case_statements = 0
75
75
 
76
- def _is_keyword_at_position(self, pos: int, keyword: str) -> bool:
76
+ def _is_keyword_at_position(self, pos: int, keyword: str) -> Tuple[bool, str]:
77
77
  """
78
78
  Check if a keyword exists at the given position using regex word boundaries.
79
79
  """
80
80
  sql = self.sql
81
81
 
82
- if pos + len(keyword) > len(sql):
83
- return False
82
+ keyword_length = len(keyword.replace(r"\s+", " "))
83
+
84
+ if pos + keyword_length > len(sql):
85
+ return False, ""
84
86
 
85
87
  # If we're not at a word boundary, we can't generate a keyword.
86
88
  if pos > 0 and not (
87
89
  bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
88
90
  or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
89
91
  ):
90
- return False
92
+ return False, ""
91
93
 
92
- pattern = rf"^{re.escape(keyword)}\b"
94
+ pattern = rf"^{keyword}\b"
93
95
  match = re.match(pattern, sql[pos:], re.IGNORECASE)
94
- return bool(match)
96
+ is_match = bool(match)
97
+ actual_match = (
98
+ sql[pos:][match.start() : match.end()] if match is not None else ""
99
+ )
100
+ return is_match, actual_match
95
101
 
96
102
  def _look_ahead_for_keywords(self, keywords: List[str]) -> Tuple[bool, str, int]:
97
103
  """
@@ -99,7 +105,8 @@ class _StatementSplitter:
99
105
  """
100
106
 
101
107
  for keyword in keywords:
102
- if self._is_keyword_at_position(self.i, keyword):
108
+ is_match, keyword = self._is_keyword_at_position(self.i, keyword)
109
+ if is_match:
103
110
  return True, keyword, len(keyword)
104
111
  return False, "", 0
105
112
 
@@ -118,7 +125,7 @@ class _StatementSplitter:
118
125
 
119
126
  def process(self) -> Iterator[str]:
120
127
  if not self.sql or not self.sql.strip():
121
- return
128
+ yield from ()
122
129
 
123
130
  prev_real_char = "\0" # the most recent non-whitespace, non-comment character
124
131
  while self.i < len(self.sql):
@@ -181,7 +188,7 @@ class _StatementSplitter:
181
188
  def _process_normal(self, most_recent_real_char: str) -> Iterator[str]:
182
189
  c = self.sql[self.i]
183
190
 
184
- if self._is_keyword_at_position(self.i, CASE_KEYWORD):
191
+ if self._is_keyword_at_position(self.i, CASE_KEYWORD)[0]:
185
192
  self.current_case_statements += 1
186
193
 
187
194
  is_control_keyword, keyword, keyword_len = self._look_ahead_for_keywords(
@@ -10,13 +10,11 @@ import tempfile
10
10
  import threading
11
11
  from dataclasses import dataclass, field
12
12
  from datetime import datetime
13
- from enum import Enum
14
13
  from types import TracebackType
15
14
  from typing import (
16
15
  Any,
17
16
  Callable,
18
17
  Dict,
19
- Final,
20
18
  Generic,
21
19
  Iterator,
22
20
  List,
@@ -31,6 +29,7 @@ from typing import (
31
29
  )
32
30
 
33
31
  from datahub.ingestion.api.closeable import Closeable
32
+ from datahub.utilities.sentinels import Unset, unset
34
33
 
35
34
  logger: logging.Logger = logging.getLogger(__name__)
36
35
 
@@ -59,16 +58,6 @@ SqliteValue = Union[int, float, str, bytes, datetime, None]
59
58
  _VT = TypeVar("_VT")
60
59
 
61
60
 
62
- class Unset(Enum):
63
- token = 0
64
-
65
-
66
- # It's pretty annoying to create a true sentinel that works with typing.
67
- # https://peps.python.org/pep-0484/#support-for-singleton-types-in-unions
68
- # Can't wait for https://peps.python.org/pep-0661/
69
- _unset: Final = Unset.token
70
-
71
-
72
61
  class ConnectionWrapper:
73
62
  """
74
63
  Wraps a SQlite connection, allowing connection reuse across multiple FileBacked* objects.
@@ -372,7 +361,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
372
361
  self,
373
362
  /,
374
363
  key: str,
375
- default: Union[_VT, Unset] = _unset,
364
+ default: Union[_VT, Unset] = unset,
376
365
  ) -> _VT:
377
366
  # If key is in the dictionary, this is similar to __getitem__ + mark_dirty.
378
367
  # If key is not in the dictionary, this is similar to __setitem__.
@@ -383,7 +372,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
383
372
  self.mark_dirty(key)
384
373
  return value
385
374
  except KeyError:
386
- if default is _unset:
375
+ if default is unset:
387
376
  raise
388
377
 
389
378
  self[key] = default
@@ -0,0 +1,22 @@
1
+ from enum import Enum
2
+ from typing import Final
3
+
4
+ # It's pretty annoying to create a true sentinel that works with typing.
5
+ # This approach using enums is inspired by:
6
+ # https://peps.python.org/pep-0484/#support-for-singleton-types-in-unions
7
+ #
8
+ # Can't wait for https://peps.python.org/pep-0661/
9
+
10
+
11
+ class Unset(Enum):
12
+ token = 0
13
+
14
+
15
+ unset: Final = Unset.token
16
+
17
+
18
+ class Auto(Enum):
19
+ token = 0
20
+
21
+
22
+ auto: Final = Auto.token
@@ -2,8 +2,12 @@ import logging
2
2
  from dataclasses import dataclass
3
3
  from typing import List, Tuple
4
4
 
5
+ from datahub.cli.env_utils import get_boolean_env_variable
6
+
7
+ _debug_diff = get_boolean_env_variable("DATAHUB_DEBUG_DIFF_PATCHER")
8
+
5
9
  logger = logging.getLogger(__name__)
6
- logger.setLevel(logging.INFO)
10
+ logger.setLevel(logging.DEBUG if _debug_diff else logging.INFO)
7
11
 
8
12
  _LOOKAROUND_LINES = 300
9
13