acryl-datahub 1.0.0rc3__py3-none-any.whl → 1.0.0rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (30) hide show
  1. {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc5.dist-info}/METADATA +2377 -2377
  2. {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc5.dist-info}/RECORD +30 -27
  3. datahub/_version.py +1 -1
  4. datahub/cli/ingest_cli.py +27 -92
  5. datahub/emitter/mcp_builder.py +4 -1
  6. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -0
  7. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  8. datahub/ingestion/source/openapi_parser.py +46 -14
  9. datahub/ingestion/source/unity/source.py +11 -1
  10. datahub/metadata/_schema_classes.py +17 -0
  11. datahub/metadata/schema.avsc +21 -3
  12. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  13. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  14. datahub/metadata/schemas/MetadataChangeEvent.avsc +13 -0
  15. datahub/sdk/_attribution.py +4 -0
  16. datahub/sdk/_entity.py +2 -0
  17. datahub/sdk/_shared.py +163 -13
  18. datahub/sdk/_utils.py +35 -0
  19. datahub/sdk/container.py +20 -4
  20. datahub/sdk/dataset.py +104 -14
  21. datahub/sdk/main_client.py +17 -0
  22. datahub/specific/dataset.py +3 -4
  23. datahub/sql_parsing/split_statements.py +20 -13
  24. datahub/utilities/file_backed_collections.py +3 -14
  25. datahub/utilities/ingest_utils.py +106 -0
  26. datahub/utilities/sentinels.py +22 -0
  27. {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc5.dist-info}/LICENSE +0 -0
  28. {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc5.dist-info}/WHEEL +0 -0
  29. {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc5.dist-info}/entry_points.txt +0 -0
  30. {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc5.dist-info}/top_level.txt +0 -0
@@ -41,10 +41,24 @@ class DataHubClient:
41
41
 
42
42
  @classmethod
43
43
  def from_env(cls) -> "DataHubClient":
44
+ """Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
45
+
46
+ This will first check DATAHUB_GMS_URL and DATAHUB_GMS_TOKEN. If not present,
47
+ it will read credentials from ~/.datahubenv. That file can be created using
48
+ the `datahub init` command.
49
+
50
+ If you're looking to specify the server/token in code, use the
51
+ DataHubClient(server=..., token=...) constructor instead.
52
+
53
+ Returns:
54
+ A DataHubClient instance.
55
+ """
56
+
44
57
  # Inspired by the DockerClient.from_env() method.
45
58
  # TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
46
59
  # That file is part of the "environment", but is not a traditional "env variable".
47
60
  graph = get_default_graph()
61
+
48
62
  return cls(graph=graph)
49
63
 
50
64
  @property
@@ -54,3 +68,6 @@ class DataHubClient:
54
68
  @property
55
69
  def resolve(self) -> ResolverClient:
56
70
  return ResolverClient(self)
71
+
72
+ # TODO: search client
73
+ # TODO: lineage client
@@ -15,6 +15,7 @@ from datahub.metadata.schema_classes import (
15
15
  UpstreamClass as Upstream,
16
16
  UpstreamLineageClass as UpstreamLineage,
17
17
  )
18
+ from datahub.metadata.urns import DatasetUrn, TagUrn, Urn
18
19
  from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
19
20
  from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
20
21
  from datahub.specific.aspect_helpers.structured_properties import (
@@ -22,8 +23,6 @@ from datahub.specific.aspect_helpers.structured_properties import (
22
23
  )
23
24
  from datahub.specific.aspect_helpers.tags import HasTagsPatch
24
25
  from datahub.specific.aspect_helpers.terms import HasTermsPatch
25
- from datahub.utilities.urns.tag_urn import TagUrn
26
- from datahub.utilities.urns.urn import Urn
27
26
 
28
27
  _Parent = TypeVar("_Parent", bound=MetadataPatchProposal)
29
28
 
@@ -104,12 +103,12 @@ class DatasetPatchBuilder(
104
103
  ):
105
104
  def __init__(
106
105
  self,
107
- urn: str,
106
+ urn: Union[str, DatasetUrn],
108
107
  system_metadata: Optional[SystemMetadataClass] = None,
109
108
  audit_header: Optional[KafkaAuditHeaderClass] = None,
110
109
  ) -> None:
111
110
  super().__init__(
112
- urn, system_metadata=system_metadata, audit_header=audit_header
111
+ str(urn), system_metadata=system_metadata, audit_header=audit_header
113
112
  )
114
113
 
115
114
  @classmethod
@@ -8,11 +8,11 @@ END_KEYWORD = "END"
8
8
 
9
9
  CONTROL_FLOW_KEYWORDS = [
10
10
  "GO",
11
- r"BEGIN\w+TRY",
12
- r"BEGIN\w+CATCH",
11
+ r"BEGIN\s+TRY",
12
+ r"BEGIN\s+CATCH",
13
13
  "BEGIN",
14
- r"END\w+TRY",
15
- r"END\w+CATCH",
14
+ r"END\s+TRY",
15
+ r"END\s+CATCH",
16
16
  # This isn't strictly correct, but we assume that IF | (condition) | (block) should all be split up
17
17
  # This mainly ensures that IF statements don't get tacked onto the previous statement incorrectly
18
18
  "IF",
@@ -73,25 +73,31 @@ class _StatementSplitter:
73
73
  # what a given END is closing.
74
74
  self.current_case_statements = 0
75
75
 
76
- def _is_keyword_at_position(self, pos: int, keyword: str) -> bool:
76
+ def _is_keyword_at_position(self, pos: int, keyword: str) -> Tuple[bool, str]:
77
77
  """
78
78
  Check if a keyword exists at the given position using regex word boundaries.
79
79
  """
80
80
  sql = self.sql
81
81
 
82
- if pos + len(keyword) > len(sql):
83
- return False
82
+ keyword_length = len(keyword.replace(r"\s+", " "))
83
+
84
+ if pos + keyword_length > len(sql):
85
+ return False, ""
84
86
 
85
87
  # If we're not at a word boundary, we can't generate a keyword.
86
88
  if pos > 0 and not (
87
89
  bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
88
90
  or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
89
91
  ):
90
- return False
92
+ return False, ""
91
93
 
92
- pattern = rf"^{re.escape(keyword)}\b"
94
+ pattern = rf"^{keyword}\b"
93
95
  match = re.match(pattern, sql[pos:], re.IGNORECASE)
94
- return bool(match)
96
+ is_match = bool(match)
97
+ actual_match = (
98
+ sql[pos:][match.start() : match.end()] if match is not None else ""
99
+ )
100
+ return is_match, actual_match
95
101
 
96
102
  def _look_ahead_for_keywords(self, keywords: List[str]) -> Tuple[bool, str, int]:
97
103
  """
@@ -99,7 +105,8 @@ class _StatementSplitter:
99
105
  """
100
106
 
101
107
  for keyword in keywords:
102
- if self._is_keyword_at_position(self.i, keyword):
108
+ is_match, keyword = self._is_keyword_at_position(self.i, keyword)
109
+ if is_match:
103
110
  return True, keyword, len(keyword)
104
111
  return False, "", 0
105
112
 
@@ -118,7 +125,7 @@ class _StatementSplitter:
118
125
 
119
126
  def process(self) -> Iterator[str]:
120
127
  if not self.sql or not self.sql.strip():
121
- return
128
+ yield from ()
122
129
 
123
130
  prev_real_char = "\0" # the most recent non-whitespace, non-comment character
124
131
  while self.i < len(self.sql):
@@ -181,7 +188,7 @@ class _StatementSplitter:
181
188
  def _process_normal(self, most_recent_real_char: str) -> Iterator[str]:
182
189
  c = self.sql[self.i]
183
190
 
184
- if self._is_keyword_at_position(self.i, CASE_KEYWORD):
191
+ if self._is_keyword_at_position(self.i, CASE_KEYWORD)[0]:
185
192
  self.current_case_statements += 1
186
193
 
187
194
  is_control_keyword, keyword, keyword_len = self._look_ahead_for_keywords(
@@ -10,13 +10,11 @@ import tempfile
10
10
  import threading
11
11
  from dataclasses import dataclass, field
12
12
  from datetime import datetime
13
- from enum import Enum
14
13
  from types import TracebackType
15
14
  from typing import (
16
15
  Any,
17
16
  Callable,
18
17
  Dict,
19
- Final,
20
18
  Generic,
21
19
  Iterator,
22
20
  List,
@@ -31,6 +29,7 @@ from typing import (
31
29
  )
32
30
 
33
31
  from datahub.ingestion.api.closeable import Closeable
32
+ from datahub.utilities.sentinels import Unset, unset
34
33
 
35
34
  logger: logging.Logger = logging.getLogger(__name__)
36
35
 
@@ -59,16 +58,6 @@ SqliteValue = Union[int, float, str, bytes, datetime, None]
59
58
  _VT = TypeVar("_VT")
60
59
 
61
60
 
62
- class Unset(Enum):
63
- token = 0
64
-
65
-
66
- # It's pretty annoying to create a true sentinel that works with typing.
67
- # https://peps.python.org/pep-0484/#support-for-singleton-types-in-unions
68
- # Can't wait for https://peps.python.org/pep-0661/
69
- _unset: Final = Unset.token
70
-
71
-
72
61
  class ConnectionWrapper:
73
62
  """
74
63
  Wraps a SQlite connection, allowing connection reuse across multiple FileBacked* objects.
@@ -372,7 +361,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
372
361
  self,
373
362
  /,
374
363
  key: str,
375
- default: Union[_VT, Unset] = _unset,
364
+ default: Union[_VT, Unset] = unset,
376
365
  ) -> _VT:
377
366
  # If key is in the dictionary, this is similar to __getitem__ + mark_dirty.
378
367
  # If key is not in the dictionary, this is similar to __setitem__.
@@ -383,7 +372,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
383
372
  self.mark_dirty(key)
384
373
  return value
385
374
  except KeyError:
386
- if default is _unset:
375
+ if default is unset:
387
376
  raise
388
377
 
389
378
  self[key] = default
@@ -0,0 +1,106 @@
1
+ import json
2
+ import logging
3
+ from typing import Optional
4
+
5
+ import click
6
+
7
+ from datahub.configuration.common import ConfigModel
8
+ from datahub.configuration.config_loader import load_config_file
9
+ from datahub.emitter.mce_builder import datahub_guid
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def _make_ingestion_urn(name: str) -> str:
15
+ guid = datahub_guid(
16
+ {
17
+ "name": name,
18
+ }
19
+ )
20
+ return f"urn:li:dataHubIngestionSource:deploy-{guid}"
21
+
22
+
23
+ class DeployOptions(ConfigModel):
24
+ name: str
25
+ schedule: Optional[str] = None
26
+ time_zone: str = "UTC"
27
+ cli_version: Optional[str] = None
28
+ executor_id: str = "default"
29
+
30
+
31
+ def deploy_source_vars(
32
+ name: Optional[str],
33
+ config: str,
34
+ urn: Optional[str],
35
+ executor_id: str,
36
+ cli_version: Optional[str],
37
+ schedule: Optional[str],
38
+ time_zone: str,
39
+ extra_pip: Optional[str],
40
+ debug: bool = False,
41
+ ) -> dict:
42
+ pipeline_config = load_config_file(
43
+ config,
44
+ allow_stdin=True,
45
+ allow_remote=True,
46
+ resolve_env_vars=False,
47
+ )
48
+
49
+ deploy_options_raw = pipeline_config.pop("deployment", None)
50
+ if deploy_options_raw is not None:
51
+ deploy_options = DeployOptions.parse_obj(deploy_options_raw)
52
+
53
+ if name:
54
+ logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
55
+ deploy_options.name = name
56
+ else:
57
+ if not name:
58
+ raise click.UsageError(
59
+ "Either --name must be set or deployment_name specified in the config"
60
+ )
61
+ deploy_options = DeployOptions(name=name)
62
+
63
+ # Use remaining CLI args to override deploy_options
64
+ if schedule:
65
+ deploy_options.schedule = schedule
66
+ if time_zone:
67
+ deploy_options.time_zone = time_zone
68
+ if cli_version:
69
+ deploy_options.cli_version = cli_version
70
+ if executor_id:
71
+ deploy_options.executor_id = executor_id
72
+
73
+ logger.info(f"Using {repr(deploy_options)}")
74
+
75
+ if not urn:
76
+ # When urn/name is not specified, we will generate a unique urn based on the deployment name.
77
+ urn = _make_ingestion_urn(deploy_options.name)
78
+ logger.info(f"Using recipe urn: {urn}")
79
+
80
+ variables: dict = {
81
+ "urn": urn,
82
+ "input": {
83
+ "name": deploy_options.name,
84
+ "type": pipeline_config["source"]["type"],
85
+ "config": {
86
+ "recipe": json.dumps(pipeline_config),
87
+ "executorId": deploy_options.executor_id,
88
+ "debugMode": debug,
89
+ "version": deploy_options.cli_version,
90
+ },
91
+ },
92
+ }
93
+
94
+ if deploy_options.schedule is not None:
95
+ variables["input"]["schedule"] = {
96
+ "interval": deploy_options.schedule,
97
+ "timezone": deploy_options.time_zone,
98
+ }
99
+ if extra_pip is not None:
100
+ extra_args_list = (
101
+ variables.get("input", {}).get("config", {}).get("extraArgs", [])
102
+ )
103
+ extra_args_list.append({"key": "extra_pip_requirements", "value": extra_pip})
104
+ variables["input"]["config"]["extraArgs"] = extra_args_list
105
+
106
+ return variables
@@ -0,0 +1,22 @@
1
+ from enum import Enum
2
+ from typing import Final
3
+
4
+ # It's pretty annoying to create a true sentinel that works with typing.
5
+ # This approach using enums is inspired by:
6
+ # https://peps.python.org/pep-0484/#support-for-singleton-types-in-unions
7
+ #
8
+ # Can't wait for https://peps.python.org/pep-0661/
9
+
10
+
11
+ class Unset(Enum):
12
+ token = 0
13
+
14
+
15
+ unset: Final = Unset.token
16
+
17
+
18
+ class Auto(Enum):
19
+ token = 0
20
+
21
+
22
+ auto: Final = Auto.token