pyspiral 0.2.5__pp310-pypy310_pp73-macosx_10_13_x86_64.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. pyspiral-0.2.5.dist-info/METADATA +48 -0
  2. pyspiral-0.2.5.dist-info/RECORD +81 -0
  3. pyspiral-0.2.5.dist-info/WHEEL +4 -0
  4. pyspiral-0.2.5.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +11 -0
  6. spiral/_lib.pypy310-pp73-darwin.so +0 -0
  7. spiral/adbc.py +386 -0
  8. spiral/api/__init__.py +221 -0
  9. spiral/api/admin.py +29 -0
  10. spiral/api/filesystems.py +125 -0
  11. spiral/api/organizations.py +90 -0
  12. spiral/api/projects.py +160 -0
  13. spiral/api/tables.py +94 -0
  14. spiral/api/tokens.py +56 -0
  15. spiral/api/workloads.py +45 -0
  16. spiral/arrow.py +209 -0
  17. spiral/authn/__init__.py +0 -0
  18. spiral/authn/authn.py +89 -0
  19. spiral/authn/device.py +206 -0
  20. spiral/authn/github_.py +33 -0
  21. spiral/authn/modal_.py +18 -0
  22. spiral/catalog.py +78 -0
  23. spiral/cli/__init__.py +82 -0
  24. spiral/cli/__main__.py +4 -0
  25. spiral/cli/admin.py +21 -0
  26. spiral/cli/app.py +48 -0
  27. spiral/cli/console.py +95 -0
  28. spiral/cli/fs.py +47 -0
  29. spiral/cli/login.py +13 -0
  30. spiral/cli/org.py +90 -0
  31. spiral/cli/printer.py +45 -0
  32. spiral/cli/project.py +107 -0
  33. spiral/cli/state.py +3 -0
  34. spiral/cli/table.py +20 -0
  35. spiral/cli/token.py +27 -0
  36. spiral/cli/types.py +53 -0
  37. spiral/cli/workload.py +59 -0
  38. spiral/config.py +26 -0
  39. spiral/core/__init__.py +0 -0
  40. spiral/core/core/__init__.pyi +53 -0
  41. spiral/core/manifests/__init__.pyi +53 -0
  42. spiral/core/metastore/__init__.pyi +91 -0
  43. spiral/core/spec/__init__.pyi +257 -0
  44. spiral/dataset.py +239 -0
  45. spiral/debug.py +251 -0
  46. spiral/expressions/__init__.py +222 -0
  47. spiral/expressions/base.py +149 -0
  48. spiral/expressions/http.py +86 -0
  49. spiral/expressions/io.py +100 -0
  50. spiral/expressions/list_.py +68 -0
  51. spiral/expressions/refs.py +44 -0
  52. spiral/expressions/str_.py +39 -0
  53. spiral/expressions/struct.py +57 -0
  54. spiral/expressions/tiff.py +223 -0
  55. spiral/expressions/udf.py +46 -0
  56. spiral/grpc_.py +32 -0
  57. spiral/project.py +137 -0
  58. spiral/proto/_/__init__.py +0 -0
  59. spiral/proto/_/arrow/__init__.py +0 -0
  60. spiral/proto/_/arrow/flight/__init__.py +0 -0
  61. spiral/proto/_/arrow/flight/protocol/__init__.py +0 -0
  62. spiral/proto/_/arrow/flight/protocol/sql/__init__.py +1990 -0
  63. spiral/proto/_/scandal/__init__.py +223 -0
  64. spiral/proto/_/spfs/__init__.py +36 -0
  65. spiral/proto/_/spiral/__init__.py +0 -0
  66. spiral/proto/_/spiral/table/__init__.py +225 -0
  67. spiral/proto/_/spiraldb/__init__.py +0 -0
  68. spiral/proto/_/spiraldb/metastore/__init__.py +499 -0
  69. spiral/proto/__init__.py +0 -0
  70. spiral/proto/scandal/__init__.py +45 -0
  71. spiral/proto/spiral/__init__.py +0 -0
  72. spiral/proto/spiral/table/__init__.py +96 -0
  73. spiral/proto/substrait/__init__.py +3399 -0
  74. spiral/proto/substrait/extensions/__init__.py +115 -0
  75. spiral/proto/util.py +41 -0
  76. spiral/py.typed +0 -0
  77. spiral/scan_.py +168 -0
  78. spiral/settings.py +157 -0
  79. spiral/substrait_.py +275 -0
  80. spiral/table.py +157 -0
  81. spiral/types_.py +6 -0
@@ -0,0 +1,115 @@
1
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
2
+ # sources: substrait/extensions/extensions.proto
3
+ # plugin: python-betterproto
4
+ # This file has been @generated
5
+
6
+ from dataclasses import dataclass
7
+ from typing import List
8
+
9
+ import betterproto
10
+ import betterproto.lib.google.protobuf as betterproto_lib_google_protobuf
11
+
12
+
13
+ @dataclass(eq=False, repr=False)
14
+ class SimpleExtensionUri(betterproto.Message):
15
+ extension_uri_anchor: int = betterproto.uint32_field(1)
16
+ """
17
+ A surrogate key used in the context of a single plan used to reference the
18
+ URI associated with an extension.
19
+ """
20
+
21
+ uri: str = betterproto.string_field(2)
22
+ """
23
+ The URI where this extension YAML can be retrieved. This is the "namespace"
24
+ of this extension.
25
+ """
26
+
27
+
28
+ @dataclass(eq=False, repr=False)
29
+ class SimpleExtensionDeclaration(betterproto.Message):
30
+ """
31
+ Describes a mapping between a specific extension entity and the uri where
32
+ that extension can be found.
33
+ """
34
+
35
+ extension_type: "SimpleExtensionDeclarationExtensionType" = (
36
+ betterproto.message_field(1, group="mapping_type")
37
+ )
38
+ extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation" = (
39
+ betterproto.message_field(2, group="mapping_type")
40
+ )
41
+ extension_function: "SimpleExtensionDeclarationExtensionFunction" = (
42
+ betterproto.message_field(3, group="mapping_type")
43
+ )
44
+
45
+
46
+ @dataclass(eq=False, repr=False)
47
+ class SimpleExtensionDeclarationExtensionType(betterproto.Message):
48
+ """Describes a Type"""
49
+
50
+ extension_uri_reference: int = betterproto.uint32_field(1)
51
+ """
52
+ references the extension_uri_anchor defined for a specific extension URI.
53
+ """
54
+
55
+ type_anchor: int = betterproto.uint32_field(2)
56
+ """
57
+ A surrogate key used in the context of a single plan to reference a
58
+ specific extension type
59
+ """
60
+
61
+ name: str = betterproto.string_field(3)
62
+ """the name of the type in the defined extension YAML."""
63
+
64
+
65
+ @dataclass(eq=False, repr=False)
66
+ class SimpleExtensionDeclarationExtensionTypeVariation(betterproto.Message):
67
+ extension_uri_reference: int = betterproto.uint32_field(1)
68
+ """
69
+ references the extension_uri_anchor defined for a specific extension URI.
70
+ """
71
+
72
+ type_variation_anchor: int = betterproto.uint32_field(2)
73
+ """
74
+ A surrogate key used in the context of a single plan to reference a
75
+ specific type variation
76
+ """
77
+
78
+ name: str = betterproto.string_field(3)
79
+ """the name of the type in the defined extension YAML."""
80
+
81
+
82
+ @dataclass(eq=False, repr=False)
83
+ class SimpleExtensionDeclarationExtensionFunction(betterproto.Message):
84
+ extension_uri_reference: int = betterproto.uint32_field(1)
85
+ """
86
+ references the extension_uri_anchor defined for a specific extension URI.
87
+ """
88
+
89
+ function_anchor: int = betterproto.uint32_field(2)
90
+ """
91
+ A surrogate key used in the context of a single plan to reference a
92
+ specific function
93
+ """
94
+
95
+ name: str = betterproto.string_field(3)
96
+ """A function signature compound name"""
97
+
98
+
99
+ @dataclass(eq=False, repr=False)
100
+ class AdvancedExtension(betterproto.Message):
101
+ """
102
+ A generic object that can be used to embed additional extension information
103
+ into the serialized substrait plan.
104
+ """
105
+
106
+ optimization: List[
107
+ "betterproto_lib_google_protobuf.Any"
108
+ ] = betterproto.message_field(1)
109
+ """
110
+ An optimization is helpful information that don't influence semantics. May
111
+ be ignored by a consumer.
112
+ """
113
+
114
+ enhancement: "betterproto_lib_google_protobuf.Any" = betterproto.message_field(2)
115
+ """An enhancement alter semantics. Cannot be ignored by a consumer."""
spiral/proto/util.py ADDED
@@ -0,0 +1,41 @@
1
+ import betterproto
2
+ from betterproto.grpc.grpclib_server import ServiceBase
3
+
4
+
5
+ def patch_protos(proto_module, our_module_globals):
6
+ """Calculate __all__ to re-export protos from a module."""
7
+
8
+ betterproto_types = (betterproto.Message, betterproto.Enum, betterproto.ServiceStub, ServiceBase)
9
+
10
+ proto_overrides = {}
11
+ missing = set()
12
+ for ident in dir(proto_module):
13
+ var = getattr(proto_module, ident)
14
+ if isinstance(var, type) and issubclass(var, betterproto_types):
15
+ if ident in our_module_globals:
16
+ override = id(our_module_globals.get(ident)) != id(var)
17
+ else:
18
+ override = False
19
+ missing.add(ident)
20
+ proto_overrides[ident] = override
21
+
22
+ if missing:
23
+ print(f"from {proto_module.__name__} import (")
24
+ for ident, override in proto_overrides.items():
25
+ if override:
26
+ print(f" {ident} as {ident}_,")
27
+ else:
28
+ print(f" {ident},")
29
+ print(")")
30
+ print("\n")
31
+ print("__all__ = [")
32
+ for ident in proto_overrides:
33
+ print(f' "{ident}",')
34
+ print("]")
35
+
36
+ raise ValueError(f"Missing types that need to be re-exported: {missing}")
37
+
38
+ # Patch any local subclasses back into the original module so the gRPC client will use them
39
+ for ident, override in proto_overrides.items():
40
+ if override:
41
+ setattr(proto_module, ident, our_module_globals[ident])
spiral/py.typed ADDED
File without changes
spiral/scan_.py ADDED
@@ -0,0 +1,168 @@
1
+ from collections.abc import Iterator
2
+ from datetime import datetime
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ import pyarrow as pa
6
+ from opentelemetry import trace
7
+
8
+ from spiral.core.core import TableScan
9
+ from spiral.core.spec import KeyRange, Schema
10
+ from spiral.expressions.base import ExprLike
11
+
12
+ if TYPE_CHECKING:
13
+ import dask.dataframe as dd
14
+ import pandas as pd
15
+ import polars as pl
16
+ from datasets import iterable_dataset
17
+
18
+ tracer = trace.get_tracer("pyspiral.client.scan")
19
+
20
+
21
+ def scan(
22
+ *projections: ExprLike,
23
+ where: ExprLike | None = None,
24
+ asof: datetime | int | str = None,
25
+ exclude_keys: bool = False,
26
+ # TODO(marko): Support config.
27
+ # config: Config | None = None,
28
+ ) -> "Scan":
29
+ """Starts a read transaction on the spiral.
30
+
31
+ Args:
32
+ projections: a set of expressions that return struct arrays.
33
+ where: a query expression to apply to the data.
34
+ asof: only data written before the given timestamp will be returned, caveats around compaction.
35
+ exclude_keys: whether to exclude the key columns in the scan result, defaults to False.
36
+ """
37
+ from spiral import expressions as se
38
+
39
+ # Combine all projections into a single struct.
40
+ projection = se.merge(*projections)
41
+ if where is not None:
42
+ where = se.lift(where)
43
+
44
+ return Scan(
45
+ TableScan(
46
+ projection.__expr__,
47
+ filter=where.__expr__ if where else None,
48
+ asof=asof,
49
+ exclude_keys=exclude_keys,
50
+ ),
51
+ # config=config,
52
+ )
53
+
54
+
55
+ class Scan:
56
+ """Scan object."""
57
+
58
+ def __init__(
59
+ self,
60
+ scan: TableScan,
61
+ # TODO(marko): Support config.
62
+ # config: Config | None = None,
63
+ ):
64
+ # NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
65
+ # when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
66
+ # and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
67
+ self._scan = scan
68
+
69
+ @property
70
+ def metrics(self) -> dict[str, Any]:
71
+ """Returns metrics about the scan."""
72
+ return self._scan.metrics()
73
+
74
+ @property
75
+ def schema(self) -> Schema:
76
+ """Returns the schema of the scan."""
77
+ return self._scan.schema()
78
+
79
+ def is_empty(self) -> bool:
80
+ """Check if the Spiral is empty for the given key range.
81
+
82
+ **IMPORTANT**: False negatives are possible, but false positives are not,
83
+ i.e. is_empty can return False and scan can return zero rows.
84
+ """
85
+ return self._scan.is_empty()
86
+
87
+ def to_record_batches(self, key_table: pa.Table | pa.RecordBatchReader | None = None) -> pa.RecordBatchReader:
88
+ """Read as a stream of RecordBatches.
89
+
90
+ Args:
91
+ key_table: a table of keys to "take" (including aux columns for cell-push-down).
92
+ """
93
+ if isinstance(key_table, pa.RecordBatchReader):
94
+ raise NotImplementedError("RecordBatchReader is not supported as key_table")
95
+
96
+ # Prefix non-key columns in the key table with # (auxiliary) to avoid conflicts with the scan schema.
97
+ if key_table is not None:
98
+ key_columns = list(self._scan.key_schema().to_arrow().names)
99
+ key_table = key_table.rename_columns(
100
+ {name: f"#{name}" if name not in key_columns else name for name in key_table.schema.names}
101
+ )
102
+
103
+ return self._scan.to_record_batches(aux_table=key_table)
104
+
105
+ def to_table(self) -> pa.Table:
106
+ """Read into a single PyArrow Table."""
107
+ return self.to_record_batches().read_all()
108
+
109
+ def to_dask(self) -> "dd.DataFrame":
110
+ """Read into a Dask DataFrame.
111
+
112
+ Requires the `dask` package to be installed.
113
+ """
114
+ import dask.dataframe as dd
115
+ import pandas as pd
116
+
117
+ def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
118
+ # TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
119
+ raise NotImplementedError()
120
+
121
+ # Fetch a set of partition ranges
122
+ return dd.from_map(_read_key_range, self.split())
123
+
124
+ def to_pandas(self) -> "pd.DataFrame":
125
+ """Read into a Pandas DataFrame.
126
+
127
+ Requires the `pandas` package to be installed.
128
+ """
129
+ return self.to_table().to_pandas()
130
+
131
+ def to_polars(self) -> "pl.DataFrame":
132
+ """Read into a Polars DataFrame.
133
+
134
+ Requires the `polars` package to be installed.
135
+ """
136
+ import polars as pl
137
+
138
+ # TODO(ngates): PR PyArrow to support lazy datasets
139
+ return pl.from_arrow(self.to_record_batches())
140
+
141
+ def to_pytorch(self) -> "iterable_dataset.IterableDataset":
142
+ """Returns an iterable dataset that can be used to build a `pytorch.DataLoader`.
143
+
144
+ Requires the `datasets` package to be installed.
145
+ """
146
+ from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
147
+
148
+ def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
149
+ stream = self.to_record_batches()
150
+
151
+ # This key is unused when training with IterableDataset.
152
+ # Default implementation returns shard id, e.g. parquet row group id.
153
+ for i, rb in enumerate(stream):
154
+ yield i, pa.Table.from_batches([rb], stream.schema)
155
+
156
+ # NOTE: Type annotation Callable[..., tuple[str, pa.Table]] is wrong. The return value must be iterable.
157
+ ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
158
+ return IterableDataset(ex_iterable=ex_iterable)
159
+
160
+ def split(self) -> list[KeyRange]:
161
+ return self._scan.split()
162
+
163
+ def debug(self):
164
+ # Visualizes the scan, mainly for debugging purposes.
165
+ # NOTE: This is not part of the API and may disappear at any moment.
166
+ from spiral.debug import show_scan
167
+
168
+ show_scan(self._scan)
spiral/settings.py ADDED
@@ -0,0 +1,157 @@
1
+ import functools
2
+ import hashlib
3
+ import os
4
+ from collections.abc import Callable, Mapping
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import jwt
9
+ import typer
10
+ from pydantic import BaseModel, Field
11
+ from pydantic_settings import (
12
+ BaseSettings,
13
+ InitSettingsSource,
14
+ PydanticBaseSettingsSource,
15
+ SettingsConfigDict,
16
+ TomlConfigSettingsSource,
17
+ )
18
+
19
+ from spiral.api import SpiralAPI
20
+ from spiral.authn.authn import (
21
+ DeviceAuthProvider,
22
+ EnvironmentAuthn,
23
+ FallbackAuthn,
24
+ TokenAuthn,
25
+ TokenExchangeProvider,
26
+ )
27
+ from spiral.authn.device import DeviceAuth
28
+ from spiral.authn.github_ import GitHubActionsProvider
29
+ from spiral.authn.modal_ import ModalProvider
30
+
31
+ DEV = "PYTEST_VERSION" in os.environ or bool(os.environ.get("SPIRAL_DEV", None))
32
+ APP_DIR = Path(typer.get_app_dir("pyspiral"))
33
+ LOG_DIR = APP_DIR / "logs"
34
+ CONFIG_FILE = APP_DIR / "config.toml"
35
+
36
+
37
+ class AuthSettings(BaseSettings):
38
+ model_config = SettingsConfigDict(frozen=True)
39
+
40
+ domain: str = "https://device.spiraldb.com"
41
+ client_id: str = "client_01J1CRS967RFQY7JSE8XZ44ATR" if DEV else "client_01J1CRS9MGF103FKTKZDCNJVM3"
42
+
43
+
44
+ class AccessToken(BaseModel):
45
+ token: str
46
+
47
+ @functools.cached_property
48
+ def payload(self) -> dict:
49
+ return jwt.decode(self.token, options={"verify_signature": False})
50
+
51
+
52
+ class SpiralDBSettings(BaseSettings):
53
+ model_config = SettingsConfigDict(frozen=True)
54
+
55
+ host: str = "localhost" if DEV else "api.spiraldb.com"
56
+ port: int = 4279 if DEV else 443
57
+ ssl: bool = not DEV
58
+ auth: AuthSettings = Field(default_factory=AuthSettings)
59
+ token: str | None = None
60
+
61
+ @property
62
+ def uri(self) -> str:
63
+ return f"{'https' if self.ssl else 'http'}://{self.host}:{self.port}"
64
+
65
+ @property
66
+ def uri_scandal(self) -> str:
67
+ # TODO(marko): Scandal will be a different service. For now, gRPC API is hosted on the SpiralDB service.
68
+ return f"{'grpc+tls' if self.ssl else 'grpc'}://{self.host}:{self.port}"
69
+
70
+ def device_auth(self) -> DeviceAuth:
71
+ auth_file = (
72
+ APP_DIR / hashlib.md5(f"{self.auth.domain}/{self.auth.client_id}".encode()).hexdigest() / "auth.json"
73
+ )
74
+ return DeviceAuth(auth_file=auth_file, domain=self.auth.domain, client_id=self.auth.client_id)
75
+
76
+
77
+ class Settings(BaseSettings):
78
+ model_config = SettingsConfigDict(
79
+ toml_file=CONFIG_FILE,
80
+ env_nested_delimiter="__",
81
+ env_prefix="SPIRAL__",
82
+ frozen=True,
83
+ )
84
+
85
+ spiraldb: SpiralDBSettings = Field(default_factory=SpiralDBSettings)
86
+
87
+ @functools.cached_property
88
+ def api(self) -> "SpiralAPI":
89
+ from spiral.api import SpiralAPI
90
+
91
+ return SpiralAPI(self.authn, base_url=self.spiraldb.uri)
92
+
93
+ @functools.cached_property
94
+ def authn(self):
95
+ if self.spiraldb.token:
96
+ return TokenAuthn(self.spiraldb.token)
97
+
98
+ return FallbackAuthn(
99
+ [
100
+ GitHubActionsProvider(),
101
+ ModalProvider(),
102
+ # TODO(marko): Github and Modal should also be behind token exchanged.
103
+ TokenExchangeProvider(EnvironmentAuthn(), self.spiraldb.uri),
104
+ DeviceAuthProvider(self.spiraldb.device_auth()),
105
+ ]
106
+ )
107
+
108
+ @classmethod
109
+ def settings_customise_sources(
110
+ cls,
111
+ settings_cls: type[BaseSettings],
112
+ env_settings: PydanticBaseSettingsSource,
113
+ dotenv_settings: PydanticBaseSettingsSource,
114
+ init_settings: InitSettingsSource,
115
+ **kwargs,
116
+ ) -> tuple[PydanticBaseSettingsSource, ...]:
117
+ return env_settings, dotenv_settings, TomlConfigSettingsSource(settings_cls), init_settings
118
+
119
+
120
+ class _LazyDict(Mapping):
121
+ def __init__(self, lazy_values: dict[str, Any | Callable[[], Any]]):
122
+ self._lazy_values = lazy_values
123
+
124
+ def _dict(self):
125
+ return {k: v() if callable(v) else v for k, v in self._lazy_values.items()}
126
+
127
+ def __getitem__(self, key, /):
128
+ return self._lazy_values[key]()
129
+
130
+ def get(self, key, /, default=None):
131
+ return self._lazy_values.get(key, lambda: default)()
132
+
133
+ def items(self):
134
+ return self._dict().items()
135
+
136
+ def keys(self):
137
+ return self._lazy_values.keys()
138
+
139
+ def values(self):
140
+ return self._dict().values()
141
+
142
+ def __contains__(self, key, /):
143
+ return key in self._lazy_values
144
+
145
+ def __eq__(self, other, /):
146
+ return False
147
+
148
+ def __iter__(self):
149
+ return iter(self._dict())
150
+
151
+ def __len__(self):
152
+ return len(self._lazy_values)
153
+
154
+
155
+ @functools.cache
156
+ def settings() -> Settings:
157
+ return Settings()