pyspiral 0.1.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. pyspiral-0.1.0.dist-info/METADATA +48 -0
  2. pyspiral-0.1.0.dist-info/RECORD +81 -0
  3. pyspiral-0.1.0.dist-info/WHEEL +4 -0
  4. pyspiral-0.1.0.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +11 -0
  6. spiral/_lib.abi3.so +0 -0
  7. spiral/adbc.py +386 -0
  8. spiral/api/__init__.py +221 -0
  9. spiral/api/admin.py +29 -0
  10. spiral/api/filesystems.py +125 -0
  11. spiral/api/organizations.py +90 -0
  12. spiral/api/projects.py +160 -0
  13. spiral/api/tables.py +94 -0
  14. spiral/api/tokens.py +56 -0
  15. spiral/api/workloads.py +45 -0
  16. spiral/arrow.py +209 -0
  17. spiral/authn/__init__.py +0 -0
  18. spiral/authn/authn.py +89 -0
  19. spiral/authn/device.py +206 -0
  20. spiral/authn/github_.py +33 -0
  21. spiral/authn/modal_.py +18 -0
  22. spiral/catalog.py +78 -0
  23. spiral/cli/__init__.py +82 -0
  24. spiral/cli/__main__.py +4 -0
  25. spiral/cli/admin.py +21 -0
  26. spiral/cli/app.py +48 -0
  27. spiral/cli/console.py +95 -0
  28. spiral/cli/fs.py +47 -0
  29. spiral/cli/login.py +13 -0
  30. spiral/cli/org.py +90 -0
  31. spiral/cli/printer.py +45 -0
  32. spiral/cli/project.py +107 -0
  33. spiral/cli/state.py +3 -0
  34. spiral/cli/table.py +20 -0
  35. spiral/cli/token.py +27 -0
  36. spiral/cli/types.py +53 -0
  37. spiral/cli/workload.py +59 -0
  38. spiral/config.py +26 -0
  39. spiral/core/__init__.py +0 -0
  40. spiral/core/core/__init__.pyi +53 -0
  41. spiral/core/manifests/__init__.pyi +53 -0
  42. spiral/core/metastore/__init__.pyi +91 -0
  43. spiral/core/spec/__init__.pyi +257 -0
  44. spiral/dataset.py +239 -0
  45. spiral/debug.py +251 -0
  46. spiral/expressions/__init__.py +222 -0
  47. spiral/expressions/base.py +149 -0
  48. spiral/expressions/http.py +86 -0
  49. spiral/expressions/io.py +100 -0
  50. spiral/expressions/list_.py +68 -0
  51. spiral/expressions/refs.py +44 -0
  52. spiral/expressions/str_.py +39 -0
  53. spiral/expressions/struct.py +57 -0
  54. spiral/expressions/tiff.py +223 -0
  55. spiral/expressions/udf.py +46 -0
  56. spiral/grpc_.py +32 -0
  57. spiral/project.py +137 -0
  58. spiral/proto/_/__init__.py +0 -0
  59. spiral/proto/_/arrow/__init__.py +0 -0
  60. spiral/proto/_/arrow/flight/__init__.py +0 -0
  61. spiral/proto/_/arrow/flight/protocol/__init__.py +0 -0
  62. spiral/proto/_/arrow/flight/protocol/sql/__init__.py +1990 -0
  63. spiral/proto/_/scandal/__init__.py +223 -0
  64. spiral/proto/_/spfs/__init__.py +36 -0
  65. spiral/proto/_/spiral/__init__.py +0 -0
  66. spiral/proto/_/spiral/table/__init__.py +225 -0
  67. spiral/proto/_/spiraldb/__init__.py +0 -0
  68. spiral/proto/_/spiraldb/metastore/__init__.py +499 -0
  69. spiral/proto/__init__.py +0 -0
  70. spiral/proto/scandal/__init__.py +45 -0
  71. spiral/proto/spiral/__init__.py +0 -0
  72. spiral/proto/spiral/table/__init__.py +96 -0
  73. spiral/proto/substrait/__init__.py +3399 -0
  74. spiral/proto/substrait/extensions/__init__.py +115 -0
  75. spiral/proto/util.py +41 -0
  76. spiral/py.typed +0 -0
  77. spiral/scan_.py +168 -0
  78. spiral/settings.py +157 -0
  79. spiral/substrait_.py +275 -0
  80. spiral/table.py +157 -0
  81. spiral/types_.py +6 -0
@@ -0,0 +1,115 @@
1
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
2
+ # sources: substrait/extensions/extensions.proto
3
+ # plugin: python-betterproto
4
+ # This file has been @generated
5
+
6
+ from dataclasses import dataclass
7
+ from typing import List
8
+
9
+ import betterproto
10
+ import betterproto.lib.google.protobuf as betterproto_lib_google_protobuf
11
+
12
+
13
+ @dataclass(eq=False, repr=False)
14
+ class SimpleExtensionUri(betterproto.Message):
15
+ extension_uri_anchor: int = betterproto.uint32_field(1)
16
+ """
17
+ A surrogate key used in the context of a single plan used to reference the
18
+ URI associated with an extension.
19
+ """
20
+
21
+ uri: str = betterproto.string_field(2)
22
+ """
23
+ The URI where this extension YAML can be retrieved. This is the "namespace"
24
+ of this extension.
25
+ """
26
+
27
+
28
+ @dataclass(eq=False, repr=False)
29
+ class SimpleExtensionDeclaration(betterproto.Message):
30
+ """
31
+ Describes a mapping between a specific extension entity and the uri where
32
+ that extension can be found.
33
+ """
34
+
35
+ extension_type: "SimpleExtensionDeclarationExtensionType" = (
36
+ betterproto.message_field(1, group="mapping_type")
37
+ )
38
+ extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation" = (
39
+ betterproto.message_field(2, group="mapping_type")
40
+ )
41
+ extension_function: "SimpleExtensionDeclarationExtensionFunction" = (
42
+ betterproto.message_field(3, group="mapping_type")
43
+ )
44
+
45
+
46
+ @dataclass(eq=False, repr=False)
47
+ class SimpleExtensionDeclarationExtensionType(betterproto.Message):
48
+ """Describes a Type"""
49
+
50
+ extension_uri_reference: int = betterproto.uint32_field(1)
51
+ """
52
+ references the extension_uri_anchor defined for a specific extension URI.
53
+ """
54
+
55
+ type_anchor: int = betterproto.uint32_field(2)
56
+ """
57
+ A surrogate key used in the context of a single plan to reference a
58
+ specific extension type
59
+ """
60
+
61
+ name: str = betterproto.string_field(3)
62
+ """the name of the type in the defined extension YAML."""
63
+
64
+
65
+ @dataclass(eq=False, repr=False)
66
+ class SimpleExtensionDeclarationExtensionTypeVariation(betterproto.Message):
67
+ extension_uri_reference: int = betterproto.uint32_field(1)
68
+ """
69
+ references the extension_uri_anchor defined for a specific extension URI.
70
+ """
71
+
72
+ type_variation_anchor: int = betterproto.uint32_field(2)
73
+ """
74
+ A surrogate key used in the context of a single plan to reference a
75
+ specific type variation
76
+ """
77
+
78
+ name: str = betterproto.string_field(3)
79
+ """the name of the type in the defined extension YAML."""
80
+
81
+
82
+ @dataclass(eq=False, repr=False)
83
+ class SimpleExtensionDeclarationExtensionFunction(betterproto.Message):
84
+ extension_uri_reference: int = betterproto.uint32_field(1)
85
+ """
86
+ references the extension_uri_anchor defined for a specific extension URI.
87
+ """
88
+
89
+ function_anchor: int = betterproto.uint32_field(2)
90
+ """
91
+ A surrogate key used in the context of a single plan to reference a
92
+ specific function
93
+ """
94
+
95
+ name: str = betterproto.string_field(3)
96
+ """A function signature compound name"""
97
+
98
+
99
+ @dataclass(eq=False, repr=False)
100
+ class AdvancedExtension(betterproto.Message):
101
+ """
102
+ A generic object that can be used to embed additional extension information
103
+ into the serialized substrait plan.
104
+ """
105
+
106
+ optimization: List[
107
+ "betterproto_lib_google_protobuf.Any"
108
+ ] = betterproto.message_field(1)
109
+ """
110
+ An optimization is helpful information that don't influence semantics. May
111
+ be ignored by a consumer.
112
+ """
113
+
114
+ enhancement: "betterproto_lib_google_protobuf.Any" = betterproto.message_field(2)
115
+ """An enhancement alter semantics. Cannot be ignored by a consumer."""
spiral/proto/util.py ADDED
@@ -0,0 +1,41 @@
1
+ import betterproto
2
+ from betterproto.grpc.grpclib_server import ServiceBase
3
+
4
+
5
+ def patch_protos(proto_module, our_module_globals):
6
+ """Calculate __all__ to re-export protos from a module."""
7
+
8
+ betterproto_types = (betterproto.Message, betterproto.Enum, betterproto.ServiceStub, ServiceBase)
9
+
10
+ proto_overrides = {}
11
+ missing = set()
12
+ for ident in dir(proto_module):
13
+ var = getattr(proto_module, ident)
14
+ if isinstance(var, type) and issubclass(var, betterproto_types):
15
+ if ident in our_module_globals:
16
+ override = id(our_module_globals.get(ident)) != id(var)
17
+ else:
18
+ override = False
19
+ missing.add(ident)
20
+ proto_overrides[ident] = override
21
+
22
+ if missing:
23
+ print(f"from {proto_module.__name__} import (")
24
+ for ident, override in proto_overrides.items():
25
+ if override:
26
+ print(f" {ident} as {ident}_,")
27
+ else:
28
+ print(f" {ident},")
29
+ print(")")
30
+ print("\n")
31
+ print("__all__ = [")
32
+ for ident in proto_overrides:
33
+ print(f' "{ident}",')
34
+ print("]")
35
+
36
+ raise ValueError(f"Missing types that need to be re-exported: {missing}")
37
+
38
+ # Patch any local subclasses back into the original module so the gRPC client will use them
39
+ for ident, override in proto_overrides.items():
40
+ if override:
41
+ setattr(proto_module, ident, our_module_globals[ident])
spiral/py.typed ADDED
File without changes
spiral/scan_.py ADDED
@@ -0,0 +1,168 @@
1
+ from collections.abc import Iterator
2
+ from datetime import datetime
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ import pyarrow as pa
6
+ from opentelemetry import trace
7
+
8
+ from spiral.core.core import TableScan
9
+ from spiral.core.spec import KeyRange, Schema
10
+ from spiral.expressions.base import ExprLike
11
+
12
+ if TYPE_CHECKING:
13
+ import dask.dataframe as dd
14
+ import pandas as pd
15
+ import polars as pl
16
+ from datasets import iterable_dataset
17
+
18
+ tracer = trace.get_tracer("pyspiral.client.scan")
19
+
20
+
21
+ def scan(
22
+ *projections: ExprLike,
23
+ where: ExprLike | None = None,
24
+ asof: datetime | int | str = None,
25
+ exclude_keys: bool = False,
26
+ # TODO(marko): Support config.
27
+ # config: Config | None = None,
28
+ ) -> "Scan":
29
+ """Starts a read transaction on the spiral.
30
+
31
+ Args:
32
+ projections: a set of expressions that return struct arrays.
33
+ where: a query expression to apply to the data.
34
+ asof: only data written before the given timestamp will be returned, caveats around compaction.
35
+ exclude_keys: whether to exclude the key columns in the scan result, defaults to False.
36
+ """
37
+ from spiral import expressions as se
38
+
39
+ # Combine all projections into a single struct.
40
+ projection = se.merge(*projections)
41
+ if where is not None:
42
+ where = se.lift(where)
43
+
44
+ return Scan(
45
+ TableScan(
46
+ projection.__expr__,
47
+ filter=where.__expr__ if where else None,
48
+ asof=asof,
49
+ exclude_keys=exclude_keys,
50
+ ),
51
+ # config=config,
52
+ )
53
+
54
+
55
+ class Scan:
56
+ """Scan object."""
57
+
58
+ def __init__(
59
+ self,
60
+ scan: TableScan,
61
+ # TODO(marko): Support config.
62
+ # config: Config | None = None,
63
+ ):
64
+ # NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
65
+ # when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
66
+ # and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
67
+ self._scan = scan
68
+
69
+ @property
70
+ def metrics(self) -> dict[str, Any]:
71
+ """Returns metrics about the scan."""
72
+ return self._scan.metrics()
73
+
74
+ @property
75
+ def schema(self) -> Schema:
76
+ """Returns the schema of the scan."""
77
+ return self._scan.schema()
78
+
79
+ def is_empty(self) -> bool:
80
+ """Check if the Spiral is empty for the given key range.
81
+
82
+ **IMPORTANT**: False negatives are possible, but false positives are not,
83
+ i.e. is_empty can return False and scan can return zero rows.
84
+ """
85
+ return self._scan.is_empty()
86
+
87
+ def to_record_batches(self, key_table: pa.Table | pa.RecordBatchReader | None = None) -> pa.RecordBatchReader:
88
+ """Read as a stream of RecordBatches.
89
+
90
+ Args:
91
+ key_table: a table of keys to "take" (including aux columns for cell-push-down).
92
+ """
93
+ if isinstance(key_table, pa.RecordBatchReader):
94
+ raise NotImplementedError("RecordBatchReader is not supported as key_table")
95
+
96
+ # Prefix non-key columns in the key table with # (auxiliary) to avoid conflicts with the scan schema.
97
+ if key_table is not None:
98
+ key_columns = list(self._scan.key_schema().to_arrow().names)
99
+ key_table = key_table.rename_columns(
100
+ {name: f"#{name}" if name not in key_columns else name for name in key_table.schema.names}
101
+ )
102
+
103
+ return self._scan.to_record_batches(aux_table=key_table)
104
+
105
+ def to_table(self) -> pa.Table:
106
+ """Read into a single PyArrow Table."""
107
+ return self.to_record_batches().read_all()
108
+
109
+ def to_dask(self) -> "dd.DataFrame":
110
+ """Read into a Dask DataFrame.
111
+
112
+ Requires the `dask` package to be installed.
113
+ """
114
+ import dask.dataframe as dd
115
+ import pandas as pd
116
+
117
+ def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
118
+ # TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
119
+ raise NotImplementedError()
120
+
121
+ # Fetch a set of partition ranges
122
+ return dd.from_map(_read_key_range, self.split())
123
+
124
+ def to_pandas(self) -> "pd.DataFrame":
125
+ """Read into a Pandas DataFrame.
126
+
127
+ Requires the `pandas` package to be installed.
128
+ """
129
+ return self.to_table().to_pandas()
130
+
131
+ def to_polars(self) -> "pl.DataFrame":
132
+ """Read into a Polars DataFrame.
133
+
134
+ Requires the `polars` package to be installed.
135
+ """
136
+ import polars as pl
137
+
138
+ # TODO(ngates): PR PyArrow to support lazy datasets
139
+ return pl.from_arrow(self.to_record_batches())
140
+
141
+ def to_pytorch(self) -> "iterable_dataset.IterableDataset":
142
+ """Returns an iterable dataset that can be used to build a `pytorch.DataLoader`.
143
+
144
+ Requires the `datasets` package to be installed.
145
+ """
146
+ from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
147
+
148
+ def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
149
+ stream = self.to_record_batches()
150
+
151
+ # This key is unused when training with IterableDataset.
152
+ # Default implementation returns shard id, e.g. parquet row group id.
153
+ for i, rb in enumerate(stream):
154
+ yield i, pa.Table.from_batches([rb], stream.schema)
155
+
156
+ # NOTE: Type annotation Callable[..., tuple[str, pa.Table]] is wrong. The return value must be iterable.
157
+ ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
158
+ return IterableDataset(ex_iterable=ex_iterable)
159
+
160
+ def split(self) -> list[KeyRange]:
161
+ return self._scan.split()
162
+
163
+ def debug(self):
164
+ # Visualizes the scan, mainly for debugging purposes.
165
+ # NOTE: This is not part of the API and may disappear at any moment.
166
+ from spiral.debug import show_scan
167
+
168
+ show_scan(self._scan)
spiral/settings.py ADDED
@@ -0,0 +1,157 @@
1
+ import functools
2
+ import hashlib
3
+ import os
4
+ from collections.abc import Callable, Mapping
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import jwt
9
+ import typer
10
+ from pydantic import BaseModel, Field
11
+ from pydantic_settings import (
12
+ BaseSettings,
13
+ InitSettingsSource,
14
+ PydanticBaseSettingsSource,
15
+ SettingsConfigDict,
16
+ TomlConfigSettingsSource,
17
+ )
18
+
19
+ from spiral.api import SpiralAPI
20
+ from spiral.authn.authn import (
21
+ DeviceAuthProvider,
22
+ EnvironmentAuthn,
23
+ FallbackAuthn,
24
+ TokenAuthn,
25
+ TokenExchangeProvider,
26
+ )
27
+ from spiral.authn.device import DeviceAuth
28
+ from spiral.authn.github_ import GitHubActionsProvider
29
+ from spiral.authn.modal_ import ModalProvider
30
+
31
+ DEV = "PYTEST_VERSION" in os.environ or bool(os.environ.get("SPIRAL_DEV", None))
32
+ APP_DIR = Path(typer.get_app_dir("pyspiral"))
33
+ LOG_DIR = APP_DIR / "logs"
34
+ CONFIG_FILE = APP_DIR / "config.toml"
35
+
36
+
37
+ class AuthSettings(BaseSettings):
38
+ model_config = SettingsConfigDict(frozen=True)
39
+
40
+ domain: str = "https://device.spiraldb.com"
41
+ client_id: str = "client_01J1CRS967RFQY7JSE8XZ44ATR" if DEV else "client_01J1CRS9MGF103FKTKZDCNJVM3"
42
+
43
+
44
+ class AccessToken(BaseModel):
45
+ token: str
46
+
47
+ @functools.cached_property
48
+ def payload(self) -> dict:
49
+ return jwt.decode(self.token, options={"verify_signature": False})
50
+
51
+
52
+ class SpiralDBSettings(BaseSettings):
53
+ model_config = SettingsConfigDict(frozen=True)
54
+
55
+ host: str = "localhost" if DEV else "api.spiraldb.com"
56
+ port: int = 4279 if DEV else 443
57
+ ssl: bool = not DEV
58
+ auth: AuthSettings = Field(default_factory=AuthSettings)
59
+ token: str | None = None
60
+
61
+ @property
62
+ def uri(self) -> str:
63
+ return f"{'https' if self.ssl else 'http'}://{self.host}:{self.port}"
64
+
65
+ @property
66
+ def uri_scandal(self) -> str:
67
+ # TODO(marko): Scandal will be a different service. For now, gRPC API is hosted on the SpiralDB service.
68
+ return f"{'grpc+tls' if self.ssl else 'grpc'}://{self.host}:{self.port}"
69
+
70
+ def device_auth(self) -> DeviceAuth:
71
+ auth_file = (
72
+ APP_DIR / hashlib.md5(f"{self.auth.domain}/{self.auth.client_id}".encode()).hexdigest() / "auth.json"
73
+ )
74
+ return DeviceAuth(auth_file=auth_file, domain=self.auth.domain, client_id=self.auth.client_id)
75
+
76
+
77
+ class Settings(BaseSettings):
78
+ model_config = SettingsConfigDict(
79
+ toml_file=CONFIG_FILE,
80
+ env_nested_delimiter="__",
81
+ env_prefix="SPIRAL__",
82
+ frozen=True,
83
+ )
84
+
85
+ spiraldb: SpiralDBSettings = Field(default_factory=SpiralDBSettings)
86
+
87
+ @functools.cached_property
88
+ def api(self) -> "SpiralAPI":
89
+ from spiral.api import SpiralAPI
90
+
91
+ return SpiralAPI(self.authn, base_url=self.spiraldb.uri)
92
+
93
+ @functools.cached_property
94
+ def authn(self):
95
+ if self.spiraldb.token:
96
+ return TokenAuthn(self.spiraldb.token)
97
+
98
+ return FallbackAuthn(
99
+ [
100
+ GitHubActionsProvider(),
101
+ ModalProvider(),
102
+ # TODO(marko): Github and Modal should also be behind token exchanged.
103
+ TokenExchangeProvider(EnvironmentAuthn(), self.spiraldb.uri),
104
+ DeviceAuthProvider(self.spiraldb.device_auth()),
105
+ ]
106
+ )
107
+
108
+ @classmethod
109
+ def settings_customise_sources(
110
+ cls,
111
+ settings_cls: type[BaseSettings],
112
+ env_settings: PydanticBaseSettingsSource,
113
+ dotenv_settings: PydanticBaseSettingsSource,
114
+ init_settings: InitSettingsSource,
115
+ **kwargs,
116
+ ) -> tuple[PydanticBaseSettingsSource, ...]:
117
+ return env_settings, dotenv_settings, TomlConfigSettingsSource(settings_cls), init_settings
118
+
119
+
120
+ class _LazyDict(Mapping):
121
+ def __init__(self, lazy_values: dict[str, Any | Callable[[], Any]]):
122
+ self._lazy_values = lazy_values
123
+
124
+ def _dict(self):
125
+ return {k: v() if callable(v) else v for k, v in self._lazy_values.items()}
126
+
127
+ def __getitem__(self, key, /):
128
+ return self._lazy_values[key]()
129
+
130
+ def get(self, key, /, default=None):
131
+ return self._lazy_values.get(key, lambda: default)()
132
+
133
+ def items(self):
134
+ return self._dict().items()
135
+
136
+ def keys(self):
137
+ return self._lazy_values.keys()
138
+
139
+ def values(self):
140
+ return self._dict().values()
141
+
142
+ def __contains__(self, key, /):
143
+ return key in self._lazy_values
144
+
145
+ def __eq__(self, other, /):
146
+ return False
147
+
148
+ def __iter__(self):
149
+ return iter(self._dict())
150
+
151
+ def __len__(self):
152
+ return len(self._lazy_values)
153
+
154
+
155
+ @functools.cache
156
+ def settings() -> Settings:
157
+ return Settings()