pyspiral 0.2.5__cp310-abi3-macosx_11_0_arm64.whl → 0.4.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/METADATA +12 -14
  2. pyspiral-0.4.0.dist-info/RECORD +98 -0
  3. {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/WHEEL +1 -1
  4. spiral/__init__.py +6 -7
  5. spiral/_lib.abi3.so +0 -0
  6. spiral/adbc.py +21 -14
  7. spiral/api/__init__.py +15 -172
  8. spiral/api/admin.py +12 -26
  9. spiral/api/client.py +160 -0
  10. spiral/api/filesystems.py +100 -72
  11. spiral/api/organizations.py +45 -58
  12. spiral/api/projects.py +171 -134
  13. spiral/api/telemetry.py +19 -0
  14. spiral/api/types.py +20 -0
  15. spiral/api/workloads.py +32 -25
  16. spiral/{arrow.py → arrow_.py} +12 -0
  17. spiral/cli/__init__.py +2 -5
  18. spiral/cli/admin.py +7 -12
  19. spiral/cli/app.py +23 -6
  20. spiral/cli/console.py +1 -1
  21. spiral/cli/fs.py +83 -18
  22. spiral/cli/iceberg/__init__.py +7 -0
  23. spiral/cli/iceberg/namespaces.py +47 -0
  24. spiral/cli/iceberg/tables.py +60 -0
  25. spiral/cli/indexes/__init__.py +19 -0
  26. spiral/cli/login.py +14 -5
  27. spiral/cli/orgs.py +90 -0
  28. spiral/cli/printer.py +9 -1
  29. spiral/cli/projects.py +136 -0
  30. spiral/cli/state.py +2 -0
  31. spiral/cli/tables/__init__.py +121 -0
  32. spiral/cli/telemetry.py +18 -0
  33. spiral/cli/types.py +8 -10
  34. spiral/cli/{workload.py → workloads.py} +11 -11
  35. spiral/{catalog.py → client.py} +22 -21
  36. spiral/core/client/__init__.pyi +117 -0
  37. spiral/core/index/__init__.pyi +15 -0
  38. spiral/core/table/__init__.pyi +108 -0
  39. spiral/core/{manifests → table/manifests}/__init__.pyi +5 -23
  40. spiral/core/table/metastore/__init__.pyi +62 -0
  41. spiral/core/{spec → table/spec}/__init__.pyi +49 -92
  42. spiral/datetime_.py +27 -0
  43. spiral/expressions/__init__.py +40 -17
  44. spiral/expressions/base.py +5 -5
  45. spiral/expressions/list_.py +1 -1
  46. spiral/expressions/mp4.py +62 -0
  47. spiral/expressions/png.py +18 -0
  48. spiral/expressions/qoi.py +18 -0
  49. spiral/expressions/refs.py +23 -9
  50. spiral/expressions/struct.py +7 -5
  51. spiral/expressions/text.py +62 -0
  52. spiral/expressions/tiff.py +88 -88
  53. spiral/expressions/udf.py +3 -3
  54. spiral/iceberg/__init__.py +3 -0
  55. spiral/iceberg/client.py +33 -0
  56. spiral/indexes/__init__.py +5 -0
  57. spiral/indexes/client.py +137 -0
  58. spiral/indexes/index.py +34 -0
  59. spiral/indexes/scan.py +22 -0
  60. spiral/project.py +19 -110
  61. spiral/{proto → protogen}/_/scandal/__init__.py +32 -77
  62. spiral/protogen/_/spiral/table/__init__.py +22 -0
  63. spiral/protogen/substrait/__init__.py +3399 -0
  64. spiral/protogen/substrait/extensions/__init__.py +115 -0
  65. spiral/server.py +17 -0
  66. spiral/settings.py +31 -87
  67. spiral/substrait_.py +10 -6
  68. spiral/tables/__init__.py +12 -0
  69. spiral/tables/client.py +130 -0
  70. spiral/{dataset.py → tables/dataset.py} +36 -25
  71. spiral/tables/debug/manifests.py +70 -0
  72. spiral/tables/debug/metrics.py +56 -0
  73. spiral/{debug.py → tables/debug/scan.py} +6 -9
  74. spiral/tables/maintenance.py +12 -0
  75. spiral/tables/scan.py +193 -0
  76. spiral/tables/snapshot.py +78 -0
  77. spiral/tables/table.py +157 -0
  78. spiral/tables/transaction.py +52 -0
  79. pyspiral-0.2.5.dist-info/RECORD +0 -81
  80. spiral/api/tables.py +0 -94
  81. spiral/api/tokens.py +0 -56
  82. spiral/authn/authn.py +0 -89
  83. spiral/authn/device.py +0 -206
  84. spiral/authn/github_.py +0 -33
  85. spiral/authn/modal_.py +0 -18
  86. spiral/cli/org.py +0 -90
  87. spiral/cli/project.py +0 -107
  88. spiral/cli/table.py +0 -20
  89. spiral/cli/token.py +0 -27
  90. spiral/config.py +0 -26
  91. spiral/core/core/__init__.pyi +0 -53
  92. spiral/core/metastore/__init__.pyi +0 -91
  93. spiral/proto/_/spfs/__init__.py +0 -36
  94. spiral/proto/_/spiral/table/__init__.py +0 -225
  95. spiral/proto/_/spiraldb/metastore/__init__.py +0 -499
  96. spiral/proto/__init__.py +0 -0
  97. spiral/proto/scandal/__init__.py +0 -45
  98. spiral/proto/spiral/__init__.py +0 -0
  99. spiral/proto/spiral/table/__init__.py +0 -96
  100. spiral/scan_.py +0 -168
  101. spiral/table.py +0 -157
  102. {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/entry_points.txt +0 -0
  103. /spiral/{authn/__init__.py → core/__init__.pyi} +0 -0
  104. /spiral/{core → protogen/_}/__init__.py +0 -0
  105. /spiral/{proto/_ → protogen/_/arrow}/__init__.py +0 -0
  106. /spiral/{proto/_/arrow → protogen/_/arrow/flight}/__init__.py +0 -0
  107. /spiral/{proto/_/arrow/flight → protogen/_/arrow/flight/protocol}/__init__.py +0 -0
  108. /spiral/{proto → protogen}/_/arrow/flight/protocol/sql/__init__.py +0 -0
  109. /spiral/{proto/_/arrow/flight/protocol → protogen/_/spiral}/__init__.py +0 -0
  110. /spiral/{proto → protogen/_}/substrait/__init__.py +0 -0
  111. /spiral/{proto → protogen/_}/substrait/extensions/__init__.py +0 -0
  112. /spiral/{proto/_/spiral → protogen}/__init__.py +0 -0
  113. /spiral/{proto → protogen}/util.py +0 -0
  114. /spiral/{proto/_/spiraldb → tables/debug}/__init__.py +0 -0
spiral/scan_.py DELETED
@@ -1,168 +0,0 @@
1
- from collections.abc import Iterator
2
- from datetime import datetime
3
- from typing import TYPE_CHECKING, Any
4
-
5
- import pyarrow as pa
6
- from opentelemetry import trace
7
-
8
- from spiral.core.core import TableScan
9
- from spiral.core.spec import KeyRange, Schema
10
- from spiral.expressions.base import ExprLike
11
-
12
- if TYPE_CHECKING:
13
- import dask.dataframe as dd
14
- import pandas as pd
15
- import polars as pl
16
- from datasets import iterable_dataset
17
-
18
- tracer = trace.get_tracer("pyspiral.client.scan")
19
-
20
-
21
- def scan(
22
- *projections: ExprLike,
23
- where: ExprLike | None = None,
24
- asof: datetime | int | str = None,
25
- exclude_keys: bool = False,
26
- # TODO(marko): Support config.
27
- # config: Config | None = None,
28
- ) -> "Scan":
29
- """Starts a read transaction on the spiral.
30
-
31
- Args:
32
- projections: a set of expressions that return struct arrays.
33
- where: a query expression to apply to the data.
34
- asof: only data written before the given timestamp will be returned, caveats around compaction.
35
- exclude_keys: whether to exclude the key columns in the scan result, defaults to False.
36
- """
37
- from spiral import expressions as se
38
-
39
- # Combine all projections into a single struct.
40
- projection = se.merge(*projections)
41
- if where is not None:
42
- where = se.lift(where)
43
-
44
- return Scan(
45
- TableScan(
46
- projection.__expr__,
47
- filter=where.__expr__ if where else None,
48
- asof=asof,
49
- exclude_keys=exclude_keys,
50
- ),
51
- # config=config,
52
- )
53
-
54
-
55
- class Scan:
56
- """Scan object."""
57
-
58
- def __init__(
59
- self,
60
- scan: TableScan,
61
- # TODO(marko): Support config.
62
- # config: Config | None = None,
63
- ):
64
- # NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
65
- # when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
66
- # and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
67
- self._scan = scan
68
-
69
- @property
70
- def metrics(self) -> dict[str, Any]:
71
- """Returns metrics about the scan."""
72
- return self._scan.metrics()
73
-
74
- @property
75
- def schema(self) -> Schema:
76
- """Returns the schema of the scan."""
77
- return self._scan.schema()
78
-
79
- def is_empty(self) -> bool:
80
- """Check if the Spiral is empty for the given key range.
81
-
82
- **IMPORTANT**: False negatives are possible, but false positives are not,
83
- i.e. is_empty can return False and scan can return zero rows.
84
- """
85
- return self._scan.is_empty()
86
-
87
- def to_record_batches(self, key_table: pa.Table | pa.RecordBatchReader | None = None) -> pa.RecordBatchReader:
88
- """Read as a stream of RecordBatches.
89
-
90
- Args:
91
- key_table: a table of keys to "take" (including aux columns for cell-push-down).
92
- """
93
- if isinstance(key_table, pa.RecordBatchReader):
94
- raise NotImplementedError("RecordBatchReader is not supported as key_table")
95
-
96
- # Prefix non-key columns in the key table with # (auxiliary) to avoid conflicts with the scan schema.
97
- if key_table is not None:
98
- key_columns = list(self._scan.key_schema().to_arrow().names)
99
- key_table = key_table.rename_columns(
100
- {name: f"#{name}" if name not in key_columns else name for name in key_table.schema.names}
101
- )
102
-
103
- return self._scan.to_record_batches(aux_table=key_table)
104
-
105
- def to_table(self) -> pa.Table:
106
- """Read into a single PyArrow Table."""
107
- return self.to_record_batches().read_all()
108
-
109
- def to_dask(self) -> "dd.DataFrame":
110
- """Read into a Dask DataFrame.
111
-
112
- Requires the `dask` package to be installed.
113
- """
114
- import dask.dataframe as dd
115
- import pandas as pd
116
-
117
- def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
118
- # TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
119
- raise NotImplementedError()
120
-
121
- # Fetch a set of partition ranges
122
- return dd.from_map(_read_key_range, self.split())
123
-
124
- def to_pandas(self) -> "pd.DataFrame":
125
- """Read into a Pandas DataFrame.
126
-
127
- Requires the `pandas` package to be installed.
128
- """
129
- return self.to_table().to_pandas()
130
-
131
- def to_polars(self) -> "pl.DataFrame":
132
- """Read into a Polars DataFrame.
133
-
134
- Requires the `polars` package to be installed.
135
- """
136
- import polars as pl
137
-
138
- # TODO(ngates): PR PyArrow to support lazy datasets
139
- return pl.from_arrow(self.to_record_batches())
140
-
141
- def to_pytorch(self) -> "iterable_dataset.IterableDataset":
142
- """Returns an iterable dataset that can be used to build a `pytorch.DataLoader`.
143
-
144
- Requires the `datasets` package to be installed.
145
- """
146
- from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
147
-
148
- def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
149
- stream = self.to_record_batches()
150
-
151
- # This key is unused when training with IterableDataset.
152
- # Default implementation returns shard id, e.g. parquet row group id.
153
- for i, rb in enumerate(stream):
154
- yield i, pa.Table.from_batches([rb], stream.schema)
155
-
156
- # NOTE: Type annotation Callable[..., tuple[str, pa.Table]] is wrong. The return value must be iterable.
157
- ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
158
- return IterableDataset(ex_iterable=ex_iterable)
159
-
160
- def split(self) -> list[KeyRange]:
161
- return self._scan.split()
162
-
163
- def debug(self):
164
- # Visualizes the scan, mainly for debugging purposes.
165
- # NOTE: This is not part of the API and may disappear at any moment.
166
- from spiral.debug import show_scan
167
-
168
- show_scan(self._scan)
spiral/table.py DELETED
@@ -1,157 +0,0 @@
1
- from datetime import datetime
2
- from typing import TYPE_CHECKING, Literal
3
-
4
- import pyarrow as pa
5
-
6
- from spiral import expressions as se
7
- from spiral.config import FILE_FORMAT, Config
8
- from spiral.core.core import Table as CoreTable
9
- from spiral.core.core import flush_wal, write
10
- from spiral.expressions.base import Expr, ExprLike
11
-
12
- if TYPE_CHECKING:
13
- import duckdb
14
- import polars as pl
15
- import pyarrow.dataset
16
-
17
- from spiral.scan_ import Scan
18
-
19
-
20
- class Table(Expr):
21
- """API for interacting with a SpiralDB's Table.
22
-
23
- Different catalog implementations should ultimately construct a Table object.
24
- """
25
-
26
- def __init__(
27
- self,
28
- table: CoreTable,
29
- name: str | None = None,
30
- ):
31
- super().__init__(table.__expr__)
32
-
33
- self._table = table
34
- self._name = name or self._table.id
35
- self._key_schema = self._table.key_schema.to_arrow()
36
- self._key_columns = set(self._key_schema.names)
37
-
38
- @property
39
- def table_id(self) -> str:
40
- return self._table.id
41
-
42
- @property
43
- def last_modified_at(self) -> int:
44
- return self._table.get_wal(asof=None).last_modified_at
45
-
46
- def __str__(self):
47
- return self._name
48
-
49
- def __repr__(self):
50
- return f'Table("{self._name}")'
51
-
52
- def __getitem__(self, item: str) -> Expr:
53
- from spiral import expressions as se
54
-
55
- if item in self._key_columns:
56
- return se.var(name=item)
57
-
58
- return super().__getitem__(item)
59
-
60
- def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
61
- # Override an expression select in the root column group to split between keys and columns.
62
- if exclude is not None:
63
- if set(exclude) & self._key_columns:
64
- raise ValueError(
65
- "Cannot use 'exclude' arg with key columns. Use 'exclude_keys' and an explicit select of keys."
66
- )
67
-
68
- key_paths = set(paths) & self._key_columns
69
- other_paths = set(paths) - key_paths
70
- if not key_paths:
71
- return super().select(*paths, exclude=exclude)
72
-
73
- from spiral import expressions as se
74
-
75
- return se.merge(se.pack({key: se.var(key) for key in key_paths}), super().select(*other_paths, exclude=exclude))
76
-
77
- @property
78
- def key_schema(self) -> pa.Schema:
79
- """Returns the key schema of the table."""
80
- return self._key_schema
81
-
82
- @property
83
- def schema(self) -> pa.Schema:
84
- """Returns the FULL schema of the table.
85
-
86
- NOTE: This can be expensive for large tables.
87
- """
88
- return self._table.get_schema(asof=None)
89
-
90
- def to_dataset(self) -> "pyarrow.dataset.Dataset":
91
- """Returns a PyArrow Dataset representing the table."""
92
- from .dataset import TableDataset
93
-
94
- return TableDataset(self)
95
-
96
- def to_polars(self) -> "pl.LazyFrame":
97
- """Returns a Polars LazyFrame for the Spiral table."""
98
- import polars as pl
99
-
100
- return pl.scan_pyarrow_dataset(self.to_dataset())
101
-
102
- def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
103
- """Returns a DuckDB relation for the Spiral table."""
104
- import duckdb
105
-
106
- return duckdb.from_arrow(self.to_dataset())
107
-
108
- def scan(
109
- self,
110
- *projections: ExprLike,
111
- where: ExprLike | None = None,
112
- asof: datetime | int | str = None,
113
- exclude_keys: bool = False,
114
- # TODO(marko): Support config.
115
- # config: Config | None = None,
116
- ) -> "Scan":
117
- """Reads the table. If projections are not provided, the entire table is read.
118
-
119
- See `spiral.scan` for more information.
120
- """
121
- from spiral.scan_ import scan
122
-
123
- if not projections:
124
- projections = [self]
125
-
126
- return scan(
127
- *projections,
128
- where=where,
129
- asof=asof,
130
- exclude_keys=exclude_keys,
131
- # config=config,
132
- )
133
-
134
- # NOTE: "vortex" is valid format. We don't want that visible in the API docs.
135
- def write(
136
- self,
137
- expr: ExprLike,
138
- *,
139
- format: Literal["parquet"] | None = None,
140
- # TODO(joe): support group_by, and config
141
- config: Config | None = None,
142
- ) -> None:
143
- """Write an item to the table inside a single transaction.
144
-
145
- :param expr: The expression to write. Must evaluate to a struct array.
146
- :param format: the format to write the data in. Defaults to "parquet".
147
- :param config: The configuration to use for this write.
148
- """
149
- write(
150
- self._table,
151
- se.lift(expr).__expr__,
152
- format=format or FILE_FORMAT,
153
- partition_size=config.partition_file_min_size if config else None,
154
- )
155
- # Flush the WAL if configured.
156
- if config is not None and config.flush_wal_on_write:
157
- flush_wal(self._table, manifest_format=format or FILE_FORMAT)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes