pyspiral 0.3.1__cp310-abi3-macosx_11_0_arm64.whl → 0.4.1__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. {pyspiral-0.3.1.dist-info → pyspiral-0.4.1.dist-info}/METADATA +9 -13
  2. pyspiral-0.4.1.dist-info/RECORD +98 -0
  3. {pyspiral-0.3.1.dist-info → pyspiral-0.4.1.dist-info}/WHEEL +1 -1
  4. spiral/__init__.py +6 -9
  5. spiral/_lib.abi3.so +0 -0
  6. spiral/adbc.py +21 -14
  7. spiral/api/__init__.py +14 -175
  8. spiral/api/admin.py +12 -26
  9. spiral/api/client.py +160 -0
  10. spiral/api/filesystems.py +100 -72
  11. spiral/api/organizations.py +45 -58
  12. spiral/api/projects.py +171 -134
  13. spiral/api/telemetry.py +19 -0
  14. spiral/api/types.py +20 -0
  15. spiral/api/workloads.py +32 -25
  16. spiral/{arrow.py → arrow_.py} +12 -0
  17. spiral/cli/__init__.py +2 -5
  18. spiral/cli/admin.py +7 -12
  19. spiral/cli/app.py +23 -6
  20. spiral/cli/console.py +1 -1
  21. spiral/cli/fs.py +82 -17
  22. spiral/cli/iceberg/__init__.py +7 -0
  23. spiral/cli/iceberg/namespaces.py +47 -0
  24. spiral/cli/iceberg/tables.py +60 -0
  25. spiral/cli/indexes/__init__.py +19 -0
  26. spiral/cli/login.py +14 -5
  27. spiral/cli/orgs.py +90 -0
  28. spiral/cli/printer.py +9 -1
  29. spiral/cli/projects.py +136 -0
  30. spiral/cli/state.py +2 -0
  31. spiral/cli/tables/__init__.py +121 -0
  32. spiral/cli/telemetry.py +18 -0
  33. spiral/cli/types.py +8 -10
  34. spiral/cli/{workload.py → workloads.py} +11 -11
  35. spiral/{catalog.py → client.py} +23 -37
  36. spiral/core/client/__init__.pyi +117 -0
  37. spiral/core/index/__init__.pyi +15 -0
  38. spiral/core/{core → table}/__init__.pyi +44 -17
  39. spiral/core/{manifests → table/manifests}/__init__.pyi +5 -23
  40. spiral/core/table/metastore/__init__.pyi +62 -0
  41. spiral/core/{spec → table/spec}/__init__.pyi +41 -66
  42. spiral/datetime_.py +27 -0
  43. spiral/expressions/__init__.py +26 -18
  44. spiral/expressions/base.py +5 -5
  45. spiral/expressions/list_.py +1 -1
  46. spiral/expressions/mp4.py +2 -9
  47. spiral/expressions/png.py +1 -1
  48. spiral/expressions/qoi.py +1 -1
  49. spiral/expressions/refs.py +3 -9
  50. spiral/expressions/struct.py +7 -5
  51. spiral/expressions/text.py +62 -0
  52. spiral/expressions/udf.py +3 -3
  53. spiral/iceberg/__init__.py +3 -0
  54. spiral/iceberg/client.py +33 -0
  55. spiral/indexes/__init__.py +5 -0
  56. spiral/indexes/client.py +137 -0
  57. spiral/indexes/index.py +34 -0
  58. spiral/indexes/scan.py +22 -0
  59. spiral/project.py +19 -110
  60. spiral/{proto → protogen}/_/scandal/__init__.py +23 -135
  61. spiral/protogen/_/spiral/table/__init__.py +22 -0
  62. spiral/protogen/substrait/__init__.py +3399 -0
  63. spiral/protogen/substrait/extensions/__init__.py +115 -0
  64. spiral/server.py +17 -0
  65. spiral/settings.py +29 -91
  66. spiral/substrait_.py +9 -5
  67. spiral/tables/__init__.py +12 -0
  68. spiral/tables/client.py +130 -0
  69. spiral/{dataset.py → tables/dataset.py} +9 -199
  70. spiral/tables/debug/manifests.py +70 -0
  71. spiral/tables/debug/metrics.py +56 -0
  72. spiral/{debug.py → tables/debug/scan.py} +6 -9
  73. spiral/{maintenance.py → tables/maintenance.py} +1 -1
  74. spiral/{scan_.py → tables/scan.py} +63 -89
  75. spiral/tables/snapshot.py +78 -0
  76. spiral/{table.py → tables/table.py} +59 -73
  77. spiral/{txn.py → tables/transaction.py} +7 -3
  78. pyspiral-0.3.1.dist-info/RECORD +0 -85
  79. spiral/api/tables.py +0 -91
  80. spiral/api/tokens.py +0 -56
  81. spiral/authn/authn.py +0 -89
  82. spiral/authn/device.py +0 -206
  83. spiral/authn/github_.py +0 -33
  84. spiral/authn/modal_.py +0 -18
  85. spiral/cli/org.py +0 -90
  86. spiral/cli/project.py +0 -109
  87. spiral/cli/table.py +0 -20
  88. spiral/cli/token.py +0 -27
  89. spiral/core/metastore/__init__.pyi +0 -91
  90. spiral/proto/_/spfs/__init__.py +0 -36
  91. spiral/proto/_/spiral/table/__init__.py +0 -276
  92. spiral/proto/_/spiraldb/metastore/__init__.py +0 -499
  93. spiral/proto/__init__.py +0 -0
  94. spiral/proto/scandal/__init__.py +0 -45
  95. spiral/proto/spiral/__init__.py +0 -0
  96. spiral/proto/spiral/table/__init__.py +0 -96
  97. {pyspiral-0.3.1.dist-info → pyspiral-0.4.1.dist-info}/entry_points.txt +0 -0
  98. /spiral/{authn/__init__.py → core/__init__.pyi} +0 -0
  99. /spiral/{core → protogen/_}/__init__.py +0 -0
  100. /spiral/{proto/_ → protogen/_/arrow}/__init__.py +0 -0
  101. /spiral/{proto/_/arrow → protogen/_/arrow/flight}/__init__.py +0 -0
  102. /spiral/{proto/_/arrow/flight → protogen/_/arrow/flight/protocol}/__init__.py +0 -0
  103. /spiral/{proto → protogen}/_/arrow/flight/protocol/sql/__init__.py +0 -0
  104. /spiral/{proto/_/arrow/flight/protocol → protogen/_/spiral}/__init__.py +0 -0
  105. /spiral/{proto → protogen/_}/substrait/__init__.py +0 -0
  106. /spiral/{proto → protogen/_}/substrait/extensions/__init__.py +0 -0
  107. /spiral/{proto/_/spiral → protogen}/__init__.py +0 -0
  108. /spiral/{proto → protogen}/util.py +0 -0
  109. /spiral/{proto/_/spiraldb → tables/debug}/__init__.py +0 -0
@@ -4,15 +4,14 @@ import pyarrow as pa
4
4
  import pyarrow.compute as pc
5
5
  import pyarrow.dataset as ds
6
6
 
7
- from spiral import Scan, Table
7
+ from spiral.tables import Scan, Snapshot
8
8
 
9
9
 
10
10
  class TableDataset(ds.Dataset):
11
- def __init__(self, table: Table):
12
- self._table = table
13
- # Once table is converted to a dataset, used pinned snapshot.
14
- self._asof = table.last_modified_at
15
- self._schema: pa.Schema = table._table.get_schema(asof=self._asof).to_arrow()
11
+ def __init__(self, snapshot: Snapshot):
12
+ self._snapshot = snapshot
13
+ self._table = snapshot.table
14
+ self._schema: pa.Schema = self._snapshot._snapshot.table.get_schema(asof=self._snapshot.asof).to_arrow()
16
15
 
17
16
  # We don't actually initialize a Dataset, we just implement enough of the API
18
17
  # to fool both DuckDB and Polars.
@@ -104,23 +103,22 @@ class TableDataset(ds.Dataset):
104
103
  use_threads: bool = True,
105
104
  memory_pool: pa.MemoryPool = None,
106
105
  ) -> "TableScanner":
107
- from .substrait_ import SubstraitConverter
106
+ from spiral.substrait_ import SubstraitConverter
108
107
 
109
108
  # Extract the substrait expression so we can convert it to a Spiral expression
110
109
  if filter is not None:
111
- filter = SubstraitConverter(self._table, self._schema, self._table.key_schema).convert(
110
+ filter = SubstraitConverter(self._table, self._schema, self._table.key_schema.to_arrow()).convert(
112
111
  filter.to_substrait(self._schema, allow_arrow_extensions=True),
113
112
  )
114
113
 
115
114
  scan = (
116
- self._table.scan(
115
+ self._snapshot.scan(
117
116
  {c: self._table[c] for c in columns},
118
117
  where=filter,
119
118
  exclude_keys=True,
120
- asof=self._asof,
121
119
  )
122
120
  if columns
123
- else self._table.scan(where=filter, asof=self._asof)
121
+ else self._snapshot.scan(where=filter)
124
122
  )
125
123
  self._last_scan = scan
126
124
 
@@ -250,191 +248,3 @@ class TableScanner(ds.Scanner):
250
248
 
251
249
  def to_table(self):
252
250
  return self.to_reader().read_all()
253
-
254
-
255
- class ScanDataset(ds.Dataset):
256
- def __init__(
257
- self,
258
- scan: Scan,
259
- key_table: pa.Table | pa.RecordBatchReader | None = None,
260
- ):
261
- self._scan = scan
262
- self._schema: pa.Schema = scan.schema.to_arrow()
263
- self._key_table = key_table
264
-
265
- # We don't actually initialize a Dataset, we just implement enough of the API
266
- # to fool both DuckDB and Polars.
267
- # super().__init__()
268
-
269
- @property
270
- def schema(self) -> pa.Schema:
271
- return self._schema
272
-
273
- def count_rows(
274
- self,
275
- filter: pc.Expression | None = None,
276
- batch_size: int | None = None,
277
- batch_readahead: int | None = None,
278
- fragment_readahead: int | None = None,
279
- fragment_scan_options: ds.FragmentScanOptions | None = None,
280
- use_threads: bool = True,
281
- memory_pool: pa.MemoryPool = None,
282
- ):
283
- return self.scanner(
284
- None,
285
- filter,
286
- batch_size,
287
- batch_readahead,
288
- fragment_readahead,
289
- fragment_scan_options,
290
- use_threads,
291
- memory_pool,
292
- ).count_rows()
293
-
294
- def filter(self, expression: pc.Expression) -> "TableDataset":
295
- raise NotImplementedError("filter not implemented")
296
-
297
- def get_fragments(self, filter: pc.Expression | None = None):
298
- """TODO(ngates): perhaps we should return ranges as per our split API?"""
299
- raise NotImplementedError("get_fragments not implemented")
300
-
301
- def head(
302
- self,
303
- num_rows: int,
304
- columns: list[str] | None = None,
305
- filter: pc.Expression | None = None,
306
- batch_size: int | None = None,
307
- batch_readahead: int | None = None,
308
- fragment_readahead: int | None = None,
309
- fragment_scan_options: ds.FragmentScanOptions | None = None,
310
- use_threads: bool = True,
311
- memory_pool: pa.MemoryPool = None,
312
- ):
313
- return self.scanner(
314
- columns,
315
- filter,
316
- batch_size,
317
- batch_readahead,
318
- fragment_readahead,
319
- fragment_scan_options,
320
- use_threads,
321
- memory_pool,
322
- ).head(num_rows)
323
-
324
- def join(
325
- self,
326
- right_dataset,
327
- keys,
328
- right_keys=None,
329
- join_type=None,
330
- left_suffix=None,
331
- right_suffix=None,
332
- coalesce_keys=True,
333
- use_threads=True,
334
- ):
335
- raise NotImplementedError("join not implemented")
336
-
337
- def join_asof(self, right_dataset, on, by, tolerance, right_on=None, right_by=None):
338
- raise NotImplementedError("join_asof not implemented")
339
-
340
- def replace_schema(self, schema: pa.Schema) -> "TableDataset":
341
- raise NotImplementedError("replace_schema not implemented")
342
-
343
- def scanner(
344
- self,
345
- columns: list[str] | None = None,
346
- filter: pc.Expression | None = None,
347
- batch_size: int | None = None,
348
- batch_readahead: int | None = None,
349
- fragment_readahead: int | None = None,
350
- fragment_scan_options: ds.FragmentScanOptions | None = None,
351
- use_threads: bool = True,
352
- memory_pool: pa.MemoryPool = None,
353
- ) -> "TableScanner":
354
- if columns is not None:
355
- columns = set(columns)
356
- names = set(self.schema.names)
357
- if len(columns - names) != 0 or len(names - columns) != 0:
358
- raise NotImplementedError("columns", columns, self.schema)
359
- if filter is not None:
360
- raise NotImplementedError("filter")
361
- if batch_size is not None:
362
- raise NotImplementedError("batch_size")
363
- if batch_readahead is not None:
364
- raise NotImplementedError("batch_readahead")
365
- if fragment_readahead is not None:
366
- raise NotImplementedError("fragment_readahead")
367
- if fragment_scan_options is not None:
368
- raise NotImplementedError("fragment_scan_options")
369
-
370
- return TableScanner(self._scan, key_table=self._key_table)
371
-
372
- def sort_by(self, sorting, **kwargs):
373
- raise NotImplementedError("sort_by not implemented")
374
-
375
- def take(
376
- self,
377
- indices: pa.Array | Any,
378
- columns: list[str] | None = None,
379
- filter: pc.Expression | None = None,
380
- batch_size: int | None = None,
381
- batch_readahead: int | None = None,
382
- fragment_readahead: int | None = None,
383
- fragment_scan_options: ds.FragmentScanOptions | None = None,
384
- use_threads: bool = True,
385
- memory_pool: pa.MemoryPool = None,
386
- ):
387
- return self.scanner(
388
- columns,
389
- filter,
390
- batch_size,
391
- batch_readahead,
392
- fragment_readahead,
393
- fragment_scan_options,
394
- use_threads,
395
- memory_pool,
396
- ).take(indices)
397
-
398
- def to_batches(
399
- self,
400
- columns: list[str] | None = None,
401
- filter: pc.Expression | None = None,
402
- batch_size: int | None = None,
403
- batch_readahead: int | None = None,
404
- fragment_readahead: int | None = None,
405
- fragment_scan_options: ds.FragmentScanOptions | None = None,
406
- use_threads: bool = True,
407
- memory_pool: pa.MemoryPool = None,
408
- ):
409
- return self.scanner(
410
- columns,
411
- filter,
412
- batch_size,
413
- batch_readahead,
414
- fragment_readahead,
415
- fragment_scan_options,
416
- use_threads,
417
- memory_pool,
418
- ).to_batches()
419
-
420
- def to_table(
421
- self,
422
- columns=None,
423
- filter: pc.Expression | None = None,
424
- batch_size: int | None = None,
425
- batch_readahead: int | None = None,
426
- fragment_readahead: int | None = None,
427
- fragment_scan_options: ds.FragmentScanOptions | None = None,
428
- use_threads: bool = True,
429
- memory_pool: pa.MemoryPool = None,
430
- ):
431
- return self.scanner(
432
- columns,
433
- filter,
434
- batch_size,
435
- batch_readahead,
436
- fragment_readahead,
437
- fragment_scan_options,
438
- use_threads,
439
- memory_pool,
440
- ).to_table()
@@ -0,0 +1,70 @@
1
+ from spiral import datetime_
2
+ from spiral.core.table import TableScan
3
+ from spiral.core.table.manifests import FragmentManifest
4
+ from spiral.tables.debug.metrics import _format_bytes
5
+
6
+
7
+ def display_manifests(scan: TableScan):
8
+ """Display all manifests in a scan."""
9
+ if len(scan.table_ids()) != 1:
10
+ raise NotImplementedError("Multiple table scans are not supported.")
11
+ table_id = scan.table_ids()[0]
12
+
13
+ key_space_manifest: FragmentManifest = scan.key_space_scan(table_id).manifest
14
+ _table_of_fragments(
15
+ key_space_manifest,
16
+ title="Key Space manifest",
17
+ )
18
+
19
+ for column_group in scan.column_groups():
20
+ column_group_manifest: FragmentManifest = scan.column_group_scan(column_group).manifest
21
+ _table_of_fragments(
22
+ column_group_manifest,
23
+ title=f"Column Group manifest for {str(column_group)}",
24
+ )
25
+
26
+
27
+ def _table_of_fragments(manifest: FragmentManifest, title: str):
28
+ """Display fragments in a formatted table."""
29
+ # Calculate summary statistics
30
+ total_size = sum(fragment.size_bytes for fragment in manifest)
31
+ total_metadata_size = sum(len(fragment.format_metadata or b"") for fragment in manifest)
32
+ fragment_count = len(manifest)
33
+ avg_size = total_size / fragment_count if fragment_count > 0 else 0
34
+
35
+ # Print title and summary
36
+ print(f"\n\n{title}")
37
+ print(
38
+ f"{fragment_count} fragments, "
39
+ f"total: {_format_bytes(total_size)}, "
40
+ f"avg: {_format_bytes(int(avg_size))}, "
41
+ f"metadata: {_format_bytes(total_metadata_size)}"
42
+ )
43
+ print("=" * 120)
44
+
45
+ # Print header
46
+ print(
47
+ f"{'ID':<30} {'Size (Metadata)':<20} {'Format':<10} {'Key Span':<10} "
48
+ f"{'Level':<5} {'Committed At':<20} {'Compacted At':<20}"
49
+ )
50
+ print("=" * 120)
51
+
52
+ # Print each fragment
53
+ for fragment in manifest:
54
+ committed_str = str(datetime_.from_timestamp_micros(fragment.committed_at)) if fragment.committed_at else "N/A"
55
+ compacted_str = str(datetime_.from_timestamp_micros(fragment.compacted_at)) if fragment.compacted_at else "N/A"
56
+
57
+ size_with_metadata = (
58
+ f"{_format_bytes(fragment.size_bytes)} ({_format_bytes(len(fragment.format_metadata or b''))})"
59
+ )
60
+ key_span = f"{fragment.key_span.begin}..{fragment.key_span.end}"
61
+
62
+ print(
63
+ f"{fragment.id:<30} "
64
+ f"{size_with_metadata:<20} "
65
+ f"{str(fragment.format):<10} "
66
+ f"{key_span:<10} "
67
+ f"{str(fragment.level):<5} "
68
+ f"{committed_str:<20} "
69
+ f"{compacted_str:<20}"
70
+ )
@@ -0,0 +1,56 @@
1
+ from typing import Any
2
+
3
+
4
+ def display_metrics(metrics: dict[str, Any]) -> None:
5
+ """Display metrics in a formatted table."""
6
+ print(
7
+ f"{'Metric':<40} {'Type':<10} {'Count':<8} {'Avg':<12} {'Min':<12} "
8
+ f"{'Max':<12} {'P95':<12} {'P99':<12} {'StdDev':<12}"
9
+ )
10
+ print("=" * 140)
11
+
12
+ for metric_name, data in sorted(metrics.items()):
13
+ metric_type = data["type"]
14
+ count = data["count"]
15
+ avg = _format_value(data["avg"], metric_type, metric_name)
16
+ min_val = _format_value(data["min"], metric_type, metric_name)
17
+ max_val = _format_value(data["max"], metric_type, metric_name)
18
+ p95 = _format_value(data["p95"], metric_type, metric_name)
19
+ p99 = _format_value(data["p99"], metric_type, metric_name)
20
+ stddev = _format_value(data["stddev"], metric_type, metric_name)
21
+
22
+ print(
23
+ f"{metric_name:<40} {metric_type:<10} {count:<8} {avg:<12} {min_val:<12} "
24
+ f"{max_val:<12} {p95:<12} {p99:<12} {stddev:<12}"
25
+ )
26
+
27
+
28
+ def _format_duration(nanoseconds: float) -> str:
29
+ """Convert nanoseconds to human-readable duration."""
30
+ if nanoseconds >= 1_000_000_000:
31
+ return f"{nanoseconds / 1_000_000_000:.2f}s"
32
+ elif nanoseconds >= 1_000_000:
33
+ return f"{nanoseconds / 1_000_000:.2f}ms"
34
+ elif nanoseconds >= 1_000:
35
+ return f"{nanoseconds / 1_000:.2f}μs"
36
+ else:
37
+ return f"{nanoseconds:.0f}ns"
38
+
39
+
40
+ def _format_bytes(bytes_value: float) -> str:
41
+ """Convert bytes to human-readable size."""
42
+ for unit in ["B", "KB", "MB", "GB"]:
43
+ if bytes_value < 1024:
44
+ return f"{bytes_value:.1f}{unit}"
45
+ bytes_value /= 1024
46
+ return f"{bytes_value:.1f}TB"
47
+
48
+
49
+ def _format_value(value: float, metric_type: str, metric_name: str) -> str:
50
+ """Format a value based on metric type and name."""
51
+ if metric_type == "timer" or "duration" in metric_name:
52
+ return _format_duration(value)
53
+ elif "bytes" in metric_name:
54
+ return _format_bytes(value)
55
+ else:
56
+ return f"{value:,.0f}"
@@ -1,8 +1,8 @@
1
1
  from datetime import datetime
2
2
 
3
- from spiral.core.core import TableScan
4
- from spiral.core.manifests import FragmentFile, FragmentManifest
5
- from spiral.core.spec import Key, KeyRange
3
+ from spiral.core.table import TableScan
4
+ from spiral.core.table.manifests import FragmentFile, FragmentManifest
5
+ from spiral.core.table.spec import Key
6
6
  from spiral.types_ import Timestamp
7
7
 
8
8
 
@@ -30,7 +30,7 @@ def show_scan(scan: TableScan):
30
30
  for i in range(len(cg_manifest)):
31
31
  fragment_file = cg_manifest[i]
32
32
  key_points.add(fragment_file.key_extent.min)
33
- key_points.add(fragment_file.key_extent.max)
33
+ key_points.add(fragment_file.key_extent.max)
34
34
 
35
35
  # Make sure split points exist in all key points.
36
36
  for s in splits[:-1]: # Don't take the last end.
@@ -44,9 +44,7 @@ def show_scan(scan: TableScan):
44
44
  show_manifest(cg_scan.manifest, scope=".".join(cg.path[1:]), key_points=key_points, splits=splits)
45
45
 
46
46
 
47
- def show_manifest(
48
- manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list[KeyRange] = None
49
- ):
47
+ def show_manifest(manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list = None):
50
48
  try:
51
49
  import matplotlib.patches as patches
52
50
  import matplotlib.pyplot as plt
@@ -157,10 +155,9 @@ def _get_fragment_legend(manifest_file: FragmentFile):
157
155
  f"key_min: {manifest_file.key_extent.min}",
158
156
  f"key_max: {manifest_file.key_extent.max}",
159
157
  f"format: {manifest_file.format}",
160
- f"level: {manifest_file.fs_level}",
158
+ f"level: {manifest_file.level}",
161
159
  f"committed_at: {_format_timestamp(manifest_file.committed_at)}",
162
160
  f"compacted_at: {_format_timestamp(manifest_file.compacted_at)}",
163
- f"fs_id: {manifest_file.fs_id}",
164
161
  f"ks_id: {manifest_file.ks_id}",
165
162
  ]
166
163
  )
@@ -1,4 +1,4 @@
1
- from spiral.core.core import TableMaintenance
1
+ from spiral.core.table import TableMaintenance
2
2
 
3
3
 
4
4
  class Maintenance:
@@ -1,57 +1,19 @@
1
1
  from collections.abc import Iterator
2
- from datetime import datetime
3
2
  from typing import TYPE_CHECKING, Any
4
3
 
5
4
  import pyarrow as pa
6
- from opentelemetry import trace
5
+ from datasets import DatasetInfo, Features
7
6
 
8
- from spiral.core.core import TableScan
9
- from spiral.core.spec import KeyRange, Schema
10
- from spiral.expressions.base import ExprLike
7
+ from spiral.core.table import KeyRange, TableScan
8
+ from spiral.core.table.spec import Schema
9
+ from spiral.settings import CI, DEV
11
10
 
12
11
  if TYPE_CHECKING:
13
12
  import dask.dataframe as dd
14
13
  import pandas as pd
15
14
  import polars as pl
16
- import pyarrow
17
- import pyarrow.dataset
18
15
  from datasets import iterable_dataset
19
16
 
20
- tracer = trace.get_tracer("pyspiral.client.scan")
21
-
22
-
23
- def scan(
24
- *projections: ExprLike,
25
- where: ExprLike | None = None,
26
- asof: datetime | int | str = None,
27
- exclude_keys: bool = False,
28
- ) -> "Scan":
29
- """Starts a read transaction on the spiral.
30
-
31
- Args:
32
- projections: a set of expressions that return struct arrays.
33
- where: a query expression to apply to the data.
34
- asof: only data written before the given timestamp will be returned, caveats around compaction.
35
- exclude_keys: whether to exclude the key columns in the scan result, defaults to False.
36
- Note that if a projection includes a key column, it will be included in the result.
37
- """
38
- from spiral import expressions as se
39
-
40
- # Combine all projections into a single struct.
41
- projection = se.merge(*projections)
42
- if where is not None:
43
- where = se.lift(where)
44
-
45
- return Scan(
46
- TableScan(
47
- projection.__expr__,
48
- filter=where.__expr__ if where else None,
49
- asof=asof,
50
- exclude_keys=exclude_keys,
51
- ),
52
- # config=config,
53
- )
54
-
55
17
 
56
18
  class Scan:
57
19
  """Scan object."""
@@ -83,20 +45,6 @@ class Scan:
83
45
  """
84
46
  return self._scan.is_empty()
85
47
 
86
- def to_dataset(
87
- self,
88
- key_table: pa.Table | pa.RecordBatchReader | None = None,
89
- ) -> "pyarrow.dataset.Dataset":
90
- """Returns a PyArrow Dataset representing the scan.
91
-
92
- Args:
93
- key_table: a table of keys to "take" (including aux columns for cell-push-down).
94
- If None, the scan will be executed without a key table.
95
- """
96
- from .dataset import ScanDataset
97
-
98
- return ScanDataset(self, key_table=key_table)
99
-
100
48
  def to_record_batches(
101
49
  self,
102
50
  key_table: pa.Table | pa.RecordBatchReader | None = None,
@@ -133,6 +81,11 @@ class Scan:
133
81
  key_table: a table of keys to "take" (including aux columns for cell-push-down).
134
82
  If None, the scan will be executed without a key table.
135
83
  """
84
+ # NOTE: Evaluates fully on Rust side which improved debuggability.
85
+ if DEV and not CI and key_table is None:
86
+ rb = self._scan.to_record_batch()
87
+ return pa.Table.from_batches([rb])
88
+
136
89
  return self.to_record_batches(key_table=key_table).read_all()
137
90
 
138
91
  def to_dask(self) -> "dd.DataFrame":
@@ -150,70 +103,91 @@ class Scan:
150
103
  # Fetch a set of partition ranges
151
104
  return dd.from_map(_read_key_range, self.split())
152
105
 
153
- def to_pandas(
154
- self,
155
- key_table: pa.Table | pa.RecordBatchReader | None = None,
156
- ) -> "pd.DataFrame":
106
+ def to_pandas(self) -> "pd.DataFrame":
157
107
  """Read into a Pandas DataFrame.
158
108
 
159
109
  Requires the `pandas` package to be installed.
160
-
161
- Args:
162
- key_table: a table of keys to "take" (including aux columns for cell-push-down).
163
- If None, the scan will be executed without a key table.
164
110
  """
165
- return self.to_table(key_table=key_table).to_pandas()
111
+ return self.to_table().to_pandas()
166
112
 
167
- def to_polars(self, key_table: pa.Table | pa.RecordBatchReader | None = None) -> "pl.LazyFrame":
168
- """Read into a Polars LazyFrame.
113
+ def to_polars(self) -> "pl.DataFrame":
114
+ """Read into a Polars DataFrame.
169
115
 
170
116
  Requires the `polars` package to be installed.
171
-
172
- Args:
173
- key_table: a table of keys to "take" (including aux columns for cell-push-down).
174
- If None, the scan will be executed without a key table.
175
117
  """
176
118
  import polars as pl
177
119
 
178
- return pl.scan_pyarrow_dataset(self.to_dataset(key_table=key_table))
120
+ # TODO(marko): This should support lazy dataframe.
121
+ return pl.from_arrow(self.to_record_batches())
179
122
 
180
123
  def to_pytorch(
181
124
  self,
182
- key_table: pa.Table | pa.RecordBatchReader | None = None,
183
125
  batch_readahead: int | None = None,
126
+ shuffle_batch_size: int | None = None,
127
+ shuffle_pool_num_rows: int | None = None,
184
128
  ) -> "iterable_dataset.IterableDataset":
185
- """Returns an iterable dataset that can be used to build a `pytorch.DataLoader`.
186
-
187
- Requires the `datasets` package to be installed.
129
+ """Returns an iterable dataset that can be used to build a PyTorch DataLoader.
188
130
 
189
131
  Args:
190
- key_table: a table of keys to "take" (including aux columns for cell-push-down).
191
- If None, the scan will be executed without a key table.
192
- batch_readahead: the number of batches to prefetch in the background.
132
+ batch_readahead: Number of batches to prefetch in the background.
133
+ shuffle_batch_size: read granularity of number of rows for a shuffled scan. If left as
134
+ None along with shuffle_pool_num_rows=None, shuffling is disabled.
135
+ shuffle_pool_num_rows: Pool size for shuffling batches.
193
136
  """
194
137
  from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
195
138
 
196
139
  def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
197
- # Use batch size 1 when iterating samples, unless batch reader is already used.
198
- stream = self.to_record_batches(
199
- key_table, batch_size=1 if isinstance(key_table, pa.Table) else None, batch_readahead=batch_readahead
200
- )
140
+ if shuffle_batch_size is None and shuffle_pool_num_rows is None:
141
+ stream = self.to_record_batches(
142
+ batch_readahead=batch_readahead,
143
+ )
144
+ else:
145
+ stream = self._scan.to_shuffled_record_batches(
146
+ batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
147
+ )
201
148
 
202
149
  # This key is unused when training with IterableDataset.
203
150
  # Default implementation returns shard id, e.g. parquet row group id.
204
151
  for i, rb in enumerate(stream):
205
152
  yield i, pa.Table.from_batches([rb], stream.schema)
206
153
 
207
- # NOTE: Type annotation Callable[..., tuple[str, pa.Table]] is wrong. The return value must be iterable.
154
+ def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
155
+ """
156
+ Replace string-view columns in the schema with strings. We do use this converted schema
157
+ as Features in the returned Dataset.
158
+ Remove this method once we have https://github.com/huggingface/datasets/pull/7718
159
+ """
160
+ new_fields = [
161
+ pa.field(field.name, pa.string(), nullable=field.nullable, metadata=field.metadata)
162
+ if field.type == pa.string_view()
163
+ else field
164
+ for field in schema
165
+ ]
166
+ return pa.schema(new_fields)
167
+
168
+ # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
208
169
  ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
209
- return IterableDataset(ex_iterable=ex_iterable)
170
+ info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
171
+ return IterableDataset(ex_iterable=ex_iterable, info=info)
210
172
 
211
- def split(self) -> list[KeyRange]:
173
+ def _split(self) -> list[KeyRange]:
174
+ # Splits the scan into a set of key ranges.
212
175
  return self._scan.split()
213
176
 
214
- def debug(self):
177
+ def _debug(self):
215
178
  # Visualizes the scan, mainly for debugging purposes.
216
- # NOTE: This is not part of the API and may disappear at any moment.
217
- from spiral.debug import show_scan
179
+ from spiral.tables.debug.scan import show_scan
218
180
 
219
181
  show_scan(self._scan)
182
+
183
+ def _dump_manifests(self):
184
+ # Print manifests in a human-readable format.
185
+ from spiral.tables.debug.manifests import display_manifests
186
+
187
+ display_manifests(self._scan)
188
+
189
+ def _dump_metrics(self):
190
+ # Print metrics in a human-readable format.
191
+ from spiral.tables.debug.metrics import display_metrics
192
+
193
+ display_metrics(self.metrics)