deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. deltacat/__init__.py +41 -16
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/benchmark_engine.py +4 -2
  5. deltacat/benchmarking/conftest.py +1 -1
  6. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  7. deltacat/catalog/__init__.py +62 -5
  8. deltacat/catalog/main/impl.py +26 -10
  9. deltacat/catalog/model/catalog.py +165 -109
  10. deltacat/catalog/model/properties.py +25 -24
  11. deltacat/compute/__init__.py +14 -0
  12. deltacat/compute/converter/constants.py +5 -0
  13. deltacat/compute/converter/converter_session.py +78 -36
  14. deltacat/compute/converter/model/convert_input.py +24 -4
  15. deltacat/compute/converter/model/convert_result.py +61 -0
  16. deltacat/compute/converter/model/converter_session_params.py +52 -10
  17. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  18. deltacat/compute/converter/steps/convert.py +84 -36
  19. deltacat/compute/converter/steps/dedupe.py +25 -4
  20. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  21. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  22. deltacat/compute/converter/utils/io.py +82 -11
  23. deltacat/compute/converter/utils/s3u.py +13 -4
  24. deltacat/compute/jobs/client.py +406 -0
  25. deltacat/constants.py +5 -6
  26. deltacat/env.py +10 -0
  27. deltacat/examples/basic_logging.py +6 -6
  28. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  29. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  30. deltacat/examples/hello_world.py +4 -2
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +198 -0
  33. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  34. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  35. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  36. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  37. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  38. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  39. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  40. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  41. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  42. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  43. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  44. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  45. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  46. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  47. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  49. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  50. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  51. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  52. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  53. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  54. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  55. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  56. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  57. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  58. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  59. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  60. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  61. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  62. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  63. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  64. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  65. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  66. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  67. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  68. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  69. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  70. deltacat/io/__init__.py +13 -0
  71. deltacat/io/dataset/__init__.py +0 -0
  72. deltacat/io/dataset/deltacat_dataset.py +91 -0
  73. deltacat/io/datasink/__init__.py +0 -0
  74. deltacat/io/datasink/deltacat_datasink.py +207 -0
  75. deltacat/io/datasource/__init__.py +0 -0
  76. deltacat/io/datasource/deltacat_datasource.py +580 -0
  77. deltacat/io/reader/__init__.py +0 -0
  78. deltacat/io/reader/deltacat_read_api.py +172 -0
  79. deltacat/storage/__init__.py +2 -0
  80. deltacat/storage/model/expression/__init__.py +47 -0
  81. deltacat/storage/model/expression/expression.py +656 -0
  82. deltacat/storage/model/expression/visitor.py +248 -0
  83. deltacat/storage/model/metafile.py +74 -42
  84. deltacat/storage/model/scan/push_down.py +32 -5
  85. deltacat/storage/model/shard.py +6 -2
  86. deltacat/storage/model/types.py +5 -3
  87. deltacat/tests/_io/reader/__init__.py +0 -0
  88. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  89. deltacat/tests/catalog/data/__init__.py +0 -0
  90. deltacat/tests/catalog/main/__init__.py +0 -0
  91. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  92. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  93. deltacat/tests/catalog/model/__init__.py +0 -0
  94. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  95. deltacat/tests/catalog/test_catalogs.py +52 -98
  96. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  97. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  98. deltacat/tests/daft/__init__.py +0 -0
  99. deltacat/tests/daft/test_model.py +97 -0
  100. deltacat/tests/experimental/__init__.py +0 -0
  101. deltacat/tests/experimental/catalog/__init__.py +0 -0
  102. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  103. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  104. deltacat/tests/experimental/daft/__init__.py +0 -0
  105. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  106. deltacat/tests/experimental/storage/__init__.py +0 -0
  107. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  108. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  109. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  110. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  111. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  112. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  113. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  114. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  115. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  116. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  117. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  118. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  119. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  120. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  121. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  122. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  123. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  124. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  125. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  126. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  127. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  128. deltacat/tests/storage/model/test_expression.py +327 -0
  129. deltacat/tests/storage/model/test_shard.py +3 -1
  130. deltacat/tests/test_deltacat_api.py +50 -9
  131. deltacat/types/media.py +141 -43
  132. deltacat/types/tables.py +35 -7
  133. deltacat/utils/daft.py +531 -5
  134. deltacat/utils/export.py +3 -1
  135. deltacat/utils/filesystem.py +39 -9
  136. deltacat/utils/polars.py +128 -0
  137. deltacat/utils/pyarrow.py +151 -15
  138. deltacat/utils/ray_utils/concurrency.py +1 -1
  139. deltacat/utils/ray_utils/runtime.py +56 -4
  140. deltacat/utils/url.py +1284 -0
  141. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
  142. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
  143. deltacat/catalog/iceberg/__init__.py +0 -4
  144. deltacat/daft/daft_scan.py +0 -111
  145. deltacat/daft/model.py +0 -258
  146. deltacat/examples/common/fixtures.py +0 -15
  147. deltacat/storage/rivulet/__init__.py +0 -11
  148. deltacat/storage/rivulet/feather/__init__.py +0 -5
  149. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  150. /deltacat/{daft → compute/jobs}/__init__.py +0 -0
  151. /deltacat/examples/{common → experimental}/__init__.py +0 -0
  152. /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
  153. /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
  154. /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
  155. /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  156. /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
  157. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  158. /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
  159. /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  160. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  161. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  162. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
  163. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  164. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  165. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  166. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  167. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  168. /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  169. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  170. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  171. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  172. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  173. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  174. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  175. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,10 @@ from typing import List
3
3
  import pyarrow as pa
4
4
  from pyarrow import feather
5
5
 
6
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
7
- from deltacat.storage.rivulet import Schema
8
- from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
9
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
6
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
7
+ from deltacat.experimental.storage.rivulet import Schema
8
+ from deltacat.experimental.storage.rivulet.arrow.serializer import ArrowSerializer
9
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
10
10
 
11
11
 
12
12
  class FeatherDataSerializer(ArrowSerializer):
@@ -3,9 +3,9 @@ import time
3
3
  from typing import List, Generator
4
4
 
5
5
  from deltacat.storage.model.partition import PartitionLocator
6
- from deltacat.storage.rivulet.fs.file_store import FileStore
7
- from deltacat.storage.rivulet.fs.input_file import InputFile
8
- from deltacat.storage.rivulet.fs.output_file import OutputFile
6
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
7
+ from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
8
+ from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
9
9
  from deltacat.utils.metafile_locator import _find_partition_path
10
10
 
11
11
 
@@ -4,8 +4,8 @@ from pyarrow.fs import FileSystem, FileType, FileSelector
4
4
  # TODO(deltacat): Rely on deltacat implementation to resolve path and filesystem.
5
5
  from ray.data.datasource.path_util import _resolve_paths_and_filesystem
6
6
 
7
- from deltacat.storage.rivulet.fs.input_file import FSInputFile
8
- from deltacat.storage.rivulet.fs.output_file import FSOutputFile
7
+ from deltacat.experimental.storage.rivulet.fs.input_file import FSInputFile
8
+ from deltacat.experimental.storage.rivulet.fs.output_file import FSOutputFile
9
9
 
10
10
 
11
11
  class FileStore:
@@ -5,7 +5,7 @@ from typing import Protocol
5
5
 
6
6
  from pyarrow.fs import FileSystem, FileType
7
7
 
8
- from deltacat.storage.rivulet.fs.input_file import FSInputFile, InputFile
8
+ from deltacat.experimental.storage.rivulet.fs.input_file import FSInputFile, InputFile
9
9
 
10
10
 
11
11
  class OutputStream(Protocol): # pragma: no cover
@@ -1,9 +1,9 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import List, Callable, Any, Protocol
3
3
 
4
- from deltacat.storage.rivulet.dataset_executor import DatasetExecutor
5
- from deltacat.storage.rivulet.mvp.Table import MvpTable
6
- from deltacat.storage.rivulet import Schema
4
+ from deltacat.experimental.storage.rivulet.dataset_executor import DatasetExecutor
5
+ from deltacat.experimental.storage.rivulet import Schema
6
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
7
7
 
8
8
 
9
9
  class DatasetOperation(Protocol):
@@ -99,7 +99,7 @@ class LogicalPlan:
99
99
  self.operations.append(CollectOperation())
100
100
  return self
101
101
 
102
- def execute(self, executor: DatasetExecutor) -> "MvpTable":
102
+ def execute(self, executor: DatasetExecutor) -> MvpTable:
103
103
  for operation in self.operations:
104
104
  operation.visit(executor)
105
105
  return executor.output
@@ -19,7 +19,7 @@ from deltacat.storage.model.partition import PartitionLocator
19
19
  from deltacat.storage.model.transaction import TransactionOperationList
20
20
 
21
21
  from deltacat.storage.model.types import StreamFormat
22
- from deltacat.storage.rivulet import Schema
22
+ from deltacat.experimental.storage.rivulet import Schema
23
23
 
24
24
  StreamPosition = int
25
25
  """The stream position for creating a consistent ordering of manifests."""
@@ -4,9 +4,9 @@ import json
4
4
  from itertools import zip_longest
5
5
  from typing import List
6
6
 
7
- from deltacat.storage.rivulet.fs.input_file import InputFile
8
- from deltacat.storage.rivulet.fs.output_file import OutputFile
9
- from deltacat.storage.rivulet.metastore.sst import (
7
+ from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
8
+ from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
9
+ from deltacat.experimental.storage.rivulet.metastore.sst import (
10
10
  SSTWriter,
11
11
  SSTableRow,
12
12
  SSTReader,
@@ -1,8 +1,8 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import Protocol, Any, List
3
3
 
4
- from deltacat.storage.rivulet.fs.input_file import InputFile
5
- from deltacat.storage.rivulet.fs.output_file import OutputFile
4
+ from deltacat.experimental.storage.rivulet.fs.input_file import InputFile
5
+ from deltacat.experimental.storage.rivulet.fs.output_file import OutputFile
6
6
 
7
7
 
8
8
  @dataclass(frozen=True)
@@ -8,9 +8,9 @@ from typing import Any, Dict, Set, List, FrozenSet, Iterable, TypeVar, NamedTupl
8
8
 
9
9
  from intervaltree import Interval, IntervalTree
10
10
 
11
- from deltacat.storage.rivulet.metastore.delta import DeltaContext
12
- from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
13
- from deltacat.storage.rivulet import Schema
11
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
12
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
13
+ from deltacat.experimental.storage.rivulet import Schema
14
14
 
15
15
  T = TypeVar("T")
16
16
 
@@ -0,0 +1,7 @@
1
+ # TODO later on this will be moved to a dedicated package
2
+ from deltacat.experimental.storage.rivulet.parquet.file_reader import ParquetFileReader
3
+ from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
4
+ FileReaderRegistrar,
5
+ )
6
+
7
+ FileReaderRegistrar.register_reader("parquet", ParquetFileReader)
@@ -4,15 +4,17 @@ from typing import Optional
4
4
 
5
5
  from pyarrow import RecordBatch
6
6
 
7
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
8
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
9
- from deltacat.storage.rivulet.reader.data_reader import (
7
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
8
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
9
+ from deltacat.experimental.storage.rivulet.reader.data_reader import (
10
10
  RowAndKey,
11
11
  FileReader,
12
12
  FILE_FORMAT,
13
13
  )
14
- from deltacat.storage.rivulet.reader.pyarrow_data_reader import RecordBatchRowIndex
15
- from deltacat.storage.rivulet.schema.schema import Schema
14
+ from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
15
+ RecordBatchRowIndex,
16
+ )
17
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
16
18
  import pyarrow.parquet as pq
17
19
  import pyarrow as pa
18
20
 
@@ -3,11 +3,11 @@ from typing import List, Any
3
3
  import pyarrow as pa
4
4
  from pyarrow.parquet import FileMetaData
5
5
 
6
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
7
- from deltacat.storage.rivulet import Schema
8
- from deltacat.storage.rivulet.arrow.serializer import ArrowSerializer
6
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
7
+ from deltacat.experimental.storage.rivulet import Schema
8
+ from deltacat.experimental.storage.rivulet.arrow.serializer import ArrowSerializer
9
9
 
10
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
10
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
11
11
 
12
12
 
13
13
  class ParquetDataSerializer(ArrowSerializer):
@@ -15,19 +15,30 @@ from typing import (
15
15
  AbstractSet,
16
16
  )
17
17
 
18
- from deltacat.storage.rivulet.metastore.delta import DeltaContext
19
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
20
- from deltacat.storage.rivulet.metastore.sst_interval_tree import (
18
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
19
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
20
+ from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
21
21
  OrderedBlockGroups,
22
22
  BlockGroup,
23
23
  Block,
24
24
  )
25
- from deltacat.storage.rivulet.reader.data_reader import RowAndKey, FileReader
26
- from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
27
- from deltacat.storage.rivulet.reader.pyarrow_data_reader import ArrowDataReader
28
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
29
- from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
30
- from deltacat.storage.rivulet import Schema
25
+ from deltacat.experimental.storage.rivulet.reader.data_reader import (
26
+ RowAndKey,
27
+ FileReader,
28
+ )
29
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
30
+ DatasetMetastore,
31
+ )
32
+ from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
33
+ ArrowDataReader,
34
+ )
35
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
36
+ QueryExpression,
37
+ )
38
+ from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
39
+ FileReaderRegistrar,
40
+ )
41
+ from deltacat.experimental.storage.rivulet import Schema
31
42
  from deltacat import logs
32
43
 
33
44
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -13,9 +13,9 @@ from typing import (
13
13
  Optional,
14
14
  )
15
15
 
16
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
17
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
18
- from deltacat.storage.rivulet.schema.schema import Schema
16
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
17
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
18
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
19
19
 
20
20
  FILE_FORMAT = TypeVar("FILE_FORMAT")
21
21
  MEMORY_FORMAT = TypeVar("MEMORY_FORMAT")
@@ -3,9 +3,11 @@ from typing import Generator, Dict, Optional
3
3
  import pyarrow as pa
4
4
 
5
5
  from deltacat.storage.model.shard import Shard
6
- from deltacat.storage.rivulet.reader.dataset_reader import DatasetReader
7
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
8
- from deltacat.storage.rivulet import Schema
6
+ from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
7
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
8
+ QueryExpression,
9
+ )
10
+ from deltacat.experimental.storage.rivulet import Schema
9
11
 
10
12
 
11
13
  class DataScan:
@@ -7,16 +7,16 @@ import pyarrow.fs
7
7
 
8
8
  from deltacat.storage import Delta
9
9
  from deltacat.storage.model.partition import PartitionLocator
10
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
10
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
11
11
  from deltacat.utils.filesystem import resolve_path_and_filesystem
12
- from deltacat.storage.rivulet.metastore.json_sst import JsonSstReader
13
- from deltacat.storage.rivulet.metastore.delta import (
12
+ from deltacat.experimental.storage.rivulet.metastore.json_sst import JsonSstReader
13
+ from deltacat.experimental.storage.rivulet.metastore.delta import (
14
14
  ManifestIO,
15
15
  DeltaContext,
16
16
  RivuletDelta,
17
17
  DeltacatManifestIO,
18
18
  )
19
- from deltacat.storage.rivulet.metastore.sst import SSTReader, SSTable
19
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTReader, SSTable
20
20
  from deltacat.utils.metafile_locator import _find_table_path
21
21
  from deltacat import logs
22
22
 
@@ -2,18 +2,20 @@ import logging
2
2
  from typing import Generator, Optional, Set, Type, TypeVar, Any
3
3
 
4
4
  from deltacat.storage.model.shard import Shard
5
- from deltacat.storage.rivulet.metastore.sst import SSTableRow, SSTable
6
- from deltacat.storage.rivulet.metastore.sst_interval_tree import (
5
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow, SSTable
6
+ from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
7
7
  BlockIntervalTree,
8
8
  OrderedBlockGroups,
9
9
  )
10
- from deltacat.storage.rivulet.reader.block_scanner import BlockScanner
11
- from deltacat.storage.rivulet.reader.dataset_metastore import (
10
+ from deltacat.experimental.storage.rivulet.reader.block_scanner import BlockScanner
11
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
12
12
  DatasetMetastore,
13
13
  ManifestAccessor,
14
14
  )
15
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
16
- from deltacat.storage.rivulet import Schema
15
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
16
+ QueryExpression,
17
+ )
18
+ from deltacat.experimental.storage.rivulet import Schema
17
19
 
18
20
  # The type of data returned to reader
19
21
  T = TypeVar("T")
@@ -4,7 +4,10 @@ from typing import Generator, Dict, Type, NamedTuple, List
4
4
 
5
5
  from pyarrow import RecordBatch
6
6
 
7
- from deltacat.storage.rivulet.reader.data_reader import DataReader, MEMORY_FORMAT
7
+ from deltacat.experimental.storage.rivulet.reader.data_reader import (
8
+ DataReader,
9
+ MEMORY_FORMAT,
10
+ )
8
11
  import pyarrow as pa
9
12
 
10
13
 
@@ -1,9 +1,9 @@
1
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
2
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
3
- from deltacat.storage.rivulet.reader.data_reader import FileReader
1
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
2
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
3
+ from deltacat.experimental.storage.rivulet.reader.data_reader import FileReader
4
4
  from typing import Type, Dict
5
5
 
6
- from deltacat.storage.rivulet.schema.schema import Schema
6
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
7
7
 
8
8
 
9
9
  class FileReaderRegistrar:
@@ -5,7 +5,7 @@ from typing import MutableMapping, Dict, Iterable, Tuple, Optional
5
5
 
6
6
  import pyarrow as pa
7
7
 
8
- from deltacat.storage.rivulet.schema.datatype import Datatype
8
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
9
9
 
10
10
 
11
11
  @dataclass(frozen=True)
@@ -1,6 +1,6 @@
1
1
  from typing import Protocol, Iterable, List, Union, Any, Dict
2
2
 
3
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
3
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
4
4
  import pyarrow as pa
5
5
 
6
6
  MEMTABLE_DATA = Union[Iterable[Dict[str, Any]], pa.Table]
@@ -1,11 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
- from deltacat.storage.rivulet.parquet.serializer import ParquetDataSerializer
4
- from deltacat.storage.rivulet import Schema
5
- from deltacat.storage.rivulet.serializer import DataSerializer
6
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
3
+ from deltacat.experimental.storage.rivulet.parquet.serializer import (
4
+ ParquetDataSerializer,
5
+ )
6
+ from deltacat.experimental.storage.rivulet import Schema
7
+ from deltacat.experimental.storage.rivulet.serializer import DataSerializer
8
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
7
9
 
8
- from deltacat.storage.rivulet.feather.serializer import FeatherDataSerializer
10
+ from deltacat.experimental.storage.rivulet.feather.serializer import (
11
+ FeatherDataSerializer,
12
+ )
9
13
 
10
14
 
11
15
  class DataSerializerFactory:
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+ from typing import Generic, List, Union, Iterable
3
+ from deltacat.storage.model.shard import T, Shard, ShardingStrategy
4
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
5
+ DatasetMetastore,
6
+ )
7
+
8
+
9
+ class RangeShard(Shard, Generic[T]):
10
+ """
11
+ Represents a range-based shard with minimum and maximum keys.
12
+
13
+ param: min_key: The minimum key for the shard.
14
+ param: max_key: The maximum key for the shard.
15
+ """
16
+
17
+ def __init__(self, min_key: T, max_key: T):
18
+ self.min_key = min_key
19
+ self.max_key = max_key
20
+
21
+ def __repr__(self) -> str:
22
+ return f"Shard(type=range, min_key={self.min_key}, max_key={self.max_key})"
23
+
24
+ @staticmethod
25
+ def split(
26
+ global_min: Union[int, str], global_max: Union[int, str], num_shards: int
27
+ ) -> List[RangeShard]:
28
+ """
29
+ Splits a range into `num_shards` shards.
30
+ Currently supports splitting ranges of integers and strings.
31
+
32
+ Note: If global_min == global_max or num_shards <= 1, a single shard is returned,
33
+ num_shards is ignored.
34
+
35
+ :param global_min: The minimum key for the entire range (int or str).
36
+ :param global_max: The maximum key for the entire range (int or str).
37
+ :param num_shards: The number of shards to create.
38
+ :return: A list of RangeShard objects.
39
+ """
40
+ if global_min == global_max or num_shards <= 1:
41
+ return [RangeShard(global_min, global_max)]
42
+
43
+ # Determine which interpolation function to use based on the type of min/max
44
+ if isinstance(global_min, int) and isinstance(global_max, int):
45
+ interpolate = RangeShard._interpolate_numeric
46
+ elif isinstance(global_min, str) and isinstance(global_max, str):
47
+ interpolate = RangeShard._interpolate_str
48
+ else:
49
+ raise ValueError(
50
+ "Unsupported combination of types for global_min and global_max."
51
+ )
52
+
53
+ shards: List[RangeShard] = []
54
+ for i in range(num_shards):
55
+ start = interpolate(global_min, global_max, i, num_shards)
56
+ end = interpolate(global_min, global_max, i + 1, num_shards)
57
+
58
+ if i > 0:
59
+ if isinstance(start, int):
60
+ start = shards[-1].max_key + 1
61
+ elif isinstance(start, int):
62
+ char_list = list(start)
63
+ char_list[-1] = chr(ord(char_list[-1]) + 1)
64
+ start = "".join(char_list)
65
+
66
+ shards.append(RangeShard(start, end))
67
+
68
+ return shards
69
+
70
+ @staticmethod
71
+ def _interpolate_numeric(start: int, end: int, step: int, total_steps: int) -> int:
72
+ """
73
+ Integer interpolation using integer (floor) division.
74
+
75
+ param: start (int): The starting number.
76
+ param: end (int): The ending number.
77
+ param: step (int): The current step in the interpolation (0-based).
78
+ param: total_steps (int): The total number of interpolation steps.
79
+
80
+ returns: int: The interpolated integer.
81
+ """
82
+ return start + (end - start) * step // total_steps
83
+
84
+ @staticmethod
85
+ def _interpolate_str(start: str, end: str, step: int, total_steps: int) -> str:
86
+ """
87
+ Interpolates between two strings lexicographically.
88
+
89
+ param: start (str): The starting string.
90
+ param: end (str): The ending string.
91
+ param: step (int): The current step in the interpolation (0-based).
92
+ param: total_steps (int): The total number of interpolation steps.
93
+
94
+ returns: str: The interpolated string.
95
+ """
96
+ max_len = max(len(start), len(end))
97
+
98
+ # Pad strings to the same length with spaces (smallest lexicographical character).
99
+ start = start.ljust(max_len, " ")
100
+ end = end.ljust(max_len, " ")
101
+
102
+ # Interpolate character by character based on ordinal values.
103
+ interpolated_chars = [
104
+ chr(round(ord(s) + (ord(e) - ord(s)) * step / total_steps))
105
+ for s, e in zip(start, end)
106
+ ]
107
+
108
+ return "".join(interpolated_chars).rstrip()
109
+
110
+
111
+ class RangeShardingStrategy(ShardingStrategy, Generic[T]):
112
+ """
113
+ Implements a sharding strategy to divide a range of keys into shards.
114
+
115
+ method: shards: Generates a list of RangeShard objects based on the global range.
116
+ """
117
+
118
+ def shards(
119
+ self, num_shards: int, metastore: DatasetMetastore
120
+ ) -> Iterable[RangeShard[T]]:
121
+ """
122
+ Divides the global range of keys into evenly sized shards.
123
+
124
+ param: num_shards: The number of shards to divide the range into.
125
+ param: metastore: The dataset metastore providing access to manifests.
126
+ returns: A list of RangeShard objects representing the divided range.
127
+ """
128
+ min, max = metastore.get_min_max_keys()
129
+ return RangeShard.split(min, max, num_shards)
@@ -6,15 +6,26 @@ from typing import Any, List, Set, Protocol, TypeVar, Dict, Iterable
6
6
 
7
7
  from pyarrow import RecordBatch, Table
8
8
  from deltacat.storage.model.partition import PartitionLocator
9
- from deltacat.storage.rivulet.metastore.delta import ManifestIO, DeltacatManifestIO
10
-
11
- from deltacat.storage.rivulet import Schema
12
- from deltacat.storage.rivulet.metastore.json_sst import JsonSstWriter
13
- from deltacat.storage.rivulet.serializer import MEMTABLE_DATA, DataSerializer
14
- from deltacat.storage.rivulet.serializer_factory import DataSerializerFactory
15
- from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter, DATA
16
- from deltacat.storage.rivulet.metastore.sst import SSTWriter
17
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
9
+ from deltacat.experimental.storage.rivulet.metastore.delta import (
10
+ ManifestIO,
11
+ DeltacatManifestIO,
12
+ )
13
+
14
+ from deltacat.experimental.storage.rivulet import Schema
15
+ from deltacat.experimental.storage.rivulet.metastore.json_sst import JsonSstWriter
16
+ from deltacat.experimental.storage.rivulet.serializer import (
17
+ MEMTABLE_DATA,
18
+ DataSerializer,
19
+ )
20
+ from deltacat.experimental.storage.rivulet.serializer_factory import (
21
+ DataSerializerFactory,
22
+ )
23
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import (
24
+ DatasetWriter,
25
+ DATA,
26
+ )
27
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTWriter
28
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
18
29
 
19
30
  INPUT_ROW = TypeVar("INPUT_ROW")
20
31
 
deltacat/io/__init__.py CHANGED
@@ -0,0 +1,13 @@
1
+ from deltacat.io.reader.deltacat_read_api import read_deltacat
2
+ from deltacat.io.datasource.deltacat_datasource import DeltacatReadType
3
+ from deltacat.io.datasource.deltacat_datasource import (
4
+ METAFILE_DATA_COLUMN_NAME,
5
+ METAFILE_TYPE_COLUMN_NAME,
6
+ )
7
+
8
+ __all__ = [
9
+ "read_deltacat",
10
+ "DeltacatReadType",
11
+ "METAFILE_DATA_COLUMN_NAME",
12
+ "METAFILE_TYPE_COLUMN_NAME",
13
+ ]
File without changes
@@ -0,0 +1,91 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, Callable, Dict, Optional, cast
5
+
6
+ import pyarrow as pa
7
+ from ray.data import Dataset
8
+
9
+ from deltacat.utils.url import DeltaCatUrl
10
+ from deltacat.io.datasink.deltacat_datasink import DeltaCatDatasink
11
+
12
+
13
+ class DeltaCatDataset(Dataset):
14
+ @staticmethod
15
+ def from_dataset(dataset: Dataset) -> DeltaCatDataset:
16
+ # cast to DeltacatDataset in-place since it only adds new methods
17
+ dataset.__class__ = DeltaCatDataset
18
+ return cast(DeltaCatDataset, dataset)
19
+
20
+ def write_deltacat(
21
+ self,
22
+ url: DeltaCatUrl,
23
+ *,
24
+ # if the source dataset only contains DeltaCAT metadata, then only copy the metadata to the destination... if it contains external source file paths, then register them in a new Delta.
25
+ metadata_only: bool = False,
26
+ # merge all deltas as part of the write operation
27
+ copy_on_write: Optional[bool] = False,
28
+ filesystem: Optional[pa.fs.S3FileSystem] = None,
29
+ try_create_dir: bool = True,
30
+ arrow_open_stream_args: Optional[Dict[str, Any]] = None,
31
+ arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
32
+ min_rows_per_file: Optional[int] = None,
33
+ ray_remote_args: Dict[str, Any] = None,
34
+ concurrency: Optional[int] = None,
35
+ **arrow_parquet_args,
36
+ ) -> None:
37
+ """Writes the dataset to files and commits DeltaCAT metadata indexing
38
+ the files written.
39
+
40
+ This is only supported for datasets convertible to Arrow records.
41
+ To control the number of files, use ``.repartition()``.
42
+
43
+ Unless a custom block path provider is given, the format of the output
44
+ files will be {uuid}_{block_idx}.{extension}, where ``uuid`` is a
45
+ unique id for the dataset.
46
+
47
+ The DeltaCAT manifest will be written to ``f"{path}/manifest``
48
+
49
+ Examples:
50
+ >>> ds.write_deltacat("s3://catalog/root/path")
51
+
52
+ Time complexity: O(dataset size / parallelism)
53
+
54
+ Args:
55
+ url: The path to the root directory where materialized files and
56
+ DeltaCAT manifest will be written.
57
+ filesystem: The filesystem implementation to write to. This should
58
+ be either a PyArrow S3FileSystem.
59
+ try_create_dir: Try to create all directories in destination path
60
+ if True. Does nothing if all directories already exist.
61
+ arrow_open_stream_args: kwargs passed to
62
+ pyarrow.fs.S3FileSystem.open_output_stream
63
+ filename_provider: FilenameProvider implementation
64
+ to write each dataset block to a custom output path.
65
+ arrow_parquet_args_fn: Callable that returns a dictionary of write
66
+ arguments to use when writing each block to a file. Overrides
67
+ any duplicate keys from arrow_parquet_args. This should be used
68
+ instead of arrow_parquet_args if any of your write arguments
69
+ cannot be pickled, or if you'd like to lazily resolve the write
70
+ arguments for each dataset block.
71
+ arrow_parquet_args: Options to pass to
72
+ pyarrow.parquet.write_table(), which is used to write out each
73
+ block to a file.
74
+ """
75
+ datasink = DeltaCatDatasink(
76
+ url,
77
+ metadata_only=metadata_only,
78
+ copy_on_write=copy_on_write,
79
+ arrow_parquet_args_fn=arrow_parquet_args_fn,
80
+ arrow_parquet_args=arrow_parquet_args,
81
+ min_rows_per_file=min_rows_per_file,
82
+ filesystem=filesystem,
83
+ try_create_dir=try_create_dir,
84
+ open_stream_args=arrow_open_stream_args,
85
+ dataset_uuid=self._uuid,
86
+ )
87
+ self.write_datasink(
88
+ datasink,
89
+ ray_remote_args=ray_remote_args,
90
+ concurrency=concurrency,
91
+ )
File without changes