deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. deltacat/__init__.py +41 -16
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/benchmark_engine.py +4 -2
  5. deltacat/benchmarking/conftest.py +1 -1
  6. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  7. deltacat/catalog/__init__.py +62 -5
  8. deltacat/catalog/main/impl.py +26 -10
  9. deltacat/catalog/model/catalog.py +165 -109
  10. deltacat/catalog/model/properties.py +25 -24
  11. deltacat/compute/__init__.py +14 -0
  12. deltacat/compute/converter/constants.py +5 -0
  13. deltacat/compute/converter/converter_session.py +78 -36
  14. deltacat/compute/converter/model/convert_input.py +24 -4
  15. deltacat/compute/converter/model/convert_result.py +61 -0
  16. deltacat/compute/converter/model/converter_session_params.py +52 -10
  17. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  18. deltacat/compute/converter/steps/convert.py +84 -36
  19. deltacat/compute/converter/steps/dedupe.py +25 -4
  20. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  21. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  22. deltacat/compute/converter/utils/io.py +82 -11
  23. deltacat/compute/converter/utils/s3u.py +13 -4
  24. deltacat/compute/jobs/client.py +406 -0
  25. deltacat/constants.py +5 -6
  26. deltacat/env.py +10 -0
  27. deltacat/examples/basic_logging.py +6 -6
  28. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  29. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  30. deltacat/examples/hello_world.py +4 -2
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +198 -0
  33. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  34. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  35. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  36. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  37. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  38. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  39. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  40. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  41. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  42. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  43. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  44. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  45. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  46. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  47. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  49. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  50. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  51. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  52. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  53. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  54. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  55. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  56. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  57. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  58. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  59. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  60. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  61. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  62. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  63. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  64. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  65. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  66. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  67. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  68. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  69. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  70. deltacat/io/__init__.py +13 -0
  71. deltacat/io/dataset/__init__.py +0 -0
  72. deltacat/io/dataset/deltacat_dataset.py +91 -0
  73. deltacat/io/datasink/__init__.py +0 -0
  74. deltacat/io/datasink/deltacat_datasink.py +207 -0
  75. deltacat/io/datasource/__init__.py +0 -0
  76. deltacat/io/datasource/deltacat_datasource.py +580 -0
  77. deltacat/io/reader/__init__.py +0 -0
  78. deltacat/io/reader/deltacat_read_api.py +172 -0
  79. deltacat/storage/__init__.py +2 -0
  80. deltacat/storage/model/expression/__init__.py +47 -0
  81. deltacat/storage/model/expression/expression.py +656 -0
  82. deltacat/storage/model/expression/visitor.py +248 -0
  83. deltacat/storage/model/metafile.py +74 -42
  84. deltacat/storage/model/scan/push_down.py +32 -5
  85. deltacat/storage/model/shard.py +6 -2
  86. deltacat/storage/model/types.py +5 -3
  87. deltacat/tests/_io/reader/__init__.py +0 -0
  88. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  89. deltacat/tests/catalog/data/__init__.py +0 -0
  90. deltacat/tests/catalog/main/__init__.py +0 -0
  91. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  92. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  93. deltacat/tests/catalog/model/__init__.py +0 -0
  94. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  95. deltacat/tests/catalog/test_catalogs.py +52 -98
  96. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  97. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  98. deltacat/tests/daft/__init__.py +0 -0
  99. deltacat/tests/daft/test_model.py +97 -0
  100. deltacat/tests/experimental/__init__.py +0 -0
  101. deltacat/tests/experimental/catalog/__init__.py +0 -0
  102. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  103. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  104. deltacat/tests/experimental/daft/__init__.py +0 -0
  105. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  106. deltacat/tests/experimental/storage/__init__.py +0 -0
  107. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  108. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  109. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  110. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  111. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  112. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  113. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  114. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  115. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  116. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  117. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  118. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  119. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  120. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  121. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  122. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  123. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  124. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  125. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  126. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  127. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  128. deltacat/tests/storage/model/test_expression.py +327 -0
  129. deltacat/tests/storage/model/test_shard.py +3 -1
  130. deltacat/tests/test_deltacat_api.py +50 -9
  131. deltacat/types/media.py +141 -43
  132. deltacat/types/tables.py +35 -7
  133. deltacat/utils/daft.py +531 -5
  134. deltacat/utils/export.py +3 -1
  135. deltacat/utils/filesystem.py +39 -9
  136. deltacat/utils/polars.py +128 -0
  137. deltacat/utils/pyarrow.py +151 -15
  138. deltacat/utils/ray_utils/concurrency.py +1 -1
  139. deltacat/utils/ray_utils/runtime.py +56 -4
  140. deltacat/utils/url.py +1284 -0
  141. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
  142. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
  143. deltacat/catalog/iceberg/__init__.py +0 -4
  144. deltacat/daft/daft_scan.py +0 -111
  145. deltacat/daft/model.py +0 -258
  146. deltacat/examples/common/fixtures.py +0 -15
  147. deltacat/storage/rivulet/__init__.py +0 -11
  148. deltacat/storage/rivulet/feather/__init__.py +0 -5
  149. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  150. /deltacat/{daft → compute/jobs}/__init__.py +0 -0
  151. /deltacat/examples/{common → experimental}/__init__.py +0 -0
  152. /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
  153. /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
  154. /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
  155. /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  156. /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
  157. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  158. /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
  159. /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  160. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  161. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  162. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
  163. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  164. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  165. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  166. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  167. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  168. /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  169. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  170. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  171. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  172. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  173. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  174. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  175. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,39 @@
1
1
  import shutil
2
2
  import tempfile
3
+
3
4
  import deltacat as dc
5
+ from deltacat.constants import METAFILE_FORMAT_MSGPACK
6
+ from deltacat import Namespace, DeltaCatUrl, DatasetType
7
+ from deltacat.storage import Metafile
8
+
9
+ from deltacat.io import (
10
+ METAFILE_TYPE_COLUMN_NAME,
11
+ METAFILE_DATA_COLUMN_NAME,
12
+ )
4
13
 
5
14
 
6
15
  class TestDeltaCAT:
7
16
  @classmethod
8
- def setup_class(cls):
17
+ def setup_method(cls):
9
18
  cls.temp_dir_1 = tempfile.mkdtemp()
10
19
  cls.temp_dir_2 = tempfile.mkdtemp()
11
20
  # Initialize DeltaCAT with two local catalogs.
12
- dc.put("test_catalog_1", root=cls.temp_dir_1)
13
- dc.put("test_catalog_2", root=cls.temp_dir_2)
21
+ dc.init()
22
+ dc.put(DeltaCatUrl("dc://test_catalog_1"), root=cls.temp_dir_1)
23
+ dc.put(DeltaCatUrl("dc://test_catalog_2"), root=cls.temp_dir_2)
14
24
 
15
25
  @classmethod
16
- def teardown_class(cls):
26
+ def teardown_method(cls):
17
27
  shutil.rmtree(cls.temp_dir_1)
18
28
  shutil.rmtree(cls.temp_dir_2)
19
29
 
20
30
  def test_cross_catalog_namespace_copy(self):
21
31
  # Given two empty DeltaCAT catalogs.
22
32
  # When a namespace is copied across catalogs.
23
- namespace_src = dc.put("test_catalog_1/test_namespace")
33
+ namespace_src = dc.put(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
24
34
  namespace_dst = dc.copy(
25
- "test_catalog_1/test_namespace",
26
- "test_catalog_2",
35
+ DeltaCatUrl("dc://test_catalog_1/test_namespace"),
36
+ DeltaCatUrl("dc://test_catalog_2/test_namespace"),
27
37
  )
28
38
  # Expect the catalog namespace created in each catalog
29
39
  # method to be equivalent and equal to the source namespace.
@@ -33,7 +43,38 @@ class TestDeltaCAT:
33
43
  # When each catalog namespace is fetched explicitly
34
44
  # Expect them to be equivalent but not equal
35
45
  # (due to different metafile IDs).
36
- actual_namespace_src = dc.get("test_catalog_1/test_namespace")
37
- actual_namespace_dst = dc.get("test_catalog_2/test_namespace")
46
+ actual_namespace_src = dc.get(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
47
+ actual_namespace_dst = dc.get(DeltaCatUrl("dc://test_catalog_2/test_namespace"))
38
48
  assert actual_namespace_src.equivalent_to(actual_namespace_dst)
39
49
  assert not actual_namespace_src == actual_namespace_dst
50
+
51
+ def test_catalog_listing_shallow_local_metafiles(self):
52
+ # Given two empty DeltaCAT catalogs.
53
+ # When a namespace is put in the catalog.
54
+ namespace_src: Namespace = dc.put(
55
+ DeltaCatUrl("dc://test_catalog_1/test_namespace")
56
+ )
57
+ # Expect the namespace to be listed.
58
+ assert any(
59
+ namespace_src.equivalent_to(other)
60
+ for other in dc.list(DeltaCatUrl("dc://test_catalog_1"))
61
+ )
62
+
63
+ def test_catalog_listing_shallow_ray_dataset(self):
64
+ # Given two empty DeltaCAT catalogs.
65
+ # When a namespace is put in the catalog.
66
+ namespace_src: Namespace = dc.put(
67
+ DeltaCatUrl("dc://test_catalog_1/test_namespace")
68
+ )
69
+ # Expect the namespace to be listed.
70
+ dataset = dc.list(
71
+ DeltaCatUrl("dc://test_catalog_1"),
72
+ dataset_type=DatasetType.RAY_DATASET,
73
+ )
74
+ actual_namespace = Metafile.deserialize(
75
+ serialized=dataset.take(1)[0][METAFILE_DATA_COLUMN_NAME],
76
+ meta_format=METAFILE_FORMAT_MSGPACK,
77
+ )
78
+ assert actual_namespace.equivalent_to(namespace_src)
79
+ namespace_type = dataset.take(1)[0][METAFILE_TYPE_COLUMN_NAME]
80
+ assert namespace_type == "Namespace"
deltacat/types/media.py CHANGED
@@ -1,30 +1,48 @@
1
1
  from enum import Enum
2
- from typing import Dict, Set
2
+ from typing import Set
3
3
 
4
4
 
5
5
  class ContentType(str, Enum):
6
- # See also:
7
- # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
8
- # https://www.iana.org/assignments/media-types/media-types.xhtml
6
+ """
7
+ Enumeration used to resolve the entity-body Media Type (formerly known as
8
+ MIME type) in an HTTP request.
9
+
10
+ https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
11
+
12
+ https://www.iana.org/assignments/media-types/media-types.xhtml
13
+ """
9
14
 
10
15
  # IANA registered types
16
+ AVRO = "application/avro"
17
+ BINARY = "application/octet-stream"
11
18
  CSV = "text/csv"
19
+ HDF = "application/x-hdf"
20
+ HTML = "text/html"
12
21
  JSON = "application/json"
22
+ TEXT = "text/plain"
23
+ WEBDATASET = "application/x-web-dataset"
24
+ XML = "text/xml"
13
25
 
14
26
  # unregistered types
15
- TSV = "text/tsv"
16
- PSV = "text/psv"
17
- PARQUET = "application/parquet"
18
- ORC = "application/orc"
19
27
  FEATHER = "application/feather"
20
- UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
21
28
  ION = "application/x-amzn-ion"
29
+ ORC = "application/orc"
30
+ PARQUET = "application/parquet"
31
+ PSV = "text/psv"
32
+ TSV = "text/tsv"
33
+ UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
22
34
 
23
35
 
24
36
  class ContentEncoding(str, Enum):
25
- # See also:
26
- # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.11
27
- # http://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding
37
+ """
38
+ Enumeration used as a modifier for :class:`deltacat.types.media.ContentType`
39
+ to indicate that additional encodings have been applied to the entity-body
40
+ Media Type in an HTTP request.
41
+
42
+ https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.11
43
+
44
+ http://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding
45
+ """
28
46
 
29
47
  # IANA registered encodings
30
48
  GZIP = "gzip"
@@ -37,27 +55,6 @@ class ContentEncoding(str, Enum):
37
55
  SNAPPY = "snappy"
38
56
 
39
57
 
40
- class TableType(str, Enum):
41
- PYARROW = "pyarrow"
42
- PANDAS = "pandas"
43
- NUMPY = "numpy"
44
- PYARROW_PARQUET = "pyarrow_parquet"
45
-
46
-
47
- class DistributedDatasetType(str, Enum):
48
- DAFT = "daft"
49
- RAY_DATASET = "ray_dataset"
50
-
51
-
52
- class SchemaType(str, Enum):
53
- ARROW = "arrow"
54
-
55
-
56
- class StorageType(str, Enum):
57
- LOCAL = "local"
58
- DISTRIBUTED = "distributed"
59
-
60
-
61
58
  DELIMITED_TEXT_CONTENT_TYPES: Set[str] = {
62
59
  ContentType.UNESCAPED_TSV.value,
63
60
  ContentType.TSV.value,
@@ -73,6 +70,7 @@ TABULAR_CONTENT_TYPES: Set[str] = {
73
70
  ContentType.PARQUET.value,
74
71
  ContentType.ORC.value,
75
72
  ContentType.FEATHER.value,
73
+ ContentType.AVRO.value,
76
74
  }
77
75
 
78
76
  EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
@@ -83,13 +81,113 @@ EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
83
81
  ContentType.JSON.value,
84
82
  }
85
83
 
86
- CONTENT_TYPE_TO_USER_KWARGS_KEY: Dict[str, str] = {
87
- ContentType.UNESCAPED_TSV.value: "unescaped_tsv",
88
- ContentType.TSV.value: "csv",
89
- ContentType.CSV.value: "csv",
90
- ContentType.PSV.value: "csv",
91
- ContentType.PARQUET.value: "parquet",
92
- ContentType.FEATHER.value: "feather",
93
- ContentType.ORC.value: "orc",
94
- ContentType.JSON.value: "json",
95
- }
84
+
85
+ class DatasetType(str, Enum):
86
+ """
87
+ Enumeration used to identify the in-memory local or distributed dataset
88
+ to be used for file IO, queries, and data transformation. Typically used
89
+ together with :class:`deltacat.types.media.DatastoreType` to resolve the
90
+ compute layer that will be responsible for reading, transforming, and
91
+ writing data to a given datastore.
92
+ """
93
+
94
+ # local
95
+ NUMPY = "numpy" # numpy.ndarray
96
+ PANDAS = "pandas" # pandas.DataFrame
97
+ POLARS = "polars" # polars.DataFrame
98
+ PYARROW = "pyarrow" # pyarrow.Table
99
+ PYARROW_PARQUET = "pyarrow_parquet" # pyarrow.parquet.ParquetFile
100
+
101
+ # distributed
102
+ DAFT = "daft" # daft.DataFrame
103
+ RAY_DATASET = "ray_dataset" # ray.data.Dataset
104
+
105
+ @staticmethod
106
+ def distributed():
107
+ return {
108
+ DatasetType.DAFT,
109
+ DatasetType.RAY_DATASET,
110
+ }
111
+
112
+ @staticmethod
113
+ def local():
114
+ return {
115
+ DatasetType.NUMPY,
116
+ DatasetType.PANDAS,
117
+ DatasetType.POLARS,
118
+ DatasetType.PYARROW,
119
+ DatasetType.PYARROW_PARQUET,
120
+ }
121
+
122
+
123
+ # deprecated by DatasetType - populated dynamically for backwards compatibility
124
+ TableType = Enum(
125
+ "TableType",
126
+ {d.name: d.value for d in DatasetType.local()},
127
+ )
128
+
129
+ # deprecated by DatasetType - populated dynamically for backwards compatibility
130
+ DistributedDatasetType = Enum(
131
+ "DistributedDatasetType",
132
+ {d.name: d.value for d in DatasetType.distributed()},
133
+ )
134
+
135
+
136
+ # deprecated by DatasetType.local() and DatasetType.distributed()
137
+ # kept for backwards compatibility
138
+ class StorageType(str, Enum):
139
+ LOCAL = "local"
140
+ DISTRIBUTED = "distributed"
141
+
142
+
143
+ class DatastoreType(str, Enum):
144
+ """
145
+ Enumeration used to identify the type of reader required to connect to and
146
+ correctly interpret data stored at a given path. Typically used together
147
+ with :class:`deltacat.types.media.DatasetType` to resolve a reader or
148
+ writer for that data store. Note that, although some overlap exists between
149
+ enum values here and in :class:`deltacat.types.media.ContentType`, each
150
+ enum serve a different purpose. The purpose of
151
+ :class:`deltacat.types.media.ContentType` is to resolve a file's MIME type,
152
+ and may be used together with datastores that support storing different
153
+ file types to describe the specific file type read/written from/to that
154
+ datastore (e.g., Iceberg, Hudi, Delta Lake, Audio, Images, Video, etc.)
155
+ """
156
+
157
+ # DeltaCAT Catalog Datasets
158
+ DELTACAT = "dc"
159
+ DELTACAT_NAMESPACE = "namespace"
160
+ DELTACAT_TABLE = "table"
161
+ DELTACAT_TABLE_VERSION = "tableversion"
162
+ DELTACAT_STREAM = "stream"
163
+ DELTACAT_PARTITION = "partition"
164
+ DELTACAT_DELTA = "delta"
165
+
166
+ # External Datasets
167
+ AUDIO = "audio"
168
+ AVRO = "avro"
169
+ BIGQUERY = "bigquery"
170
+ BINARY = "binary"
171
+ CSV = "csv"
172
+ CLICKHOUSE = "clickhouse"
173
+ DATABRICKS_TABLES = "databricks"
174
+ DELTA_LAKE = "deltalake"
175
+ DELTA_SHARING = "deltasharing"
176
+ FEATHER = "feather"
177
+ HDF = "hdf"
178
+ HTML = "html"
179
+ HUDI = "hudi"
180
+ ICEBERG = "iceberg"
181
+ IMAGES = "images"
182
+ JSON = "json"
183
+ LANCE = "lance"
184
+ MONGO = "mongodb"
185
+ NUMPY = "numpy"
186
+ ORC = "orc"
187
+ PARQUET = "parquet"
188
+ TEXT = "text"
189
+ TFRECORDS = "tfrecords"
190
+ VIDEOS = "videos"
191
+ WARC = "warc"
192
+ WEBDATASET = "webdataset"
193
+ XML = "xml"
deltacat/types/tables.py CHANGED
@@ -3,9 +3,10 @@ from typing import Callable, Dict, Type, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
+ import polars as pl
6
7
  import pyarrow as pa
7
8
  import pyarrow.parquet as papq
8
- from ray.data.dataset import Dataset
9
+ from ray.data.dataset import Dataset as RayDataset
9
10
  from ray.data.read_api import (
10
11
  from_arrow,
11
12
  from_arrow_refs,
@@ -18,11 +19,12 @@ import deltacat.storage as dcs
18
19
  from deltacat.types.media import TableType, DistributedDatasetType
19
20
  from deltacat.utils import numpy as np_utils
20
21
  from deltacat.utils import pandas as pd_utils
22
+ from deltacat.utils import polars as pl_utils
21
23
  from deltacat.utils import pyarrow as pa_utils
22
24
  from deltacat.utils import daft as daft_utils
23
25
  from deltacat.utils.ray_utils import dataset as ds_utils
24
26
 
25
- TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
27
+ TABLE_TYPE_TO_S3_READER_FUNC: Dict[int, Callable] = {
26
28
  TableType.PYARROW_PARQUET.value: pa_utils.s3_file_to_parquet,
27
29
  TableType.PYARROW.value: pa_utils.s3_file_to_table,
28
30
  TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
@@ -34,8 +36,9 @@ TABLE_CLASS_TO_WRITER_FUNC: Dict[
34
36
  ] = {
35
37
  pa.Table: pa_utils.table_to_file,
36
38
  pd.DataFrame: pd_utils.dataframe_to_file,
39
+ pl.DataFrame: pl_utils.dataframe_to_file,
37
40
  np.ndarray: np_utils.ndarray_to_file,
38
- Dataset: ds_utils.dataset_to_file,
41
+ RayDataset: ds_utils.dataset_to_file,
39
42
  }
40
43
 
41
44
  TABLE_CLASS_TO_SLICER_FUNC: Dict[
@@ -43,8 +46,9 @@ TABLE_CLASS_TO_SLICER_FUNC: Dict[
43
46
  ] = {
44
47
  pa.Table: pa_utils.slice_table,
45
48
  pd.DataFrame: pd_utils.slice_dataframe,
49
+ pl.DataFrame: pl_utils.slice_table,
46
50
  np.ndarray: np_utils.slice_ndarray,
47
- Dataset: ds_utils.slice_dataset,
51
+ RayDataset: ds_utils.slice_dataset,
48
52
  }
49
53
 
50
54
  TABLE_CLASS_TO_SIZE_FUNC: Dict[
@@ -53,13 +57,27 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
53
57
  pa.Table: pa_utils.table_size,
54
58
  papq.ParquetFile: pa_utils.parquet_file_size,
55
59
  pd.DataFrame: pd_utils.dataframe_size,
60
+ pl.DataFrame: pl_utils.dataframe_size,
56
61
  np.ndarray: np_utils.ndarray_size,
57
- Dataset: ds_utils.dataset_size,
62
+ RayDataset: ds_utils.dataset_size,
63
+ }
64
+
65
+ TABLE_CLASS_TO_PYARROW_FUNC: Dict[
66
+ Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
67
+ ] = {
68
+ pa.Table: lambda table, **kwargs: table,
69
+ papq.ParquetFile: lambda table, **kwargs: table.read(**kwargs),
70
+ pd.DataFrame: lambda table, **kwargs: pa.Table.from_pandas(table, **kwargs),
71
+ pl.DataFrame: lambda table, **kwargs: pl.DataFrame.to_arrow(table, **kwargs),
72
+ np.ndarray: lambda table, **kwargs: pa.Table.from_arrays(
73
+ [pa.array(table[:, i]) for i in range(table.shape[1])]
74
+ ),
58
75
  }
59
76
 
60
77
  TABLE_CLASS_TO_TABLE_TYPE: Dict[Type[dcs.LocalTable], str] = {
61
78
  pa.Table: TableType.PYARROW.value,
62
79
  papq.ParquetFile: TableType.PYARROW_PARQUET.value,
80
+ pl.DataFrame: TableType.POLARS.value,
63
81
  pd.DataFrame: TableType.PANDAS.value,
64
82
  np.ndarray: TableType.NUMPY.value,
65
83
  }
@@ -78,7 +96,6 @@ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
78
96
  TableType.PANDAS.value: from_pandas_refs,
79
97
  }
80
98
 
81
-
82
99
  DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
83
100
  DistributedDatasetType.DAFT.value: daft_utils.s3_files_to_dataframe
84
101
  }
@@ -106,7 +123,18 @@ class TableWriteMode(str, Enum):
106
123
 
107
124
 
108
125
  def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
109
- return len(table) if not isinstance(table, Dataset) else table.count()
126
+ return len(table) if not isinstance(table, RayDataset) else table.count()
127
+
128
+
129
+ def get_table_size(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
130
+ table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
131
+ if table_size_func is None:
132
+ msg = (
133
+ f"No size function found for table type: {type(table)}.\n"
134
+ f"Known table types: {TABLE_CLASS_TO_SIZE_FUNC.keys}"
135
+ )
136
+ raise ValueError(msg)
137
+ return table_size_func(table)
110
138
 
111
139
 
112
140
  def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable: