deltacat 2.0.0b3__py3-none-any.whl → 2.0.0b6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -67,7 +67,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
67
67
 
68
68
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
69
69
 
70
- __version__ = "2.0.0b3"
70
+ __version__ = "2.0.0b6"
71
71
 
72
72
 
73
73
  __all__ = [
@@ -2,10 +2,13 @@ import logging
2
2
 
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
 
5
- from daft import DataFrame
5
+ from daft import DataFrame, context
6
+ from daft.daft import ScanOperatorHandle, StorageConfig
7
+ from daft.logical.builder import LogicalPlanBuilder
6
8
 
7
9
  from deltacat import logs
8
10
  from deltacat.catalog.model.table_definition import TableDefinition
11
+ from deltacat.daft.daft_scan import DeltaCatScanOperator
9
12
  from deltacat.exceptions import TableAlreadyExistsError
10
13
  from deltacat.storage.iceberg.iceberg_scan_planner import IcebergScanPlanner
11
14
  from deltacat.storage.iceberg.model import PartitionSchemeMapper, SchemaMapper
@@ -144,7 +147,17 @@ def read_table(
144
147
  table: str, *args, namespace: Optional[str] = None, **kwargs
145
148
  ) -> DistributedDataset:
146
149
  """Read a table into a distributed dataset."""
147
- raise NotImplementedError("read_table not implemented")
150
+ # TODO: more proper IO configuration
151
+ io_config = context.get_context().daft_planning_config.default_io_config
152
+ multithreaded_io = context.get_context().get_or_create_runner().name != "ray"
153
+
154
+ storage_config = StorageConfig(multithreaded_io, io_config)
155
+
156
+ dc_table = get_table(name=table, namespace=namespace, **kwargs)
157
+ dc_scan_operator = DeltaCatScanOperator(dc_table, storage_config)
158
+ handle = ScanOperatorHandle.from_python_scan_operator(dc_scan_operator)
159
+ builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)
160
+ return DataFrame(builder)
148
161
 
149
162
 
150
163
  def alter_table(
@@ -5,12 +5,11 @@ from typing import Iterator, List
5
5
  from pyarrow.fs import FileSystem
6
6
 
7
7
  from pyiceberg.io.pyarrow import (
8
- fill_parquet_file_metadata,
8
+ data_file_statistics_from_parquet_metadata,
9
9
  compute_statistics_plan,
10
10
  parquet_path_to_id_mapping,
11
11
  )
12
- from pyiceberg.table import Table, _MergingSnapshotProducer
13
- from pyiceberg.table.snapshots import Operation
12
+ from pyiceberg.table import Table
14
13
  from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
15
14
  from pyiceberg.types import StructType, NestedField, IntegerType
16
15
  from pyiceberg.typedef import Record
@@ -24,11 +23,10 @@ def append(table: Table, paths: List[str]) -> None:
24
23
  # raise ValueError("Cannot write to tables with a sort-order")
25
24
 
26
25
  data_files = write_file(table, paths)
27
- merge = _MergingSnapshotProducer(operation=Operation.APPEND, table=table)
28
- for data_file in data_files:
29
- merge.append_data_file(data_file)
30
-
31
- merge.commit()
26
+ with table.transaction() as txn:
27
+ with txn.update_snapshot().fast_append() as snapshot_update:
28
+ for data_file in data_files:
29
+ snapshot_update.append_data_file(data_file)
32
30
 
33
31
 
34
32
  def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
@@ -41,6 +39,11 @@ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
41
39
  fs_path = fs_tuple[1]
42
40
  with fs.open_input_file(fs_path) as native_file:
43
41
  parquet_metadata = pq.read_metadata(native_file)
42
+ statistics = data_file_statistics_from_parquet_metadata(
43
+ parquet_metadata=parquet_metadata,
44
+ stats_columns=compute_statistics_plan(table.schema(), table.properties),
45
+ parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
46
+ )
44
47
  data_file = DataFile(
45
48
  content=DataFileContent.DATA,
46
49
  file_path=file_path,
@@ -63,12 +66,7 @@ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
63
66
  spec_id=table.spec().spec_id,
64
67
  equality_ids=None,
65
68
  key_metadata=None,
66
- )
67
- fill_parquet_file_metadata(
68
- data_file=data_file,
69
- parquet_metadata=parquet_metadata,
70
- stats_columns=compute_statistics_plan(table.schema(), table.properties),
71
- parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
69
+ **statistics.to_serialized_dict(),
72
70
  )
73
71
  data_files.append(data_file)
74
72
  return data_files
@@ -709,7 +709,7 @@ def _get_deltas_from_partition_filter(
709
709
 
710
710
  def _get_storage(**kwargs):
711
711
  """
712
- Returns the implementation of `deltacat.storage.interface` to use with this catalog.
712
+ Returns the implementation of `deltacat.storage.interface` to use with this catalog
713
713
 
714
714
  This is configured in the `CatalogProperties` stored during initialization and passed through `delegate.py`
715
715
  """
@@ -1,11 +1,15 @@
1
1
  import os
2
2
  import logging
3
3
 
4
+ import uuid
4
5
  import daft
6
+ from pyiceberg.catalog import CatalogType
7
+
5
8
  import deltacat as dc
6
9
 
7
10
  from deltacat import logs
8
11
  from deltacat import IcebergCatalog
12
+ from deltacat.catalog.iceberg import IcebergCatalogConfig
9
13
  from deltacat.examples.common.fixtures import (
10
14
  store_cli_args_in_os_environ,
11
15
  )
@@ -30,6 +34,24 @@ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
30
34
 
31
35
 
32
36
  def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
37
+ """
38
+ This is an e2e example that
39
+ 1. creates a DeltaCAT Table (backed by an Iceberg Table) in Glue
40
+ 2. writes data into the DeltaCAT Table
41
+ 3. reads data from the DeltaCAT Table using Daft
42
+
43
+ To run the script:
44
+ 1. prepare an AWS Account
45
+ 1. prepare a S3 location where the data will be written to, which will be used in Step 3.
46
+ 2. prepare an IAM Role that has access to the S3 location and Glue
47
+ 2. retrieve the IAM Role AWS Credential and cache locally in ~/.aws/credentials
48
+ 3. run below command to execute the example
49
+ ```
50
+ make venv && source venv/bin/activate
51
+ python -m deltacat.examples.iceberg.iceberg_bucket_writer --warehouse=s3://<YOUR_S3_LOCATION>
52
+ ```
53
+
54
+ """
33
55
  # create any runtime environment required to run the example
34
56
  runtime_env = create_ray_runtime_environment()
35
57
 
@@ -38,6 +60,7 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
38
60
  # Only the `iceberg` data catalog is provided so it will become the default.
39
61
  # If initializing multiple catalogs, use the `default_catalog_name` param
40
62
  # to specify which catalog should be the default.
63
+
41
64
  dc.init(
42
65
  catalogs={
43
66
  # the name of the DeltaCAT catalog is "iceberg"
@@ -49,11 +72,13 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
49
72
  name="example-iceberg-catalog",
50
73
  # for additional properties see:
51
74
  # https://py.iceberg.apache.org/configuration/
52
- properties={
53
- "type": "glue",
54
- "region_name": "us-east-1",
55
- "warehouse": warehouse,
56
- },
75
+ config=IcebergCatalogConfig(
76
+ type=CatalogType.GLUE,
77
+ properties={
78
+ "warehouse": warehouse,
79
+ "region_name": "us-east-1",
80
+ },
81
+ ),
57
82
  )
58
83
  },
59
84
  # pass the runtime environment into ray.init()
@@ -89,10 +114,10 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
89
114
  }
90
115
  )
91
116
 
92
- # write to a table named `test_namespace.test_table_bucketed`
117
+ # write to a table named `test_namespace.test_table_bucketed-<SUFFIX>`
93
118
  # we don't need to specify which catalog to create this table in since
94
119
  # only the "iceberg" catalog is available
95
- table_name = "test_table_bucketed"
120
+ table_name = f"test_table_bucketed-{uuid.uuid4().hex[:8]}"
96
121
  namespace = "test_namespace"
97
122
  print(f"Creating Glue Table: {namespace}.{table_name}")
98
123
  dc.write_to_table(
@@ -106,9 +131,40 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
106
131
  )
107
132
 
108
133
  print(f"Getting Glue Table: {namespace}.{table_name}")
109
- table_definition = dc.get_table(table_name, namespace)
134
+ table_definition = dc.get_table(name=table_name, namespace=namespace)
110
135
  print(f"Retrieved Glue Table: {table_definition}")
111
136
 
137
+ # Read Data from DeltaCAT Table (backed by Iceberg) using Daft
138
+ daft_dataframe = dc.read_table(table=table_name, namespace=namespace)
139
+
140
+ daft_dataframe.where(df["bid"] > 200.0).show()
141
+ # Expected result:
142
+ # ╭────────┬─────────┬─────────╮
143
+ # │ symbol ┆ bid ┆ ask │
144
+ # │ --- ┆ --- ┆ --- │
145
+ # │ Utf8 ┆ Float64 ┆ Float64 │
146
+ # ╞════════╪═════════╪═════════╡
147
+ # │ meta ┆ 392.03 ┆ 392.09 │
148
+ # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
149
+ # │ msft ┆ 403.25 ┆ 403.27 │
150
+ # ╰────────┴─────────┴─────────╯
151
+
152
+ daft_dataframe.select("symbol").show()
153
+ # Expected result:
154
+ # ╭────────╮
155
+ # │ symbol │
156
+ # │ --- │
157
+ # │ Utf8 │
158
+ # ╞════════╡
159
+ # │ meta │
160
+ # ├╌╌╌╌╌╌╌╌┤
161
+ # │ amzn │
162
+ # ├╌╌╌╌╌╌╌╌┤
163
+ # │ goog │
164
+ # ├╌╌╌╌╌╌╌╌┤
165
+ # │ msft │
166
+ # ╰────────╯
167
+
112
168
 
113
169
  if __name__ == "__main__":
114
170
  example_script_args = [
@@ -121,15 +177,6 @@ if __name__ == "__main__":
121
177
  "type": str,
122
178
  },
123
179
  ),
124
- (
125
- [
126
- "--STAGE",
127
- ],
128
- {
129
- "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
130
- "type": str,
131
- },
132
- ),
133
180
  ]
134
181
 
135
182
  # store any CLI args in the runtime environment
File without changes
@@ -0,0 +1,4 @@
1
+ """Daft integration package for DeltaCAT.
2
+
3
+ This package provides integration between DeltaCAT and Daft.
4
+ """
@@ -0,0 +1,229 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Tuple, Optional
4
+
5
+ from deltacat.catalog.model.catalog import Catalog as DCCatalog
6
+ from deltacat.catalog.model.table_definition import TableDefinition
7
+
8
+ from daft.catalog import Catalog, Identifier, Table
9
+ from daft.dataframe import DataFrame
10
+ from daft.logical.schema import Schema
11
+ from deltacat.constants import DEFAULT_NAMESPACE
12
+
13
+
14
+ class DaftCatalog(Catalog):
15
+ """
16
+ Wrapper class to create a Daft catalog from a DeltaCAT catalog.
17
+
18
+ The initialization of DeltaCAT and Daft catalogs is managed in `deltacat.catalog.catalog.py`. The user
19
+ is just expected to initialize catalogs through the DeltaCAT public interface (init / put_catalog).
20
+
21
+ TODO (mccember) in follow up PR we need to consider how to keep the DeltaCAT Catalogs class and Daft session in sync,
22
+ and the user-facing entrypoint to get a Daft catalog
23
+
24
+ This class itself expects a `Catalog` and will invoke the underlying implementation
25
+ similar to `deltacat.catalog.delegate.py`, like:
26
+ catalog.impl.create_namespace(namespace, inner=catalog.inner)
27
+
28
+ We cannot route calls through the higher level catalog registry / delegate since this wrapper class is at a lower
29
+ layer and does not manage registering catalogs.
30
+ """
31
+
32
+ def __init__(self, catalog: DCCatalog, name: str):
33
+ """
34
+ Initialize given DeltaCAT catalog. This catalog is also registered with DeltaCAT (via deltacat.put_catalog) given the provided Name
35
+
36
+ :param catalog: DeltaCAT Catalog object. If None, the catalog will be fetched from `deltacat.Catalogs`
37
+ given the catalog name.
38
+
39
+ :param name: Name of DeltaCAT catalog. If the name is not yet registered with `deltacat.Catalogs`,
40
+ it will be registered upon creation to ensure that the DeltaCAT and Daft catalogs keep in sync.
41
+
42
+ :param kwargs: Additional keyword arguments passed to deltacat.get_catalog or deltacat.put_catalog,
43
+ such as 'namespace' for tests.
44
+ """
45
+ self.dc_catalog = catalog
46
+ self._name = name
47
+
48
+ @property
49
+ def name(self) -> str:
50
+ return self._name
51
+
52
+ ###
53
+ # create_*
54
+ ###
55
+ def create_namespace(self, identifier: Identifier | str):
56
+ """Create a new namespace in the catalog."""
57
+ if isinstance(identifier, Identifier):
58
+ identifier = str(identifier)
59
+ self.dc_catalog.impl.create_namespace(identifier, inner=self.dc_catalog.inner)
60
+
61
+ def create_table(
62
+ self, identifier: Identifier | str, source: Schema | DataFrame, **kwargs
63
+ ) -> Table:
64
+ """
65
+ Create a DeltaCAT table via Daft catalog API
66
+
67
+ End users calling create_table through the daft table API may provide kwargs which will be plumbed through
68
+ to deltacat create_table. For full list of keyword arguments accepted by create_table.
69
+
70
+ Note: as of 4/22, Daft create_table does not yet support kwargs. Tracked at: https://github.com/Eventual-Inc/Daft/issues/4195
71
+
72
+ :param identifier: Daft table identifier. Sequence of strings of the format (namespace) or (namespace, table)
73
+ or (namespace, table, table version). If this is a string, it is a dot delimited string of the same format.
74
+ Identifiers can be created either like Identifier("namespace", "table", "version") OR
75
+ Identifier.from_str("namespace.table.version")
76
+
77
+ :param source: a TableSource, either a Daft DataFrame, Daft Schema, or str (filesystem path)
78
+ """
79
+ if isinstance(source, DataFrame):
80
+ return self._create_table_from_df(identifier, source)
81
+ elif isinstance(source, Schema):
82
+ return self._create_table_from_schema(identifier, source)
83
+ else:
84
+ raise Exception(
85
+ f"Expected table source to be Schema or DataFrame. Found: {type(source)}"
86
+ )
87
+
88
+ def _create_table_from_df(
89
+ self, ident: Identifier | str, source: DataFrame, **kwargs
90
+ ) -> Table:
91
+ """
92
+ Create a table from a DataFrame.
93
+ """
94
+ t = self._create_table_from_schema(ident, source.schema(), **kwargs)
95
+ # TODO (mccember) append data upon creation
96
+ return t
97
+
98
+ def _create_table_from_schema(
99
+ self, ident: Identifier | str, source: Schema, **kwargs
100
+ ) -> Table:
101
+ """
102
+ Create a table from a schema.
103
+ """
104
+ namespace, name, version = self._extract_namespace_name_version(ident)
105
+
106
+ # Convert the Daft schema to a DeltaCAT schema
107
+ # This is a simplified version, would need to be enhanced for production
108
+ deltacat_schema = self._convert_schema_to_deltacat(source)
109
+
110
+ # Create the table in DeltaCAT
111
+ table_def = self.dc_catalog.impl.create_table(
112
+ name,
113
+ namespace=namespace,
114
+ version=version,
115
+ schema=deltacat_schema,
116
+ inner=self.dc_catalog.inner,
117
+ **kwargs,
118
+ )
119
+
120
+ return DaftTable._from_obj(table_def)
121
+
122
+ ###
123
+ # drop_*
124
+ ###
125
+
126
+ def drop_namespace(self, identifier: Identifier | str):
127
+ raise NotImplementedError()
128
+
129
+ def drop_table(self, identifier: Identifier | str):
130
+ raise NotImplementedError()
131
+
132
+ ###
133
+ # get_*
134
+ ###
135
+
136
+ def get_table(self, identifier: Identifier | str, **kwargs) -> Table:
137
+ namespace, table, version = self._extract_namespace_name_version(identifier)
138
+
139
+ table_def = self.dc_catalog.impl.get_table(
140
+ table,
141
+ namespace=namespace,
142
+ table_version=version,
143
+ inner=self.dc_catalog.inner,
144
+ **kwargs,
145
+ )
146
+
147
+ if not table_def:
148
+ raise ValueError(f"Table {identifier} not found")
149
+
150
+ return DaftTable._from_obj(table_def)
151
+
152
+ ###
153
+ # list_*
154
+ ###
155
+
156
+ def list_namespaces(self, pattern: str | None = None) -> list[Identifier]:
157
+ raise NotImplementedError("Not implemented")
158
+
159
+ def list_tables(self, pattern: str | None = None) -> list[str]:
160
+ raise NotImplementedError("Not implemented")
161
+
162
+ def _extract_namespace_name_version(
163
+ self, ident: Identifier | str
164
+ ) -> Tuple[str, str, Optional[str]]:
165
+ """
166
+ Extract namespace, name,version from identifier
167
+
168
+ Returns a 3-tuple. If no namespace is provided, uses DeltaCAT defualt namespace
169
+ """
170
+ default_namespace = DEFAULT_NAMESPACE
171
+
172
+ if isinstance(ident, str):
173
+ ident = Identifier.from_str(ident)
174
+
175
+ if isinstance(ident, Identifier):
176
+ if len(ident) == 1:
177
+ return (default_namespace, ident[0], None)
178
+ elif len(ident) == 2:
179
+ return (ident[0], ident[1], None)
180
+ elif len(ident) == 3:
181
+ return (ident[0], ident[1], ident[2])
182
+ else:
183
+ raise ValueError(
184
+ f"Expected table identifier to be in format (table) or (namespace, table)"
185
+ f"or (namespace, table, version). Found: {ident}"
186
+ )
187
+
188
+ def _convert_schema_to_deltacat(self, schema: Schema):
189
+ """Convert Daft schema to DeltaCAT schema.
190
+ For now, just use PyArrow schema as intermediary
191
+ TODO look into how enhancements on schema can be propagated between Daft<=>DeltaCAT
192
+ """
193
+ from deltacat.storage.model.schema import Schema as DeltaCATSchema
194
+
195
+ return DeltaCATSchema.of(schema=schema.to_pyarrow_schema())
196
+
197
+
198
+ class DaftTable(Table):
199
+ """
200
+ Wrapper class to create a Daft table from a DeltaCAT table
201
+ """
202
+
203
+ _inner: TableDefinition
204
+
205
+ _read_options = set()
206
+ _write_options = set()
207
+
208
+ def __init__(self, inner: TableDefinition):
209
+ self._inner = inner
210
+
211
+ @property
212
+ def name(self) -> str:
213
+ """Return the table name."""
214
+ return self._inner.table_version.table_name
215
+
216
+ @staticmethod
217
+ def _from_obj(obj: object) -> DaftTable:
218
+ """Returns a DeltaCATTable if the given object can be adapted so."""
219
+ if isinstance(obj, TableDefinition):
220
+ t = DaftTable.__new__(DaftTable)
221
+ t._inner = obj
222
+ return t
223
+ raise ValueError(f"Unsupported DeltaCAT table type: {type(obj)}")
224
+
225
+ def read(self, **options) -> DataFrame:
226
+ raise NotImplementedError("Not implemented")
227
+
228
+ def write(self, df: DataFrame | object, mode: str = "append", **options):
229
+ raise NotImplementedError("Not implemented")
@@ -11,6 +11,8 @@ from deltacat import logs
11
11
 
12
12
  from deltacat.storage.model.schema import FieldLocator
13
13
 
14
+ import json
15
+
14
16
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
17
 
16
18
 
@@ -192,6 +194,20 @@ class Manifest(dict):
192
194
  manifest = Manifest._build_manifest(meta, entries, author, uuid)
193
195
  return manifest
194
196
 
197
+ @staticmethod
198
+ def from_json(json_string: str) -> Manifest:
199
+ parsed_dict = json.loads(json_string)
200
+ return Manifest.of(
201
+ entries=ManifestEntryList.of(
202
+ [
203
+ ManifestEntry.from_dict(entry)
204
+ for entry in parsed_dict.get("entries", [])
205
+ ]
206
+ ),
207
+ author=ManifestAuthor.from_dict(parsed_dict.get("author")),
208
+ uuid=parsed_dict.get("id"),
209
+ )
210
+
195
211
  @staticmethod
196
212
  def merge_manifests(
197
213
  manifests: List[Manifest], author: Optional[ManifestAuthor] = None
@@ -264,6 +280,23 @@ class ManifestMeta(dict):
264
280
  manifest_meta["entry_params"] = entry_params
265
281
  return manifest_meta
266
282
 
283
+ @staticmethod
284
+ def from_dict(obj: dict) -> Optional[ManifestMeta]:
285
+ if obj is None:
286
+ return None
287
+
288
+ return ManifestMeta.of(
289
+ record_count=obj.get("record_count"),
290
+ content_length=obj.get("content_length"),
291
+ content_type=obj.get("content_type"),
292
+ content_encoding=obj.get("content_encoding"),
293
+ source_content_length=obj.get("source_content_length"),
294
+ credentials=obj.get("credentials"),
295
+ content_type_parameters=obj.get("content_type_parameters"),
296
+ entry_type=obj.get("entry_type"),
297
+ entry_params=obj.get("entry_params"),
298
+ )
299
+
267
300
  @property
268
301
  def record_count(self) -> Optional[int]:
269
302
  return self.get("record_count")
@@ -358,6 +391,16 @@ class ManifestEntry(dict):
358
391
  manifest_entry = ManifestEntry.of(url, manifest_entry_meta)
359
392
  return manifest_entry
360
393
 
394
+ @staticmethod
395
+ def from_dict(obj: dict) -> ManifestEntry:
396
+ return ManifestEntry.of(
397
+ url=obj.get("url"),
398
+ uri=obj.get("uri"),
399
+ meta=ManifestMeta.from_dict(obj.get("meta")),
400
+ mandatory=obj.get("mandatory", True),
401
+ uuid=obj.get("id"),
402
+ )
403
+
361
404
  @property
362
405
  def uri(self) -> Optional[str]:
363
406
  return self.get("uri")
@@ -392,6 +435,12 @@ class ManifestAuthor(dict):
392
435
  manifest_author["version"] = version
393
436
  return manifest_author
394
437
 
438
+ @staticmethod
439
+ def from_dict(obj: dict) -> Optional[ManifestAuthor]:
440
+ if obj is None:
441
+ return None
442
+ return ManifestAuthor.of(obj.get("name"), obj.get("version"))
443
+
395
444
  @property
396
445
  def name(self) -> Optional[str]:
397
446
  return self.get("name")
@@ -0,0 +1,129 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from deltacat.storage.model.manifest import Manifest, ManifestEntry
6
+
7
+
8
+ @pytest.fixture
9
+ def manifest_a():
10
+ return """
11
+ {
12
+ "entries":[
13
+ {
14
+ "uri":"s3://test_bucket/file1.tsv.gz",
15
+ "mandatory":true,
16
+ "meta":{
17
+ "record_count":0,
18
+ "content_length":123,
19
+ "source_content_length":0,
20
+ "content_type":"application/x-amzn-unescaped-tsv",
21
+ "content_encoding":"gzip"
22
+ }
23
+ },
24
+ {
25
+ "uri":"s3://test_bucket/file2.tsv.gz",
26
+ "mandatory":true,
27
+ "meta":{
28
+ "record_count":0,
29
+ "content_length":456,
30
+ "source_content_length":0,
31
+ "content_type":"application/x-amzn-unescaped-tsv",
32
+ "content_encoding":"gzip"
33
+ }
34
+ }
35
+ ],
36
+ "meta":{
37
+ "record_count":0,
38
+ "content_length":579,
39
+ "source_content_length":0,
40
+ "content_type":"application/x-amzn-unescaped-tsv",
41
+ "content_encoding":"gzip"
42
+ },
43
+ "id":"052f62c0-5082-4935-9937-18a705156123",
44
+ "author":{
45
+ "name":"Dave",
46
+ "version":"1.0"
47
+ }
48
+ }
49
+ """
50
+
51
+
52
+ @pytest.fixture
53
+ def manifest_no_author():
54
+ return """
55
+ {
56
+ "entries":[
57
+ {
58
+ "uri":"s3://test_bucket/file1.tsv.gz",
59
+ "mandatory":true,
60
+ "meta":{
61
+ "record_count":0,
62
+ "content_length":123,
63
+ "source_content_length":0,
64
+ "content_type":"application/x-amzn-unescaped-tsv",
65
+ "content_encoding":"gzip"
66
+ }
67
+ },
68
+ {
69
+ "uri":"s3://test_bucket/file2.tsv.gz",
70
+ "mandatory":true,
71
+ "meta":{
72
+ "record_count":0,
73
+ "content_length":456,
74
+ "source_content_length":0,
75
+ "content_type":"application/x-amzn-unescaped-tsv",
76
+ "content_encoding":"gzip"
77
+ }
78
+ }
79
+ ],
80
+ "meta":{
81
+ "record_count":0,
82
+ "content_length":579,
83
+ "source_content_length":0,
84
+ "content_type":"application/x-amzn-unescaped-tsv",
85
+ "content_encoding":"gzip"
86
+ },
87
+ "id":"052f62c0-5082-4935-9937-18a705156123"
88
+ }
89
+ """
90
+
91
+
92
+ @pytest.fixture()
93
+ def manifest_entry_no_meta():
94
+ return """
95
+ {
96
+ "uri":"s3://test_bucket/file1.tsv.gz",
97
+ "mandatory":true
98
+ }
99
+ """
100
+
101
+
102
+ def test_manifest_from_json(manifest_a):
103
+ manifest = Manifest.from_json(manifest_a)
104
+
105
+ assert manifest.entries is not None
106
+ assert len(manifest.entries) == 2
107
+ assert manifest.entries[0].uri == "s3://test_bucket/file1.tsv.gz"
108
+ assert manifest.entries[0].meta.record_count == 0
109
+ assert manifest.meta.content_length == 579
110
+ assert manifest.author.name == "Dave"
111
+
112
+
113
+ def test_manifest_from_json_no_author(manifest_no_author):
114
+ manifest = Manifest.from_json(manifest_no_author)
115
+
116
+ assert manifest.entries is not None
117
+ assert len(manifest.entries) == 2
118
+ assert manifest.entries[0].uri == "s3://test_bucket/file1.tsv.gz"
119
+ assert manifest.entries[0].meta is not None
120
+ assert manifest.author is None
121
+
122
+
123
+ def test_manifest_entry_from_dict_no_meta(manifest_entry_no_meta):
124
+ entry = ManifestEntry.from_dict(json.loads(manifest_entry_no_meta))
125
+
126
+ assert entry is not None
127
+ assert entry.meta is None
128
+ assert entry.uri == "s3://test_bucket/file1.tsv.gz"
129
+ assert entry.mandatory is True
deltacat/utils/daft.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import Optional, List, Any, Dict, Callable
3
3
  import daft
4
4
  import ray
5
- from daft.table import read_parquet_into_pyarrow
5
+ from daft.recordbatch import read_parquet_into_pyarrow
6
6
  from daft import TimeUnit, DataFrame
7
7
  from daft.io import IOConfig, S3Config
8
8
  import pyarrow as pa
@@ -10,7 +10,6 @@ import pyarrow as pa
10
10
  from deltacat import logs
11
11
  from deltacat.utils.common import ReadKwargsProvider
12
12
  from deltacat.utils.schema import coerce_pyarrow_table_to_schema
13
-
14
13
  from deltacat.types.media import ContentType, ContentEncoding
15
14
  from deltacat.aws.constants import (
16
15
  BOTO_MAX_RETRIES,
@@ -72,9 +71,7 @@ def s3_files_to_dataframe(
72
71
  f"Preparing to read S3 object from {len(uris)} files into daft dataframe"
73
72
  )
74
73
 
75
- df, latency = timed_invocation(
76
- daft.read_parquet, path=uris, io_config=io_config, use_native_downloader=True
77
- )
74
+ df, latency = timed_invocation(daft.read_parquet, path=uris, io_config=io_config)
78
75
 
79
76
  logger.debug(f"Time to create daft dataframe from {len(uris)} files is {latency}s")
80
77
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 2.0.0b3
3
+ Version: 2.0.0b6
4
4
  Summary: A portable, scalable, fast, and Pythonic Data Lakehouse for AI.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -17,11 +17,11 @@ Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: aws-embedded-metrics==3.2.0
19
19
  Requires-Dist: boto3~=1.34
20
- Requires-Dist: getdaft==0.3.6
20
+ Requires-Dist: getdaft>=0.4.11
21
21
  Requires-Dist: intervaltree==3.1.0
22
22
  Requires-Dist: numpy==1.21.5
23
23
  Requires-Dist: pandas==1.3.5
24
- Requires-Dist: pyarrow==17.0.0
24
+ Requires-Dist: pyarrow==16.0.0
25
25
  Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3
26
26
  Requires-Dist: pymemcache==4.0.0
27
27
  Requires-Dist: ray>=2.20.0
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=GCLov4iY1E1wvwH6d8j0edbjmuyEHRWEvGEJ2Zs6UHo,2474
1
+ deltacat/__init__.py,sha256=gMoUv3PoGXHlkitSzr_bWcMYUfc5o1nn8-LdHDzLStU,2474
2
2
  deltacat/annotations.py,sha256=9lBi34DpIV_RPjCCK2Aiz_6nMyd-e-_CfQ1XtdRQQlM,1196
3
3
  deltacat/api.py,sha256=fYKurVlM97VKb_fh7kJ1rDcl-VAAuSflxPeqrsUt1u8,5257
4
4
  deltacat/constants.py,sha256=_JfHTRktDTM70Nls-LMnSmLeCRG17UwQYCmY6gQSGBg,3482
@@ -24,10 +24,10 @@ deltacat/catalog/delegate.py,sha256=x3jj_T61gyExuAnbDqhU6smbaAbIN4UxrVMZuBEOg0A,
24
24
  deltacat/catalog/interface.py,sha256=YB-qNBFsWupqyWJuHr7eQ-_MshhZZ5HpLphoZ64yn2g,12244
25
25
  deltacat/catalog/iceberg/__init__.py,sha256=LOENcLTQQlu_694MvRhMd2TQDLzwfg2vz0D8DuVO3M8,190
26
26
  deltacat/catalog/iceberg/iceberg_catalog_config.py,sha256=LfHxv8pk-YmTRQy5LvKFzwSqZ8ek2Y6v0KY7xihhIN0,786
27
- deltacat/catalog/iceberg/impl.py,sha256=hFAX0QGfWq25t9miYHACye_t_3fxUAmQXpQ9kf3w_xQ,13591
28
- deltacat/catalog/iceberg/overrides.py,sha256=HGev1Us2zJpavAoClCCMHrf6sQ8fG0poSxyLEJOB-Ss,2668
27
+ deltacat/catalog/iceberg/impl.py,sha256=c_ONnLLyh8Vyqo5PusQSHySQ92iM4Qgk-rucHMfdd7s,14288
28
+ deltacat/catalog/iceberg/overrides.py,sha256=WmM2mxf7ihDl8anb5GzBxo5-sxBkot8ZSRTxDpaauRA,2687
29
29
  deltacat/catalog/main/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- deltacat/catalog/main/impl.py,sha256=y7sya4BVfYMbp0-smgs_00cktw7QHkJxXTWADSr0W3s,23093
30
+ deltacat/catalog/main/impl.py,sha256=E9gCPaARJAaiIS2HTdXXz0-GwTjOaWIBX2TK2MsL194,23092
31
31
  deltacat/catalog/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  deltacat/catalog/model/catalog.py,sha256=zGuNl1Czqbk2QQx9qGpMWCTK9ay4b3tm3SJzNkLlw-I,10198
33
33
  deltacat/catalog/model/properties.py,sha256=wdXjd39-JEj-zZLL5pH6wyIXAdpph-CD7yEIF96Wn-A,4110
@@ -136,8 +136,11 @@ deltacat/examples/hello_world.py,sha256=hXpMUvJINB2qWTpV3QFPlRNu0uE31BvEs2sLyQ3C
136
136
  deltacat/examples/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
137
  deltacat/examples/common/fixtures.py,sha256=MS0Hz1c__f9Axm3JgTajfWuMVeDAQmFmZ7KB7vz_1q4,430
138
138
  deltacat/examples/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- deltacat/examples/iceberg/iceberg_bucket_writer.py,sha256=9i78x8WBgp-vvMBsvbCWkcRo6oEZ8SDtGfjMlNXAO30,4521
139
+ deltacat/examples/iceberg/iceberg_bucket_writer.py,sha256=PdJG3jXcgPVds4UanfyNWB1egv-Os7LnZCPhdgv9Yyk,6586
140
140
  deltacat/examples/iceberg/iceberg_reader.py,sha256=mlF-277vT04at-2jibAjgRJG6Y-zle_NNy1-pXwS2YQ,5023
141
+ deltacat/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
142
+ deltacat/experimental/daft/__init__.py,sha256=0d1SsgjbDher8TKgS0gSBBdy5TGi01fewiwpG0BMwck,108
143
+ deltacat/experimental/daft/daft_catalog.py,sha256=112wDqqzdtxmtZVwiZW59MektbRsFMjSRgqYHrUOuok,8396
141
144
  deltacat/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
142
145
  deltacat/io/file_object_store.py,sha256=YoNL3Qla8uLOHaWnyBmIgotjSGAy3Td3Tumah0kk73Y,1868
143
146
  deltacat/io/memcached_object_store.py,sha256=C96t77-4BQe0XZ4vC76Ygi2o1POUoMN4t4BiyPmulz0,10997
@@ -158,7 +161,7 @@ deltacat/storage/model/delta.py,sha256=PhkjME0dItGgPd37SrQbI8VjQcIaYW2OfIq0KJKgD
158
161
  deltacat/storage/model/interop.py,sha256=CzXdu1NuJF5ER3IjQJztkNECD6MRDwbmMezlfN4SRH0,536
159
162
  deltacat/storage/model/list_result.py,sha256=5DpRAu-c0M48cHtKdTRPSgQiq2nCWfjAY8LOVqp5wxI,2703
160
163
  deltacat/storage/model/locator.py,sha256=Q16y-eGSQSZpDPKDYQhOjSA9c5ajwg1jLw_13MIB4SM,4707
161
- deltacat/storage/model/manifest.py,sha256=iV53LLQY83pDv9YwUqlyzjfLiqFHWuJf9J0dZdR7yO4,15153
164
+ deltacat/storage/model/manifest.py,sha256=3I4Vohd-PnEQ5NdQu9yN3jvFchqnzb8hQ3bq6w_tO4E,16808
162
165
  deltacat/storage/model/metafile.py,sha256=UVWPvvYvA0tj_pM8ig7NKfVFrVWU4l3eDP7I2n9Upeg,53404
163
166
  deltacat/storage/model/namespace.py,sha256=gLli1V64O9RHIf-FesmqWA29Wi7P1kwt01uz5sDdJR0,2409
164
167
  deltacat/storage/model/partition.py,sha256=qNCvc74o_4pmFVL-FCyKCZMH4lHSjRO560sb3vaF_H0,20759
@@ -277,6 +280,7 @@ deltacat/tests/storage/main/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
277
280
  deltacat/tests/storage/main/test_main_storage.py,sha256=9dtsAcp9GZ4XQ5-8XhKnAcFF7upowJpTIuqZUB2EYig,58124
278
281
  deltacat/tests/storage/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
279
282
  deltacat/tests/storage/model/test_delete_parameters.py,sha256=RcNRMIed0zUzkX9tRXDoYPXHb7721OEt8viY9tpWXZM,822
283
+ deltacat/tests/storage/model/test_manifest.py,sha256=udp9YUNvIBpnT-NutjMaF25abEQOXEcPkQm8Aay_UCs,3733
280
284
  deltacat/tests/storage/model/test_metafile_io.py,sha256=116U9aNJPzR0JS6iadJyyx0_4KyAi3D47WCNbndag6o,101639
281
285
  deltacat/tests/storage/model/test_schema.py,sha256=5m4BscbxbbOiry-lDI8j4vQcnvkG2Y-f0ZfshncPiSI,9599
282
286
  deltacat/tests/storage/model/test_shard.py,sha256=6QBr-ws3zQkJjjGyB7QEOhtNC5ql0cdjOPB2wxGNW3Q,755
@@ -323,7 +327,7 @@ deltacat/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
323
327
  deltacat/utils/arguments.py,sha256=5y1Xz4HSAD8M8Jt83i6gOEKoYjy_fMQe1V43IhIE4hY,1191
324
328
  deltacat/utils/cloudpickle.py,sha256=XE7YDmQe56ksfl3NdYZkzOAhbHSuhNcBZGOehQpgZr0,1187
325
329
  deltacat/utils/common.py,sha256=RG_-enXNpLKaYrqyx1ne2lL10lxN9vK7F631oJP6SE8,1375
326
- deltacat/utils/daft.py,sha256=nd4XBKcZTFYxf_VH9jm-wqqbrIujKAeisCt2vVbW2BA,5807
330
+ deltacat/utils/daft.py,sha256=RsOGzxI6ltsRcH6SfbK6PDBEaKyLZaUisCBXBlUvjbI,5770
327
331
  deltacat/utils/export.py,sha256=As5aiwOw9vLxtfolPLU0yak6W2RVR0rkuaYQ5YCy49U,1952
328
332
  deltacat/utils/filesystem.py,sha256=DthBgrVGzIcsQcGnyD3QYEQIpkYFxB19XmpF9DfCaeo,11709
329
333
  deltacat/utils/metafile_locator.py,sha256=_3yEW9n49jiEBuXHZmUKsFdYx6RxWWuS-Mu2gs_a1bw,2933
@@ -342,8 +346,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
342
346
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
343
347
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
344
348
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
345
- deltacat-2.0.0b3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
346
- deltacat-2.0.0b3.dist-info/METADATA,sha256=mRoST3kb94Civ8ipex9LlT7_BQ1Sz2vMbukcv10AT6g,2808
347
- deltacat-2.0.0b3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
348
- deltacat-2.0.0b3.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
349
- deltacat-2.0.0b3.dist-info/RECORD,,
349
+ deltacat-2.0.0b6.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
350
+ deltacat-2.0.0b6.dist-info/METADATA,sha256=USf1wawH_OzPK19QEAqDWvrHDYhymxpP6I4RHD97YsQ,2809
351
+ deltacat-2.0.0b6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
352
+ deltacat-2.0.0b6.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
353
+ deltacat-2.0.0b6.dist-info/RECORD,,