deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ import json
4
5
  import posixpath
5
6
  from typing import Any, Dict, List, Optional
6
7
 
@@ -48,8 +49,7 @@ class Delta(Metafile):
48
49
  ) -> Delta:
49
50
  """
50
51
  Creates a Delta metadata model with the given Delta Locator, Delta Type,
51
- manifest metadata, properties, manifest, and previous delta stream
52
- position.
52
+ manifest metadata, properties, manifest, and previous delta stream position.
53
53
  """
54
54
  delta = Delta()
55
55
  delta.locator = locator
@@ -275,6 +275,13 @@ class Delta(Metafile):
275
275
  return delta_locator.partition_values
276
276
  return None
277
277
 
278
+ @property
279
+ def partition_values_json(self) -> Optional[str]:
280
+ partition_values = (
281
+ self.partition_values if self.partition_values is not None else None
282
+ )
283
+ return json.dumps(partition_values)
284
+
278
285
  @property
279
286
  def stream_position(self) -> Optional[int]:
280
287
  delta_locator = self.locator
@@ -282,6 +289,13 @@ class Delta(Metafile):
282
289
  return delta_locator.stream_position
283
290
  return None
284
291
 
292
+ def url(self, catalog_name: Optional[str] = None) -> str:
293
+ return (
294
+ f"dc://{catalog_name}/{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/{self.partition_values_json}/{self.stream_position}/"
295
+ if catalog_name
296
+ else f"table://{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/{self.partition_values_json}/{self.stream_position}/"
297
+ )
298
+
285
299
  def to_serializable(self) -> Delta:
286
300
  serializable = self
287
301
  if serializable.table_locator:
@@ -378,7 +392,17 @@ class DeltaLocator(Locator, dict):
378
392
  partition_values,
379
393
  partition_id,
380
394
  )
381
- if partition_values and partition_id
395
+ if any(
396
+ [
397
+ partition_id,
398
+ partition_values,
399
+ stream_id,
400
+ stream_format,
401
+ table_name,
402
+ table_version,
403
+ namespace,
404
+ ]
405
+ )
382
406
  else None
383
407
  )
384
408
  return DeltaLocator.of(
@@ -90,29 +90,23 @@ class Locator:
90
90
  def canonical_string(self, separator: str = DEFAULT_NAME_SEPARATOR) -> str:
91
91
  """
92
92
  Returns a unique string for the given locator that can be used
93
- for equality checks (i.e. two locators are equal if they have
94
- the same canonical string).
93
+ for equality checks between objects with the same parent.
95
94
  """
96
- parts = []
97
- parent_hexdigest = self.parent.hexdigest() if self.parent else None
98
- if parent_hexdigest:
99
- parts.append(parent_hexdigest)
100
- parts.extend(self.name.parts())
101
- return separator.join([str(part) for part in parts])
95
+ return separator.join([str(part) for part in self.name.parts()])
102
96
 
103
97
  def digest(self) -> bytes:
104
98
  """
105
99
  Return a digest of the given locator that can be used for
106
- equality checks (i.e. two locators are equal if they have the
107
- same digest) and uniform random hash distribution.
100
+ equality checks between objects with the same parent and uniform
101
+ random hash distribution.
108
102
  """
109
103
  return sha1_digest(self.canonical_string().encode("utf-8"))
110
104
 
111
105
  def hexdigest(self) -> str:
112
106
  """
113
107
  Returns a hexdigest of the given locator suitable
114
- for use in equality (i.e. two locators are equal if they have the same
115
- hexdigest) and inclusion in URLs.
108
+ equality checks between objects with the same parent and
109
+ inclusion in URLs.
116
110
  """
117
111
  return sha1_hexdigest(self.canonical_string().encode("utf-8"))
118
112
 
@@ -4,14 +4,26 @@ import logging
4
4
  import itertools
5
5
 
6
6
  from enum import Enum
7
- from typing import Optional, List, Dict, Any
7
+ from typing import Optional, List, Dict, Any, TYPE_CHECKING
8
8
  from uuid import uuid4
9
9
 
10
+ if TYPE_CHECKING:
11
+ from deltacat.storage.model.schema import FieldLocator
12
+
10
13
  from deltacat import logs
11
14
 
12
- from deltacat.storage.model.schema import FieldLocator
15
+ from deltacat.types.media import (
16
+ ContentType,
17
+ ContentEncoding,
18
+ EXT_TO_CONTENT_TYPE,
19
+ EXT_TO_CONTENT_ENCODING,
20
+ )
13
21
 
14
22
  import json
23
+ import pyarrow as pa
24
+ import posixpath
25
+
26
+ from deltacat.utils.filesystem import get_file_info
15
27
 
16
28
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
17
29
 
@@ -64,7 +76,7 @@ class EntryParams(dict):
64
76
 
65
77
  @staticmethod
66
78
  def of(
67
- equality_field_locators: Optional[List[FieldLocator]] = None,
79
+ equality_field_locators: Optional[List["FieldLocator"]] = None,
68
80
  ) -> EntryParams:
69
81
  params = EntryParams()
70
82
  if equality_field_locators is not None:
@@ -72,7 +84,7 @@ class EntryParams(dict):
72
84
  return params
73
85
 
74
86
  @property
75
- def equality_field_locators(self) -> Optional[List[FieldLocator]]:
87
+ def equality_field_locators(self) -> Optional[List["FieldLocator"]]:
76
88
  return self.get("equality_field_locators")
77
89
 
78
90
 
@@ -118,11 +130,35 @@ class Manifest(dict):
118
130
  content_encoding = None
119
131
  credentials = None
120
132
  content_type_params = None
133
+ schema_id = None
134
+ sort_scheme_id = None
121
135
  if entries:
122
136
  content_type = entries[0].meta.content_type
123
137
  content_encoding = entries[0].meta.content_encoding
124
138
  credentials = entries[0].meta.credentials
125
139
  content_type_params = entries[0].meta.content_type_parameters
140
+
141
+ # Keep the latest schema ID
142
+ # Schema IDs are >= 0, and schema evolution always increments the last schema ID
143
+ entry_schema_ids = [
144
+ entry.meta.schema_id if entry.meta.schema_id is not None else -1
145
+ for entry in entries
146
+ ]
147
+ max_schema_id = max(entry_schema_ids) if entry_schema_ids else -1
148
+ schema_id = max_schema_id if max_schema_id >= 0 else None
149
+
150
+ # Handle sort_scheme_id: set to None if entries have multiple different sort_scheme_ids
151
+ entry_sort_scheme_ids = set(
152
+ entry.meta.sort_scheme_id
153
+ for entry in entries
154
+ if entry.meta.sort_scheme_id is not None
155
+ )
156
+ sort_scheme_id = (
157
+ list(entry_sort_scheme_ids)[0]
158
+ if len(entry_sort_scheme_ids) == 1
159
+ else None
160
+ )
161
+
126
162
  for entry in entries:
127
163
  meta = entry.meta
128
164
  if meta.content_type != content_type:
@@ -130,7 +166,7 @@ class Manifest(dict):
130
166
  if meta.content_encoding != content_encoding:
131
167
  content_encoding = None
132
168
  entry_content_type = meta.content_type
133
- if entry_content_type != content_type:
169
+ if content_type and entry_content_type != content_type:
134
170
  msg = (
135
171
  f"Expected all manifest entries to have content "
136
172
  f"type '{content_type}' but found "
@@ -138,7 +174,7 @@ class Manifest(dict):
138
174
  )
139
175
  raise ValueError(msg)
140
176
  entry_content_encoding = meta.get("content_encoding", None)
141
- if entry_content_encoding != content_encoding:
177
+ if content_encoding and entry_content_encoding != content_encoding:
142
178
  msg = (
143
179
  f"Expected all manifest entries to have content "
144
180
  f"encoding '{content_encoding}' but found "
@@ -190,6 +226,8 @@ class Manifest(dict):
190
226
  content_type_parameters=content_type_params,
191
227
  entry_type=entry_type,
192
228
  entry_params=entry_params,
229
+ schema_id=schema_id,
230
+ sort_scheme_id=sort_scheme_id,
193
231
  )
194
232
  manifest = Manifest._build_manifest(meta, entries, author, uuid)
195
233
  return manifest
@@ -256,6 +294,8 @@ class ManifestMeta(dict):
256
294
  content_type_parameters: Optional[List[Dict[str, str]]] = None,
257
295
  entry_type: Optional[EntryType] = None,
258
296
  entry_params: Optional[EntryParams] = None,
297
+ schema_id: Optional[int] = None,
298
+ sort_scheme_id: Optional[str] = None,
259
299
  ) -> ManifestMeta:
260
300
  manifest_meta = ManifestMeta()
261
301
  if record_count is not None:
@@ -278,6 +318,10 @@ class ManifestMeta(dict):
278
318
  )
279
319
  if entry_params is not None:
280
320
  manifest_meta["entry_params"] = entry_params
321
+ if schema_id is not None:
322
+ manifest_meta["schema_id"] = schema_id
323
+ if sort_scheme_id is not None:
324
+ manifest_meta["sort_scheme_id"] = sort_scheme_id
281
325
  return manifest_meta
282
326
 
283
327
  @staticmethod
@@ -295,6 +339,8 @@ class ManifestMeta(dict):
295
339
  content_type_parameters=obj.get("content_type_parameters"),
296
340
  entry_type=obj.get("entry_type"),
297
341
  entry_params=obj.get("entry_params"),
342
+ schema_id=obj.get("schema_id"),
343
+ sort_scheme_id=obj.get("sort_scheme_id"),
298
344
  )
299
345
 
300
346
  @property
@@ -343,6 +389,14 @@ class ManifestMeta(dict):
343
389
  self["entry_params"] = val = EntryParams(val)
344
390
  return val
345
391
 
392
+ @property
393
+ def schema_id(self) -> Optional[int]:
394
+ return self.get("schema_id")
395
+
396
+ @property
397
+ def sort_scheme_id(self) -> Optional[str]:
398
+ return self.get("sort_scheme_id")
399
+
346
400
 
347
401
  class ManifestEntry(dict):
348
402
  @staticmethod
@@ -375,6 +429,10 @@ class ManifestEntry(dict):
375
429
  url: str,
376
430
  record_count: int,
377
431
  source_content_length: Optional[int] = None,
432
+ credentials: Optional[Dict[str, str]] = None,
433
+ content_type_parameters: Optional[List[Dict[str, str]]] = None,
434
+ entry_type: Optional[EntryType] = None,
435
+ entry_params: Optional[EntryParams] = None,
378
436
  **s3_client_kwargs,
379
437
  ) -> ManifestEntry:
380
438
  from deltacat.aws import s3u as s3_utils
@@ -387,6 +445,10 @@ class ManifestEntry(dict):
387
445
  content_type=s3_obj["ContentType"],
388
446
  content_encoding=s3_obj["ContentEncoding"],
389
447
  source_content_length=source_content_length,
448
+ credentials=credentials,
449
+ content_type_parameters=content_type_parameters,
450
+ entry_type=entry_type,
451
+ entry_params=entry_params,
390
452
  )
391
453
  manifest_entry = ManifestEntry.of(url, manifest_entry_meta)
392
454
  return manifest_entry
@@ -401,6 +463,116 @@ class ManifestEntry(dict):
401
463
  uuid=obj.get("id"),
402
464
  )
403
465
 
466
+ @staticmethod
467
+ def from_path(
468
+ path: str,
469
+ filesystem: pa.fs.FileSystem,
470
+ record_count: int,
471
+ source_content_length: Optional[int] = None,
472
+ content_type: Optional[str] = None,
473
+ content_encoding: Optional[str] = None,
474
+ credentials: Optional[Dict[str, str]] = None,
475
+ content_type_parameters: Optional[List[Dict[str, str]]] = None,
476
+ entry_type: Optional[EntryType] = None,
477
+ entry_params: Optional[EntryParams] = None,
478
+ schema_id: Optional[int] = None,
479
+ sort_scheme_id: Optional[str] = None,
480
+ ) -> ManifestEntry:
481
+ """
482
+ Creates a manifest entry from a path using a pyarrow filesystem.
483
+
484
+ Args:
485
+ path: Path to the file
486
+ filesystem: PyArrow filesystem to use for accessing the file
487
+ record_count: Number of records in the file
488
+ source_content_length: Optional original content length in-memory
489
+ before writing to disk.
490
+ content_type: Optional content type override. If not provided, will
491
+ be derived from file extension.
492
+ content_encoding: Optional content encoding override. If not
493
+ provided, will be derived from file extension.
494
+ credentials: Optional credentials required to read this manifest entry.
495
+ content_type_parameters: Optional content type parameters.
496
+ entry_type: Optional entry type of this manifest entry. Defaults to DATA.
497
+ entry_params: Optional entry type parameters.
498
+ schema_id: Schema ID used to write this manifest entry.
499
+ sort_scheme_id: Sort scheme ID used to write this manifest entry.
500
+
501
+ Returns:
502
+ A ManifestEntry instance
503
+ """
504
+ file_info = get_file_info(path, filesystem)
505
+ if file_info.type != pa.fs.FileType.File:
506
+ raise FileNotFoundError(f"Path does not point to a file: {path}")
507
+
508
+ # Extract extensions from right to left
509
+ # First split will get potential encoding extension
510
+ base_path, ext1 = posixpath.splitext(path)
511
+
512
+ # Initialize with defaults for no extensions
513
+ derived_content_type = ContentType.BINARY
514
+ derived_content_encoding = ContentEncoding.IDENTITY
515
+
516
+ # Only proceed with extension checks if we found at least one extension
517
+ if ext1:
518
+ # Check if the first extension is a known encoding
519
+ derived_content_encoding = EXT_TO_CONTENT_ENCODING.get(
520
+ ext1,
521
+ ContentEncoding.IDENTITY,
522
+ )
523
+
524
+ # Get second extension only if first was an encoding
525
+ if derived_content_encoding != ContentEncoding.IDENTITY:
526
+ # Second split will get potential content type extension
527
+ _, ext2 = posixpath.splitext(base_path)
528
+ if ext2:
529
+ derived_content_type = EXT_TO_CONTENT_TYPE.get(
530
+ ext2,
531
+ ContentType.BINARY,
532
+ )
533
+ else:
534
+ # First extension wasn't an encoding, check if it's a
535
+ # content type
536
+ derived_content_type = EXT_TO_CONTENT_TYPE.get(
537
+ ext1,
538
+ ContentType.BINARY,
539
+ )
540
+
541
+ if (
542
+ derived_content_type == ContentType.BINARY
543
+ and derived_content_encoding != ContentEncoding.IDENTITY
544
+ ):
545
+ logger.debug(
546
+ f"Found encoding {derived_content_encoding.value} but no "
547
+ f"content type for {path}, assuming binary"
548
+ )
549
+
550
+ # Use provided values if available, otherwise use derived values
551
+ final_content_type = (
552
+ content_type if content_type is not None else derived_content_type.value
553
+ )
554
+ final_content_encoding = (
555
+ content_encoding
556
+ if content_encoding is not None
557
+ else derived_content_encoding.value
558
+ )
559
+
560
+ manifest_entry_meta = ManifestMeta.of(
561
+ record_count=record_count,
562
+ content_length=file_info.size,
563
+ content_type=final_content_type,
564
+ content_encoding=final_content_encoding,
565
+ source_content_length=source_content_length,
566
+ credentials=credentials,
567
+ content_type_parameters=content_type_parameters,
568
+ entry_type=entry_type,
569
+ entry_params=entry_params,
570
+ schema_id=schema_id,
571
+ sort_scheme_id=sort_scheme_id,
572
+ )
573
+ manifest_entry = ManifestEntry.of(path, manifest_entry_meta)
574
+ return manifest_entry
575
+
404
576
  @property
405
577
  def uri(self) -> Optional[str]:
406
578
  return self.get("uri")
@@ -465,3 +637,7 @@ class ManifestEntryList(List[ManifestEntry]):
465
637
  if val is not None and not isinstance(val, ManifestEntry):
466
638
  self[item] = val = ManifestEntry(val)
467
639
  return val
640
+
641
+ def __iter__(self):
642
+ for i in range(len(self)):
643
+ yield self[i] # This triggers __getitem__ conversion