deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,248 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, Generic, TypeVar, Callable, Optional
3
+ from functools import singledispatchmethod
4
+ import re
5
+
6
+ from deltacat.storage.model.expression import (
7
+ Expression,
8
+ Reference,
9
+ Literal,
10
+ BinaryExpression,
11
+ UnaryExpression,
12
+ In,
13
+ Between,
14
+ Like,
15
+ )
16
+
17
+
18
+ C = TypeVar("C") # Context type
19
+ R = TypeVar("R") # Return type
20
+
21
+
22
+ class ExpressionVisitor(ABC, Generic[C, R]):
23
+ """
24
+ Visitor pattern for deltacat expressions.
25
+
26
+ This base class provides two ways to implement visitors:
27
+ 1. Using a procedure dictionary (_PROCEDURES) - for simple, declarative visitors
28
+ 2. Using specialized visit_xyz methods with snake_case naming - for more control
29
+
30
+ Subclasses need only implement visit_reference and visit_literal, plus either:
31
+ - Define _PROCEDURES dictionary with functions for handling different expression types
32
+ - Implement specific visit_xyz methods (using snake_case) for individual expressions
33
+ """
34
+
35
+ # Default procedure dictionary for subclasses to override
36
+ _PROCEDURES: Dict[str, Callable] = {}
37
+
38
+ def __init__(self):
39
+ """Initialize visitor and validate required methods."""
40
+ # Pre-check for required methods
41
+ if not hasattr(self, "visit_reference") or not callable(
42
+ getattr(self, "visit_reference")
43
+ ):
44
+ raise NotImplementedError("Subclasses must implement visit_reference")
45
+ if not hasattr(self, "visit_literal") or not callable(
46
+ getattr(self, "visit_literal")
47
+ ):
48
+ raise NotImplementedError("Subclasses must implement visit_literal")
49
+ self._setup_default_procedure_handlers()
50
+
51
+ def _to_snake_case(self, name: str) -> str:
52
+ """Convert PascalCase or camelCase to snake_case."""
53
+ pattern = re.compile(r"(?<!^)(?=[A-Z])")
54
+ return pattern.sub("_", name).lower()
55
+
56
+ def _setup_default_procedure_handlers(self):
57
+ """Set up default procedure application methods if not overridden."""
58
+ if not hasattr(self, "_apply_binary") or not callable(
59
+ getattr(self, "_apply_binary")
60
+ ):
61
+ self._apply_binary = lambda proc, left, right: proc(left, right)
62
+ if not hasattr(self, "_apply_unary") or not callable(
63
+ getattr(self, "_apply_unary")
64
+ ):
65
+ self._apply_unary = lambda proc, operand: proc(operand)
66
+ if not hasattr(self, "_apply_in") or not callable(getattr(self, "_apply_in")):
67
+ self._apply_in = lambda proc, value, values: proc(value, values)
68
+ if not hasattr(self, "_apply_between") or not callable(
69
+ getattr(self, "_apply_between")
70
+ ):
71
+ self._apply_between = lambda proc, value, lower, upper: proc(
72
+ value, lower, upper
73
+ )
74
+ if not hasattr(self, "_apply_like") or not callable(
75
+ getattr(self, "_apply_like")
76
+ ):
77
+ self._apply_like = lambda proc, value, pattern: proc(value, pattern)
78
+
79
+ @singledispatchmethod
80
+ def visit(self, expr: Expression, context: Optional[C] = None) -> R:
81
+ """
82
+ Generic visit method that dispatches to specific methods based on expression type.
83
+
84
+ Args:
85
+ expr: The expression to visit
86
+ context: Optional context to pass through the visitor
87
+
88
+ Returns:
89
+ Result of visiting the expression
90
+ """
91
+ expr_type = type(expr).__name__
92
+ raise NotImplementedError(f"No visit method for type {expr_type}")
93
+
94
+ @visit.register
95
+ def _visit_reference(self, expr: Reference, context: Optional[C] = None) -> R:
96
+ """Visit a Reference expression."""
97
+ return self.visit_reference(expr, context)
98
+
99
+ @visit.register
100
+ def _visit_literal(self, expr: Literal, context: Optional[C] = None) -> R:
101
+ """Visit a Literal expression."""
102
+ return self.visit_literal(expr, context)
103
+
104
+ @visit.register
105
+ def _visit_binary(self, expr: BinaryExpression, context: Optional[C] = None) -> R:
106
+ """Visit a binary expression using method specialization or procedures."""
107
+ expr_type = type(expr).__name__
108
+
109
+ left_result = self.visit(expr.left, context)
110
+ right_result = self.visit(expr.right, context)
111
+
112
+ method_name = f"visit_{self._to_snake_case(expr_type)}"
113
+ if hasattr(self, method_name):
114
+ method = getattr(self, method_name)
115
+ return method(expr, context)
116
+
117
+ if expr_type in self._PROCEDURES:
118
+ return self._apply_binary(
119
+ self._PROCEDURES[expr_type], left_result, right_result
120
+ )
121
+
122
+ try:
123
+ return self.visit_binary_expression(
124
+ expr, left_result, right_result, context
125
+ )
126
+ except NotImplementedError:
127
+ raise NotImplementedError(f"No handler for {expr_type}")
128
+
129
+ @visit.register
130
+ def _visit_unary(self, expr: UnaryExpression, context: Optional[C] = None) -> R:
131
+ """Visit a unary expression using method specialization or procedures."""
132
+ expr_type = type(expr).__name__
133
+
134
+ operand_result = self.visit(expr.operand, context)
135
+
136
+ method_name = f"visit_{self._to_snake_case(expr_type)}"
137
+ if hasattr(self, method_name):
138
+ method = getattr(self, method_name)
139
+ return method(expr, context)
140
+
141
+ if expr_type in self._PROCEDURES:
142
+ return self._apply_unary(self._PROCEDURES[expr_type], operand_result)
143
+
144
+ try:
145
+ return self.visit_unary_expression(expr, operand_result, context)
146
+ except NotImplementedError:
147
+ raise NotImplementedError(f"No handler for {expr_type}")
148
+
149
+ @visit.register
150
+ def _visit_in(self, expr: In, context: Optional[C] = None) -> R:
151
+ """Visit an In expression."""
152
+ if hasattr(self, "visit_in"):
153
+ return self.visit_in(expr, context)
154
+
155
+ if "In" in self._PROCEDURES:
156
+ value_result = self.visit(expr.value, context)
157
+ values_results = [self.visit(v, context) for v in expr.values]
158
+ return self._apply_in(self._PROCEDURES["In"], value_result, values_results)
159
+
160
+ raise NotImplementedError("No handler for In expression")
161
+
162
+ @visit.register
163
+ def _visit_between(self, expr: Between, context: Optional[C] = None) -> R:
164
+ """Visit a Between expression."""
165
+ if hasattr(self, "visit_between"):
166
+ return self.visit_between(expr, context)
167
+
168
+ if "Between" in self._PROCEDURES:
169
+ value_result = self.visit(expr.value, context)
170
+ lower_result = self.visit(expr.lower, context)
171
+ upper_result = self.visit(expr.upper, context)
172
+ return self._apply_between(
173
+ self._PROCEDURES["Between"], value_result, lower_result, upper_result
174
+ )
175
+
176
+ raise NotImplementedError("No handler for Between expression")
177
+
178
+ @visit.register
179
+ def _visit_like(self, expr: Like, context: Optional[C] = None) -> R:
180
+ """Visit a Like expression."""
181
+ if hasattr(self, "visit_like"):
182
+ return self.visit_like(expr, context)
183
+
184
+ if "Like" in self._PROCEDURES:
185
+ value_result = self.visit(expr.value, context)
186
+ pattern_result = self.visit(expr.pattern, context)
187
+ return self._apply_like(
188
+ self._PROCEDURES["Like"], value_result, pattern_result
189
+ )
190
+
191
+ raise NotImplementedError("No handler for Like expression")
192
+
193
+ @abstractmethod
194
+ def visit_reference(self, expr: Reference, context: Optional[C] = None) -> R:
195
+ """Visit a Reference expression."""
196
+ pass
197
+
198
+ @abstractmethod
199
+ def visit_literal(self, expr: Literal, context: Optional[C] = None) -> R:
200
+ """Visit a Literal expression."""
201
+ pass
202
+
203
+ def visit_binary_expression(
204
+ self, expr: BinaryExpression, left: R, right: R, context: Optional[C] = None
205
+ ) -> R:
206
+ """Default fallback handler for binary expressions."""
207
+ raise NotImplementedError(f"No handler for {type(expr).__name__}")
208
+
209
+ def visit_unary_expression(
210
+ self, expr: UnaryExpression, operand: R, context: Optional[C] = None
211
+ ) -> R:
212
+ """Default fallback handler for unary expressions."""
213
+ raise NotImplementedError(f"No handler for {type(expr).__name__}")
214
+
215
+
216
+ class DisplayVisitor(ExpressionVisitor[Expression, str]):
217
+ """
218
+ Visitor implementation that formats expressions in standard infix notation.
219
+ For example: "a = b AND c > d" instead of "(AND (= a b) (> c d))".
220
+ """
221
+
222
+ # Map all expression types to their string formatting procedures with infix notation
223
+ _PROCEDURES = {
224
+ # Binary operations with infix notation
225
+ "Equal": lambda left, right: f"{left} = {right}",
226
+ "NotEqual": lambda left, right: f"{left} <> {right}",
227
+ "GreaterThan": lambda left, right: f"{left} > {right}",
228
+ "LessThan": lambda left, right: f"{left} < {right}",
229
+ "GreaterThanEqual": lambda left, right: f"{left} >= {right}",
230
+ "LessThanEqual": lambda left, right: f"{left} <= {right}",
231
+ "And": lambda left, right: f"({left} AND {right})",
232
+ "Or": lambda left, right: f"({left} OR {right})",
233
+ # Unary operations
234
+ "Not": lambda operand: f"NOT ({operand})",
235
+ "IsNull": lambda operand: f"({operand}) IS NULL",
236
+ # Special operations
237
+ "In": lambda value, values: f"{value} IN ({', '.join(values)})",
238
+ "Between": lambda value, lower, upper: f"{value} BETWEEN {lower} AND {upper}",
239
+ "Like": lambda value, pattern: f"{value} LIKE {pattern}",
240
+ }
241
+
242
+ def visit_reference(self, expr: Reference, context=None) -> str:
243
+ """Format a field reference."""
244
+ return expr.field
245
+
246
+ def visit_literal(self, expr: Literal, context=None) -> str:
247
+ """Format a literal value using its PyArrow representation."""
248
+ return str(expr.value)
@@ -90,29 +90,23 @@ class Locator:
90
90
  def canonical_string(self, separator: str = DEFAULT_NAME_SEPARATOR) -> str:
91
91
  """
92
92
  Returns a unique string for the given locator that can be used
93
- for equality checks (i.e. two locators are equal if they have
94
- the same canonical string).
93
+ for equality checks between objects with the same parent.
95
94
  """
96
- parts = []
97
- parent_hexdigest = self.parent.hexdigest() if self.parent else None
98
- if parent_hexdigest:
99
- parts.append(parent_hexdigest)
100
- parts.extend(self.name.parts())
101
- return separator.join([str(part) for part in parts])
95
+ return separator.join([str(part) for part in self.name.parts()])
102
96
 
103
97
  def digest(self) -> bytes:
104
98
  """
105
99
  Return a digest of the given locator that can be used for
106
- equality checks (i.e. two locators are equal if they have the
107
- same digest) and uniform random hash distribution.
100
+ equality checks between objects with the same parent and uniform
101
+ random hash distribution.
108
102
  """
109
103
  return sha1_digest(self.canonical_string().encode("utf-8"))
110
104
 
111
105
  def hexdigest(self) -> str:
112
106
  """
113
107
  Returns a hexdigest of the given locator suitable
114
- for use in equality (i.e. two locators are equal if they have the same
115
- hexdigest) and inclusion in URLs.
108
+ equality checks between objects with the same parent and
109
+ inclusion in URLs.
116
110
  """
117
111
  return sha1_hexdigest(self.canonical_string().encode("utf-8"))
118
112
 
@@ -4,12 +4,26 @@ import logging
4
4
  import itertools
5
5
 
6
6
  from enum import Enum
7
- from typing import Optional, List, Dict, Any
7
+ from typing import Optional, List, Dict, Any, TYPE_CHECKING
8
8
  from uuid import uuid4
9
9
 
10
+ if TYPE_CHECKING:
11
+ from deltacat.storage.model.schema import FieldLocator
12
+
10
13
  from deltacat import logs
11
14
 
12
- from deltacat.storage.model.schema import FieldLocator
15
+ from deltacat.types.media import (
16
+ ContentType,
17
+ ContentEncoding,
18
+ EXT_TO_CONTENT_TYPE,
19
+ EXT_TO_CONTENT_ENCODING,
20
+ )
21
+
22
+ import json
23
+ import pyarrow as pa
24
+ import posixpath
25
+
26
+ from deltacat.utils.filesystem import get_file_info
13
27
 
14
28
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
29
 
@@ -62,7 +76,7 @@ class EntryParams(dict):
62
76
 
63
77
  @staticmethod
64
78
  def of(
65
- equality_field_locators: Optional[List[FieldLocator]] = None,
79
+ equality_field_locators: Optional[List["FieldLocator"]] = None,
66
80
  ) -> EntryParams:
67
81
  params = EntryParams()
68
82
  if equality_field_locators is not None:
@@ -70,7 +84,7 @@ class EntryParams(dict):
70
84
  return params
71
85
 
72
86
  @property
73
- def equality_field_locators(self) -> Optional[List[FieldLocator]]:
87
+ def equality_field_locators(self) -> Optional[List["FieldLocator"]]:
74
88
  return self.get("equality_field_locators")
75
89
 
76
90
 
@@ -116,11 +130,35 @@ class Manifest(dict):
116
130
  content_encoding = None
117
131
  credentials = None
118
132
  content_type_params = None
133
+ schema_id = None
134
+ sort_scheme_id = None
119
135
  if entries:
120
136
  content_type = entries[0].meta.content_type
121
137
  content_encoding = entries[0].meta.content_encoding
122
138
  credentials = entries[0].meta.credentials
123
139
  content_type_params = entries[0].meta.content_type_parameters
140
+
141
+ # Keep the latest schema ID
142
+ # Schema IDs are >= 0, and schema evolution always increments the last schema ID
143
+ entry_schema_ids = [
144
+ entry.meta.schema_id if entry.meta.schema_id is not None else -1
145
+ for entry in entries
146
+ ]
147
+ max_schema_id = max(entry_schema_ids) if entry_schema_ids else -1
148
+ schema_id = max_schema_id if max_schema_id >= 0 else None
149
+
150
+ # Handle sort_scheme_id: set to None if entries have multiple different sort_scheme_ids
151
+ entry_sort_scheme_ids = set(
152
+ entry.meta.sort_scheme_id
153
+ for entry in entries
154
+ if entry.meta.sort_scheme_id is not None
155
+ )
156
+ sort_scheme_id = (
157
+ list(entry_sort_scheme_ids)[0]
158
+ if len(entry_sort_scheme_ids) == 1
159
+ else None
160
+ )
161
+
124
162
  for entry in entries:
125
163
  meta = entry.meta
126
164
  if meta.content_type != content_type:
@@ -128,7 +166,7 @@ class Manifest(dict):
128
166
  if meta.content_encoding != content_encoding:
129
167
  content_encoding = None
130
168
  entry_content_type = meta.content_type
131
- if entry_content_type != content_type:
169
+ if content_type and entry_content_type != content_type:
132
170
  msg = (
133
171
  f"Expected all manifest entries to have content "
134
172
  f"type '{content_type}' but found "
@@ -136,7 +174,7 @@ class Manifest(dict):
136
174
  )
137
175
  raise ValueError(msg)
138
176
  entry_content_encoding = meta.get("content_encoding", None)
139
- if entry_content_encoding != content_encoding:
177
+ if content_encoding and entry_content_encoding != content_encoding:
140
178
  msg = (
141
179
  f"Expected all manifest entries to have content "
142
180
  f"encoding '{content_encoding}' but found "
@@ -188,10 +226,26 @@ class Manifest(dict):
188
226
  content_type_parameters=content_type_params,
189
227
  entry_type=entry_type,
190
228
  entry_params=entry_params,
229
+ schema_id=schema_id,
230
+ sort_scheme_id=sort_scheme_id,
191
231
  )
192
232
  manifest = Manifest._build_manifest(meta, entries, author, uuid)
193
233
  return manifest
194
234
 
235
+ @staticmethod
236
+ def from_json(json_string: str) -> Manifest:
237
+ parsed_dict = json.loads(json_string)
238
+ return Manifest.of(
239
+ entries=ManifestEntryList.of(
240
+ [
241
+ ManifestEntry.from_dict(entry)
242
+ for entry in parsed_dict.get("entries", [])
243
+ ]
244
+ ),
245
+ author=ManifestAuthor.from_dict(parsed_dict.get("author")),
246
+ uuid=parsed_dict.get("id"),
247
+ )
248
+
195
249
  @staticmethod
196
250
  def merge_manifests(
197
251
  manifests: List[Manifest], author: Optional[ManifestAuthor] = None
@@ -240,6 +294,8 @@ class ManifestMeta(dict):
240
294
  content_type_parameters: Optional[List[Dict[str, str]]] = None,
241
295
  entry_type: Optional[EntryType] = None,
242
296
  entry_params: Optional[EntryParams] = None,
297
+ schema_id: Optional[int] = None,
298
+ sort_scheme_id: Optional[str] = None,
243
299
  ) -> ManifestMeta:
244
300
  manifest_meta = ManifestMeta()
245
301
  if record_count is not None:
@@ -262,8 +318,31 @@ class ManifestMeta(dict):
262
318
  )
263
319
  if entry_params is not None:
264
320
  manifest_meta["entry_params"] = entry_params
321
+ if schema_id is not None:
322
+ manifest_meta["schema_id"] = schema_id
323
+ if sort_scheme_id is not None:
324
+ manifest_meta["sort_scheme_id"] = sort_scheme_id
265
325
  return manifest_meta
266
326
 
327
+ @staticmethod
328
+ def from_dict(obj: dict) -> Optional[ManifestMeta]:
329
+ if obj is None:
330
+ return None
331
+
332
+ return ManifestMeta.of(
333
+ record_count=obj.get("record_count"),
334
+ content_length=obj.get("content_length"),
335
+ content_type=obj.get("content_type"),
336
+ content_encoding=obj.get("content_encoding"),
337
+ source_content_length=obj.get("source_content_length"),
338
+ credentials=obj.get("credentials"),
339
+ content_type_parameters=obj.get("content_type_parameters"),
340
+ entry_type=obj.get("entry_type"),
341
+ entry_params=obj.get("entry_params"),
342
+ schema_id=obj.get("schema_id"),
343
+ sort_scheme_id=obj.get("sort_scheme_id"),
344
+ )
345
+
267
346
  @property
268
347
  def record_count(self) -> Optional[int]:
269
348
  return self.get("record_count")
@@ -310,6 +389,14 @@ class ManifestMeta(dict):
310
389
  self["entry_params"] = val = EntryParams(val)
311
390
  return val
312
391
 
392
+ @property
393
+ def schema_id(self) -> Optional[int]:
394
+ return self.get("schema_id")
395
+
396
+ @property
397
+ def sort_scheme_id(self) -> Optional[str]:
398
+ return self.get("sort_scheme_id")
399
+
313
400
 
314
401
  class ManifestEntry(dict):
315
402
  @staticmethod
@@ -342,6 +429,10 @@ class ManifestEntry(dict):
342
429
  url: str,
343
430
  record_count: int,
344
431
  source_content_length: Optional[int] = None,
432
+ credentials: Optional[Dict[str, str]] = None,
433
+ content_type_parameters: Optional[List[Dict[str, str]]] = None,
434
+ entry_type: Optional[EntryType] = None,
435
+ entry_params: Optional[EntryParams] = None,
345
436
  **s3_client_kwargs,
346
437
  ) -> ManifestEntry:
347
438
  from deltacat.aws import s3u as s3_utils
@@ -354,10 +445,134 @@ class ManifestEntry(dict):
354
445
  content_type=s3_obj["ContentType"],
355
446
  content_encoding=s3_obj["ContentEncoding"],
356
447
  source_content_length=source_content_length,
448
+ credentials=credentials,
449
+ content_type_parameters=content_type_parameters,
450
+ entry_type=entry_type,
451
+ entry_params=entry_params,
357
452
  )
358
453
  manifest_entry = ManifestEntry.of(url, manifest_entry_meta)
359
454
  return manifest_entry
360
455
 
456
+ @staticmethod
457
+ def from_dict(obj: dict) -> ManifestEntry:
458
+ return ManifestEntry.of(
459
+ url=obj.get("url"),
460
+ uri=obj.get("uri"),
461
+ meta=ManifestMeta.from_dict(obj.get("meta")),
462
+ mandatory=obj.get("mandatory", True),
463
+ uuid=obj.get("id"),
464
+ )
465
+
466
+ @staticmethod
467
+ def from_path(
468
+ path: str,
469
+ filesystem: pa.fs.FileSystem,
470
+ record_count: int,
471
+ source_content_length: Optional[int] = None,
472
+ content_type: Optional[str] = None,
473
+ content_encoding: Optional[str] = None,
474
+ credentials: Optional[Dict[str, str]] = None,
475
+ content_type_parameters: Optional[List[Dict[str, str]]] = None,
476
+ entry_type: Optional[EntryType] = None,
477
+ entry_params: Optional[EntryParams] = None,
478
+ schema_id: Optional[int] = None,
479
+ sort_scheme_id: Optional[str] = None,
480
+ ) -> ManifestEntry:
481
+ """
482
+ Creates a manifest entry from a path using a pyarrow filesystem.
483
+
484
+ Args:
485
+ path: Path to the file
486
+ filesystem: PyArrow filesystem to use for accessing the file
487
+ record_count: Number of records in the file
488
+ source_content_length: Optional original content length in-memory
489
+ before writing to disk.
490
+ content_type: Optional content type override. If not provided, will
491
+ be derived from file extension.
492
+ content_encoding: Optional content encoding override. If not
493
+ provided, will be derived from file extension.
494
+ credentials: Optional credentials required to read this manifest entry.
495
+ content_type_parameters: Optional content type parameters.
496
+ entry_type: Optional entry type of this manifest entry. Defaults to DATA.
497
+ entry_params: Optional entry type parameters.
498
+ schema_id: Schema ID used to write this manifest entry.
499
+ sort_scheme_id: Sort scheme ID used to write this manifest entry.
500
+
501
+ Returns:
502
+ A ManifestEntry instance
503
+ """
504
+ file_info = get_file_info(path, filesystem)
505
+ if file_info.type != pa.fs.FileType.File:
506
+ raise FileNotFoundError(f"Path does not point to a file: {path}")
507
+
508
+ # Extract extensions from right to left
509
+ # First split will get potential encoding extension
510
+ base_path, ext1 = posixpath.splitext(path)
511
+
512
+ # Initialize with defaults for no extensions
513
+ derived_content_type = ContentType.BINARY
514
+ derived_content_encoding = ContentEncoding.IDENTITY
515
+
516
+ # Only proceed with extension checks if we found at least one extension
517
+ if ext1:
518
+ # Check if the first extension is a known encoding
519
+ derived_content_encoding = EXT_TO_CONTENT_ENCODING.get(
520
+ ext1,
521
+ ContentEncoding.IDENTITY,
522
+ )
523
+
524
+ # Get second extension only if first was an encoding
525
+ if derived_content_encoding != ContentEncoding.IDENTITY:
526
+ # Second split will get potential content type extension
527
+ _, ext2 = posixpath.splitext(base_path)
528
+ if ext2:
529
+ derived_content_type = EXT_TO_CONTENT_TYPE.get(
530
+ ext2,
531
+ ContentType.BINARY,
532
+ )
533
+ else:
534
+ # First extension wasn't an encoding, check if it's a
535
+ # content type
536
+ derived_content_type = EXT_TO_CONTENT_TYPE.get(
537
+ ext1,
538
+ ContentType.BINARY,
539
+ )
540
+
541
+ if (
542
+ derived_content_type == ContentType.BINARY
543
+ and derived_content_encoding != ContentEncoding.IDENTITY
544
+ ):
545
+ logger.debug(
546
+ f"Found encoding {derived_content_encoding.value} but no "
547
+ f"content type for {path}, assuming binary"
548
+ )
549
+
550
+ # Use provided values if available, otherwise use derived values
551
+ final_content_type = (
552
+ content_type if content_type is not None else derived_content_type.value
553
+ )
554
+ final_content_encoding = (
555
+ content_encoding
556
+ if content_encoding is not None
557
+ else derived_content_encoding.value
558
+ )
559
+
560
+ manifest_entry_meta = ManifestMeta.of(
561
+ record_count=record_count,
562
+ content_length=file_info.size,
563
+ content_type=final_content_type,
564
+ content_encoding=final_content_encoding,
565
+ source_content_length=source_content_length,
566
+ credentials=credentials,
567
+ content_type_parameters=content_type_parameters,
568
+ entry_type=entry_type,
569
+ entry_params=entry_params,
570
+ schema_id=schema_id,
571
+ sort_scheme_id=sort_scheme_id,
572
+ )
573
+ manifest_entry = ManifestEntry.of(path, manifest_entry_meta)
574
+ return manifest_entry
575
+
361
576
  @property
362
577
  def uri(self) -> Optional[str]:
363
578
  return self.get("uri")
@@ -392,6 +607,12 @@ class ManifestAuthor(dict):
392
607
  manifest_author["version"] = version
393
608
  return manifest_author
394
609
 
610
+ @staticmethod
611
+ def from_dict(obj: dict) -> Optional[ManifestAuthor]:
612
+ if obj is None:
613
+ return None
614
+ return ManifestAuthor.of(obj.get("name"), obj.get("version"))
615
+
395
616
  @property
396
617
  def name(self) -> Optional[str]:
397
618
  return self.get("name")
@@ -416,3 +637,7 @@ class ManifestEntryList(List[ManifestEntry]):
416
637
  if val is not None and not isinstance(val, ManifestEntry):
417
638
  self[item] = val = ManifestEntry(val)
418
639
  return val
640
+
641
+ def __iter__(self):
642
+ for i in range(len(self)):
643
+ yield self[i] # This triggers __getitem__ conversion