@lix-js/sdk 0.6.0-preview.1 → 0.6.0-preview.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. package/SKILL.md +305 -320
  2. package/dist/engine-wasm/wasm/lix_engine.d.ts +5 -0
  3. package/dist/engine-wasm/wasm/lix_engine.js +9 -13
  4. package/dist/engine-wasm/wasm/lix_engine.wasm +0 -0
  5. package/dist/engine-wasm/wasm/lix_engine.wasm.d.ts +1 -0
  6. package/dist/open-lix.d.ts +103 -14
  7. package/dist/open-lix.js +3 -0
  8. package/dist/sqlite/index.js +99 -22
  9. package/dist-engine-src/README.md +18 -0
  10. package/dist-engine-src/src/backend/kv.rs +358 -0
  11. package/dist-engine-src/src/backend/mod.rs +12 -0
  12. package/dist-engine-src/src/backend/testing.rs +658 -0
  13. package/dist-engine-src/src/backend/types.rs +96 -0
  14. package/dist-engine-src/src/binary_cas/chunking.rs +31 -0
  15. package/dist-engine-src/src/binary_cas/codec.rs +346 -0
  16. package/dist-engine-src/src/binary_cas/context.rs +139 -0
  17. package/dist-engine-src/src/binary_cas/kv.rs +1063 -0
  18. package/dist-engine-src/src/binary_cas/mod.rs +11 -0
  19. package/dist-engine-src/src/binary_cas/types.rs +127 -0
  20. package/dist-engine-src/src/cel/context.rs +86 -0
  21. package/dist-engine-src/src/cel/error.rs +19 -0
  22. package/dist-engine-src/src/cel/mod.rs +8 -0
  23. package/dist-engine-src/src/cel/provider.rs +9 -0
  24. package/dist-engine-src/src/cel/runtime.rs +167 -0
  25. package/dist-engine-src/src/cel/value.rs +50 -0
  26. package/dist-engine-src/src/changelog/codec.rs +321 -0
  27. package/dist-engine-src/src/changelog/context.rs +92 -0
  28. package/dist-engine-src/src/changelog/materialization.rs +121 -0
  29. package/dist-engine-src/src/changelog/mod.rs +13 -0
  30. package/dist-engine-src/src/changelog/reader.rs +20 -0
  31. package/dist-engine-src/src/changelog/storage.rs +220 -0
  32. package/dist-engine-src/src/changelog/types.rs +38 -0
  33. package/dist-engine-src/src/commit_graph/context.rs +1588 -0
  34. package/dist-engine-src/src/commit_graph/mod.rs +12 -0
  35. package/dist-engine-src/src/commit_graph/types.rs +145 -0
  36. package/dist-engine-src/src/commit_graph/walker.rs +780 -0
  37. package/dist-engine-src/src/common/error.rs +313 -0
  38. package/dist-engine-src/src/common/fingerprint.rs +3 -0
  39. package/dist-engine-src/src/common/fs_path.rs +1336 -0
  40. package/dist-engine-src/src/common/identity.rs +135 -0
  41. package/dist-engine-src/src/common/metadata.rs +35 -0
  42. package/dist-engine-src/src/common/mod.rs +23 -0
  43. package/dist-engine-src/src/common/types.rs +105 -0
  44. package/dist-engine-src/src/common/wire.rs +222 -0
  45. package/dist-engine-src/src/engine.rs +239 -0
  46. package/dist-engine-src/src/entity_identity.rs +285 -0
  47. package/dist-engine-src/src/functions/context.rs +327 -0
  48. package/dist-engine-src/src/functions/deterministic.rs +113 -0
  49. package/dist-engine-src/src/functions/mod.rs +18 -0
  50. package/dist-engine-src/src/functions/provider.rs +130 -0
  51. package/dist-engine-src/src/functions/state.rs +363 -0
  52. package/dist-engine-src/src/functions/types.rs +37 -0
  53. package/dist-engine-src/src/init.rs +505 -0
  54. package/dist-engine-src/src/json_store/compression.rs +77 -0
  55. package/dist-engine-src/src/json_store/context.rs +129 -0
  56. package/dist-engine-src/src/json_store/encoded.rs +15 -0
  57. package/dist-engine-src/src/json_store/mod.rs +9 -0
  58. package/dist-engine-src/src/json_store/store.rs +236 -0
  59. package/dist-engine-src/src/json_store/types.rs +52 -0
  60. package/dist-engine-src/src/lib.rs +61 -0
  61. package/dist-engine-src/src/live_state/context.rs +2241 -0
  62. package/dist-engine-src/src/live_state/mod.rs +15 -0
  63. package/dist-engine-src/src/live_state/overlay.rs +75 -0
  64. package/dist-engine-src/src/live_state/reader.rs +23 -0
  65. package/dist-engine-src/src/live_state/types.rs +239 -0
  66. package/dist-engine-src/src/live_state/visibility.rs +218 -0
  67. package/dist-engine-src/src/plugin/archive.rs +441 -0
  68. package/dist-engine-src/src/plugin/component.rs +183 -0
  69. package/dist-engine-src/src/plugin/install.rs +637 -0
  70. package/dist-engine-src/src/plugin/manifest.rs +516 -0
  71. package/dist-engine-src/src/plugin/materializer.rs +477 -0
  72. package/dist-engine-src/src/plugin/mod.rs +33 -0
  73. package/dist-engine-src/src/plugin/plugin_manifest.json +119 -0
  74. package/dist-engine-src/src/plugin/storage.rs +74 -0
  75. package/dist-engine-src/src/schema/annotations/defaults.rs +280 -0
  76. package/dist-engine-src/src/schema/annotations/mod.rs +1 -0
  77. package/dist-engine-src/src/schema/builtin/lix_account.json +22 -0
  78. package/dist-engine-src/src/schema/builtin/lix_active_account.json +30 -0
  79. package/dist-engine-src/src/schema/builtin/lix_binary_blob_ref.json +30 -0
  80. package/dist-engine-src/src/schema/builtin/lix_change.json +62 -0
  81. package/dist-engine-src/src/schema/builtin/lix_change_author.json +46 -0
  82. package/dist-engine-src/src/schema/builtin/lix_change_set.json +18 -0
  83. package/dist-engine-src/src/schema/builtin/lix_change_set_element.json +75 -0
  84. package/dist-engine-src/src/schema/builtin/lix_commit.json +62 -0
  85. package/dist-engine-src/src/schema/builtin/lix_commit_edge.json +46 -0
  86. package/dist-engine-src/src/schema/builtin/lix_directory_descriptor.json +53 -0
  87. package/dist-engine-src/src/schema/builtin/lix_entity_label.json +63 -0
  88. package/dist-engine-src/src/schema/builtin/lix_file_descriptor.json +53 -0
  89. package/dist-engine-src/src/schema/builtin/lix_key_value.json +41 -0
  90. package/dist-engine-src/src/schema/builtin/lix_label.json +22 -0
  91. package/dist-engine-src/src/schema/builtin/lix_registered_schema.json +31 -0
  92. package/dist-engine-src/src/schema/builtin/lix_version_descriptor.json +35 -0
  93. package/dist-engine-src/src/schema/builtin/lix_version_ref.json +49 -0
  94. package/dist-engine-src/src/schema/builtin/mod.rs +271 -0
  95. package/dist-engine-src/src/schema/definition.json +157 -0
  96. package/dist-engine-src/src/schema/definition.rs +636 -0
  97. package/dist-engine-src/src/schema/key.rs +206 -0
  98. package/dist-engine-src/src/schema/mod.rs +20 -0
  99. package/dist-engine-src/src/schema/seed.rs +14 -0
  100. package/dist-engine-src/src/schema/tests.rs +739 -0
  101. package/dist-engine-src/src/schema_registry.rs +294 -0
  102. package/dist-engine-src/src/session/context.rs +366 -0
  103. package/dist-engine-src/src/session/create_version.rs +80 -0
  104. package/dist-engine-src/src/session/execute.rs +447 -0
  105. package/dist-engine-src/src/session/merge/analysis.rs +102 -0
  106. package/dist-engine-src/src/session/merge/apply.rs +23 -0
  107. package/dist-engine-src/src/session/merge/conflicts.rs +62 -0
  108. package/dist-engine-src/src/session/merge/mod.rs +11 -0
  109. package/dist-engine-src/src/session/merge/stats.rs +65 -0
  110. package/dist-engine-src/src/session/merge/version.rs +437 -0
  111. package/dist-engine-src/src/session/mod.rs +25 -0
  112. package/dist-engine-src/src/session/switch_version.rs +121 -0
  113. package/dist-engine-src/src/sql2/change_provider.rs +337 -0
  114. package/dist-engine-src/src/sql2/classify.rs +147 -0
  115. package/dist-engine-src/src/sql2/commit_derived_provider.rs +591 -0
  116. package/dist-engine-src/src/sql2/context.rs +307 -0
  117. package/dist-engine-src/src/sql2/directory_history_provider.rs +623 -0
  118. package/dist-engine-src/src/sql2/directory_provider.rs +2405 -0
  119. package/dist-engine-src/src/sql2/dml.rs +148 -0
  120. package/dist-engine-src/src/sql2/entity_history_provider.rs +444 -0
  121. package/dist-engine-src/src/sql2/entity_provider.rs +2700 -0
  122. package/dist-engine-src/src/sql2/error.rs +196 -0
  123. package/dist-engine-src/src/sql2/execute.rs +3379 -0
  124. package/dist-engine-src/src/sql2/file_history_provider.rs +902 -0
  125. package/dist-engine-src/src/sql2/file_provider.rs +3254 -0
  126. package/dist-engine-src/src/sql2/filesystem_planner.rs +1526 -0
  127. package/dist-engine-src/src/sql2/filesystem_predicates.rs +159 -0
  128. package/dist-engine-src/src/sql2/filesystem_visibility.rs +369 -0
  129. package/dist-engine-src/src/sql2/history_projection.rs +80 -0
  130. package/dist-engine-src/src/sql2/history_provider.rs +418 -0
  131. package/dist-engine-src/src/sql2/history_route.rs +643 -0
  132. package/dist-engine-src/src/sql2/lix_state_provider.rs +2430 -0
  133. package/dist-engine-src/src/sql2/mod.rs +43 -0
  134. package/dist-engine-src/src/sql2/read_only.rs +65 -0
  135. package/dist-engine-src/src/sql2/record_batch.rs +17 -0
  136. package/dist-engine-src/src/sql2/result_metadata.rs +29 -0
  137. package/dist-engine-src/src/sql2/runtime.rs +60 -0
  138. package/dist-engine-src/src/sql2/session.rs +135 -0
  139. package/dist-engine-src/src/sql2/udfs/common.rs +295 -0
  140. package/dist-engine-src/src/sql2/udfs/lix_active_version_commit_id.rs +53 -0
  141. package/dist-engine-src/src/sql2/udfs/lix_empty_blob.rs +47 -0
  142. package/dist-engine-src/src/sql2/udfs/lix_json.rs +100 -0
  143. package/dist-engine-src/src/sql2/udfs/lix_json_get.rs +99 -0
  144. package/dist-engine-src/src/sql2/udfs/lix_json_get_text.rs +99 -0
  145. package/dist-engine-src/src/sql2/udfs/lix_text_decode.rs +82 -0
  146. package/dist-engine-src/src/sql2/udfs/lix_text_encode.rs +85 -0
  147. package/dist-engine-src/src/sql2/udfs/lix_uuid_v7.rs +76 -0
  148. package/dist-engine-src/src/sql2/udfs/mod.rs +82 -0
  149. package/dist-engine-src/src/sql2/version_provider.rs +1187 -0
  150. package/dist-engine-src/src/sql2/version_scope.rs +394 -0
  151. package/dist-engine-src/src/sql2/write_normalization.rs +345 -0
  152. package/dist-engine-src/src/storage/context.rs +356 -0
  153. package/dist-engine-src/src/storage/mod.rs +14 -0
  154. package/dist-engine-src/src/storage/read_scope.rs +88 -0
  155. package/dist-engine-src/src/storage/types.rs +501 -0
  156. package/dist-engine-src/src/storage_bench.rs +3406 -0
  157. package/dist-engine-src/src/test_support.rs +81 -0
  158. package/dist-engine-src/src/tracked_state/by_file_index.rs +102 -0
  159. package/dist-engine-src/src/tracked_state/codec.rs +747 -0
  160. package/dist-engine-src/src/tracked_state/context.rs +983 -0
  161. package/dist-engine-src/src/tracked_state/diff.rs +494 -0
  162. package/dist-engine-src/src/tracked_state/materialization.rs +141 -0
  163. package/dist-engine-src/src/tracked_state/merge.rs +474 -0
  164. package/dist-engine-src/src/tracked_state/mod.rs +31 -0
  165. package/dist-engine-src/src/tracked_state/rebuild.rs +771 -0
  166. package/dist-engine-src/src/tracked_state/storage.rs +243 -0
  167. package/dist-engine-src/src/tracked_state/tree.rs +2744 -0
  168. package/dist-engine-src/src/tracked_state/tree_types.rs +176 -0
  169. package/dist-engine-src/src/tracked_state/types.rs +61 -0
  170. package/dist-engine-src/src/transaction/commit.rs +1224 -0
  171. package/dist-engine-src/src/transaction/context.rs +1307 -0
  172. package/dist-engine-src/src/transaction/live_state_overlay.rs +34 -0
  173. package/dist-engine-src/src/transaction/mod.rs +11 -0
  174. package/dist-engine-src/src/transaction/normalization.rs +1026 -0
  175. package/dist-engine-src/src/transaction/schema_resolver.rs +127 -0
  176. package/dist-engine-src/src/transaction/staging.rs +1436 -0
  177. package/dist-engine-src/src/transaction/types.rs +351 -0
  178. package/dist-engine-src/src/transaction/validation.rs +4811 -0
  179. package/dist-engine-src/src/untracked_state/codec.rs +363 -0
  180. package/dist-engine-src/src/untracked_state/context.rs +82 -0
  181. package/dist-engine-src/src/untracked_state/materialization.rs +157 -0
  182. package/dist-engine-src/src/untracked_state/mod.rs +17 -0
  183. package/dist-engine-src/src/untracked_state/storage.rs +348 -0
  184. package/dist-engine-src/src/untracked_state/types.rs +96 -0
  185. package/dist-engine-src/src/version/context.rs +52 -0
  186. package/dist-engine-src/src/version/mod.rs +12 -0
  187. package/dist-engine-src/src/version/refs.rs +421 -0
  188. package/dist-engine-src/src/version/stage_rows.rs +71 -0
  189. package/dist-engine-src/src/version/types.rs +21 -0
  190. package/dist-engine-src/src/wasm/mod.rs +60 -0
  191. package/package.json +68 -64
@@ -0,0 +1,3254 @@
1
+ use std::any::Any;
2
+ use std::collections::{BTreeMap, BTreeSet};
3
+ use std::sync::Arc;
4
+
5
+ use async_trait::async_trait;
6
+ use datafusion::arrow::array::{
7
+ ArrayRef, BinaryArray, BooleanArray, RecordBatchOptions, StringArray, UInt64Array,
8
+ };
9
+ use datafusion::arrow::compute::{and, filter_record_batch};
10
+ use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
11
+ use datafusion::arrow::record_batch::RecordBatch;
12
+ use datafusion::catalog::{Session, TableProvider};
13
+ use datafusion::common::{not_impl_err, DFSchema, DataFusionError, Result, ScalarValue};
14
+ use datafusion::datasource::TableType;
15
+ use datafusion::execution::TaskContext;
16
+ use datafusion::logical_expr::dml::InsertOp;
17
+ use datafusion::logical_expr::{Expr, TableProviderFilterPushDown};
18
+ use datafusion::physical_expr::{create_physical_expr, EquivalenceProperties, PhysicalExpr};
19
+ use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType, PlanProperties};
20
+ use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
21
+ use datafusion::physical_plan::{
22
+ DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream,
23
+ };
24
+ use datafusion::prelude::SessionContext;
25
+ use futures_util::{stream, TryStreamExt};
26
+ use serde::Deserialize;
27
+
28
+ use crate::binary_cas::{BlobDataReader, BlobHash};
29
+ use crate::functions::FunctionProviderHandle;
30
+ use crate::live_state::LiveStateRow;
31
+ use crate::live_state::{
32
+ LiveStateFilter, LiveStateProjection, LiveStateReader, LiveStateScanRequest,
33
+ };
34
+ use crate::sql2::dml::{InsertExec, InsertSink};
35
+ use crate::sql2::filesystem_predicates::{
36
+ canonicalize_filesystem_path_filters, FilesystemPathKind,
37
+ };
38
+ use crate::sql2::version_scope::{
39
+ explicit_version_ids_from_dml_filters, resolve_provider_version_ids,
40
+ resolve_write_version_scope, VersionBinding,
41
+ };
42
+ use crate::sql2::write_normalization::{
43
+ is_binary_type, lix_file_data_type_error, lix_file_data_type_error_with_value,
44
+ logical_expr_is_binary_or_null, reject_non_binary_casts_for_insert_column,
45
+ scalar_is_binary_or_null, InsertCell, InsertColumnIntents, SqlCell, UpdateAssignmentValues,
46
+ UpdateCell,
47
+ };
48
+ use crate::transaction::types::StageRow;
49
+ use crate::version::VersionRefReader;
50
+ use crate::{parse_row_metadata, serialize_row_metadata, LixError, RowMetadata};
51
+
52
+ const FILE_DESCRIPTOR_SCHEMA_KEY: &str = "lix_file_descriptor";
53
+ const BLOB_REF_SCHEMA_KEY: &str = "lix_binary_blob_ref";
54
+ const DIRECTORY_DESCRIPTOR_SCHEMA_KEY: &str = "lix_directory_descriptor";
55
+
56
+ use super::filesystem_planner::{
57
+ blob_ref_row, directory_path_resolvers_from_state_rows, file_descriptor_row,
58
+ file_descriptor_write_row, filesystem_storage_scope_key, plan_file_delete,
59
+ plan_file_path_update, BlobRefRowInput, DirectoryPathResolver, FileDeleteInput,
60
+ FileDescriptorRowInput, FileDescriptorWriteIntent, FilePathWriteInput, FilesystemDeletePlan,
61
+ FilesystemRowContext,
62
+ };
63
+ use super::result_metadata::json_field;
64
+ use crate::sql2::{
65
+ SqlWriteContext, WriteAccess, WriteContextLiveStateReader, WriteContextVersionRefReader,
66
+ };
67
+ use crate::transaction::types::{
68
+ LogicalPrimaryKey, StageFileData, StageRowOrigin, StageWrite, StageWriteMode,
69
+ StageWriteOperation,
70
+ };
71
+
72
+ pub(crate) async fn register_lix_file_providers(
73
+ session: &SessionContext,
74
+ active_version_id: &str,
75
+ live_state: Arc<dyn LiveStateReader>,
76
+ version_ref: Arc<dyn VersionRefReader>,
77
+ blob_reader: Arc<dyn BlobDataReader>,
78
+ functions: FunctionProviderHandle,
79
+ ) -> Result<(), LixError> {
80
+ session
81
+ .register_table(
82
+ "lix_file_by_version",
83
+ Arc::new(LixFileProvider::by_version(
84
+ Arc::clone(&live_state),
85
+ Arc::clone(&version_ref),
86
+ Arc::clone(&blob_reader),
87
+ functions.clone(),
88
+ )),
89
+ )
90
+ .map_err(datafusion_error_to_lix_error)?;
91
+ session
92
+ .register_table(
93
+ "lix_file",
94
+ Arc::new(LixFileProvider::active_version(
95
+ active_version_id,
96
+ live_state,
97
+ version_ref,
98
+ Arc::clone(&blob_reader),
99
+ functions,
100
+ )),
101
+ )
102
+ .map_err(datafusion_error_to_lix_error)?;
103
+ Ok(())
104
+ }
105
+
106
+ pub(crate) async fn register_lix_file_write_providers(
107
+ session: &SessionContext,
108
+ write_ctx: SqlWriteContext,
109
+ ) -> Result<(), LixError> {
110
+ session
111
+ .register_table(
112
+ "lix_file_by_version",
113
+ Arc::new(LixFileProvider::by_version_with_write(write_ctx.clone())),
114
+ )
115
+ .map_err(datafusion_error_to_lix_error)?;
116
+ session
117
+ .register_table(
118
+ "lix_file",
119
+ Arc::new(LixFileProvider::active_version_with_write(write_ctx)),
120
+ )
121
+ .map_err(datafusion_error_to_lix_error)?;
122
+ Ok(())
123
+ }
124
+
125
+ pub(crate) struct LixFileProvider {
126
+ schema: SchemaRef,
127
+ live_state: Arc<dyn LiveStateReader>,
128
+ version_ref: Arc<dyn VersionRefReader>,
129
+ blob_reader: Arc<dyn BlobDataReader>,
130
+ write_access: WriteAccess,
131
+ functions: FunctionProviderHandle,
132
+ version_binding: VersionBinding,
133
+ }
134
+
135
+ impl std::fmt::Debug for LixFileProvider {
136
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
137
+ f.debug_struct("LixFileProvider").finish()
138
+ }
139
+ }
140
+
141
+ impl LixFileProvider {
142
+ pub(crate) fn active_version(
143
+ active_version_id: impl Into<String>,
144
+ live_state: Arc<dyn LiveStateReader>,
145
+ version_ref: Arc<dyn VersionRefReader>,
146
+ blob_reader: Arc<dyn BlobDataReader>,
147
+ functions: FunctionProviderHandle,
148
+ ) -> Self {
149
+ Self {
150
+ schema: lix_file_schema(),
151
+ live_state,
152
+ version_ref,
153
+ blob_reader,
154
+ write_access: WriteAccess::read_only(),
155
+ functions,
156
+ version_binding: VersionBinding::active(active_version_id),
157
+ }
158
+ }
159
+
160
+ pub(crate) fn active_version_with_write(write_ctx: SqlWriteContext) -> Self {
161
+ let active_version_id = write_ctx.active_version_id();
162
+ let functions = write_ctx.functions();
163
+ let live_state = Arc::new(WriteContextLiveStateReader::new(write_ctx.clone()));
164
+ let version_ref = Arc::new(WriteContextVersionRefReader::new(write_ctx.clone()));
165
+ let blob_reader = write_ctx.blob_reader();
166
+ Self {
167
+ schema: lix_file_schema(),
168
+ live_state,
169
+ version_ref,
170
+ blob_reader,
171
+ write_access: WriteAccess::write(write_ctx),
172
+ functions,
173
+ version_binding: VersionBinding::active(active_version_id),
174
+ }
175
+ }
176
+
177
+ pub(crate) fn by_version(
178
+ live_state: Arc<dyn LiveStateReader>,
179
+ version_ref: Arc<dyn VersionRefReader>,
180
+ blob_reader: Arc<dyn BlobDataReader>,
181
+ functions: FunctionProviderHandle,
182
+ ) -> Self {
183
+ Self {
184
+ schema: lix_file_by_version_schema(),
185
+ live_state,
186
+ version_ref,
187
+ blob_reader,
188
+ write_access: WriteAccess::read_only(),
189
+ functions,
190
+ version_binding: VersionBinding::explicit(),
191
+ }
192
+ }
193
+
194
+ pub(crate) fn by_version_with_write(write_ctx: SqlWriteContext) -> Self {
195
+ let functions = write_ctx.functions();
196
+ let live_state = Arc::new(WriteContextLiveStateReader::new(write_ctx.clone()));
197
+ let version_ref = Arc::new(WriteContextVersionRefReader::new(write_ctx.clone()));
198
+ let blob_reader = write_ctx.blob_reader();
199
+ Self {
200
+ schema: lix_file_by_version_schema(),
201
+ live_state,
202
+ version_ref,
203
+ blob_reader,
204
+ write_access: WriteAccess::write(write_ctx),
205
+ functions,
206
+ version_binding: VersionBinding::explicit(),
207
+ }
208
+ }
209
+ }
210
+
211
+ #[async_trait]
212
+ impl TableProvider for LixFileProvider {
213
+ fn as_any(&self) -> &dyn Any {
214
+ self
215
+ }
216
+
217
+ fn schema(&self) -> SchemaRef {
218
+ Arc::clone(&self.schema)
219
+ }
220
+
221
+ fn table_type(&self) -> TableType {
222
+ TableType::Base
223
+ }
224
+
225
+ fn supports_filters_pushdown(
226
+ &self,
227
+ filters: &[&Expr],
228
+ ) -> Result<Vec<TableProviderFilterPushDown>> {
229
+ Ok(filters
230
+ .iter()
231
+ .map(|_| TableProviderFilterPushDown::Exact)
232
+ .collect())
233
+ }
234
+
235
+ async fn scan(
236
+ &self,
237
+ _state: &dyn Session,
238
+ projection: Option<&Vec<usize>>,
239
+ filters: &[Expr],
240
+ limit: Option<usize>,
241
+ ) -> Result<Arc<dyn ExecutionPlan>> {
242
+ let projected_schema = projected_schema(&self.schema, projection)?;
243
+ let scan_limit = if filters.is_empty() { limit } else { None };
244
+ let mut request =
245
+ lix_file_scan_request(self.version_binding.active_version_id(), scan_limit);
246
+ if self.write_access.is_write() && matches!(self.version_binding, VersionBinding::Explicit)
247
+ {
248
+ request.filter.version_ids = explicit_version_ids_from_dml_filters(filters);
249
+ if request.filter.version_ids.is_empty() {
250
+ return Err(DataFusionError::Plan(
251
+ "DELETE FROM lix_file_by_version requires an explicit lixcol_version_id predicate"
252
+ .to_string(),
253
+ ));
254
+ }
255
+ }
256
+ request.filter.version_ids = resolve_provider_version_ids(
257
+ self.version_ref.as_ref(),
258
+ &self.version_binding,
259
+ request.filter.version_ids,
260
+ )
261
+ .await
262
+ .map_err(lix_error_to_datafusion_error)?;
263
+ let filters = canonicalize_filesystem_path_filters(filters, FilesystemPathKind::File)?;
264
+ let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
265
+ let physical_filters = filters
266
+ .iter()
267
+ .map(|expr| create_physical_expr(expr, &df_schema, _state.execution_props()))
268
+ .collect::<Result<Vec<_>>>()?;
269
+ Ok(Arc::new(LixFileScanExec::new(
270
+ Arc::clone(&self.live_state),
271
+ Arc::clone(&self.blob_reader),
272
+ Arc::clone(&self.schema),
273
+ projected_schema,
274
+ projection.cloned(),
275
+ request,
276
+ physical_filters,
277
+ limit,
278
+ )))
279
+ }
280
+
281
+ async fn insert_into(
282
+ &self,
283
+ _state: &dyn Session,
284
+ input: Arc<dyn ExecutionPlan>,
285
+ insert_op: InsertOp,
286
+ ) -> Result<Arc<dyn ExecutionPlan>> {
287
+ if insert_op != InsertOp::Append {
288
+ return not_impl_err!("{insert_op} not implemented for lix_file yet");
289
+ }
290
+
291
+ let write_ctx = self.write_access.require_write("INSERT into lix_file")?;
292
+ let insert_column_intents = InsertColumnIntents::from_input(&input);
293
+ let include_data_writes = insert_column_intents.includes_column("data");
294
+ if include_data_writes {
295
+ reject_non_binary_casts_for_insert_column(&input, "data", "INSERT into lix_file")?;
296
+ }
297
+
298
+ let sink = LixFileInsertSink::new(
299
+ input.schema(),
300
+ write_ctx.clone(),
301
+ self.functions.clone(),
302
+ self.version_binding.clone(),
303
+ include_data_writes,
304
+ );
305
+ Ok(Arc::new(InsertExec::new(input, Arc::new(sink))))
306
+ }
307
+
308
+ async fn delete_from(
309
+ &self,
310
+ state: &dyn Session,
311
+ filters: Vec<Expr>,
312
+ ) -> Result<Arc<dyn ExecutionPlan>> {
313
+ let write_ctx = self.write_access.require_write("DELETE FROM lix_file")?;
314
+
315
+ let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
316
+ let filters = canonicalize_filesystem_path_filters(&filters, FilesystemPathKind::File)?;
317
+ let physical_filters = filters
318
+ .iter()
319
+ .map(|expr| create_physical_expr(expr, &df_schema, state.execution_props()))
320
+ .collect::<Result<Vec<_>>>()?;
321
+ let mut request = lix_file_scan_request(self.version_binding.active_version_id(), None);
322
+ if matches!(self.version_binding, VersionBinding::Explicit) {
323
+ request.filter.version_ids = explicit_version_ids_from_dml_filters(&filters);
324
+ if request.filter.version_ids.is_empty() {
325
+ return Err(DataFusionError::Plan(
326
+ "DELETE FROM lix_file_by_version requires an explicit lixcol_version_id predicate"
327
+ .to_string(),
328
+ ));
329
+ }
330
+ }
331
+
332
+ Ok(Arc::new(LixFileDeleteExec::new(
333
+ Arc::clone(&self.blob_reader),
334
+ write_ctx.clone(),
335
+ Arc::clone(&self.schema),
336
+ self.version_binding.clone(),
337
+ request,
338
+ physical_filters,
339
+ )))
340
+ }
341
+
342
+ async fn update(
343
+ &self,
344
+ state: &dyn Session,
345
+ assignments: Vec<(String, Expr)>,
346
+ filters: Vec<Expr>,
347
+ ) -> Result<Arc<dyn ExecutionPlan>> {
348
+ let write_ctx = self.write_access.require_write("UPDATE lix_file")?;
349
+
350
+ validate_lix_file_update_assignments(&self.schema, &assignments)?;
351
+
352
+ let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
353
+ let physical_assignments = assignments
354
+ .iter()
355
+ .map(|(column_name, expr)| {
356
+ Ok((
357
+ column_name.clone(),
358
+ create_physical_expr(expr, &df_schema, state.execution_props())?,
359
+ ))
360
+ })
361
+ .collect::<Result<Vec<_>>>()?;
362
+ let filters = canonicalize_filesystem_path_filters(&filters, FilesystemPathKind::File)?;
363
+ let physical_filters = filters
364
+ .iter()
365
+ .map(|expr| create_physical_expr(expr, &df_schema, state.execution_props()))
366
+ .collect::<Result<Vec<_>>>()?;
367
+ let request = lix_file_scan_request(self.version_binding.active_version_id(), None);
368
+
369
+ Ok(Arc::new(LixFileUpdateExec::new(
370
+ Arc::clone(&self.blob_reader),
371
+ write_ctx.clone(),
372
+ Arc::clone(&self.schema),
373
+ self.version_binding.clone(),
374
+ self.functions.clone(),
375
+ request,
376
+ physical_assignments,
377
+ physical_filters,
378
+ )))
379
+ }
380
+ }
381
+
382
+ #[allow(dead_code)]
383
+ struct LixFileInsertSink {
384
+ write_ctx: SqlWriteContext,
385
+ functions: FunctionProviderHandle,
386
+ version_binding: VersionBinding,
387
+ surface_name: &'static str,
388
+ include_data_writes: bool,
389
+ }
390
+
391
+ impl std::fmt::Debug for LixFileInsertSink {
392
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
393
+ f.debug_struct("LixFileInsertSink").finish()
394
+ }
395
+ }
396
+
397
+ impl LixFileInsertSink {
398
+ fn new(
399
+ _schema: SchemaRef,
400
+ write_ctx: SqlWriteContext,
401
+ functions: FunctionProviderHandle,
402
+ version_binding: VersionBinding,
403
+ include_data_writes: bool,
404
+ ) -> Self {
405
+ let surface_name = lix_file_surface_name(&version_binding);
406
+ Self {
407
+ write_ctx,
408
+ functions,
409
+ version_binding,
410
+ surface_name,
411
+ include_data_writes,
412
+ }
413
+ }
414
+ }
415
+
416
+ impl DisplayAs for LixFileInsertSink {
417
+ fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
418
+ match t {
419
+ DisplayFormatType::Default | DisplayFormatType::Verbose => {
420
+ write!(f, "LixFileInsertSink")
421
+ }
422
+ DisplayFormatType::TreeRender => write!(f, "LixFileInsertSink"),
423
+ }
424
+ }
425
+ }
426
+
427
+ #[async_trait]
428
+ impl InsertSink for LixFileInsertSink {
429
+ async fn write_batches(
430
+ &self,
431
+ batches: Vec<RecordBatch>,
432
+ _context: &Arc<TaskContext>,
433
+ ) -> Result<u64> {
434
+ let mut staged = LixFileStagedBatch::default();
435
+ let mut path_resolvers = None;
436
+ for batch in batches {
437
+ if path_resolvers.is_none() {
438
+ path_resolvers = Some(
439
+ file_path_resolvers_from_live_state(
440
+ Arc::new(WriteContextLiveStateReader::new(self.write_ctx.clone())),
441
+ self.version_binding.active_version_id(),
442
+ )
443
+ .await
444
+ .map_err(lix_error_to_datafusion_error)?,
445
+ );
446
+ }
447
+ if record_batch_has_non_null_column(&batch, "path")? {
448
+ staged.extend(lix_file_insert_stage_from_batch_with_path_resolvers(
449
+ &batch,
450
+ self.version_binding.active_version_id(),
451
+ self.surface_name,
452
+ path_resolvers
453
+ .as_mut()
454
+ .expect("path resolver should be initialized"),
455
+ &mut || self.functions.call_uuid_v7(),
456
+ self.include_data_writes,
457
+ )?);
458
+ } else {
459
+ staged.extend(
460
+ lix_file_insert_stage_from_batch_with_id_generator_and_path_resolvers(
461
+ &batch,
462
+ self.version_binding.active_version_id(),
463
+ self.surface_name,
464
+ path_resolvers
465
+ .as_mut()
466
+ .expect("path resolver should be initialized"),
467
+ &mut || self.functions.call_uuid_v7(),
468
+ self.include_data_writes,
469
+ )?,
470
+ );
471
+ }
472
+ }
473
+
474
+ if !staged.state_rows.is_empty() || !staged.file_data_writes.is_empty() {
475
+ let intent = if staged.file_data_writes.is_empty() {
476
+ StageWrite::Rows {
477
+ mode: StageWriteMode::Insert,
478
+ rows: staged.state_rows,
479
+ }
480
+ } else {
481
+ StageWrite::RowsWithFileData {
482
+ mode: StageWriteMode::Insert,
483
+ rows: staged.state_rows,
484
+ file_data: staged.file_data_writes,
485
+ count: staged.count,
486
+ }
487
+ };
488
+ self.write_ctx
489
+ .stage_write(intent)
490
+ .await
491
+ .map_err(lix_error_to_datafusion_error)?;
492
+ }
493
+
494
+ Ok(staged.count)
495
+ }
496
+ }
497
+
498
+ fn lix_file_surface_name(version_binding: &VersionBinding) -> &'static str {
499
+ match version_binding {
500
+ VersionBinding::Active { .. } => "lix_file",
501
+ VersionBinding::Explicit => "lix_file_by_version",
502
+ }
503
+ }
504
+
505
+ #[allow(dead_code)]
506
+ struct LixFileDeleteExec {
507
+ blob_reader: Arc<dyn BlobDataReader>,
508
+ write_ctx: SqlWriteContext,
509
+ table_schema: SchemaRef,
510
+ version_binding: VersionBinding,
511
+ request: LiveStateScanRequest,
512
+ filters: Vec<Arc<dyn PhysicalExpr>>,
513
+ result_schema: SchemaRef,
514
+ properties: Arc<PlanProperties>,
515
+ }
516
+
517
+ impl std::fmt::Debug for LixFileDeleteExec {
518
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
519
+ f.debug_struct("LixFileDeleteExec").finish()
520
+ }
521
+ }
522
+
523
+ impl LixFileDeleteExec {
524
+ fn new(
525
+ blob_reader: Arc<dyn BlobDataReader>,
526
+ write_ctx: SqlWriteContext,
527
+ table_schema: SchemaRef,
528
+ version_binding: VersionBinding,
529
+ request: LiveStateScanRequest,
530
+ filters: Vec<Arc<dyn PhysicalExpr>>,
531
+ ) -> Self {
532
+ let result_schema = dml_count_schema();
533
+ let properties = PlanProperties::new(
534
+ EquivalenceProperties::new(Arc::clone(&result_schema)),
535
+ Partitioning::UnknownPartitioning(1),
536
+ EmissionType::Final,
537
+ Boundedness::Bounded,
538
+ );
539
+ Self {
540
+ blob_reader,
541
+ write_ctx,
542
+ table_schema,
543
+ version_binding,
544
+ request,
545
+ filters,
546
+ result_schema,
547
+ properties: Arc::new(properties),
548
+ }
549
+ }
550
+ }
551
+
552
+ impl DisplayAs for LixFileDeleteExec {
553
+ fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
554
+ match t {
555
+ DisplayFormatType::Default | DisplayFormatType::Verbose => {
556
+ write!(f, "LixFileDeleteExec(filters={})", self.filters.len())
557
+ }
558
+ DisplayFormatType::TreeRender => write!(f, "LixFileDeleteExec"),
559
+ }
560
+ }
561
+ }
562
+
563
+ impl ExecutionPlan for LixFileDeleteExec {
564
+ fn name(&self) -> &str {
565
+ "LixFileDeleteExec"
566
+ }
567
+
568
+ fn as_any(&self) -> &dyn Any {
569
+ self
570
+ }
571
+
572
+ fn properties(&self) -> &Arc<PlanProperties> {
573
+ &self.properties
574
+ }
575
+
576
+ fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
577
+ Vec::new()
578
+ }
579
+
580
+ fn with_new_children(
581
+ self: Arc<Self>,
582
+ children: Vec<Arc<dyn ExecutionPlan>>,
583
+ ) -> Result<Arc<dyn ExecutionPlan>> {
584
+ if !children.is_empty() {
585
+ return Err(DataFusionError::Execution(
586
+ "LixFileDeleteExec does not accept children".to_string(),
587
+ ));
588
+ }
589
+ Ok(self)
590
+ }
591
+
592
+ fn execute(
593
+ &self,
594
+ partition: usize,
595
+ _context: Arc<TaskContext>,
596
+ ) -> Result<SendableRecordBatchStream> {
597
+ if partition != 0 {
598
+ return Err(DataFusionError::Execution(format!(
599
+ "LixFileDeleteExec only exposes one partition, got {partition}"
600
+ )));
601
+ }
602
+
603
+ let blob_reader = Arc::clone(&self.blob_reader);
604
+ let write_ctx = self.write_ctx.clone();
605
+ let table_schema = Arc::clone(&self.table_schema);
606
+ let version_binding = self.version_binding.clone();
607
+ let request = self.request.clone();
608
+ let filters = self.filters.clone();
609
+ let result_schema = Arc::clone(&self.result_schema);
610
+ let stream_schema = Arc::clone(&result_schema);
611
+
612
+ let stream = stream::once(async move {
613
+ let rows = write_ctx
614
+ .scan_live_state(&request)
615
+ .await
616
+ .map_err(lix_error_to_datafusion_error)?;
617
+ let blob_ref_file_ids =
618
+ blob_ref_file_ids_from_live_rows(&rows).map_err(lix_error_to_datafusion_error)?;
619
+ let source_batch = lix_file_record_batch(&table_schema, &blob_reader, rows)
620
+ .await
621
+ .map_err(lix_error_to_datafusion_error)?;
622
+ let matched_batch = filter_lix_file_batch(source_batch, &filters)?;
623
+ let staged = lix_file_delete_stage_from_batch(
624
+ &matched_batch,
625
+ version_binding.active_version_id(),
626
+ &blob_ref_file_ids,
627
+ )?;
628
+ let count = staged.count;
629
+
630
+ if count > 0 {
631
+ write_ctx
632
+ .stage_write(StageWrite::Rows {
633
+ mode: StageWriteMode::Replace,
634
+ rows: staged.state_rows,
635
+ })
636
+ .await
637
+ .map_err(lix_error_to_datafusion_error)?;
638
+ }
639
+
640
+ Ok::<_, DataFusionError>(stream::iter(vec![Ok::<RecordBatch, DataFusionError>(
641
+ dml_count_batch(Arc::clone(&stream_schema), count)?,
642
+ )]))
643
+ })
644
+ .try_flatten();
645
+
646
+ Ok(Box::pin(RecordBatchStreamAdapter::new(
647
+ result_schema,
648
+ stream,
649
+ )))
650
+ }
651
+ }
652
+
653
+ #[allow(dead_code)]
654
+ struct LixFileUpdateExec {
655
+ blob_reader: Arc<dyn BlobDataReader>,
656
+ write_ctx: SqlWriteContext,
657
+ table_schema: SchemaRef,
658
+ version_binding: VersionBinding,
659
+ functions: FunctionProviderHandle,
660
+ request: LiveStateScanRequest,
661
+ assignments: Vec<(String, Arc<dyn PhysicalExpr>)>,
662
+ filters: Vec<Arc<dyn PhysicalExpr>>,
663
+ result_schema: SchemaRef,
664
+ properties: Arc<PlanProperties>,
665
+ }
666
+
667
+ impl std::fmt::Debug for LixFileUpdateExec {
668
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
669
+ f.debug_struct("LixFileUpdateExec").finish()
670
+ }
671
+ }
672
+
673
+ impl LixFileUpdateExec {
674
+ fn new(
675
+ blob_reader: Arc<dyn BlobDataReader>,
676
+ write_ctx: SqlWriteContext,
677
+ table_schema: SchemaRef,
678
+ version_binding: VersionBinding,
679
+ functions: FunctionProviderHandle,
680
+ request: LiveStateScanRequest,
681
+ assignments: Vec<(String, Arc<dyn PhysicalExpr>)>,
682
+ filters: Vec<Arc<dyn PhysicalExpr>>,
683
+ ) -> Self {
684
+ let result_schema = dml_count_schema();
685
+ let properties = PlanProperties::new(
686
+ EquivalenceProperties::new(Arc::clone(&result_schema)),
687
+ Partitioning::UnknownPartitioning(1),
688
+ EmissionType::Final,
689
+ Boundedness::Bounded,
690
+ );
691
+ Self {
692
+ blob_reader,
693
+ write_ctx,
694
+ table_schema,
695
+ version_binding,
696
+ functions,
697
+ request,
698
+ assignments,
699
+ filters,
700
+ result_schema,
701
+ properties: Arc::new(properties),
702
+ }
703
+ }
704
+ }
705
+
706
+ impl DisplayAs for LixFileUpdateExec {
707
+ fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
708
+ match t {
709
+ DisplayFormatType::Default | DisplayFormatType::Verbose => {
710
+ write!(
711
+ f,
712
+ "LixFileUpdateExec(assignments={}, filters={})",
713
+ self.assignments.len(),
714
+ self.filters.len()
715
+ )
716
+ }
717
+ DisplayFormatType::TreeRender => write!(f, "LixFileUpdateExec"),
718
+ }
719
+ }
720
+ }
721
+
722
+ impl ExecutionPlan for LixFileUpdateExec {
723
+ fn name(&self) -> &str {
724
+ "LixFileUpdateExec"
725
+ }
726
+
727
+ fn as_any(&self) -> &dyn Any {
728
+ self
729
+ }
730
+
731
+ fn properties(&self) -> &Arc<PlanProperties> {
732
+ &self.properties
733
+ }
734
+
735
+ fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
736
+ Vec::new()
737
+ }
738
+
739
+ fn with_new_children(
740
+ self: Arc<Self>,
741
+ children: Vec<Arc<dyn ExecutionPlan>>,
742
+ ) -> Result<Arc<dyn ExecutionPlan>> {
743
+ if !children.is_empty() {
744
+ return Err(DataFusionError::Execution(
745
+ "LixFileUpdateExec does not accept children".to_string(),
746
+ ));
747
+ }
748
+ Ok(self)
749
+ }
750
+
751
+ fn execute(
752
+ &self,
753
+ partition: usize,
754
+ _context: Arc<TaskContext>,
755
+ ) -> Result<SendableRecordBatchStream> {
756
+ if partition != 0 {
757
+ return Err(DataFusionError::Execution(format!(
758
+ "LixFileUpdateExec only exposes one partition, got {partition}"
759
+ )));
760
+ }
761
+
762
+ let blob_reader = Arc::clone(&self.blob_reader);
763
+ let write_ctx = self.write_ctx.clone();
764
+ let table_schema = Arc::clone(&self.table_schema);
765
+ let version_binding = self.version_binding.clone();
766
+ let functions = self.functions.clone();
767
+ let request = self.request.clone();
768
+ let assignments = self.assignments.clone();
769
+ let filters = self.filters.clone();
770
+ let result_schema = Arc::clone(&self.result_schema);
771
+ let stream_schema = Arc::clone(&result_schema);
772
+
773
+ let stream = stream::once(async move {
774
+ let rows = write_ctx
775
+ .scan_live_state(&request)
776
+ .await
777
+ .map_err(lix_error_to_datafusion_error)?;
778
+ let source_batch = lix_file_record_batch(&table_schema, &blob_reader, rows)
779
+ .await
780
+ .map_err(lix_error_to_datafusion_error)?;
781
+ let matched_batch = filter_lix_file_batch(source_batch, &filters)?;
782
+ let assignment_values = UpdateAssignmentValues::evaluate(&matched_batch, &assignments)?;
783
+ let update_columns = LixFileUpdateColumns::from_assignments(&assignments);
784
+ let mut path_resolvers = None;
785
+ if update_columns.path || update_columns.descriptor {
786
+ path_resolvers = Some(
787
+ file_path_resolvers_from_live_state(
788
+ Arc::new(WriteContextLiveStateReader::new(write_ctx.clone())),
789
+ version_binding.active_version_id(),
790
+ )
791
+ .await
792
+ .map_err(lix_error_to_datafusion_error)?,
793
+ );
794
+ }
795
+ let staged = lix_file_update_stage_from_batch(
796
+ &matched_batch,
797
+ &assignment_values,
798
+ version_binding.active_version_id(),
799
+ update_columns,
800
+ path_resolvers.as_mut(),
801
+ &mut || functions.call_uuid_v7(),
802
+ )?;
803
+ let count = staged.count;
804
+
805
+ if count > 0 {
806
+ let intent = if staged.file_data_writes.is_empty() {
807
+ StageWrite::Rows {
808
+ mode: StageWriteMode::Replace,
809
+ rows: staged.state_rows,
810
+ }
811
+ } else {
812
+ StageWrite::RowsWithFileData {
813
+ mode: StageWriteMode::Replace,
814
+ rows: staged.state_rows,
815
+ file_data: staged.file_data_writes,
816
+ count,
817
+ }
818
+ };
819
+ write_ctx
820
+ .stage_write(intent)
821
+ .await
822
+ .map_err(lix_error_to_datafusion_error)?;
823
+ }
824
+
825
+ Ok::<_, DataFusionError>(stream::iter(vec![Ok::<RecordBatch, DataFusionError>(
826
+ dml_count_batch(Arc::clone(&stream_schema), count)?,
827
+ )]))
828
+ })
829
+ .try_flatten();
830
+
831
+ Ok(Box::pin(RecordBatchStreamAdapter::new(
832
+ result_schema,
833
+ stream,
834
+ )))
835
+ }
836
+ }
837
+
838
+ struct LixFileScanExec {
839
+ live_state: Arc<dyn LiveStateReader>,
840
+ blob_reader: Arc<dyn BlobDataReader>,
841
+ batch_schema: SchemaRef,
842
+ output_schema: SchemaRef,
843
+ projection: Option<Vec<usize>>,
844
+ request: LiveStateScanRequest,
845
+ filters: Vec<Arc<dyn PhysicalExpr>>,
846
+ limit: Option<usize>,
847
+ properties: Arc<PlanProperties>,
848
+ }
849
+
850
+ impl std::fmt::Debug for LixFileScanExec {
851
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
852
+ f.debug_struct("LixFileScanExec").finish()
853
+ }
854
+ }
855
+
856
+ impl LixFileScanExec {
857
+ fn new(
858
+ live_state: Arc<dyn LiveStateReader>,
859
+ blob_reader: Arc<dyn BlobDataReader>,
860
+ batch_schema: SchemaRef,
861
+ output_schema: SchemaRef,
862
+ projection: Option<Vec<usize>>,
863
+ request: LiveStateScanRequest,
864
+ filters: Vec<Arc<dyn PhysicalExpr>>,
865
+ limit: Option<usize>,
866
+ ) -> Self {
867
+ let properties = PlanProperties::new(
868
+ EquivalenceProperties::new(output_schema.clone()),
869
+ Partitioning::UnknownPartitioning(1),
870
+ EmissionType::Incremental,
871
+ Boundedness::Bounded,
872
+ );
873
+ Self {
874
+ live_state,
875
+ blob_reader,
876
+ batch_schema,
877
+ output_schema,
878
+ projection,
879
+ request,
880
+ filters,
881
+ limit,
882
+ properties: Arc::new(properties),
883
+ }
884
+ }
885
+ }
886
+
887
+ impl DisplayAs for LixFileScanExec {
888
+ fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
889
+ match t {
890
+ DisplayFormatType::Default | DisplayFormatType::Verbose => {
891
+ write!(f, "LixFileScanExec(limit={:?})", self.limit)
892
+ }
893
+ DisplayFormatType::TreeRender => write!(f, "LixFileScanExec"),
894
+ }
895
+ }
896
+ }
897
+
898
+ impl ExecutionPlan for LixFileScanExec {
899
+ fn name(&self) -> &str {
900
+ "LixFileScanExec"
901
+ }
902
+
903
+ fn as_any(&self) -> &dyn Any {
904
+ self
905
+ }
906
+
907
+ fn properties(&self) -> &Arc<PlanProperties> {
908
+ &self.properties
909
+ }
910
+
911
+ fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
912
+ Vec::new()
913
+ }
914
+
915
+ fn with_new_children(
916
+ self: Arc<Self>,
917
+ children: Vec<Arc<dyn ExecutionPlan>>,
918
+ ) -> Result<Arc<dyn ExecutionPlan>> {
919
+ if !children.is_empty() {
920
+ return Err(DataFusionError::Execution(
921
+ "LixFileScanExec does not accept children".to_string(),
922
+ ));
923
+ }
924
+ Ok(self)
925
+ }
926
+
927
+ fn execute(
928
+ &self,
929
+ partition: usize,
930
+ _context: Arc<TaskContext>,
931
+ ) -> Result<SendableRecordBatchStream> {
932
+ if partition != 0 {
933
+ return Err(DataFusionError::Execution(format!(
934
+ "LixFileScanExec only supports partition 0, got {partition}"
935
+ )));
936
+ }
937
+
938
+ let live_state = Arc::clone(&self.live_state);
939
+ let blob_reader = Arc::clone(&self.blob_reader);
940
+ let request = self.request.clone();
941
+ let filters = self.filters.clone();
942
+ let limit = self.limit;
943
+ let output_schema = Arc::clone(&self.output_schema);
944
+ let batch_schema = Arc::clone(&self.batch_schema);
945
+ let projection = self.projection.clone();
946
+ let fut = async move {
947
+ let rows = live_state.scan_rows(&request).await.map_err(|error| {
948
+ DataFusionError::Execution(format!("sql2 lix_file scan failed: {error}"))
949
+ })?;
950
+ let batch = lix_file_record_batch(&batch_schema, &blob_reader, rows)
951
+ .await
952
+ .map_err(|error| {
953
+ DataFusionError::Execution(format!("sql2 lix_file batch build failed: {error}"))
954
+ })?;
955
+ let filtered = filter_lix_file_batch(batch, &filters)?;
956
+ let projected = match projection {
957
+ Some(indices) => filtered.project(&indices).map_err(DataFusionError::from),
958
+ None => Ok(filtered),
959
+ }?;
960
+ match limit {
961
+ Some(limit) => Ok(projected.slice(0, limit.min(projected.num_rows()))),
962
+ None => Ok(projected),
963
+ }
964
+ };
965
+
966
+ Ok(Box::pin(RecordBatchStreamAdapter::new(
967
+ output_schema,
968
+ stream::once(fut).map_ok(|batch| batch),
969
+ )))
970
+ }
971
+ }
972
+
973
+ #[derive(Debug, Clone)]
974
+ struct FileDescriptorRecord {
975
+ id: String,
976
+ directory_id: Option<String>,
977
+ name: String,
978
+ hidden: bool,
979
+ live: LiveStateRow,
980
+ }
981
+
982
+ #[derive(Debug, Clone)]
983
+ struct BlobRefRecord {
984
+ blob_hash: String,
985
+ }
986
+
987
+ #[derive(Debug, Clone)]
988
+ struct DirectoryDescriptorRecord {
989
+ id: String,
990
+ parent_id: Option<String>,
991
+ name: String,
992
+ version_id: String,
993
+ }
994
+
995
+ #[derive(Debug, Deserialize)]
996
+ struct FileDescriptorSnapshot {
997
+ id: String,
998
+ directory_id: Option<String>,
999
+ name: String,
1000
+ hidden: bool,
1001
+ }
1002
+
1003
+ #[derive(Debug, Deserialize)]
1004
+ struct BlobRefSnapshot {
1005
+ id: String,
1006
+ blob_hash: String,
1007
+ }
1008
+
1009
+ #[derive(Debug, Deserialize)]
1010
+ struct DirectoryDescriptorSnapshot {
1011
+ id: String,
1012
+ parent_id: Option<String>,
1013
+ name: String,
1014
+ }
1015
+
1016
+ #[derive(Debug, Default)]
1017
+ struct LixFileStagedBatch {
1018
+ state_rows: Vec<StageRow>,
1019
+ file_data_writes: Vec<StageFileData>,
1020
+ count: u64,
1021
+ }
1022
+
1023
+ impl LixFileStagedBatch {
1024
+ fn extend(&mut self, other: LixFileStagedBatch) {
1025
+ self.state_rows.extend(other.state_rows);
1026
+ self.file_data_writes.extend(other.file_data_writes);
1027
+ self.count += other.count;
1028
+ }
1029
+
1030
+ fn extend_filesystem_plan(&mut self, plan: super::filesystem_planner::FilesystemWritePlan) {
1031
+ self.state_rows.extend(plan.rows);
1032
+ self.file_data_writes.extend(plan.file_data);
1033
+ self.count += plan.count;
1034
+ }
1035
+
1036
+ fn extend_filesystem_delete_plan(&mut self, plan: FilesystemDeletePlan) {
1037
+ self.state_rows.extend(plan.rows);
1038
+ self.count += plan.count;
1039
+ }
1040
+ }
1041
+
1042
+ #[cfg(test)]
1043
+ fn lix_file_write_rows_from_batch(
1044
+ batch: &RecordBatch,
1045
+ version_binding: Option<&str>,
1046
+ ) -> Result<Vec<StageRow>> {
1047
+ Ok(lix_file_insert_stage_from_batch(batch, version_binding)?.state_rows)
1048
+ }
1049
+
1050
+ fn lix_file_delete_stage_from_batch(
1051
+ batch: &RecordBatch,
1052
+ version_binding: Option<&str>,
1053
+ blob_ref_file_ids: &BTreeSet<String>,
1054
+ ) -> Result<LixFileStagedBatch> {
1055
+ let mut staged = LixFileStagedBatch::default();
1056
+ for row_index in 0..batch.num_rows() {
1057
+ let file_id = required_string_value(batch, row_index, "id")?;
1058
+ let context = file_row_context_from_batch(batch, row_index, version_binding)?;
1059
+ staged.extend_filesystem_delete_plan(plan_file_delete(FileDeleteInput {
1060
+ file_id: file_id.clone(),
1061
+ has_blob_ref: blob_ref_file_ids.contains(&file_id),
1062
+ context,
1063
+ }));
1064
+ }
1065
+ Ok(staged)
1066
+ }
1067
+
1068
+ fn blob_ref_file_ids_from_live_rows(
1069
+ rows: &[LiveStateRow],
1070
+ ) -> std::result::Result<BTreeSet<String>, LixError> {
1071
+ let mut file_ids = BTreeSet::new();
1072
+ for row in rows {
1073
+ if row.schema_key != BLOB_REF_SCHEMA_KEY {
1074
+ continue;
1075
+ }
1076
+ let Some(snapshot_content) = row.snapshot_content.as_deref() else {
1077
+ continue;
1078
+ };
1079
+ let snapshot: BlobRefSnapshot =
1080
+ serde_json::from_str(snapshot_content).map_err(|error| {
1081
+ LixError::new(
1082
+ "LIX_ERROR_UNKNOWN",
1083
+ format!("invalid lix_binary_blob_ref snapshot JSON: {error}"),
1084
+ )
1085
+ })?;
1086
+ file_ids.insert(snapshot.id);
1087
+ }
1088
+ Ok(file_ids)
1089
+ }
1090
+
1091
+ #[cfg(test)]
1092
+ fn lix_file_insert_stage_from_batch(
1093
+ batch: &RecordBatch,
1094
+ version_binding: Option<&str>,
1095
+ ) -> Result<LixFileStagedBatch> {
1096
+ lix_file_stage_from_batch_with_options(batch, version_binding, "lix_file", true, true, true)
1097
+ }
1098
+
1099
+ fn lix_file_insert_stage_from_batch_with_id_generator_and_path_resolvers(
1100
+ batch: &RecordBatch,
1101
+ version_binding: Option<&str>,
1102
+ surface_name: &str,
1103
+ path_resolvers: &mut BTreeMap<String, DirectoryPathResolver>,
1104
+ generate_id: &mut dyn FnMut() -> String,
1105
+ include_data_writes: bool,
1106
+ ) -> Result<LixFileStagedBatch> {
1107
+ lix_file_stage_from_batch_with_options_and_path_resolvers(
1108
+ batch,
1109
+ version_binding,
1110
+ surface_name,
1111
+ true,
1112
+ true,
1113
+ include_data_writes,
1114
+ Some(path_resolvers),
1115
+ Some(generate_id),
1116
+ )
1117
+ }
1118
+
1119
+ fn lix_file_insert_stage_from_batch_with_path_resolvers(
1120
+ batch: &RecordBatch,
1121
+ version_binding: Option<&str>,
1122
+ surface_name: &str,
1123
+ path_resolvers: &mut BTreeMap<String, DirectoryPathResolver>,
1124
+ generate_directory_id: &mut dyn FnMut() -> String,
1125
+ include_data_writes: bool,
1126
+ ) -> Result<LixFileStagedBatch> {
1127
+ lix_file_stage_from_batch_with_options_and_path_resolvers(
1128
+ batch,
1129
+ version_binding,
1130
+ surface_name,
1131
+ true,
1132
+ true,
1133
+ include_data_writes,
1134
+ Some(path_resolvers),
1135
+ Some(generate_directory_id),
1136
+ )
1137
+ }
1138
+
1139
+ fn lix_file_existing_update_stage_from_batch(
1140
+ batch: &RecordBatch,
1141
+ assignment_values: &UpdateAssignmentValues,
1142
+ version_binding: Option<&str>,
1143
+ include_descriptor_writes: bool,
1144
+ include_data_writes: bool,
1145
+ path_resolvers: Option<&mut BTreeMap<String, DirectoryPathResolver>>,
1146
+ ) -> Result<LixFileStagedBatch> {
1147
+ let mut staged = LixFileStagedBatch::default();
1148
+ let mut path_resolvers = path_resolvers;
1149
+
1150
+ for row_index in 0..batch.num_rows() {
1151
+ let id = required_string_value(batch, row_index, "id")?;
1152
+ let hidden = update_optional_bool_value(batch, assignment_values, row_index, "hidden")?
1153
+ .unwrap_or(false);
1154
+ let context =
1155
+ file_row_context_from_update(batch, assignment_values, row_index, version_binding)?;
1156
+
1157
+ if include_descriptor_writes {
1158
+ let directory_id =
1159
+ update_optional_string_value(batch, assignment_values, row_index, "directory_id")?;
1160
+ let name = update_required_string_value(batch, assignment_values, row_index, "name")?;
1161
+ if let Some(path_resolvers) = path_resolvers.as_deref_mut() {
1162
+ let resolver = path_resolvers
1163
+ .entry(file_path_resolver_key(&context))
1164
+ .or_insert_with(DirectoryPathResolver::default);
1165
+ resolver
1166
+ .reserve_file(directory_id.clone(), name.clone(), id.clone())
1167
+ .map_err(lix_error_to_datafusion_error)?;
1168
+ }
1169
+ staged
1170
+ .state_rows
1171
+ .push(file_descriptor_row(FileDescriptorRowInput {
1172
+ id: id.clone(),
1173
+ directory_id,
1174
+ name,
1175
+ hidden,
1176
+ context: context.clone(),
1177
+ }));
1178
+ }
1179
+
1180
+ if include_data_writes {
1181
+ let data = update_required_binary_value(batch, assignment_values, row_index, "data")?;
1182
+ stage_lix_file_data_write(&mut staged, id, data, context, None)?;
1183
+ }
1184
+
1185
+ staged.count = staged
1186
+ .count
1187
+ .checked_add(1)
1188
+ .ok_or_else(|| DataFusionError::Execution("lix_file row count overflow".into()))?;
1189
+ }
1190
+
1191
+ Ok(staged)
1192
+ }
1193
+
1194
+ #[derive(Debug, Clone, Copy)]
1195
+ struct LixFileUpdateColumns {
1196
+ path: bool,
1197
+ data: bool,
1198
+ descriptor: bool,
1199
+ }
1200
+
1201
+ impl LixFileUpdateColumns {
1202
+ fn from_assignments(assignments: &[(String, Arc<dyn PhysicalExpr>)]) -> Self {
1203
+ let path = assignments
1204
+ .iter()
1205
+ .any(|(column_name, _)| column_name == "path");
1206
+ let data = assignments
1207
+ .iter()
1208
+ .any(|(column_name, _)| column_name == "data");
1209
+ let descriptor = assignments
1210
+ .iter()
1211
+ .any(|(column_name, _)| column_name != "path" && column_name != "data");
1212
+ Self {
1213
+ path,
1214
+ data,
1215
+ descriptor,
1216
+ }
1217
+ }
1218
+ }
1219
+
1220
+ fn lix_file_update_stage_from_batch(
1221
+ batch: &RecordBatch,
1222
+ assignment_values: &UpdateAssignmentValues,
1223
+ version_binding: Option<&str>,
1224
+ update_columns: LixFileUpdateColumns,
1225
+ path_resolvers: Option<&mut BTreeMap<String, DirectoryPathResolver>>,
1226
+ generate_directory_id: &mut dyn FnMut() -> String,
1227
+ ) -> Result<LixFileStagedBatch> {
1228
+ if update_columns.path || update_columns.descriptor {
1229
+ let Some(path_resolvers) = path_resolvers else {
1230
+ return Err(DataFusionError::Execution(
1231
+ "UPDATE lix_file requires filesystem path resolver".to_string(),
1232
+ ));
1233
+ };
1234
+ return if update_columns.path {
1235
+ lix_file_path_update_stage_from_batch(
1236
+ batch,
1237
+ assignment_values,
1238
+ version_binding,
1239
+ update_columns,
1240
+ path_resolvers,
1241
+ generate_directory_id,
1242
+ )
1243
+ } else {
1244
+ lix_file_existing_update_stage_from_batch(
1245
+ batch,
1246
+ assignment_values,
1247
+ version_binding,
1248
+ update_columns.descriptor,
1249
+ update_columns.data,
1250
+ Some(path_resolvers),
1251
+ )
1252
+ };
1253
+ }
1254
+
1255
+ lix_file_existing_update_stage_from_batch(
1256
+ batch,
1257
+ assignment_values,
1258
+ version_binding,
1259
+ update_columns.descriptor,
1260
+ update_columns.data,
1261
+ None,
1262
+ )
1263
+ }
1264
+
1265
+ fn lix_file_path_update_stage_from_batch(
1266
+ batch: &RecordBatch,
1267
+ assignment_values: &UpdateAssignmentValues,
1268
+ version_binding: Option<&str>,
1269
+ update_columns: LixFileUpdateColumns,
1270
+ path_resolvers: &mut BTreeMap<String, DirectoryPathResolver>,
1271
+ generate_directory_id: &mut dyn FnMut() -> String,
1272
+ ) -> Result<LixFileStagedBatch> {
1273
+ let mut staged = LixFileStagedBatch::default();
1274
+
1275
+ for row_index in 0..batch.num_rows() {
1276
+ let id = required_string_value(batch, row_index, "id")?;
1277
+ let path = update_required_string_value(batch, assignment_values, row_index, "path")?;
1278
+ let hidden = update_optional_bool_value(batch, assignment_values, row_index, "hidden")?
1279
+ .unwrap_or(false);
1280
+ let context =
1281
+ file_row_context_from_update(batch, assignment_values, row_index, version_binding)?;
1282
+ let assigned_data = if update_columns.data {
1283
+ Some(update_required_binary_value(
1284
+ batch,
1285
+ assignment_values,
1286
+ row_index,
1287
+ "data",
1288
+ )?)
1289
+ } else {
1290
+ None
1291
+ };
1292
+
1293
+ let resolver = path_resolvers
1294
+ .entry(file_path_resolver_key(&context))
1295
+ .or_insert_with(DirectoryPathResolver::default);
1296
+ let plan = plan_file_path_update(
1297
+ resolver,
1298
+ id.clone(),
1299
+ path,
1300
+ hidden,
1301
+ None,
1302
+ context.clone(),
1303
+ generate_directory_id,
1304
+ )
1305
+ .map_err(lix_error_to_datafusion_error)?;
1306
+ staged.extend_filesystem_plan(plan);
1307
+
1308
+ if let Some(data) = assigned_data {
1309
+ stage_lix_file_data_write(&mut staged, id, data, context, None)?;
1310
+ }
1311
+ }
1312
+
1313
+ Ok(staged)
1314
+ }
1315
+
1316
+ #[cfg(test)]
1317
+ fn lix_file_stage_from_batch_with_options(
1318
+ batch: &RecordBatch,
1319
+ version_binding: Option<&str>,
1320
+ surface_name: &str,
1321
+ reject_read_only_fields: bool,
1322
+ include_descriptor_writes: bool,
1323
+ include_data_writes: bool,
1324
+ ) -> Result<LixFileStagedBatch> {
1325
+ lix_file_stage_from_batch_with_options_and_path_resolvers(
1326
+ batch,
1327
+ version_binding,
1328
+ surface_name,
1329
+ reject_read_only_fields,
1330
+ include_descriptor_writes,
1331
+ include_data_writes,
1332
+ None,
1333
+ None,
1334
+ )
1335
+ }
1336
+
1337
+ fn lix_file_stage_from_batch_with_options_and_path_resolvers(
1338
+ batch: &RecordBatch,
1339
+ version_binding: Option<&str>,
1340
+ surface_name: &str,
1341
+ reject_read_only_fields: bool,
1342
+ include_descriptor_writes: bool,
1343
+ include_data_writes: bool,
1344
+ mut path_resolvers: Option<&mut BTreeMap<String, DirectoryPathResolver>>,
1345
+ mut generate_directory_id: Option<&mut dyn FnMut() -> String>,
1346
+ ) -> Result<LixFileStagedBatch> {
1347
+ let mut staged = LixFileStagedBatch::default();
1348
+
1349
+ for row_index in 0..batch.num_rows() {
1350
+ if reject_read_only_fields {
1351
+ reject_read_only_lix_file_insert_field(batch, row_index, "lixcol_entity_id")?;
1352
+ reject_read_only_lix_file_insert_field(batch, row_index, "lixcol_schema_key")?;
1353
+ reject_read_only_lix_file_insert_field(batch, row_index, "lixcol_change_id")?;
1354
+ reject_read_only_lix_file_insert_field(batch, row_index, "lixcol_created_at")?;
1355
+ reject_read_only_lix_file_insert_field(batch, row_index, "lixcol_updated_at")?;
1356
+ reject_read_only_lix_file_insert_field(batch, row_index, "lixcol_commit_id")?;
1357
+ }
1358
+
1359
+ let path = optional_string_value(batch, row_index, "path")?;
1360
+ let id = optional_string_value(batch, row_index, "id")?;
1361
+ let hidden = optional_bool_value(batch, row_index, "hidden")?;
1362
+ let context = file_row_context_from_batch(batch, row_index, version_binding)?;
1363
+ let data = if include_data_writes {
1364
+ insert_optional_binary_value(batch, row_index, "data")?
1365
+ } else {
1366
+ None
1367
+ };
1368
+
1369
+ if let Some(path) = path {
1370
+ reject_read_only_lix_file_insert_field(batch, row_index, "directory_id")?;
1371
+ reject_read_only_lix_file_insert_field(batch, row_index, "name")?;
1372
+
1373
+ let Some(path_resolvers) = path_resolvers.as_deref_mut() else {
1374
+ return Err(DataFusionError::Execution(
1375
+ "INSERT into lix_file with path requires directory path resolver".to_string(),
1376
+ ));
1377
+ };
1378
+ let resolver = path_resolvers
1379
+ .entry(file_path_resolver_key(&context))
1380
+ .or_insert_with(DirectoryPathResolver::default);
1381
+ let Some(generate_directory_id) = generate_directory_id.as_deref_mut() else {
1382
+ return Err(DataFusionError::Execution(
1383
+ "INSERT into lix_file with path requires directory id generator".to_string(),
1384
+ ));
1385
+ };
1386
+ let file_id = id.unwrap_or_else(|| generate_directory_id());
1387
+ let mut plan = super::filesystem_planner::plan_file_path_write(
1388
+ resolver,
1389
+ FilePathWriteInput {
1390
+ id: Some(file_id.clone()),
1391
+ path,
1392
+ data,
1393
+ hidden,
1394
+ context,
1395
+ },
1396
+ generate_directory_id,
1397
+ )
1398
+ .map_err(lix_error_to_datafusion_error)?;
1399
+ attach_lix_file_insert_origin(&mut plan.rows, surface_name, &file_id);
1400
+ staged.extend_filesystem_plan(plan);
1401
+ continue;
1402
+ }
1403
+
1404
+ let directory_id = optional_string_value(batch, row_index, "directory_id")?;
1405
+ let name = required_string_value(batch, row_index, "name")?;
1406
+
1407
+ let id = if data.is_some() {
1408
+ match id {
1409
+ Some(id) => Some(id),
1410
+ None => {
1411
+ let Some(generate_id) = generate_directory_id.as_deref_mut() else {
1412
+ return Err(DataFusionError::Execution(
1413
+ "INSERT into lix_file with data requires id generator".to_string(),
1414
+ ));
1415
+ };
1416
+ Some(generate_id())
1417
+ }
1418
+ }
1419
+ } else {
1420
+ id
1421
+ };
1422
+
1423
+ if include_descriptor_writes {
1424
+ if let Some(path_resolvers) = path_resolvers.as_deref_mut() {
1425
+ if let Some(file_id) = id.as_ref() {
1426
+ let resolver = path_resolvers
1427
+ .entry(file_path_resolver_key(&context))
1428
+ .or_insert_with(DirectoryPathResolver::default);
1429
+ resolver
1430
+ .reserve_file(directory_id.clone(), name.clone(), file_id.clone())
1431
+ .map_err(lix_error_to_datafusion_error)?;
1432
+ }
1433
+ }
1434
+ let mut row = file_descriptor_write_row(FileDescriptorWriteIntent {
1435
+ id: id.clone(),
1436
+ directory_id: directory_id.clone(),
1437
+ name: name.clone(),
1438
+ hidden,
1439
+ context: context.clone(),
1440
+ });
1441
+ if let Some(file_id) = id.as_ref() {
1442
+ row.origin = Some(lix_file_insert_origin(surface_name, file_id));
1443
+ }
1444
+ staged.state_rows.push(row);
1445
+ }
1446
+
1447
+ if let (Some(id), Some(data)) = (id, data) {
1448
+ let origin = Some(lix_file_insert_origin(surface_name, &id));
1449
+ stage_lix_file_data_write(&mut staged, id, data, context, origin)?;
1450
+ }
1451
+ staged.count = staged
1452
+ .count
1453
+ .checked_add(1)
1454
+ .ok_or_else(|| DataFusionError::Execution("lix_file row count overflow".into()))?;
1455
+ }
1456
+
1457
+ Ok(staged)
1458
+ }
1459
+
1460
+ fn stage_lix_file_data_write(
1461
+ staged: &mut LixFileStagedBatch,
1462
+ file_id: String,
1463
+ data: Vec<u8>,
1464
+ context: FilesystemRowContext,
1465
+ origin: Option<StageRowOrigin>,
1466
+ ) -> Result<()> {
1467
+ let mut row = blob_ref_row(BlobRefRowInput {
1468
+ file_id: file_id.clone(),
1469
+ data: data.clone(),
1470
+ context: FilesystemRowContext {
1471
+ file_id: None,
1472
+ metadata: None,
1473
+ ..context.clone()
1474
+ },
1475
+ })
1476
+ .map_err(lix_error_to_datafusion_error)?;
1477
+ row.origin = origin;
1478
+ staged.state_rows.push(row);
1479
+ staged.file_data_writes.push(StageFileData {
1480
+ file_id,
1481
+ version_id: context.version_id,
1482
+ untracked: context.untracked,
1483
+ data,
1484
+ });
1485
+ Ok(())
1486
+ }
1487
+
1488
+ fn attach_lix_file_insert_origin(rows: &mut [StageRow], surface_name: &str, file_id: &str) {
1489
+ let origin = lix_file_insert_origin(surface_name, file_id);
1490
+ for row in rows {
1491
+ if row.schema_key == FILE_DESCRIPTOR_SCHEMA_KEY || row.schema_key == BLOB_REF_SCHEMA_KEY {
1492
+ row.origin = Some(origin.clone());
1493
+ }
1494
+ }
1495
+ }
1496
+
1497
+ fn lix_file_insert_origin(surface_name: &str, file_id: &str) -> StageRowOrigin {
1498
+ StageRowOrigin {
1499
+ surface: surface_name.to_string(),
1500
+ operation: StageWriteOperation::Insert,
1501
+ primary_key: Some(LogicalPrimaryKey {
1502
+ columns: vec!["id".to_string()],
1503
+ values: vec![file_id.to_string()],
1504
+ }),
1505
+ }
1506
+ }
1507
+
1508
+ fn file_row_context_from_batch(
1509
+ batch: &RecordBatch,
1510
+ row_index: usize,
1511
+ version_binding: Option<&str>,
1512
+ ) -> Result<FilesystemRowContext> {
1513
+ let explicit_version_id = optional_string_value(batch, row_index, "lixcol_version_id")?;
1514
+ let scope = resolve_write_version_scope(
1515
+ optional_bool_value(batch, row_index, "lixcol_global")?,
1516
+ explicit_version_id,
1517
+ version_binding,
1518
+ "INSERT into lix_file_by_version",
1519
+ "lix_file",
1520
+ )?;
1521
+
1522
+ Ok(FilesystemRowContext {
1523
+ version_id: scope.version_id,
1524
+ global: scope.global,
1525
+ untracked: optional_bool_value(batch, row_index, "lixcol_untracked")?.unwrap_or(false),
1526
+ file_id: optional_string_value(batch, row_index, "lixcol_file_id")?,
1527
+ metadata: optional_metadata_value(batch, row_index, "lixcol_metadata", "lix_file")?,
1528
+ })
1529
+ }
1530
+
1531
+ fn file_row_context_from_update(
1532
+ batch: &RecordBatch,
1533
+ assignment_values: &UpdateAssignmentValues,
1534
+ row_index: usize,
1535
+ version_binding: Option<&str>,
1536
+ ) -> Result<FilesystemRowContext> {
1537
+ let explicit_version_id = optional_string_value(batch, row_index, "lixcol_version_id")?;
1538
+ let scope = resolve_write_version_scope(
1539
+ optional_bool_value(batch, row_index, "lixcol_global")?,
1540
+ explicit_version_id,
1541
+ version_binding,
1542
+ "UPDATE into lix_file_by_version",
1543
+ "lix_file",
1544
+ )?;
1545
+
1546
+ Ok(FilesystemRowContext {
1547
+ version_id: scope.version_id,
1548
+ global: scope.global,
1549
+ untracked: optional_bool_value(batch, row_index, "lixcol_untracked")?.unwrap_or(false),
1550
+ file_id: optional_string_value(batch, row_index, "lixcol_file_id")?,
1551
+ metadata: update_optional_metadata_value(
1552
+ batch,
1553
+ assignment_values,
1554
+ row_index,
1555
+ "lixcol_metadata",
1556
+ "lix_file",
1557
+ )?,
1558
+ })
1559
+ }
1560
+
1561
+ fn file_path_resolver_key(context: &FilesystemRowContext) -> String {
1562
+ filesystem_storage_scope_key(
1563
+ &context.version_id,
1564
+ context.global,
1565
+ context.untracked,
1566
+ context.file_id.as_deref(),
1567
+ )
1568
+ }
1569
+
1570
+ async fn file_path_resolvers_from_live_state(
1571
+ live_state: Arc<dyn LiveStateReader>,
1572
+ version_binding: Option<&str>,
1573
+ ) -> std::result::Result<BTreeMap<String, DirectoryPathResolver>, LixError> {
1574
+ let rows = live_state
1575
+ .scan_rows(&LiveStateScanRequest {
1576
+ filter: LiveStateFilter {
1577
+ schema_keys: vec![
1578
+ DIRECTORY_DESCRIPTOR_SCHEMA_KEY.to_string(),
1579
+ FILE_DESCRIPTOR_SCHEMA_KEY.to_string(),
1580
+ ],
1581
+ version_ids: version_binding
1582
+ .map(|version_id| vec![version_id.to_string()])
1583
+ .unwrap_or_default(),
1584
+ ..Default::default()
1585
+ },
1586
+ ..Default::default()
1587
+ })
1588
+ .await?;
1589
+ let mut resolvers = directory_path_resolvers_from_state_rows(rows)?;
1590
+ if let Some(version_id) = version_binding {
1591
+ let key = filesystem_storage_scope_key(version_id, false, false, None);
1592
+ resolvers
1593
+ .entry(key)
1594
+ .or_insert_with(DirectoryPathResolver::default);
1595
+ }
1596
+ Ok(resolvers)
1597
+ }
1598
+
1599
+ async fn lix_file_record_batch(
1600
+ schema: &SchemaRef,
1601
+ blob_reader: &Arc<dyn BlobDataReader>,
1602
+ rows: Vec<LiveStateRow>,
1603
+ ) -> Result<RecordBatch, LixError> {
1604
+ let projected_columns = schema
1605
+ .fields()
1606
+ .iter()
1607
+ .map(|field| field.name().as_str())
1608
+ .collect::<Vec<_>>();
1609
+ let needs_data = projected_columns
1610
+ .iter()
1611
+ .any(|column_name| *column_name == "data");
1612
+
1613
+ let mut file_rows = BTreeMap::<(String, String), FileDescriptorRecord>::new();
1614
+ let mut blob_rows = BTreeMap::<(String, String), BlobRefRecord>::new();
1615
+ let mut directory_rows = Vec::<DirectoryDescriptorRecord>::new();
1616
+
1617
+ for row in rows {
1618
+ match row.schema_key.as_str() {
1619
+ FILE_DESCRIPTOR_SCHEMA_KEY => {
1620
+ let Some(snapshot_content) = row.snapshot_content.as_deref() else {
1621
+ continue;
1622
+ };
1623
+ let snapshot: FileDescriptorSnapshot = serde_json::from_str(snapshot_content)
1624
+ .map_err(|error| {
1625
+ LixError::new(
1626
+ "LIX_ERROR_UNKNOWN",
1627
+ format!("invalid lix_file_descriptor snapshot JSON: {error}"),
1628
+ )
1629
+ })?;
1630
+ file_rows.insert(
1631
+ (row.version_id.clone(), snapshot.id.clone()),
1632
+ FileDescriptorRecord {
1633
+ id: snapshot.id,
1634
+ directory_id: snapshot.directory_id,
1635
+ name: snapshot.name,
1636
+ hidden: snapshot.hidden,
1637
+ live: row,
1638
+ },
1639
+ );
1640
+ }
1641
+ BLOB_REF_SCHEMA_KEY => {
1642
+ let Some(snapshot_content) = row.snapshot_content.as_deref() else {
1643
+ continue;
1644
+ };
1645
+ let snapshot: BlobRefSnapshot =
1646
+ serde_json::from_str(snapshot_content).map_err(|error| {
1647
+ LixError::new(
1648
+ "LIX_ERROR_UNKNOWN",
1649
+ format!("invalid lix_binary_blob_ref snapshot JSON: {error}"),
1650
+ )
1651
+ })?;
1652
+ blob_rows.insert(
1653
+ (row.version_id.clone(), snapshot.id.clone()),
1654
+ BlobRefRecord {
1655
+ blob_hash: snapshot.blob_hash,
1656
+ },
1657
+ );
1658
+ }
1659
+ DIRECTORY_DESCRIPTOR_SCHEMA_KEY => {
1660
+ let Some(snapshot_content) = row.snapshot_content.as_deref() else {
1661
+ continue;
1662
+ };
1663
+ let snapshot: DirectoryDescriptorSnapshot = serde_json::from_str(snapshot_content)
1664
+ .map_err(|error| {
1665
+ LixError::new(
1666
+ "LIX_ERROR_UNKNOWN",
1667
+ format!("invalid lix_directory_descriptor snapshot JSON: {error}"),
1668
+ )
1669
+ })?;
1670
+ directory_rows.push(DirectoryDescriptorRecord {
1671
+ id: snapshot.id,
1672
+ parent_id: snapshot.parent_id,
1673
+ name: snapshot.name,
1674
+ version_id: row.version_id,
1675
+ });
1676
+ }
1677
+ _ => {}
1678
+ }
1679
+ }
1680
+
1681
+ let directory_paths = derive_directory_paths(&directory_rows)?;
1682
+ let mut ids = Vec::new();
1683
+ let mut paths = Vec::new();
1684
+ let mut directory_ids = Vec::new();
1685
+ let mut names = Vec::new();
1686
+ let mut hiddens = Vec::new();
1687
+ let mut data_values = Vec::new();
1688
+ let mut entity_ids = Vec::new();
1689
+ let mut schema_keys = Vec::new();
1690
+ let mut file_ids = Vec::new();
1691
+ let mut schema_versions = Vec::new();
1692
+ let mut globals = Vec::new();
1693
+ let mut change_ids = Vec::new();
1694
+ let mut created_ats = Vec::new();
1695
+ let mut updated_ats = Vec::new();
1696
+ let mut commit_ids = Vec::new();
1697
+ let mut untracked_values = Vec::new();
1698
+ let mut metadata_values = Vec::new();
1699
+ let mut version_ids = Vec::new();
1700
+
1701
+ for ((version_id, _), file) in file_rows {
1702
+ let directory_path = match file.directory_id.as_ref() {
1703
+ Some(directory_id) => {
1704
+ let key = (version_id.clone(), directory_id.clone());
1705
+ let Some(path) = directory_paths.get(&key).cloned() else {
1706
+ return Err(LixError::new(
1707
+ LixError::CODE_FOREIGN_KEY,
1708
+ format!(
1709
+ "lix_file_descriptor '{}' references missing directory_id '{}' in version '{}'",
1710
+ file.id, directory_id, version_id
1711
+ ),
1712
+ ));
1713
+ };
1714
+ Some(path)
1715
+ }
1716
+ None => None,
1717
+ };
1718
+ let path = match directory_path {
1719
+ Some(directory_path) => format!("{directory_path}{}", file.name),
1720
+ None => format!("/{}", file.name),
1721
+ };
1722
+ let data = if needs_data {
1723
+ match blob_rows.get(&(version_id.clone(), file.id.clone())) {
1724
+ Some(blob_ref) => load_single_blob_bytes(blob_reader, &blob_ref.blob_hash).await?,
1725
+ None => None,
1726
+ }
1727
+ } else {
1728
+ None
1729
+ };
1730
+
1731
+ ids.push(Some(file.id));
1732
+ paths.push(Some(path));
1733
+ directory_ids.push(file.directory_id);
1734
+ names.push(Some(file.name));
1735
+ hiddens.push(Some(file.hidden));
1736
+ data_values.push(data);
1737
+ entity_ids.push(Some(file.live.entity_id.as_string()?));
1738
+ schema_keys.push(Some(file.live.schema_key));
1739
+ file_ids.push(file.live.file_id);
1740
+ schema_versions.push(file.live.schema_version);
1741
+ globals.push(Some(file.live.global));
1742
+ change_ids.push(file.live.change_id);
1743
+ created_ats.push(file.live.created_at);
1744
+ updated_ats.push(file.live.updated_at);
1745
+ commit_ids.push(file.live.commit_id);
1746
+ untracked_values.push(Some(file.live.untracked));
1747
+ metadata_values.push(file.live.metadata.as_ref().map(serialize_row_metadata));
1748
+ version_ids.push(Some(version_id));
1749
+ }
1750
+
1751
+ let mut columns = Vec::<ArrayRef>::with_capacity(schema.fields().len());
1752
+ for field in schema.fields() {
1753
+ let array: ArrayRef = match field.name().as_str() {
1754
+ "id" => Arc::new(StringArray::from(ids.clone())),
1755
+ "path" => Arc::new(StringArray::from(paths.clone())),
1756
+ "directory_id" => Arc::new(StringArray::from(directory_ids.clone())),
1757
+ "name" => Arc::new(StringArray::from(names.clone())),
1758
+ "hidden" => Arc::new(BooleanArray::from(hiddens.clone())),
1759
+ "data" => Arc::new(BinaryArray::from(
1760
+ data_values
1761
+ .iter()
1762
+ .map(|value| value.as_deref())
1763
+ .collect::<Vec<_>>(),
1764
+ )),
1765
+ "lixcol_entity_id" => Arc::new(StringArray::from(entity_ids.clone())),
1766
+ "lixcol_schema_key" => Arc::new(StringArray::from(schema_keys.clone())),
1767
+ "lixcol_file_id" => Arc::new(StringArray::from(file_ids.clone())),
1768
+ "lixcol_schema_version" => Arc::new(StringArray::from(schema_versions.clone())),
1769
+ "lixcol_global" => Arc::new(BooleanArray::from(globals.clone())),
1770
+ "lixcol_change_id" => Arc::new(StringArray::from(change_ids.clone())),
1771
+ "lixcol_created_at" => Arc::new(StringArray::from(created_ats.clone())),
1772
+ "lixcol_updated_at" => Arc::new(StringArray::from(updated_ats.clone())),
1773
+ "lixcol_commit_id" => Arc::new(StringArray::from(commit_ids.clone())),
1774
+ "lixcol_untracked" => Arc::new(BooleanArray::from(untracked_values.clone())),
1775
+ "lixcol_metadata" => Arc::new(StringArray::from(metadata_values.clone())),
1776
+ "lixcol_version_id" => Arc::new(StringArray::from(version_ids.clone())),
1777
+ other => {
1778
+ return Err(LixError::new(
1779
+ "LIX_ERROR_UNKNOWN",
1780
+ format!("sql2 lix_file provider does not support projected column '{other}'"),
1781
+ ))
1782
+ }
1783
+ };
1784
+ columns.push(array);
1785
+ }
1786
+
1787
+ let options = RecordBatchOptions::new().with_row_count(Some(ids.len()));
1788
+ RecordBatch::try_new_with_options(Arc::clone(schema), columns, &options).map_err(|error| {
1789
+ LixError::new(
1790
+ "LIX_ERROR_UNKNOWN",
1791
+ format!("sql2 failed to build lix_file record batch: {error}"),
1792
+ )
1793
+ })
1794
+ }
1795
+
1796
+ async fn load_single_blob_bytes(
1797
+ blob_reader: &Arc<dyn BlobDataReader>,
1798
+ blob_hash: &str,
1799
+ ) -> Result<Option<Vec<u8>>, LixError> {
1800
+ let hash = BlobHash::from_hex(blob_hash)?;
1801
+ Ok(blob_reader
1802
+ .load_bytes_many(&[hash])
1803
+ .await?
1804
+ .into_vec()
1805
+ .into_iter()
1806
+ .next()
1807
+ .flatten())
1808
+ }
1809
+
1810
+ fn derive_directory_paths(
1811
+ rows: &[DirectoryDescriptorRecord],
1812
+ ) -> Result<BTreeMap<(String, String), String>, LixError> {
1813
+ let mut by_version = BTreeMap::<String, BTreeMap<String, &DirectoryDescriptorRecord>>::new();
1814
+ for row in rows {
1815
+ by_version
1816
+ .entry(row.version_id.clone())
1817
+ .or_default()
1818
+ .insert(row.id.clone(), row);
1819
+ }
1820
+
1821
+ let mut paths = BTreeMap::<(String, String), String>::new();
1822
+ for (version_id, records) in by_version {
1823
+ for directory_id in records.keys() {
1824
+ derive_directory_path_for(
1825
+ &version_id,
1826
+ directory_id,
1827
+ &records,
1828
+ &mut paths,
1829
+ &mut BTreeSet::new(),
1830
+ )?;
1831
+ }
1832
+ }
1833
+ Ok(paths)
1834
+ }
1835
+
1836
+ fn derive_directory_path_for(
1837
+ version_id: &str,
1838
+ directory_id: &str,
1839
+ records: &BTreeMap<String, &DirectoryDescriptorRecord>,
1840
+ paths: &mut BTreeMap<(String, String), String>,
1841
+ visiting: &mut BTreeSet<String>,
1842
+ ) -> Result<Option<String>, LixError> {
1843
+ if let Some(path) = paths.get(&(version_id.to_string(), directory_id.to_string())) {
1844
+ return Ok(Some(path.clone()));
1845
+ }
1846
+ if !visiting.insert(directory_id.to_string()) {
1847
+ return Err(directory_parent_cycle_error(version_id, directory_id));
1848
+ }
1849
+ let Some(row) = records.get(directory_id) else {
1850
+ visiting.remove(directory_id);
1851
+ return Ok(None);
1852
+ };
1853
+ let path = match row.parent_id.as_deref() {
1854
+ Some(parent_id) => {
1855
+ let Some(parent_path) =
1856
+ derive_directory_path_for(version_id, parent_id, records, paths, visiting)?
1857
+ else {
1858
+ visiting.remove(directory_id);
1859
+ return Ok(None);
1860
+ };
1861
+ format!("{parent_path}{}/", row.name)
1862
+ }
1863
+ None => format!("/{}/", row.name),
1864
+ };
1865
+ visiting.remove(directory_id);
1866
+ paths.insert(
1867
+ (version_id.to_string(), directory_id.to_string()),
1868
+ path.clone(),
1869
+ );
1870
+ Ok(Some(path))
1871
+ }
1872
+
1873
+ fn directory_parent_cycle_error(version_id: &str, directory_id: &str) -> LixError {
1874
+ LixError::new(
1875
+ LixError::CODE_CONSTRAINT_VIOLATION,
1876
+ format!(
1877
+ "lix_directory_descriptor parent_id cycle in version '{version_id}' while resolving directory '{directory_id}'"
1878
+ ),
1879
+ )
1880
+ }
1881
+
1882
+ fn projected_schema(base_schema: &SchemaRef, projection: Option<&Vec<usize>>) -> Result<SchemaRef> {
1883
+ let fields = match projection {
1884
+ Some(indices) => indices
1885
+ .iter()
1886
+ .map(|index| base_schema.field(*index).as_ref().clone())
1887
+ .collect::<Vec<_>>(),
1888
+ None => base_schema
1889
+ .fields()
1890
+ .iter()
1891
+ .map(|field| field.as_ref().clone())
1892
+ .collect::<Vec<_>>(),
1893
+ };
1894
+ Ok(Arc::new(Schema::new(fields)))
1895
+ }
1896
+
1897
+ fn lix_file_scan_request(
1898
+ version_binding: Option<&str>,
1899
+ limit: Option<usize>,
1900
+ ) -> LiveStateScanRequest {
1901
+ LiveStateScanRequest {
1902
+ filter: LiveStateFilter {
1903
+ schema_keys: vec![
1904
+ FILE_DESCRIPTOR_SCHEMA_KEY.to_string(),
1905
+ BLOB_REF_SCHEMA_KEY.to_string(),
1906
+ DIRECTORY_DESCRIPTOR_SCHEMA_KEY.to_string(),
1907
+ ],
1908
+ version_ids: version_binding
1909
+ .map(|version_id| vec![version_id.to_string()])
1910
+ .unwrap_or_default(),
1911
+ ..LiveStateFilter::default()
1912
+ },
1913
+ projection: LiveStateProjection::default(),
1914
+ limit,
1915
+ }
1916
+ }
1917
+
1918
+ fn validate_lix_file_update_assignments(
1919
+ schema: &SchemaRef,
1920
+ assignments: &[(String, Expr)],
1921
+ ) -> Result<()> {
1922
+ for (column_name, expr) in assignments {
1923
+ schema.field_with_name(column_name).map_err(|_| {
1924
+ DataFusionError::Plan(format!(
1925
+ "UPDATE lix_file failed: column '{column_name}' does not exist"
1926
+ ))
1927
+ })?;
1928
+ if !matches!(
1929
+ column_name.as_str(),
1930
+ "path" | "directory_id" | "name" | "hidden" | "data" | "lixcol_metadata"
1931
+ ) {
1932
+ return Err(DataFusionError::Execution(format!(
1933
+ "UPDATE lix_file cannot stage read-only column '{column_name}'"
1934
+ )));
1935
+ }
1936
+ if column_name == "data" {
1937
+ reject_non_binary_lix_file_data_assignment(expr)?;
1938
+ }
1939
+ }
1940
+ Ok(())
1941
+ }
1942
+
1943
+ fn reject_non_binary_lix_file_data_assignment(expr: &Expr) -> Result<()> {
1944
+ match expr {
1945
+ Expr::Literal(value, _) => {
1946
+ if !scalar_is_binary_or_null(value) {
1947
+ return Err(non_binary_lix_file_data_assignment_error());
1948
+ }
1949
+ }
1950
+ Expr::Cast(cast) if is_binary_type(&cast.data_type) => {
1951
+ if !logical_expr_is_binary_or_null(&cast.expr) {
1952
+ return Err(non_binary_lix_file_data_assignment_error());
1953
+ }
1954
+ }
1955
+ _ => {}
1956
+ }
1957
+
1958
+ Ok(())
1959
+ }
1960
+
1961
+ fn non_binary_lix_file_data_assignment_error() -> DataFusionError {
1962
+ lix_file_data_type_error(
1963
+ "UPDATE lix_file",
1964
+ "data",
1965
+ "use X'...' or a binary parameter for file contents",
1966
+ )
1967
+ }
1968
+
1969
+ fn filter_lix_file_batch(
1970
+ batch: RecordBatch,
1971
+ filters: &[Arc<dyn PhysicalExpr>],
1972
+ ) -> Result<RecordBatch> {
1973
+ let Some(mask) = evaluate_lix_file_filters(&batch, filters)? else {
1974
+ return Ok(batch);
1975
+ };
1976
+ Ok(filter_record_batch(&batch, &mask)?)
1977
+ }
1978
+
1979
+ fn evaluate_lix_file_filters(
1980
+ batch: &RecordBatch,
1981
+ filters: &[Arc<dyn PhysicalExpr>],
1982
+ ) -> Result<Option<BooleanArray>> {
1983
+ if filters.is_empty() {
1984
+ return Ok(None);
1985
+ }
1986
+
1987
+ let mut combined_mask: Option<BooleanArray> = None;
1988
+ for filter in filters {
1989
+ let result = filter.evaluate(batch)?;
1990
+ let array = result.into_array(batch.num_rows())?;
1991
+ let bool_array = array
1992
+ .as_any()
1993
+ .downcast_ref::<BooleanArray>()
1994
+ .ok_or_else(|| {
1995
+ DataFusionError::Execution("lix_file filter was not boolean".to_string())
1996
+ })?;
1997
+ let normalized = bool_array
1998
+ .iter()
1999
+ .map(|value| Some(value == Some(true)))
2000
+ .collect::<BooleanArray>();
2001
+ combined_mask = Some(match combined_mask {
2002
+ Some(existing) => and(&existing, &normalized)?,
2003
+ None => normalized,
2004
+ });
2005
+ }
2006
+ Ok(combined_mask)
2007
+ }
2008
+
2009
+ fn dml_count_schema() -> SchemaRef {
2010
+ Arc::new(Schema::new(vec![Field::new(
2011
+ "count",
2012
+ DataType::UInt64,
2013
+ false,
2014
+ )]))
2015
+ }
2016
+
2017
+ fn dml_count_batch(schema: SchemaRef, count: u64) -> Result<RecordBatch> {
2018
+ RecordBatch::try_new(
2019
+ schema,
2020
+ vec![Arc::new(UInt64Array::from(vec![count])) as ArrayRef],
2021
+ )
2022
+ .map_err(DataFusionError::from)
2023
+ }
2024
+
2025
+ fn record_batch_has_non_null_column(batch: &RecordBatch, column_name: &str) -> Result<bool> {
2026
+ for row_index in 0..batch.num_rows() {
2027
+ if optional_scalar_value(batch, row_index, column_name)?
2028
+ .is_some_and(|value| !value.is_null())
2029
+ {
2030
+ return Ok(true);
2031
+ }
2032
+ }
2033
+ Ok(false)
2034
+ }
2035
+
2036
+ fn reject_read_only_lix_file_insert_field(
2037
+ batch: &RecordBatch,
2038
+ row_index: usize,
2039
+ column_name: &str,
2040
+ ) -> Result<()> {
2041
+ if optional_scalar_value(batch, row_index, column_name)?.is_some_and(|value| !value.is_null()) {
2042
+ return Err(DataFusionError::Execution(format!(
2043
+ "INSERT into lix_file cannot stage read-only column '{column_name}'"
2044
+ )));
2045
+ }
2046
+ Ok(())
2047
+ }
2048
+
2049
+ fn required_string_value(
2050
+ batch: &RecordBatch,
2051
+ row_index: usize,
2052
+ column_name: &str,
2053
+ ) -> Result<String> {
2054
+ optional_string_value(batch, row_index, column_name)?.ok_or_else(|| {
2055
+ DataFusionError::Execution(format!(
2056
+ "INSERT into lix_file requires non-null text column '{column_name}'"
2057
+ ))
2058
+ })
2059
+ }
2060
+
2061
+ fn update_required_string_value(
2062
+ batch: &RecordBatch,
2063
+ assignment_values: &UpdateAssignmentValues,
2064
+ row_index: usize,
2065
+ column_name: &str,
2066
+ ) -> Result<String> {
2067
+ update_optional_string_value(batch, assignment_values, row_index, column_name)?.ok_or_else(
2068
+ || {
2069
+ DataFusionError::Execution(format!(
2070
+ "UPDATE lix_file requires non-null text column '{column_name}'"
2071
+ ))
2072
+ },
2073
+ )
2074
+ }
2075
+
2076
+ fn update_optional_string_value(
2077
+ batch: &RecordBatch,
2078
+ assignment_values: &UpdateAssignmentValues,
2079
+ row_index: usize,
2080
+ column_name: &str,
2081
+ ) -> Result<Option<String>> {
2082
+ match assignment_values.assigned_or_existing_cell(batch, row_index, column_name)? {
2083
+ InsertCell::Omitted | InsertCell::Provided(SqlCell::Null) => Ok(None),
2084
+ InsertCell::Provided(SqlCell::Value(
2085
+ ScalarValue::Utf8(Some(value))
2086
+ | ScalarValue::Utf8View(Some(value))
2087
+ | ScalarValue::LargeUtf8(Some(value)),
2088
+ )) => Ok(Some(value)),
2089
+ InsertCell::Provided(SqlCell::Value(other)) => Err(DataFusionError::Execution(format!(
2090
+ "UPDATE lix_file expected text-compatible column '{column_name}', got {other:?}"
2091
+ ))),
2092
+ }
2093
+ }
2094
+
2095
+ fn update_optional_metadata_value(
2096
+ batch: &RecordBatch,
2097
+ assignment_values: &UpdateAssignmentValues,
2098
+ row_index: usize,
2099
+ column_name: &str,
2100
+ context: &str,
2101
+ ) -> Result<Option<RowMetadata>> {
2102
+ update_optional_string_value(batch, assignment_values, row_index, column_name)?
2103
+ .map(|value| {
2104
+ parse_row_metadata(&value, context).map_err(super::error::lix_error_to_datafusion_error)
2105
+ })
2106
+ .transpose()
2107
+ }
2108
+
2109
+ fn update_optional_bool_value(
2110
+ batch: &RecordBatch,
2111
+ assignment_values: &UpdateAssignmentValues,
2112
+ row_index: usize,
2113
+ column_name: &str,
2114
+ ) -> Result<Option<bool>> {
2115
+ match assignment_values.assigned_or_existing_cell(batch, row_index, column_name)? {
2116
+ InsertCell::Omitted | InsertCell::Provided(SqlCell::Null) => Ok(None),
2117
+ InsertCell::Provided(SqlCell::Value(ScalarValue::Boolean(Some(value)))) => Ok(Some(value)),
2118
+ InsertCell::Provided(SqlCell::Value(other)) => Err(DataFusionError::Execution(format!(
2119
+ "UPDATE lix_file expected boolean column '{column_name}', got {other:?}"
2120
+ ))),
2121
+ }
2122
+ }
2123
+
2124
+ fn update_required_binary_value(
2125
+ _batch: &RecordBatch,
2126
+ assignment_values: &UpdateAssignmentValues,
2127
+ row_index: usize,
2128
+ column_name: &str,
2129
+ ) -> Result<Vec<u8>> {
2130
+ match assignment_values.assigned_cell(row_index, column_name)? {
2131
+ UpdateCell::Unassigned | UpdateCell::Assigned(SqlCell::Null) => {
2132
+ Err(lix_file_data_type_error(
2133
+ "UPDATE lix_file",
2134
+ column_name,
2135
+ "use X'' for an empty file or omit data to leave contents unchanged",
2136
+ ))
2137
+ }
2138
+ UpdateCell::Assigned(SqlCell::Value(ScalarValue::Binary(Some(value))))
2139
+ | UpdateCell::Assigned(SqlCell::Value(ScalarValue::LargeBinary(Some(value)))) => Ok(value),
2140
+ UpdateCell::Assigned(SqlCell::Value(ScalarValue::FixedSizeBinary(_, Some(value)))) => {
2141
+ Ok(value)
2142
+ }
2143
+ UpdateCell::Assigned(SqlCell::Value(other)) => Err(lix_file_data_type_error_with_value(
2144
+ "UPDATE lix_file",
2145
+ column_name,
2146
+ &other,
2147
+ "use X'...' or a binary parameter for file contents",
2148
+ )),
2149
+ }
2150
+ }
2151
+
2152
+ fn optional_string_value(
2153
+ batch: &RecordBatch,
2154
+ row_index: usize,
2155
+ column_name: &str,
2156
+ ) -> Result<Option<String>> {
2157
+ match optional_scalar_value(batch, row_index, column_name)? {
2158
+ None
2159
+ | Some(ScalarValue::Null)
2160
+ | Some(ScalarValue::Utf8(None))
2161
+ | Some(ScalarValue::Utf8View(None))
2162
+ | Some(ScalarValue::LargeUtf8(None)) => Ok(None),
2163
+ Some(ScalarValue::Utf8(Some(value)))
2164
+ | Some(ScalarValue::Utf8View(Some(value)))
2165
+ | Some(ScalarValue::LargeUtf8(Some(value))) => Ok(Some(value)),
2166
+ Some(other) => Err(DataFusionError::Execution(format!(
2167
+ "INSERT into lix_file expected text-compatible column '{column_name}', got {other:?}"
2168
+ ))),
2169
+ }
2170
+ }
2171
+
2172
+ fn optional_metadata_value(
2173
+ batch: &RecordBatch,
2174
+ row_index: usize,
2175
+ column_name: &str,
2176
+ context: &str,
2177
+ ) -> Result<Option<RowMetadata>> {
2178
+ optional_string_value(batch, row_index, column_name)?
2179
+ .map(|value| {
2180
+ parse_row_metadata(&value, context).map_err(super::error::lix_error_to_datafusion_error)
2181
+ })
2182
+ .transpose()
2183
+ }
2184
+
2185
+ fn optional_bool_value(
2186
+ batch: &RecordBatch,
2187
+ row_index: usize,
2188
+ column_name: &str,
2189
+ ) -> Result<Option<bool>> {
2190
+ match optional_scalar_value(batch, row_index, column_name)? {
2191
+ None | Some(ScalarValue::Null) | Some(ScalarValue::Boolean(None)) => Ok(None),
2192
+ Some(ScalarValue::Boolean(Some(value))) => Ok(Some(value)),
2193
+ Some(other) => Err(DataFusionError::Execution(format!(
2194
+ "INSERT into lix_file expected boolean column '{column_name}', got {other:?}"
2195
+ ))),
2196
+ }
2197
+ }
2198
+
2199
+ fn insert_optional_binary_value(
2200
+ batch: &RecordBatch,
2201
+ row_index: usize,
2202
+ column_name: &str,
2203
+ ) -> Result<Option<Vec<u8>>> {
2204
+ match optional_scalar_value(batch, row_index, column_name)? {
2205
+ None => Ok(None),
2206
+ Some(ScalarValue::Null)
2207
+ | Some(ScalarValue::Binary(None))
2208
+ | Some(ScalarValue::LargeBinary(None))
2209
+ | Some(ScalarValue::FixedSizeBinary(_, None)) => Err(lix_file_data_type_error(
2210
+ "INSERT into lix_file",
2211
+ column_name,
2212
+ "use X'' for an empty file or omit data to create a descriptor without contents",
2213
+ )),
2214
+ Some(ScalarValue::Binary(Some(value))) | Some(ScalarValue::LargeBinary(Some(value))) => {
2215
+ Ok(Some(value))
2216
+ }
2217
+ Some(ScalarValue::FixedSizeBinary(_, Some(value))) => Ok(Some(value)),
2218
+ Some(other) => Err(lix_file_data_type_error_with_value(
2219
+ "INSERT into lix_file",
2220
+ column_name,
2221
+ &other,
2222
+ "use X'...' or a binary parameter for file contents",
2223
+ )),
2224
+ }
2225
+ }
2226
+
2227
+ fn optional_scalar_value(
2228
+ batch: &RecordBatch,
2229
+ row_index: usize,
2230
+ column_name: &str,
2231
+ ) -> Result<Option<ScalarValue>> {
2232
+ let schema = batch.schema();
2233
+ let column_index = match schema.index_of(column_name) {
2234
+ Ok(column_index) => column_index,
2235
+ Err(_) => return Ok(None),
2236
+ };
2237
+ if row_index >= batch.num_rows() {
2238
+ return Err(DataFusionError::Execution(format!(
2239
+ "row index {row_index} out of bounds for lix_file batch with {} rows",
2240
+ batch.num_rows()
2241
+ )));
2242
+ }
2243
+ ScalarValue::try_from_array(batch.column(column_index).as_ref(), row_index)
2244
+ .map(Some)
2245
+ .map_err(|error| {
2246
+ DataFusionError::Execution(format!(
2247
+ "failed to decode lix_file column '{column_name}' at row {row_index}: {error}"
2248
+ ))
2249
+ })
2250
+ }
2251
+
2252
+ fn lix_file_schema() -> SchemaRef {
2253
+ Arc::new(Schema::new(vec![
2254
+ Field::new("id", DataType::Utf8, true),
2255
+ Field::new("path", DataType::Utf8, false),
2256
+ Field::new("directory_id", DataType::Utf8, true),
2257
+ Field::new("name", DataType::Utf8, false),
2258
+ Field::new("hidden", DataType::Boolean, true),
2259
+ Field::new("data", DataType::Binary, true),
2260
+ Field::new("lixcol_entity_id", DataType::Utf8, false),
2261
+ Field::new("lixcol_schema_key", DataType::Utf8, false),
2262
+ Field::new("lixcol_file_id", DataType::Utf8, true),
2263
+ Field::new("lixcol_schema_version", DataType::Utf8, false),
2264
+ Field::new("lixcol_global", DataType::Boolean, true),
2265
+ Field::new("lixcol_change_id", DataType::Utf8, true),
2266
+ Field::new("lixcol_created_at", DataType::Utf8, true),
2267
+ Field::new("lixcol_updated_at", DataType::Utf8, true),
2268
+ Field::new("lixcol_commit_id", DataType::Utf8, true),
2269
+ Field::new("lixcol_untracked", DataType::Boolean, true),
2270
+ json_field("lixcol_metadata", true),
2271
+ ]))
2272
+ }
2273
+
2274
+ fn lix_file_by_version_schema() -> SchemaRef {
2275
+ let mut fields = lix_file_schema()
2276
+ .fields()
2277
+ .iter()
2278
+ .map(|field| field.as_ref().clone())
2279
+ .collect::<Vec<_>>();
2280
+ fields.push(Field::new("lixcol_version_id", DataType::Utf8, false));
2281
+ Arc::new(Schema::new(fields))
2282
+ }
2283
+
2284
+ fn datafusion_error_to_lix_error(error: DataFusionError) -> LixError {
2285
+ super::error::datafusion_error_to_lix_error(error)
2286
+ }
2287
+
2288
+ fn lix_error_to_datafusion_error(error: LixError) -> DataFusionError {
2289
+ super::error::lix_error_to_datafusion_error(error)
2290
+ }
2291
+
2292
+ #[cfg(test)]
2293
+ mod tests {
2294
+ use std::collections::{BTreeMap, BTreeSet};
2295
+ use std::sync::Arc;
2296
+
2297
+ use async_trait::async_trait;
2298
+ use datafusion::arrow::array::{ArrayRef, BinaryArray, BooleanArray, StringArray};
2299
+ use datafusion::arrow::datatypes::{DataType, Field, Schema};
2300
+ use datafusion::arrow::record_batch::RecordBatch;
2301
+ use datafusion::execution::TaskContext;
2302
+ use datafusion::logical_expr::lit;
2303
+ use serde_json::{json, Value as JsonValue};
2304
+
2305
+ use crate::binary_cas::BlobDataReader;
2306
+ use crate::functions::{
2307
+ FunctionProvider, FunctionProviderHandle, SharedFunctionProvider, SystemFunctionProvider,
2308
+ };
2309
+ use crate::live_state::LiveStateRow;
2310
+ use crate::live_state::{LiveStateReader, LiveStateRowRequest, LiveStateScanRequest};
2311
+ use crate::sql2::dml::InsertSink;
2312
+ use crate::sql2::{SqlWriteContext, SqlWriteExecutionContext};
2313
+ use crate::transaction::types::{StageWrite, StageWriteMode, StageWriteOutcome};
2314
+ use crate::LixError;
2315
+
2316
+ use super::{
2317
+ derive_directory_path_for, lix_file_delete_stage_from_batch,
2318
+ lix_file_insert_stage_from_batch, lix_file_insert_stage_from_batch_with_path_resolvers,
2319
+ lix_file_write_rows_from_batch, DirectoryDescriptorRecord, LixFileInsertSink,
2320
+ VersionBinding,
2321
+ };
2322
+
2323
+ fn test_id_generator(ids: &'static [&'static str]) -> impl FnMut() -> String {
2324
+ let mut ids = ids.iter();
2325
+ move || ids.next().expect("test id should exist").to_string()
2326
+ }
2327
+
2328
+ fn test_functions() -> FunctionProviderHandle {
2329
+ SharedFunctionProvider::new(
2330
+ Box::new(SystemFunctionProvider) as Box<dyn FunctionProvider + Send>
2331
+ )
2332
+ }
2333
+
2334
+ fn lix_file_update_stage_from_batch_for_test(
2335
+ batch: &RecordBatch,
2336
+ version_binding: Option<&str>,
2337
+ update_columns: super::LixFileUpdateColumns,
2338
+ path_resolvers: Option<&mut BTreeMap<String, super::DirectoryPathResolver>>,
2339
+ generate_directory_id: &mut dyn FnMut() -> String,
2340
+ ) -> datafusion::common::Result<super::LixFileStagedBatch> {
2341
+ let mut columns = Vec::new();
2342
+ if update_columns.path {
2343
+ columns.extend(["path", "hidden"]);
2344
+ }
2345
+ if update_columns.data {
2346
+ columns.push("data");
2347
+ }
2348
+ if update_columns.descriptor {
2349
+ columns.extend(["directory_id", "name", "hidden"]);
2350
+ }
2351
+ let assignment_values = super::UpdateAssignmentValues::from_batch_columns(batch, &columns);
2352
+ super::lix_file_update_stage_from_batch(
2353
+ batch,
2354
+ &assignment_values,
2355
+ version_binding,
2356
+ update_columns,
2357
+ path_resolvers,
2358
+ generate_directory_id,
2359
+ )
2360
+ }
2361
+
2362
+ #[derive(Default)]
2363
+ struct CapturingWriteContext {
2364
+ rows: Vec<LiveStateRow>,
2365
+ writes: Vec<StageWrite>,
2366
+ }
2367
+
2368
+ #[async_trait]
2369
+ impl BlobDataReader for CapturingWriteContext {
2370
+ async fn load_bytes_many(
2371
+ &self,
2372
+ hashes: &[crate::binary_cas::BlobHash],
2373
+ ) -> Result<crate::binary_cas::BlobBytesBatch, LixError> {
2374
+ Ok(crate::binary_cas::BlobBytesBatch::missing(hashes.len()))
2375
+ }
2376
+ }
2377
+
2378
+ #[async_trait]
2379
+ impl SqlWriteExecutionContext for CapturingWriteContext {
2380
+ fn active_version_id(&self) -> &str {
2381
+ "version-b"
2382
+ }
2383
+
2384
+ fn functions(&self) -> FunctionProviderHandle {
2385
+ test_functions()
2386
+ }
2387
+
2388
+ fn list_visible_schemas(&self) -> Result<Vec<JsonValue>, LixError> {
2389
+ Ok(Vec::new())
2390
+ }
2391
+
2392
+ async fn load_bytes_many(
2393
+ &mut self,
2394
+ hashes: &[crate::binary_cas::BlobHash],
2395
+ ) -> Result<crate::binary_cas::BlobBytesBatch, LixError> {
2396
+ BlobDataReader::load_bytes_many(self, hashes).await
2397
+ }
2398
+
2399
+ async fn scan_live_state(
2400
+ &mut self,
2401
+ _request: &LiveStateScanRequest,
2402
+ ) -> Result<Vec<LiveStateRow>, LixError> {
2403
+ Ok(self.rows.clone())
2404
+ }
2405
+
2406
+ async fn load_version_head(
2407
+ &mut self,
2408
+ version_id: &str,
2409
+ ) -> Result<Option<String>, LixError> {
2410
+ if version_id == "ghost-version" {
2411
+ return Ok(None);
2412
+ }
2413
+ Ok(Some(format!("commit-{version_id}")))
2414
+ }
2415
+
2416
+ async fn stage_write(&mut self, write: StageWrite) -> Result<StageWriteOutcome, LixError> {
2417
+ self.writes.push(write);
2418
+ Ok(StageWriteOutcome { count: 0 })
2419
+ }
2420
+ }
2421
+
2422
+ #[derive(Default)]
2423
+ struct RowsLiveStateReader {
2424
+ rows: Vec<LiveStateRow>,
2425
+ }
2426
+
2427
+ #[async_trait]
2428
+ impl LiveStateReader for RowsLiveStateReader {
2429
+ async fn scan_rows(
2430
+ &self,
2431
+ _request: &LiveStateScanRequest,
2432
+ ) -> Result<Vec<LiveStateRow>, LixError> {
2433
+ Ok(self.rows.clone())
2434
+ }
2435
+
2436
+ async fn load_row(
2437
+ &self,
2438
+ _request: &LiveStateRowRequest,
2439
+ ) -> Result<Option<LiveStateRow>, LixError> {
2440
+ Ok(None)
2441
+ }
2442
+ }
2443
+
2444
+ fn live_directory_row(
2445
+ entity_id: &str,
2446
+ version_id: &str,
2447
+ snapshot_content: &str,
2448
+ ) -> LiveStateRow {
2449
+ LiveStateRow {
2450
+ entity_id: crate::entity_identity::EntityIdentity::from_string(entity_id)
2451
+ .expect("entity id should decode"),
2452
+ schema_key: super::DIRECTORY_DESCRIPTOR_SCHEMA_KEY.to_string(),
2453
+ file_id: None,
2454
+ snapshot_content: Some(snapshot_content.to_string()),
2455
+ metadata: None,
2456
+ schema_version: "1".to_string(),
2457
+ version_id: version_id.to_string(),
2458
+ change_id: Some(format!("change-{entity_id}")),
2459
+ commit_id: Some(format!("commit-{entity_id}")),
2460
+ global: false,
2461
+ untracked: false,
2462
+ created_at: "2026-04-23T00:00:00Z".to_string(),
2463
+ updated_at: "2026-04-23T01:00:00Z".to_string(),
2464
+ }
2465
+ }
2466
+
2467
+ fn live_file_row(entity_id: &str, version_id: &str, snapshot_content: &str) -> LiveStateRow {
2468
+ LiveStateRow {
2469
+ entity_id: crate::entity_identity::EntityIdentity::from_string(entity_id)
2470
+ .expect("entity id should decode"),
2471
+ schema_key: super::FILE_DESCRIPTOR_SCHEMA_KEY.to_string(),
2472
+ file_id: None,
2473
+ snapshot_content: Some(snapshot_content.to_string()),
2474
+ metadata: None,
2475
+ schema_version: "1".to_string(),
2476
+ version_id: version_id.to_string(),
2477
+ change_id: Some(format!("change-{entity_id}")),
2478
+ commit_id: Some(format!("commit-{entity_id}")),
2479
+ global: false,
2480
+ untracked: false,
2481
+ created_at: "2026-04-23T00:00:00Z".to_string(),
2482
+ updated_at: "2026-04-23T01:00:00Z".to_string(),
2483
+ }
2484
+ }
2485
+
2486
+ fn string_column(values: Vec<Option<&str>>) -> ArrayRef {
2487
+ Arc::new(StringArray::from(values)) as ArrayRef
2488
+ }
2489
+
2490
+ fn file_insert_batch(include_version: bool, global: bool) -> RecordBatch {
2491
+ let mut fields = vec![
2492
+ Field::new("id", DataType::Utf8, false),
2493
+ Field::new("directory_id", DataType::Utf8, true),
2494
+ Field::new("name", DataType::Utf8, false),
2495
+ Field::new("hidden", DataType::Boolean, false),
2496
+ Field::new("lixcol_global", DataType::Boolean, false),
2497
+ Field::new("lixcol_metadata", DataType::Utf8, true),
2498
+ ];
2499
+ let mut columns = vec![
2500
+ string_column(vec![Some("file-readme")]),
2501
+ string_column(vec![Some("dir-docs")]),
2502
+ string_column(vec![Some("readme.md")]),
2503
+ Arc::new(BooleanArray::from(vec![false])) as ArrayRef,
2504
+ Arc::new(BooleanArray::from(vec![global])) as ArrayRef,
2505
+ string_column(vec![Some("{\"source\":\"file\"}")]),
2506
+ ];
2507
+ if include_version {
2508
+ fields.push(Field::new("lixcol_version_id", DataType::Utf8, false));
2509
+ columns.push(string_column(vec![Some("version-b")]));
2510
+ }
2511
+ RecordBatch::try_new(Arc::new(Schema::new(fields)), columns).expect("file insert batch")
2512
+ }
2513
+
2514
+ fn data_insert_batch() -> RecordBatch {
2515
+ RecordBatch::try_new(
2516
+ Arc::new(Schema::new(vec![
2517
+ Field::new("id", DataType::Utf8, false),
2518
+ Field::new("directory_id", DataType::Utf8, true),
2519
+ Field::new("name", DataType::Utf8, false),
2520
+ Field::new("hidden", DataType::Boolean, false),
2521
+ Field::new("data", DataType::Binary, true),
2522
+ Field::new("lixcol_version_id", DataType::Utf8, false),
2523
+ ])),
2524
+ vec![
2525
+ string_column(vec![Some("file-readme")]),
2526
+ string_column(vec![Some("dir-docs")]),
2527
+ string_column(vec![Some("readme.md")]),
2528
+ Arc::new(BooleanArray::from(vec![false])) as ArrayRef,
2529
+ Arc::new(BinaryArray::from_vec(vec![b"hello"])) as ArrayRef,
2530
+ string_column(vec![Some("version-b")]),
2531
+ ],
2532
+ )
2533
+ .expect("file data batch")
2534
+ }
2535
+
2536
+ fn path_data_insert_batch() -> RecordBatch {
2537
+ RecordBatch::try_new(
2538
+ Arc::new(Schema::new(vec![
2539
+ Field::new("id", DataType::Utf8, false),
2540
+ Field::new("path", DataType::Utf8, false),
2541
+ Field::new("hidden", DataType::Boolean, false),
2542
+ Field::new("data", DataType::Binary, true),
2543
+ Field::new("lixcol_version_id", DataType::Utf8, false),
2544
+ ])),
2545
+ vec![
2546
+ string_column(vec![Some("file-readme")]),
2547
+ string_column(vec![Some("/docs/guides/readme.md")]),
2548
+ Arc::new(BooleanArray::from(vec![false])) as ArrayRef,
2549
+ Arc::new(BinaryArray::from_vec(vec![b"hello"])) as ArrayRef,
2550
+ string_column(vec![Some("version-b")]),
2551
+ ],
2552
+ )
2553
+ .expect("file path data batch")
2554
+ }
2555
+
2556
+ fn path_update_batch() -> RecordBatch {
2557
+ RecordBatch::try_new(
2558
+ Arc::new(Schema::new(vec![
2559
+ Field::new("id", DataType::Utf8, false),
2560
+ Field::new("path", DataType::Utf8, false),
2561
+ Field::new("hidden", DataType::Boolean, false),
2562
+ Field::new("data", DataType::Binary, true),
2563
+ Field::new("lixcol_version_id", DataType::Utf8, false),
2564
+ ])),
2565
+ vec![
2566
+ string_column(vec![Some("file-readme")]),
2567
+ string_column(vec![Some("/docs/renamed.md")]),
2568
+ Arc::new(BooleanArray::from(vec![false])) as ArrayRef,
2569
+ Arc::new(BinaryArray::from_vec(vec![b"hello"])) as ArrayRef,
2570
+ string_column(vec![Some("version-b")]),
2571
+ ],
2572
+ )
2573
+ .expect("file path update batch")
2574
+ }
2575
+
2576
+ fn file_delete_batch() -> RecordBatch {
2577
+ RecordBatch::try_new(
2578
+ Arc::new(Schema::new(vec![
2579
+ Field::new("id", DataType::Utf8, false),
2580
+ Field::new("lixcol_version_id", DataType::Utf8, false),
2581
+ ])),
2582
+ vec![
2583
+ string_column(vec![Some("file-readme")]),
2584
+ string_column(vec![Some("version-b")]),
2585
+ ],
2586
+ )
2587
+ .expect("file delete batch")
2588
+ }
2589
+
2590
+ #[test]
2591
+ fn derives_nested_directory_paths() {
2592
+ let root = DirectoryDescriptorRecord {
2593
+ id: "dir-docs".to_string(),
2594
+ parent_id: None,
2595
+ name: "docs".to_string(),
2596
+ version_id: "version-a".to_string(),
2597
+ };
2598
+ let child = DirectoryDescriptorRecord {
2599
+ id: "dir-guides".to_string(),
2600
+ parent_id: Some("dir-docs".to_string()),
2601
+ name: "guides".to_string(),
2602
+ version_id: "version-a".to_string(),
2603
+ };
2604
+ let mut records = BTreeMap::new();
2605
+ records.insert(root.id.clone(), &root);
2606
+ records.insert(child.id.clone(), &child);
2607
+ let mut paths = BTreeMap::new();
2608
+
2609
+ assert_eq!(
2610
+ derive_directory_path_for(
2611
+ "version-a",
2612
+ "dir-guides",
2613
+ &records,
2614
+ &mut paths,
2615
+ &mut BTreeSet::new()
2616
+ )
2617
+ .expect("path derivation should succeed"),
2618
+ Some("/docs/guides/".to_string())
2619
+ );
2620
+ }
2621
+
2622
+ #[tokio::test]
2623
+ async fn file_projection_rejects_unresolved_non_root_directory_id() {
2624
+ let blob_reader = Arc::new(CapturingWriteContext::default()) as Arc<dyn BlobDataReader>;
2625
+ let error = super::lix_file_record_batch(
2626
+ &super::lix_file_schema(),
2627
+ &blob_reader,
2628
+ vec![live_file_row(
2629
+ "file-readme",
2630
+ "version-b",
2631
+ "{\"id\":\"file-readme\",\"directory_id\":\"missing-dir\",\"name\":\"readme.md\",\"hidden\":false}",
2632
+ )],
2633
+ )
2634
+ .await
2635
+ .expect_err("unresolved non-root directory_id should not project as root path");
2636
+
2637
+ assert_eq!(error.code, LixError::CODE_FOREIGN_KEY);
2638
+ assert!(error.message.contains("missing-dir"));
2639
+ }
2640
+
2641
+ #[test]
2642
+ fn decodes_file_insert_into_lix_state_write_row() {
2643
+ let batch = file_insert_batch(true, false);
2644
+
2645
+ let rows = lix_file_write_rows_from_batch(&batch, None).expect("decode file insert");
2646
+
2647
+ assert_eq!(rows.len(), 1);
2648
+ assert_eq!(
2649
+ rows[0].entity_id.as_ref(),
2650
+ Some(&crate::entity_identity::EntityIdentity::single(
2651
+ "file-readme"
2652
+ ))
2653
+ );
2654
+ assert_eq!(rows[0].schema_key, "lix_file_descriptor");
2655
+ assert_eq!(rows[0].version_id, "version-b");
2656
+ assert_eq!(rows[0].schema_version.as_str(), "1");
2657
+ assert_eq!(rows[0].metadata.as_ref(), Some(&json!({"source": "file"})));
2658
+ let snapshot: JsonValue =
2659
+ serde_json::from_str(rows[0].snapshot_content.as_deref().unwrap())
2660
+ .expect("descriptor snapshot JSON");
2661
+ assert_eq!(snapshot["id"], "file-readme");
2662
+ assert_eq!(snapshot["directory_id"], "dir-docs");
2663
+ assert_eq!(snapshot["name"], "readme.md");
2664
+ assert_eq!(snapshot["hidden"], false);
2665
+ }
2666
+
2667
+ #[test]
2668
+ fn active_file_insert_defaults_version_id() {
2669
+ let batch = file_insert_batch(false, false);
2670
+
2671
+ let rows =
2672
+ lix_file_write_rows_from_batch(&batch, Some("version-a")).expect("decode file insert");
2673
+
2674
+ assert_eq!(rows.len(), 1);
2675
+ assert_eq!(rows[0].version_id, "version-a");
2676
+ }
2677
+
2678
+ #[test]
2679
+ fn by_version_file_insert_requires_version_id_for_non_global_rows() {
2680
+ let batch = file_insert_batch(false, false);
2681
+
2682
+ let error =
2683
+ lix_file_write_rows_from_batch(&batch, None).expect_err("version id is required");
2684
+
2685
+ assert!(
2686
+ error.to_string().contains("requires lixcol_version_id"),
2687
+ "unexpected error: {error}"
2688
+ );
2689
+ }
2690
+
2691
+ #[test]
2692
+ fn file_insert_rejects_global_with_non_global_version_id() {
2693
+ let error = lix_file_write_rows_from_batch(&file_insert_batch(true, true), None)
2694
+ .expect_err("global file write should reject conflicting version id");
2695
+
2696
+ assert!(
2697
+ error
2698
+ .to_string()
2699
+ .contains("cannot set lixcol_global=true with non-global lixcol_version_id"),
2700
+ "unexpected error: {error}"
2701
+ );
2702
+ }
2703
+
2704
+ #[test]
2705
+ fn file_update_accepts_path_assignment() {
2706
+ super::validate_lix_file_update_assignments(
2707
+ &super::lix_file_schema(),
2708
+ &[("path".to_string(), lit("/docs/renamed.md"))],
2709
+ )
2710
+ .expect("path should be writable for update");
2711
+ }
2712
+
2713
+ #[test]
2714
+ fn file_path_update_stages_descriptor_from_new_path() {
2715
+ let mut resolvers = BTreeMap::new();
2716
+ resolvers.insert(
2717
+ super::filesystem_storage_scope_key("version-b", false, false, None),
2718
+ super::DirectoryPathResolver::from_existing([(
2719
+ "/docs/".to_string(),
2720
+ "dir-docs".to_string(),
2721
+ )])
2722
+ .expect("directory resolver should seed"),
2723
+ );
2724
+
2725
+ let staged = lix_file_update_stage_from_batch_for_test(
2726
+ &path_update_batch(),
2727
+ None,
2728
+ super::LixFileUpdateColumns {
2729
+ path: true,
2730
+ data: false,
2731
+ descriptor: false,
2732
+ },
2733
+ Some(&mut resolvers),
2734
+ &mut test_id_generator(&["should-not-be-used"]),
2735
+ )
2736
+ .expect("decode file path update");
2737
+
2738
+ assert_eq!(staged.count, 1);
2739
+ assert_eq!(staged.file_data_writes.len(), 0);
2740
+ assert_eq!(staged.state_rows.len(), 1);
2741
+ let descriptor = staged
2742
+ .state_rows
2743
+ .iter()
2744
+ .find(|row| row.schema_key == "lix_file_descriptor")
2745
+ .expect("file descriptor row should be staged");
2746
+ let snapshot: JsonValue =
2747
+ serde_json::from_str(descriptor.snapshot_content.as_deref().unwrap())
2748
+ .expect("descriptor snapshot JSON");
2749
+ assert_eq!(snapshot["id"], "file-readme");
2750
+ assert_eq!(snapshot["directory_id"], "dir-docs");
2751
+ assert_eq!(snapshot["name"], "renamed.md");
2752
+ assert_eq!(snapshot["hidden"], false);
2753
+ }
2754
+
2755
+ #[test]
2756
+ fn file_path_update_preserves_existing_data_unless_data_is_assigned() {
2757
+ let mut resolvers = BTreeMap::new();
2758
+ resolvers.insert(
2759
+ super::filesystem_storage_scope_key("version-b", false, false, None),
2760
+ super::DirectoryPathResolver::from_existing([(
2761
+ "/docs/".to_string(),
2762
+ "dir-docs".to_string(),
2763
+ )])
2764
+ .expect("directory resolver should seed"),
2765
+ );
2766
+
2767
+ let staged = lix_file_update_stage_from_batch_for_test(
2768
+ &path_update_batch(),
2769
+ None,
2770
+ super::LixFileUpdateColumns {
2771
+ path: true,
2772
+ data: false,
2773
+ descriptor: false,
2774
+ },
2775
+ Some(&mut resolvers),
2776
+ &mut test_id_generator(&["should-not-be-used"]),
2777
+ )
2778
+ .expect("decode file path update");
2779
+
2780
+ assert!(
2781
+ staged.file_data_writes.is_empty(),
2782
+ "path-only update should not rewrite file data"
2783
+ );
2784
+ assert!(
2785
+ staged
2786
+ .state_rows
2787
+ .iter()
2788
+ .all(|row| row.schema_key != "lix_binary_blob_ref"),
2789
+ "path-only update should not rewrite the blob ref"
2790
+ );
2791
+ }
2792
+
2793
+ #[tokio::test]
2794
+ async fn file_path_update_seeds_resolver_from_visible_directory_state() {
2795
+ let mut resolvers = super::file_path_resolvers_from_live_state(
2796
+ Arc::new(RowsLiveStateReader {
2797
+ rows: vec![live_directory_row(
2798
+ "dir-docs",
2799
+ "version-b",
2800
+ "{\"id\":\"dir-docs\",\"parent_id\":null,\"name\":\"docs\"}",
2801
+ )],
2802
+ }) as Arc<dyn LiveStateReader>,
2803
+ Some("version-b"),
2804
+ )
2805
+ .await
2806
+ .expect("directory state should seed path resolver");
2807
+
2808
+ let staged = lix_file_update_stage_from_batch_for_test(
2809
+ &path_update_batch(),
2810
+ None,
2811
+ super::LixFileUpdateColumns {
2812
+ path: true,
2813
+ data: false,
2814
+ descriptor: false,
2815
+ },
2816
+ Some(&mut resolvers),
2817
+ &mut test_id_generator(&["should-not-be-used"]),
2818
+ )
2819
+ .expect("decode file path update");
2820
+
2821
+ assert_eq!(staged.count, 1);
2822
+ assert_eq!(staged.state_rows.len(), 1);
2823
+ assert!(staged
2824
+ .state_rows
2825
+ .iter()
2826
+ .all(|row| row.schema_key != "lix_directory_descriptor"));
2827
+
2828
+ let snapshot: JsonValue =
2829
+ serde_json::from_str(staged.state_rows[0].snapshot_content.as_deref().unwrap())
2830
+ .expect("descriptor snapshot JSON");
2831
+ assert_eq!(snapshot["directory_id"], "dir-docs");
2832
+ assert_eq!(snapshot["name"], "renamed.md");
2833
+ }
2834
+
2835
+ #[tokio::test]
2836
+ async fn file_path_update_stages_only_missing_parent_directories() {
2837
+ let mut resolvers = super::file_path_resolvers_from_live_state(
2838
+ Arc::new(RowsLiveStateReader::default()) as Arc<dyn LiveStateReader>,
2839
+ Some("version-b"),
2840
+ )
2841
+ .await
2842
+ .expect("empty directory state should seed path resolver");
2843
+
2844
+ let staged = lix_file_update_stage_from_batch_for_test(
2845
+ &path_update_batch(),
2846
+ None,
2847
+ super::LixFileUpdateColumns {
2848
+ path: true,
2849
+ data: false,
2850
+ descriptor: false,
2851
+ },
2852
+ Some(&mut resolvers),
2853
+ &mut test_id_generator(&["dir-generated-docs"]),
2854
+ )
2855
+ .expect("decode file path update");
2856
+
2857
+ assert_eq!(staged.count, 1);
2858
+ assert_eq!(staged.state_rows.len(), 2);
2859
+ assert_eq!(
2860
+ staged
2861
+ .state_rows
2862
+ .iter()
2863
+ .filter(|row| row.schema_key == "lix_directory_descriptor")
2864
+ .count(),
2865
+ 1
2866
+ );
2867
+
2868
+ let directory = staged
2869
+ .state_rows
2870
+ .iter()
2871
+ .find(|row| row.schema_key == "lix_directory_descriptor")
2872
+ .expect("missing /docs/ directory should be staged");
2873
+ assert_eq!(
2874
+ directory.entity_id.as_ref(),
2875
+ Some(&crate::entity_identity::EntityIdentity::single(
2876
+ "dir-generated-docs"
2877
+ ))
2878
+ );
2879
+
2880
+ let descriptor = staged
2881
+ .state_rows
2882
+ .iter()
2883
+ .find(|row| row.schema_key == "lix_file_descriptor")
2884
+ .expect("file descriptor should be staged");
2885
+ let snapshot: JsonValue =
2886
+ serde_json::from_str(descriptor.snapshot_content.as_deref().unwrap())
2887
+ .expect("descriptor snapshot JSON");
2888
+ assert_eq!(snapshot["directory_id"], "dir-generated-docs");
2889
+ }
2890
+
2891
+ #[test]
2892
+ fn file_path_update_with_data_assignment_stages_blob_ref_and_payload() {
2893
+ let mut resolvers = BTreeMap::new();
2894
+ resolvers.insert(
2895
+ super::filesystem_storage_scope_key("version-b", false, false, None),
2896
+ super::DirectoryPathResolver::from_existing([(
2897
+ "/docs/".to_string(),
2898
+ "dir-docs".to_string(),
2899
+ )])
2900
+ .expect("directory resolver should seed"),
2901
+ );
2902
+
2903
+ let staged = lix_file_update_stage_from_batch_for_test(
2904
+ &path_update_batch(),
2905
+ None,
2906
+ super::LixFileUpdateColumns {
2907
+ path: true,
2908
+ data: true,
2909
+ descriptor: false,
2910
+ },
2911
+ Some(&mut resolvers),
2912
+ &mut test_id_generator(&["should-not-be-used"]),
2913
+ )
2914
+ .expect("decode file path and data update");
2915
+
2916
+ assert_eq!(staged.count, 1);
2917
+ assert_eq!(staged.file_data_writes.len(), 1);
2918
+ assert_eq!(staged.file_data_writes[0].file_id, "file-readme");
2919
+ assert_eq!(staged.file_data_writes[0].data, b"hello");
2920
+ assert!(staged
2921
+ .state_rows
2922
+ .iter()
2923
+ .any(|row| row.schema_key == "lix_file_descriptor"));
2924
+ assert!(staged
2925
+ .state_rows
2926
+ .iter()
2927
+ .any(|row| row.schema_key == "lix_binary_blob_ref"));
2928
+ }
2929
+
2930
+ #[test]
2931
+ fn file_data_update_without_path_ignores_materialized_path_column() {
2932
+ let staged = lix_file_update_stage_from_batch_for_test(
2933
+ &path_update_batch(),
2934
+ None,
2935
+ super::LixFileUpdateColumns {
2936
+ path: false,
2937
+ data: true,
2938
+ descriptor: false,
2939
+ },
2940
+ None,
2941
+ &mut test_id_generator(&["should-not-be-used"]),
2942
+ )
2943
+ .expect("decode file data update");
2944
+
2945
+ assert_eq!(staged.count, 1);
2946
+ assert_eq!(staged.file_data_writes.len(), 1);
2947
+ assert_eq!(staged.file_data_writes[0].file_id, "file-readme");
2948
+ assert_eq!(staged.state_rows.len(), 1);
2949
+ assert_eq!(staged.state_rows[0].schema_key, "lix_binary_blob_ref");
2950
+ }
2951
+
2952
+ #[test]
2953
+ fn file_insert_stages_non_null_data() {
2954
+ let batch = data_insert_batch();
2955
+
2956
+ let staged = lix_file_insert_stage_from_batch(&batch, None).expect("decode file data");
2957
+
2958
+ assert_eq!(staged.count, 1);
2959
+ assert_eq!(staged.state_rows.len(), 2);
2960
+ assert!(staged
2961
+ .state_rows
2962
+ .iter()
2963
+ .any(|row| row.schema_key == "lix_file_descriptor"));
2964
+ let blob_ref_row = staged
2965
+ .state_rows
2966
+ .iter()
2967
+ .find(|row| row.schema_key == "lix_binary_blob_ref")
2968
+ .expect("data insert should stage blob ref row");
2969
+ assert_eq!(
2970
+ blob_ref_row.entity_id.as_ref(),
2971
+ Some(&crate::entity_identity::EntityIdentity::single(
2972
+ "file-readme"
2973
+ ))
2974
+ );
2975
+ assert_eq!(blob_ref_row.file_id.as_deref(), Some("file-readme"));
2976
+ assert_eq!(staged.file_data_writes.len(), 1);
2977
+ assert_eq!(staged.file_data_writes[0].file_id, "file-readme");
2978
+ assert_eq!(staged.file_data_writes[0].version_id, "version-b");
2979
+ assert_eq!(staged.file_data_writes[0].data, b"hello");
2980
+ }
2981
+
2982
+ #[test]
2983
+ fn file_delete_with_blob_ref_stages_descriptor_and_blob_ref_tombstones() {
2984
+ let batch = file_delete_batch();
2985
+ let staged = lix_file_delete_stage_from_batch(
2986
+ &batch,
2987
+ None,
2988
+ &BTreeSet::from(["file-readme".to_string()]),
2989
+ )
2990
+ .expect("decode file delete");
2991
+
2992
+ assert_eq!(staged.count, 1);
2993
+ assert_eq!(staged.state_rows.len(), 2);
2994
+ let descriptor = staged
2995
+ .state_rows
2996
+ .iter()
2997
+ .find(|row| row.schema_key == "lix_file_descriptor")
2998
+ .expect("file descriptor tombstone should be staged");
2999
+ assert_eq!(
3000
+ descriptor.entity_id.as_ref(),
3001
+ Some(&crate::entity_identity::EntityIdentity::single(
3002
+ "file-readme"
3003
+ ))
3004
+ );
3005
+ assert_eq!(descriptor.file_id, None);
3006
+ assert_eq!(descriptor.snapshot_content, None);
3007
+
3008
+ let blob_ref = staged
3009
+ .state_rows
3010
+ .iter()
3011
+ .find(|row| row.schema_key == "lix_binary_blob_ref")
3012
+ .expect("blob ref tombstone should be staged");
3013
+ assert_eq!(
3014
+ blob_ref.entity_id.as_ref(),
3015
+ Some(&crate::entity_identity::EntityIdentity::single(
3016
+ "file-readme"
3017
+ ))
3018
+ );
3019
+ assert_eq!(blob_ref.file_id.as_deref(), Some("file-readme"));
3020
+ assert_eq!(blob_ref.snapshot_content, None);
3021
+ }
3022
+
3023
+ #[test]
3024
+ fn file_delete_without_blob_ref_stages_only_descriptor_tombstone() {
3025
+ let batch = file_delete_batch();
3026
+ let staged = lix_file_delete_stage_from_batch(&batch, None, &BTreeSet::new())
3027
+ .expect("decode file delete");
3028
+
3029
+ assert_eq!(staged.count, 1);
3030
+ assert_eq!(staged.state_rows.len(), 1);
3031
+ assert_eq!(staged.state_rows[0].schema_key, "lix_file_descriptor");
3032
+ assert_eq!(
3033
+ staged.state_rows[0].entity_id.as_ref(),
3034
+ Some(&crate::entity_identity::EntityIdentity::single(
3035
+ "file-readme"
3036
+ ))
3037
+ );
3038
+ assert_eq!(staged.state_rows[0].snapshot_content, None);
3039
+ }
3040
+
3041
+ #[test]
3042
+ fn file_path_insert_reuses_existing_parent_directory() {
3043
+ let mut resolvers = BTreeMap::new();
3044
+ resolvers.insert(
3045
+ super::filesystem_storage_scope_key("version-b", false, false, None),
3046
+ super::DirectoryPathResolver::from_existing([
3047
+ ("/docs/".to_string(), "dir-docs".to_string()),
3048
+ ("/docs/guides/".to_string(), "dir-guides".to_string()),
3049
+ ])
3050
+ .expect("directory resolver should seed"),
3051
+ );
3052
+
3053
+ let staged = lix_file_insert_stage_from_batch_with_path_resolvers(
3054
+ &path_data_insert_batch(),
3055
+ None,
3056
+ "lix_file",
3057
+ &mut resolvers,
3058
+ &mut test_id_generator(&["should-not-be-used"]),
3059
+ true,
3060
+ )
3061
+ .expect("decode file path data");
3062
+
3063
+ assert_eq!(staged.count, 1);
3064
+ assert_eq!(staged.file_data_writes.len(), 1);
3065
+ assert_eq!(staged.file_data_writes[0].file_id, "file-readme");
3066
+ assert_eq!(staged.state_rows.len(), 2);
3067
+ let descriptor = staged
3068
+ .state_rows
3069
+ .iter()
3070
+ .find(|row| row.schema_key == "lix_file_descriptor")
3071
+ .expect("file descriptor row should be staged");
3072
+ let snapshot: JsonValue =
3073
+ serde_json::from_str(descriptor.snapshot_content.as_deref().unwrap())
3074
+ .expect("descriptor snapshot JSON");
3075
+ assert_eq!(snapshot["id"], "file-readme");
3076
+ assert_eq!(snapshot["directory_id"], "dir-guides");
3077
+ assert_eq!(snapshot["name"], "readme.md");
3078
+ }
3079
+
3080
+ #[test]
3081
+ fn file_path_insert_stages_missing_parent_directories_once() {
3082
+ let mut resolvers = BTreeMap::new();
3083
+
3084
+ let staged = lix_file_insert_stage_from_batch_with_path_resolvers(
3085
+ &path_data_insert_batch(),
3086
+ None,
3087
+ "lix_file",
3088
+ &mut resolvers,
3089
+ &mut test_id_generator(&["dir-generated-docs", "dir-generated-guides"]),
3090
+ true,
3091
+ )
3092
+ .expect("decode file path data");
3093
+
3094
+ assert_eq!(staged.count, 1);
3095
+ assert_eq!(staged.state_rows.len(), 4);
3096
+ let directory_rows = staged
3097
+ .state_rows
3098
+ .iter()
3099
+ .filter(|row| row.schema_key == "lix_directory_descriptor")
3100
+ .collect::<Vec<_>>();
3101
+ assert_eq!(directory_rows.len(), 2);
3102
+
3103
+ let descriptor = staged
3104
+ .state_rows
3105
+ .iter()
3106
+ .find(|row| row.schema_key == "lix_file_descriptor")
3107
+ .expect("file descriptor row should be staged");
3108
+ let snapshot: JsonValue =
3109
+ serde_json::from_str(descriptor.snapshot_content.as_deref().unwrap())
3110
+ .expect("descriptor snapshot JSON");
3111
+ assert_eq!(snapshot["directory_id"], "dir-generated-guides");
3112
+ }
3113
+
3114
+ #[tokio::test]
3115
+ async fn file_insert_sink_stages_decoded_lix_state_rows() {
3116
+ let batch = file_insert_batch(true, false);
3117
+ let mut write_context = CapturingWriteContext::default();
3118
+ let write_ctx = SqlWriteContext::new(&mut write_context);
3119
+ let sink = LixFileInsertSink::new(
3120
+ batch.schema(),
3121
+ write_ctx,
3122
+ test_functions(),
3123
+ VersionBinding::explicit(),
3124
+ false,
3125
+ );
3126
+
3127
+ let count = sink
3128
+ .write_batches(vec![batch], &Arc::new(TaskContext::default()))
3129
+ .await
3130
+ .expect("file insert sink should stage");
3131
+
3132
+ assert_eq!(count, 1);
3133
+ let writes = &write_context.writes;
3134
+ assert_eq!(writes.len(), 1);
3135
+ match &writes[0] {
3136
+ StageWrite::Rows { mode, rows } => {
3137
+ assert_eq!(*mode, StageWriteMode::Insert);
3138
+ assert_eq!(rows.len(), 1);
3139
+ assert_eq!(
3140
+ rows[0].entity_id.as_ref(),
3141
+ Some(&crate::entity_identity::EntityIdentity::single(
3142
+ "file-readme"
3143
+ ))
3144
+ );
3145
+ assert_eq!(rows[0].schema_key, "lix_file_descriptor");
3146
+ }
3147
+ other => panic!("expected insert staged write, got {other:?}"),
3148
+ }
3149
+ }
3150
+
3151
+ #[tokio::test]
3152
+ async fn file_insert_sink_stages_file_data_writes() {
3153
+ let batch = data_insert_batch();
3154
+ let mut write_context = CapturingWriteContext::default();
3155
+ let write_ctx = SqlWriteContext::new(&mut write_context);
3156
+ let sink = LixFileInsertSink::new(
3157
+ batch.schema(),
3158
+ write_ctx,
3159
+ test_functions(),
3160
+ VersionBinding::explicit(),
3161
+ true,
3162
+ );
3163
+
3164
+ let count = sink
3165
+ .write_batches(vec![batch], &Arc::new(TaskContext::default()))
3166
+ .await
3167
+ .expect("file insert sink should stage data");
3168
+
3169
+ assert_eq!(count, 1);
3170
+ let writes = &write_context.writes;
3171
+ assert_eq!(writes.len(), 1);
3172
+ match &writes[0] {
3173
+ StageWrite::RowsWithFileData {
3174
+ mode,
3175
+ rows,
3176
+ file_data,
3177
+ count,
3178
+ ..
3179
+ } => {
3180
+ assert_eq!(*mode, StageWriteMode::Insert);
3181
+ assert_eq!(*count, 1);
3182
+ assert_eq!(rows.len(), 2);
3183
+ assert!(rows
3184
+ .iter()
3185
+ .any(|row| row.schema_key == "lix_file_descriptor"));
3186
+ assert!(rows
3187
+ .iter()
3188
+ .any(|row| row.schema_key == "lix_binary_blob_ref"));
3189
+ assert_eq!(file_data.len(), 1);
3190
+ assert_eq!(file_data[0].file_id, "file-readme");
3191
+ assert_eq!(file_data[0].data, b"hello");
3192
+ }
3193
+ other => panic!("expected insert with file data staged write, got {other:?}"),
3194
+ }
3195
+ }
3196
+
3197
+ #[tokio::test]
3198
+ async fn file_insert_sink_seeds_path_resolver_from_live_state() {
3199
+ let batch = path_data_insert_batch();
3200
+ let mut write_context = CapturingWriteContext {
3201
+ rows: vec![
3202
+ live_directory_row(
3203
+ "dir-docs",
3204
+ "version-b",
3205
+ "{\"id\":\"dir-docs\",\"parent_id\":null,\"name\":\"docs\"}",
3206
+ ),
3207
+ live_directory_row(
3208
+ "dir-guides",
3209
+ "version-b",
3210
+ "{\"id\":\"dir-guides\",\"parent_id\":\"dir-docs\",\"name\":\"guides\"}",
3211
+ ),
3212
+ ],
3213
+ writes: Vec::new(),
3214
+ };
3215
+ let write_ctx = SqlWriteContext::new(&mut write_context);
3216
+ let sink = LixFileInsertSink::new(
3217
+ batch.schema(),
3218
+ write_ctx,
3219
+ test_functions(),
3220
+ VersionBinding::explicit(),
3221
+ true,
3222
+ );
3223
+
3224
+ let count = sink
3225
+ .write_batches(vec![batch], &Arc::new(TaskContext::default()))
3226
+ .await
3227
+ .expect("file insert sink should stage path data");
3228
+
3229
+ assert_eq!(count, 1);
3230
+ let writes = &write_context.writes;
3231
+ assert_eq!(writes.len(), 1);
3232
+ match &writes[0] {
3233
+ StageWrite::RowsWithFileData {
3234
+ rows,
3235
+ file_data,
3236
+ count,
3237
+ ..
3238
+ } => {
3239
+ assert_eq!(*count, 1);
3240
+ assert_eq!(file_data.len(), 1);
3241
+ assert_eq!(file_data[0].file_id, "file-readme");
3242
+ let descriptor = rows
3243
+ .iter()
3244
+ .find(|row| row.schema_key == "lix_file_descriptor")
3245
+ .expect("file descriptor row should be staged");
3246
+ let snapshot: JsonValue =
3247
+ serde_json::from_str(descriptor.snapshot_content.as_deref().unwrap())
3248
+ .expect("descriptor snapshot JSON");
3249
+ assert_eq!(snapshot["directory_id"], "dir-guides");
3250
+ }
3251
+ other => panic!("expected insert with file data staged write, got {other:?}"),
3252
+ }
3253
+ }
3254
+ }