deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import re
4
4
  from typing import Optional, Tuple, Union, List
5
5
  from datetime import timedelta
6
+ from enum import Enum
6
7
 
7
8
  import sys
8
9
  import urllib
@@ -18,11 +19,54 @@ from pyarrow.fs import (
18
19
  FSSpecHandler,
19
20
  PyFileSystem,
20
21
  GcsFileSystem,
22
+ LocalFileSystem,
23
+ S3FileSystem,
24
+ AzureFileSystem,
25
+ HadoopFileSystem,
21
26
  )
22
27
 
23
28
  _LOCAL_SCHEME = "local"
24
29
 
25
30
 
31
+ class FilesystemType(str, Enum):
32
+ LOCAL = "local"
33
+ S3 = "s3"
34
+ GCS = "gcs"
35
+ AZURE = "azure"
36
+ HADOOP = "hadoop"
37
+ UNKNOWN = "unknown"
38
+
39
+ @classmethod
40
+ def from_filesystem(cls, filesystem: FileSystem) -> FilesystemType:
41
+ if isinstance(filesystem, LocalFileSystem):
42
+ return cls.LOCAL
43
+ elif isinstance(filesystem, S3FileSystem):
44
+ return cls.S3
45
+ elif isinstance(filesystem, GcsFileSystem):
46
+ return cls.GCS
47
+ elif isinstance(filesystem, AzureFileSystem):
48
+ return cls.AZURE
49
+ elif isinstance(filesystem, HadoopFileSystem):
50
+ return cls.HADOOP
51
+ else:
52
+ return cls.UNKNOWN
53
+
54
+ @classmethod
55
+ def to_filesystem(cls, filesystem_type: FilesystemType) -> FileSystem:
56
+ if filesystem_type == cls.LOCAL:
57
+ return LocalFileSystem()
58
+ elif filesystem_type == cls.S3:
59
+ return S3FileSystem()
60
+ elif filesystem_type == cls.GCS:
61
+ return GcsFileSystem()
62
+ elif filesystem_type == cls.AZURE:
63
+ return AzureFileSystem()
64
+ elif filesystem_type == cls.HADOOP:
65
+ return HadoopFileSystem()
66
+ else:
67
+ raise ValueError(f"Unsupported filesystem type: {filesystem_type}")
68
+
69
+
26
70
  def resolve_paths_and_filesystem(
27
71
  paths: Union[str, List[str]],
28
72
  filesystem: FileSystem = None,
@@ -221,6 +265,62 @@ def get_file_info(
221
265
  return file_info
222
266
 
223
267
 
268
+ def write_file(
269
+ path: str,
270
+ data: Union[str, bytes],
271
+ filesystem: Optional[FileSystem] = None,
272
+ ) -> None:
273
+ """
274
+ Write data to a file using any filesystem.
275
+
276
+ Args:
277
+ path: The file path to write to.
278
+ data: The data to write (string or bytes).
279
+ filesystem: The filesystem implementation to use. If None, will be inferred from the path.
280
+ """
281
+ resolved_path, resolved_filesystem = resolve_path_and_filesystem(
282
+ path=path,
283
+ filesystem=filesystem,
284
+ )
285
+
286
+ # Convert string to bytes if necessary
287
+ if isinstance(data, str):
288
+ data = data.encode("utf-8")
289
+
290
+ with resolved_filesystem.open_output_stream(resolved_path) as f:
291
+ f.write(data)
292
+
293
+
294
+ def read_file(
295
+ path: str,
296
+ filesystem: Optional[FileSystem] = None,
297
+ fail_if_not_found: bool = True,
298
+ ) -> Optional[bytes]:
299
+ """
300
+ Read data from a file using any filesystem.
301
+
302
+ Args:
303
+ path: The file path to read from.
304
+ filesystem: The filesystem implementation to use. If None, will be inferred from the path.
305
+ fail_if_not_found: Whether to raise an error if the file is not found.
306
+
307
+ Returns:
308
+ The file data as bytes, or None if file not found and fail_if_not_found is False.
309
+ """
310
+ try:
311
+ resolved_path, resolved_filesystem = resolve_path_and_filesystem(
312
+ path=path,
313
+ filesystem=filesystem,
314
+ )
315
+
316
+ with resolved_filesystem.open_input_stream(resolved_path) as f:
317
+ return f.read()
318
+ except FileNotFoundError:
319
+ if fail_if_not_found:
320
+ raise
321
+ return None
322
+
323
+
224
324
  def _handle_read_os_error(
225
325
  error: OSError,
226
326
  paths: Union[str, List[str]],
@@ -1,6 +1,7 @@
1
1
  import posixpath
2
2
  import pyarrow.fs
3
3
 
4
+ from deltacat.constants import REV_DIR_NAME
4
5
  from deltacat.storage.model.partition import PartitionLocator
5
6
  from deltacat.utils.filesystem import resolve_path_and_filesystem
6
7
 
@@ -28,7 +29,7 @@ def _find_first_child_with_rev(
28
29
  )
29
30
  for child in children:
30
31
  if child.type == pyarrow.fs.FileType.Directory:
31
- rev_path = posixpath.join(child.path, "rev")
32
+ rev_path = posixpath.join(child.path, REV_DIR_NAME)
32
33
  if filesystem.get_file_info(rev_path).type == pyarrow.fs.FileType.Directory:
33
34
  return child.base_name
34
35
  raise ValueError(f"No directory with 'rev/' found under {parent_path}")
deltacat/utils/numpy.py CHANGED
@@ -1,14 +1,21 @@
1
- from typing import List, Optional, Callable, Union
1
+ from typing import List, Optional, Callable, Union, Dict, Any
2
2
 
3
+ import pandas as pd
3
4
  import numpy as np
4
- import pyarrow as pa
5
5
  from fsspec import AbstractFileSystem
6
+ import pyarrow.fs as pafs
7
+ import logging
6
8
 
7
9
  from ray.data.datasource import FilenameProvider
8
- from deltacat.types.media import ContentType
10
+ from deltacat.types.media import ContentType, ContentEncoding
9
11
  from deltacat.utils import pandas as pd_utils
10
- from deltacat.utils import pyarrow as pa_utils
12
+
11
13
  from deltacat.utils.common import ReadKwargsProvider
14
+ from deltacat import logs
15
+ from deltacat.utils.performance import timed_invocation
16
+ from deltacat.types.partial_download import PartialFileDownloadParams
17
+
18
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
19
 
13
20
 
14
21
  def slice_ndarray(np_array: np.ndarray, max_len: Optional[int]) -> List[np.ndarray]:
@@ -22,26 +29,61 @@ def slice_ndarray(np_array: np.ndarray, max_len: Optional[int]) -> List[np.ndarr
22
29
  return [np_array[i : i + max_len] for i in range(0, len(np_array), max_len)]
23
30
 
24
31
 
25
- def s3_file_to_ndarray(
26
- s3_url: str,
32
+ def file_to_ndarray(
33
+ path: str,
27
34
  content_type: str,
28
- content_encoding: str,
35
+ content_encoding: str = ContentEncoding.IDENTITY.value,
36
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
29
37
  column_names: Optional[List[str]] = None,
30
38
  include_columns: Optional[List[str]] = None,
31
39
  pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
32
- **s3_client_kwargs
40
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
41
+ fs_open_kwargs: Dict[str, Any] = {},
42
+ **kwargs,
33
43
  ) -> np.ndarray:
34
- # TODO: Compare perf to s3 -> pyarrow -> pandas [Series/DataFrame] -> numpy
35
- dataframe = pd_utils.s3_file_to_dataframe(
36
- s3_url,
37
- content_type,
38
- content_encoding,
39
- column_names,
40
- include_columns,
41
- pd_read_func_kwargs_provider,
42
- **s3_client_kwargs
44
+ """
45
+ Read a file into a NumPy ndarray using any filesystem.
46
+
47
+ This function delegates to the pandas file_to_dataframe function and converts
48
+ the resulting DataFrame to a NumPy ndarray.
49
+
50
+ Args:
51
+ path: The file path to read
52
+ content_type: The content type of the file (e.g., ContentType.CSV.value)
53
+ content_encoding: The content encoding (default: IDENTITY)
54
+ filesystem: The filesystem to use (if None, will be inferred from path)
55
+ column_names: Optional column names to assign
56
+ include_columns: Optional columns to include in the result
57
+ pd_read_func_kwargs_provider: Optional kwargs provider for customization
58
+ fs_open_kwargs: Optional kwargs for filesystem open operations
59
+ **kwargs: Additional kwargs passed to the reader function
60
+
61
+ Returns:
62
+ np.ndarray: The loaded data as a NumPy ndarray
63
+ """
64
+ logger.debug(
65
+ f"Reading {path} to NumPy ndarray. Content type: {content_type}. "
66
+ f"Encoding: {content_encoding}"
43
67
  )
44
- return dataframe.to_numpy()
68
+
69
+ dataframe, latency = timed_invocation(
70
+ pd_utils.file_to_dataframe,
71
+ path=path,
72
+ content_type=content_type,
73
+ content_encoding=content_encoding,
74
+ filesystem=filesystem,
75
+ column_names=column_names,
76
+ include_columns=include_columns,
77
+ pd_read_func_kwargs_provider=pd_read_func_kwargs_provider,
78
+ partial_file_download_params=partial_file_download_params,
79
+ fs_open_kwargs=fs_open_kwargs,
80
+ **kwargs,
81
+ )
82
+
83
+ ndarray, conversion_latency = timed_invocation(dataframe.to_numpy)
84
+ total_latency = latency + conversion_latency
85
+ logger.debug(f"Time to read {path} into NumPy ndarray: {total_latency}s")
86
+ return ndarray
45
87
 
46
88
 
47
89
  def ndarray_size(np_array: np.ndarray) -> int:
@@ -51,22 +93,72 @@ def ndarray_size(np_array: np.ndarray) -> int:
51
93
  def ndarray_to_file(
52
94
  np_array: np.ndarray,
53
95
  path: str,
54
- file_system: AbstractFileSystem,
96
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
55
97
  block_path_provider: Union[FilenameProvider, Callable],
56
98
  content_type: str = ContentType.PARQUET.value,
57
- **kwargs
99
+ **kwargs,
58
100
  ) -> None:
59
101
  """
60
102
  Writes the given Numpy ndarray to a file.
61
103
  """
104
+ import pyarrow as pa
62
105
 
63
- # PyArrow only supports 1D ndarrays, so convert to list of 1D arrays
64
- np_arrays = [array for array in np_array]
65
- pa_utils.table_to_file(
66
- pa.table({"data": np_arrays}),
106
+ # Extract schema from kwargs if available
107
+ schema = kwargs.pop("schema", None)
108
+
109
+ # Convert to pandas DataFrame with proper column names if schema is available
110
+ if schema and isinstance(schema, pa.Schema):
111
+ if np_array.ndim == 1:
112
+ # 1D array: single column
113
+ column_names = [schema.names[0]] if schema.names else ["0"]
114
+ df = pd.DataFrame({column_names[0]: np_array})
115
+ elif np_array.ndim == 2:
116
+ # 2D array: multiple columns
117
+ column_names = (
118
+ schema.names
119
+ if len(schema.names) == np_array.shape[1]
120
+ else [f"{i}" for i in range(np_array.shape[1])]
121
+ )
122
+ df = pd.DataFrame(np_array, columns=column_names)
123
+ else:
124
+ raise ValueError(
125
+ f"NumPy arrays with {np_array.ndim} dimensions are not supported"
126
+ )
127
+ else:
128
+ # Fallback to generic column names
129
+ df = pd.DataFrame(np_array)
130
+
131
+ pd_utils.dataframe_to_file(
132
+ df,
67
133
  path,
68
- file_system,
134
+ filesystem,
69
135
  block_path_provider,
70
136
  content_type,
71
- **kwargs
137
+ **kwargs,
72
138
  )
139
+
140
+
141
+ def concat_ndarrays(arrays: List[np.ndarray]) -> Optional[np.ndarray]:
142
+ """
143
+ Concatenate a list of NumPy ndarrays into a single ndarray.
144
+
145
+ Args:
146
+ arrays: List of NumPy ndarrays to concatenate
147
+
148
+ Returns:
149
+ Concatenated NumPy ndarray, or None if input is empty
150
+ """
151
+ if arrays is None or not len(arrays):
152
+ return None
153
+ if len(arrays) == 1:
154
+ return next(iter(arrays))
155
+ return np.concatenate(arrays, axis=0)
156
+
157
+
158
+ def append_column_to_ndarray(
159
+ np_array: np.ndarray,
160
+ column_name: str,
161
+ column_value: Any,
162
+ ) -> np.ndarray:
163
+ # Add a new column with value repeating for each row of np_array
164
+ return np.concatenate((np_array, np.full((len(np_array), 1), column_value)), axis=1)