kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kumoai might be problematic. Click here for more details.

Files changed (122) hide show
  1. kumoai/__init__.py +300 -0
  2. kumoai/_logging.py +29 -0
  3. kumoai/_singleton.py +25 -0
  4. kumoai/_version.py +1 -0
  5. kumoai/artifact_export/__init__.py +9 -0
  6. kumoai/artifact_export/config.py +209 -0
  7. kumoai/artifact_export/job.py +108 -0
  8. kumoai/client/__init__.py +5 -0
  9. kumoai/client/client.py +223 -0
  10. kumoai/client/connector.py +110 -0
  11. kumoai/client/endpoints.py +150 -0
  12. kumoai/client/graph.py +120 -0
  13. kumoai/client/jobs.py +471 -0
  14. kumoai/client/online.py +78 -0
  15. kumoai/client/pquery.py +207 -0
  16. kumoai/client/rfm.py +112 -0
  17. kumoai/client/source_table.py +53 -0
  18. kumoai/client/table.py +101 -0
  19. kumoai/client/utils.py +130 -0
  20. kumoai/codegen/__init__.py +19 -0
  21. kumoai/codegen/cli.py +100 -0
  22. kumoai/codegen/context.py +16 -0
  23. kumoai/codegen/edits.py +473 -0
  24. kumoai/codegen/exceptions.py +10 -0
  25. kumoai/codegen/generate.py +222 -0
  26. kumoai/codegen/handlers/__init__.py +4 -0
  27. kumoai/codegen/handlers/connector.py +118 -0
  28. kumoai/codegen/handlers/graph.py +71 -0
  29. kumoai/codegen/handlers/pquery.py +62 -0
  30. kumoai/codegen/handlers/table.py +109 -0
  31. kumoai/codegen/handlers/utils.py +42 -0
  32. kumoai/codegen/identity.py +114 -0
  33. kumoai/codegen/loader.py +93 -0
  34. kumoai/codegen/naming.py +94 -0
  35. kumoai/codegen/registry.py +121 -0
  36. kumoai/connector/__init__.py +31 -0
  37. kumoai/connector/base.py +153 -0
  38. kumoai/connector/bigquery_connector.py +200 -0
  39. kumoai/connector/databricks_connector.py +213 -0
  40. kumoai/connector/file_upload_connector.py +189 -0
  41. kumoai/connector/glue_connector.py +150 -0
  42. kumoai/connector/s3_connector.py +278 -0
  43. kumoai/connector/snowflake_connector.py +252 -0
  44. kumoai/connector/source_table.py +471 -0
  45. kumoai/connector/utils.py +1796 -0
  46. kumoai/databricks.py +14 -0
  47. kumoai/encoder/__init__.py +4 -0
  48. kumoai/exceptions.py +26 -0
  49. kumoai/experimental/__init__.py +0 -0
  50. kumoai/experimental/rfm/__init__.py +210 -0
  51. kumoai/experimental/rfm/authenticate.py +432 -0
  52. kumoai/experimental/rfm/backend/__init__.py +0 -0
  53. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  54. kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
  55. kumoai/experimental/rfm/backend/local/sampler.py +312 -0
  56. kumoai/experimental/rfm/backend/local/table.py +113 -0
  57. kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
  58. kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
  59. kumoai/experimental/rfm/backend/snow/table.py +242 -0
  60. kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
  61. kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
  62. kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
  63. kumoai/experimental/rfm/base/__init__.py +30 -0
  64. kumoai/experimental/rfm/base/column.py +152 -0
  65. kumoai/experimental/rfm/base/expression.py +44 -0
  66. kumoai/experimental/rfm/base/sampler.py +761 -0
  67. kumoai/experimental/rfm/base/source.py +19 -0
  68. kumoai/experimental/rfm/base/sql_sampler.py +143 -0
  69. kumoai/experimental/rfm/base/table.py +736 -0
  70. kumoai/experimental/rfm/graph.py +1237 -0
  71. kumoai/experimental/rfm/infer/__init__.py +19 -0
  72. kumoai/experimental/rfm/infer/categorical.py +40 -0
  73. kumoai/experimental/rfm/infer/dtype.py +82 -0
  74. kumoai/experimental/rfm/infer/id.py +46 -0
  75. kumoai/experimental/rfm/infer/multicategorical.py +48 -0
  76. kumoai/experimental/rfm/infer/pkey.py +128 -0
  77. kumoai/experimental/rfm/infer/stype.py +35 -0
  78. kumoai/experimental/rfm/infer/time_col.py +61 -0
  79. kumoai/experimental/rfm/infer/timestamp.py +41 -0
  80. kumoai/experimental/rfm/pquery/__init__.py +7 -0
  81. kumoai/experimental/rfm/pquery/executor.py +102 -0
  82. kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
  83. kumoai/experimental/rfm/relbench.py +76 -0
  84. kumoai/experimental/rfm/rfm.py +1184 -0
  85. kumoai/experimental/rfm/sagemaker.py +138 -0
  86. kumoai/experimental/rfm/task_table.py +231 -0
  87. kumoai/formatting.py +30 -0
  88. kumoai/futures.py +99 -0
  89. kumoai/graph/__init__.py +12 -0
  90. kumoai/graph/column.py +106 -0
  91. kumoai/graph/graph.py +948 -0
  92. kumoai/graph/table.py +838 -0
  93. kumoai/jobs.py +80 -0
  94. kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
  95. kumoai/mixin.py +28 -0
  96. kumoai/pquery/__init__.py +25 -0
  97. kumoai/pquery/prediction_table.py +287 -0
  98. kumoai/pquery/predictive_query.py +641 -0
  99. kumoai/pquery/training_table.py +424 -0
  100. kumoai/spcs.py +121 -0
  101. kumoai/testing/__init__.py +8 -0
  102. kumoai/testing/decorators.py +57 -0
  103. kumoai/testing/snow.py +50 -0
  104. kumoai/trainer/__init__.py +42 -0
  105. kumoai/trainer/baseline_trainer.py +93 -0
  106. kumoai/trainer/config.py +2 -0
  107. kumoai/trainer/distilled_trainer.py +175 -0
  108. kumoai/trainer/job.py +1192 -0
  109. kumoai/trainer/online_serving.py +258 -0
  110. kumoai/trainer/trainer.py +475 -0
  111. kumoai/trainer/util.py +103 -0
  112. kumoai/utils/__init__.py +11 -0
  113. kumoai/utils/datasets.py +83 -0
  114. kumoai/utils/display.py +51 -0
  115. kumoai/utils/forecasting.py +209 -0
  116. kumoai/utils/progress_logger.py +343 -0
  117. kumoai/utils/sql.py +3 -0
  118. kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
  119. kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
  120. kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
  121. kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
  122. kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1796 @@
1
+ import asyncio
2
+ import csv
3
+ import gc
4
+ import io
5
+ import math
6
+ import os
7
+ import re
8
+ import tempfile
9
+ import threading
10
+ import time
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+ from dataclasses import dataclass
13
+ from logging import getLogger
14
+ from typing import (
15
+ Any,
16
+ AsyncIterator,
17
+ Callable,
18
+ Deque,
19
+ Dict,
20
+ Generator,
21
+ Iterator,
22
+ List,
23
+ Optional,
24
+ Tuple,
25
+ Union,
26
+ )
27
+ from urllib.parse import urlparse
28
+
29
+ import aiohttp
30
+ import pandas as pd
31
+ import pyarrow as pa
32
+ import pyarrow.parquet as pq
33
+ from kumoapi.data_source import (
34
+ CompleteFileUploadRequest,
35
+ DeleteUploadedFileRequest,
36
+ PartUploadMetadata,
37
+ StartFileUploadRequest,
38
+ StartFileUploadResponse,
39
+ )
40
+ from tqdm import tqdm
41
+
42
+ from kumoai import global_state
43
+ # still used for server-side completion retries
44
+ from kumoai.exceptions import HTTPException
45
+ from kumoai.futures import _KUMO_EVENT_LOOP
46
+
47
+ # -------------------
48
+ # Constants & Globals
49
+ # -------------------
50
+ logger = getLogger(__name__)
51
+
52
+ CHUNK_SIZE = 100 * 10**6 # 100 MB (legacy local single-file chunk)
53
+ READ_CHUNK_BYTES = 8 * 1024**2 # 8 MiB remote read buffer
54
+ UPLOAD_CHUNK_BYTES = 8 * 1024**2 # 8 MiB streamed PUT sub-chunks
55
+ MAX_PARTITION_SIZE = 1000 * 1024**2 # 1GB
56
+ MIN_PARTITION_SIZE = 100 * 1024**2 # 100MB
57
+
58
+ CONNECTOR_ID_MAP = {
59
+ "csv": "csv_upload_connector",
60
+ "parquet": "parquet_upload_connector",
61
+ }
62
+
63
+ _TQDM_LOCK = threading.Lock()
64
+
65
+
66
+ # ---------------
67
+ # Small utilities
68
+ # ---------------
69
+ def _fmt_bytes(n: int) -> str:
70
+ value = float(n)
71
+ units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]
72
+ for unit in units:
73
+ if value < 1024:
74
+ return f"{value:.1f} {unit}"
75
+ value /= 1024
76
+ return f"{value:.1f} EiB"
77
+
78
+
79
+ def _fmt_secs(s: float) -> str:
80
+ if s < 1:
81
+ return f"{s*1000:.0f} ms"
82
+ return f"{s:.2f} s"
83
+
84
+
85
+ def _fmt_rate(nbytes: int, secs: float) -> str:
86
+ if secs <= 0:
87
+ return "-"
88
+ return f"{(nbytes / secs) / 1024**2:.1f} MB/s"
89
+
90
+
91
+ def _short_path(p: str, maxlen: int = 60) -> str:
92
+ if len(p) <= maxlen:
93
+ return p
94
+ try:
95
+ parsed = urlparse(p)
96
+ head = f"{parsed.scheme}://"
97
+ tail = p[-40:]
98
+ return f"{head}…{tail}"
99
+ except Exception:
100
+ return f"…{p[-maxlen:]}"
101
+
102
+
103
+ def _safe_bar_update(bar: tqdm, inc: int) -> None:
104
+ with _TQDM_LOCK:
105
+ try:
106
+ bar.update(inc)
107
+ except Exception:
108
+ pass
109
+
110
+
111
+ def _log_file_timing(label: str, path: str, size: int, tread: float,
112
+ tval: float, tupl: float) -> None:
113
+ logger.debug("[%s] %s (%s) | read=%s @ %s | validate=%s | upload=%s @ %s",
114
+ label, path, _fmt_bytes(size), _fmt_secs(tread),
115
+ _fmt_rate(size, max(tread, 1e-6)), _fmt_secs(tval),
116
+ _fmt_secs(tupl), _fmt_rate(size, max(tupl, 1e-6)))
117
+
118
+
119
+ # -----------------------
120
+ # Async upload primitives
121
+ # -----------------------
122
+ def _iter_memview_stream(
123
+ mv: memoryview,
124
+ subchunk_bytes: int,
125
+ progress_cb: Optional[Callable[[int], None]] = None,
126
+ ) -> Iterator[memoryview]:
127
+ """Yield memoryview slices (zero-copy) for streaming PUT."""
128
+ pos = 0
129
+ n = mv.nbytes
130
+ while pos < n:
131
+ nxt = min(n, pos + subchunk_bytes)
132
+ chunk = mv[pos:nxt] # zero-copy slice
133
+ pos = nxt
134
+ if progress_cb:
135
+ try:
136
+ progress_cb(len(chunk))
137
+ except Exception:
138
+ pass
139
+ yield chunk
140
+
141
+
142
+ async def _put_with_retry_streamed(
143
+ session: aiohttp.ClientSession,
144
+ url: str,
145
+ mv: memoryview,
146
+ part_no: int,
147
+ subchunk_bytes: int = UPLOAD_CHUNK_BYTES,
148
+ progress_cb: Optional[Callable[[int], None]] = None,
149
+ retries: int = 3,
150
+ ) -> Tuple[int, str]:
151
+ """Stream a memoryview to a presigned URL using an *async* generator so
152
+ aiohttp does not try to wrap it as multipart/form-data. We also set
153
+ Content-Length explicitly so S3/GCS expects a fixed-size payload (avoids
154
+ chunked TE).
155
+ """
156
+
157
+ # Build a fresh async generator per attempt (can't reuse after failure).
158
+ def _make_async_gen() -> Callable[[], Any]:
159
+ async def _agen() -> AsyncIterator[memoryview]:
160
+ # Yield zero-copy memoryview slices; aiohttp can send memoryview
161
+ # directly.
162
+ for chunk in _iter_memview_stream(mv, subchunk_bytes, progress_cb):
163
+ yield chunk
164
+ # cooperative yield; keeps event loop snappy without extra
165
+ # copies
166
+ await asyncio.sleep(0)
167
+
168
+ return _agen
169
+
170
+ headers = {
171
+ "Content-Type": "application/octet-stream",
172
+ "Content-Length": str(mv.nbytes),
173
+ }
174
+
175
+ attempt = 0
176
+ while True:
177
+ try:
178
+ async with session.put(url, data=_make_async_gen()(),
179
+ headers=headers) as res:
180
+ # Read/consume response to free the connection
181
+ _ = await res.read()
182
+ if res.status != 200:
183
+ raise RuntimeError(
184
+ f"PUT failed {res.status}: {res.reason}")
185
+ etag = res.headers.get("ETag") or res.headers.get("Etag") or ""
186
+ return (part_no + 1, etag)
187
+ except Exception:
188
+ attempt += 1
189
+ if attempt > retries:
190
+ raise
191
+ # backoff before retrying; generator will be recreated next loop
192
+ await asyncio.sleep(0.5 * attempt)
193
+
194
+
195
+ async def multi_put_bounded(
196
+ urls: List[str],
197
+ data_iter: Generator[Union[bytes, memoryview], None, None],
198
+ tqdm_bar_position: int = 0, # kept for compatibility (unused)
199
+ concurrency: int = 4,
200
+ upload_progress_cb: Optional[Callable[[int], None]] = None,
201
+ upload_subchunk_bytes: int = UPLOAD_CHUNK_BYTES,
202
+ ) -> List[PartUploadMetadata]:
203
+ """Multipart uploader with bounded concurrency and byte-accurate progress.
204
+ No extra progress bar here; caller drives a single byte counter via
205
+ upload_progress_cb.
206
+ """
207
+ sem = asyncio.Semaphore(concurrency)
208
+ results: List[Union[Tuple[int, str], None]] = [None] * len(urls)
209
+
210
+ async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
211
+ ssl=False)) as session:
212
+
213
+ async def worker(idx: int, url: str, chunk: Union[bytes,
214
+ memoryview]) -> None:
215
+ async with sem:
216
+ mv = chunk if isinstance(chunk,
217
+ memoryview) else memoryview(chunk)
218
+ res = await _put_with_retry_streamed(
219
+ session=session,
220
+ url=url,
221
+ mv=mv,
222
+ part_no=idx,
223
+ subchunk_bytes=upload_subchunk_bytes,
224
+ progress_cb=upload_progress_cb,
225
+ )
226
+ results[idx] = res
227
+
228
+ tasks: List[asyncio.Task] = []
229
+ for idx, url in enumerate(urls):
230
+ try:
231
+ chunk = next(data_iter)
232
+ except StopIteration:
233
+ break
234
+ tasks.append(asyncio.create_task(worker(idx, url, chunk)))
235
+
236
+ try:
237
+ await asyncio.gather(*tasks)
238
+ except Exception:
239
+ for t in tasks:
240
+ if not t.done():
241
+ t.cancel()
242
+ await asyncio.gather(*tasks, return_exceptions=True)
243
+ raise
244
+
245
+ out: List[PartUploadMetadata] = []
246
+ for r in results:
247
+ if r is None:
248
+ continue
249
+ out.append(PartUploadMetadata(r[0], r[1]))
250
+ return out
251
+
252
+
253
+ def stream_read(
254
+ f: io.BufferedReader,
255
+ chunk_size: int,
256
+ ) -> Generator[bytes, None, None]:
257
+ r"""Streams ``chunk_size`` contiguous bytes from buffered reader ``f`` each
258
+ time the generator is yielded from.
259
+ """
260
+ while True:
261
+ byte_buf = f.read(chunk_size)
262
+ if len(byte_buf) == 0:
263
+ break
264
+ yield byte_buf
265
+
266
+
267
+ def _validate_url_ext(url: str, file_type: Union[str, None]) -> str:
268
+ """Validate that `url` ends with .csv or .parquet. If `file_type` is
269
+ given ("csv" or "parquet"), ensure it matches. Returns the detected type
270
+ ("csv" or "parquet"), else raises ValueError.
271
+ """
272
+ u = url.lower()
273
+ detected = "csv" if u.endswith(".csv") else "parquet" if u.endswith(
274
+ ".parquet") else None
275
+ if detected is None:
276
+ raise ValueError(f"File path '{url}' must end with .csv or .parquet")
277
+
278
+ if file_type is None:
279
+ return detected
280
+
281
+ ft = file_type.lower()
282
+ if ft not in ("csv", "parquet"):
283
+ raise ValueError("file_type must be 'csv', 'parquet', or None")
284
+
285
+ if ft != detected:
286
+ raise ValueError(f"File path '{url}' must end with .{ft}")
287
+ return detected
288
+
289
+
290
+ def upload_table(
291
+ name: str,
292
+ path: str,
293
+ auto_partition: bool = True,
294
+ partition_size_mb: int = 250,
295
+ parallelism: Optional[int] = None,
296
+ file_type: Optional[str] = None,
297
+ ) -> None:
298
+ """Upload a CSV/Parquet table to Kumo from a local file or a remote path
299
+ (s3://, gs://, abfs://, abfss://, az://).
300
+
301
+ - Local file: uploaded as-is. If >1 GiB and `auto_partition=True`, splits
302
+ into ~`partition_size_mb` MiB parts.
303
+ - Remote file: uploaded via multipart. Files >1 GiB are rejected
304
+ (re-shard to ~200 MiB and upload as a directory).
305
+ - Remote directory: auto-detects format (or use `file_type`), validates
306
+ each shard, and uploads in parallel with a memory-safe budget.
307
+
308
+ Args:
309
+ name: Destination table name in Kumo.
310
+ path: Local path or remote URL to a .csv/.parquet file or directory.
311
+ auto_partition: Local-only; partition files >1 GiB.
312
+ partition_size_mb: Local partition target size (100–1000 MiB).
313
+ parallelism: Directory uploads concurrency override.
314
+ file_type: Force "csv" or "parquet" for directories; None = auto-detect
315
+
316
+ Raises:
317
+ ValueError: Bad/mixed types, zero rows, >1 GiB remote file,
318
+ schema/header mismatch, or invalid column names.
319
+ ImportError: Missing filesystem dependency (s3fs/gcsfs/adlfs).
320
+ RuntimeError: Remote stat/list/read or multipart completion failures.
321
+
322
+ Notes:
323
+ CSV headers are sanitized (chars → underscore, de-duped). Parquet
324
+ columns must already be valid.
325
+ """
326
+ # Decide local vs remote by scheme
327
+ scheme = urlparse(path).scheme
328
+ if scheme in ("s3", "gs", "abfs", "abfss", "az"):
329
+ return _upload_table_remote(
330
+ name=name,
331
+ path=path,
332
+ auto_partition=auto_partition,
333
+ partition_size_mb=partition_size_mb,
334
+ parallelism=parallelism,
335
+ file_type=file_type,
336
+ )
337
+ # Local path
338
+ _validate_url_ext(path, file_type)
339
+ file_size = os.path.getsize(path)
340
+
341
+ if file_size < MAX_PARTITION_SIZE:
342
+ return _upload_single_file(name, path)
343
+
344
+ if not auto_partition:
345
+ raise ValueError(
346
+ f"File {path} is {file_size / (1024**3):.2f}GB, which exceeds "
347
+ f"the 1GB limit. Enable auto_partition=True to automatically "
348
+ f"partition large files.")
349
+
350
+ partition_size = partition_size_mb * 1024**2
351
+ if (partition_size > MAX_PARTITION_SIZE
352
+ or partition_size < MIN_PARTITION_SIZE):
353
+ raise ValueError(
354
+ f"Partition size {partition_size_mb}MB must be between "
355
+ f"{MIN_PARTITION_SIZE / 1024**2}MB and "
356
+ f"{MAX_PARTITION_SIZE / 1024**2}MB.")
357
+
358
+ logger.info("File %s is large with size %s, partitioning for upload...",
359
+ path, file_size)
360
+ if path.endswith('.parquet'):
361
+ _upload_partitioned_parquet(name, path, partition_size)
362
+ else:
363
+ _upload_partitioned_csv(name, path, partition_size)
364
+
365
+
366
+ def _handle_duplicate_names(names: List[str]) -> List[str]:
367
+ unique_names: List[str] = []
368
+ unique_counts: dict[str, int] = {}
369
+ for name in names:
370
+ if name not in unique_names:
371
+ unique_counts[name] = 0
372
+ unique_names.append(name)
373
+ else:
374
+ unique_counts[name] += 1
375
+ new_name = f"{name}_{unique_counts[name]}"
376
+ while new_name in names or new_name in unique_names:
377
+ unique_counts[name] += 1
378
+ new_name = f"{name}_{unique_counts[name]}"
379
+ unique_names.append(new_name)
380
+ return unique_names
381
+
382
+
383
+ def _sanitize_columns(names: List[str]) -> Tuple[List[str], bool]:
384
+ """Normalize column names in a CSV or Parquet file.
385
+
386
+ Rules:
387
+ - Replace any non-alphanumeric character with "_"
388
+ - Strip leading/trailing underscores
389
+ - Ensure uniqueness by appending suffixes: _1, _2, ...
390
+ - Auto-name empty columns as auto_named_<n>
391
+
392
+ Returns:
393
+ (new_column_names, changed)
394
+ """
395
+ _SAN_RE = re.compile(r"[^0-9A-Za-z,\t]")
396
+ # 1) Replace non-alphanumeric sequences with underscore
397
+ new = [_SAN_RE.sub("_", n).strip("_") for n in names]
398
+
399
+ # 2) Auto-name any empty column names to match UI behavior
400
+ unnamed_counter = 0
401
+ for i, n in enumerate(new):
402
+ if not n:
403
+ new[i] = f"auto_named_{unnamed_counter}"
404
+ unnamed_counter += 1
405
+
406
+ # 3) Ensure uniqueness (append suffixes where needed)
407
+ new = _handle_duplicate_names(new)
408
+ return new, new != names
409
+
410
+
411
+ def sanitize_file(src_path: str) -> Tuple[str, bool]:
412
+ """Normalize column names in a CSV or Parquet file.
413
+
414
+ Rules:
415
+ - Replace any non-alphanumeric character with "_"
416
+ - Strip leading/trailing underscores
417
+ - Ensure uniqueness by appending suffixes: _1, _2, ...
418
+
419
+ Returns (path, changed):
420
+ - (src_path, False) if no changes were needed
421
+ - (temp_path, True) if a sanitized temp file was written (caller must
422
+ delete)
423
+ """
424
+ if src_path.endswith('.parquet'):
425
+ pf = pq.ParquetFile(src_path)
426
+ new_names, changed = _sanitize_columns(pf.schema.names)
427
+ if not changed:
428
+ return src_path, False
429
+ temp_file = tempfile.NamedTemporaryFile(suffix='.parquet',
430
+ delete=False)
431
+ original_schema = pf.schema.to_arrow_schema()
432
+ fields = [
433
+ field.with_name(new_name)
434
+ for field, new_name in zip(original_schema, new_names)
435
+ ]
436
+ sanitized_schema = pa.schema(fields)
437
+ writer = pq.ParquetWriter(temp_file.name, sanitized_schema)
438
+ for i in range(pf.num_row_groups):
439
+ tbl = pf.read_row_group(i).rename_columns(new_names)
440
+ writer.write_table(tbl)
441
+ writer.close()
442
+ return temp_file.name, True
443
+ elif src_path.endswith('.csv'):
444
+ cols = pd.read_csv(src_path, nrows=0).columns.tolist()
445
+ new_cols, changed = _sanitize_columns(cols)
446
+ if not changed:
447
+ return src_path, False
448
+ tmp = tempfile.NamedTemporaryFile(suffix='.csv', delete=False)
449
+ tmp_path = tmp.name
450
+ tmp.close()
451
+ reader = pd.read_csv(src_path, chunksize=1_000_000)
452
+ with open(tmp_path, 'w', encoding='utf-8', newline='') as out:
453
+ out.write(','.join(new_cols) + '\n')
454
+ for chunk in reader:
455
+ chunk.columns = new_cols
456
+ chunk.to_csv(out, header=False, index=False)
457
+ return tmp_path, True
458
+ else:
459
+ raise ValueError(
460
+ f"File {src_path} must be either a CSV or Parquet file.")
461
+
462
+
463
+ def _upload_single_file(
464
+ name: str,
465
+ path: str,
466
+ tqdm_bar_position: int = 0,
467
+ ) -> None:
468
+ if not (path.endswith(".parquet") or path.endswith(".csv")):
469
+ raise ValueError(f"Path {path} must be either a CSV or Parquet file. "
470
+ "Partitioned data is not currently supported.")
471
+
472
+ file_type = 'parquet' if path.endswith('parquet') else 'csv'
473
+ path, temp_file_created = sanitize_file(path)
474
+ sz = os.path.getsize(path)
475
+ if tqdm_bar_position == 0:
476
+ logger.info("Uploading table %s (path: %s), size=%s bytes", name, path,
477
+ sz)
478
+
479
+ upload_res = _start_table_upload(table_name=name, file_type=file_type,
480
+ file_size_bytes=sz)
481
+
482
+ urls = list(upload_res.presigned_part_urls.values())
483
+ loop = _KUMO_EVENT_LOOP
484
+ part_metadata_list_fut = asyncio.run_coroutine_threadsafe(
485
+ multi_put_bounded(
486
+ urls=urls,
487
+ data_iter=stream_read(open(path, 'rb'), CHUNK_SIZE),
488
+ tqdm_bar_position=tqdm_bar_position,
489
+ concurrency=min(4, len(urls)),
490
+ upload_progress_cb=None,
491
+ upload_subchunk_bytes=UPLOAD_CHUNK_BYTES,
492
+ ),
493
+ loop,
494
+ )
495
+ part_metadata_list = part_metadata_list_fut.result()
496
+
497
+ if tqdm_bar_position == 0:
498
+ logger.info("Upload complete. Validating table %s.", name)
499
+ for i in range(5):
500
+ try:
501
+ _complete_table_upload(
502
+ table_name=name,
503
+ file_type=file_type,
504
+ upload_path=upload_res.temp_upload_path,
505
+ upload_id=upload_res.upload_id,
506
+ parts_metadata=part_metadata_list,
507
+ )
508
+ except HTTPException as e:
509
+ # TODO(manan): this can happen when DELETE above has
510
+ # not propagated. So we retry with delay here. We
511
+ # assume DELETE is processed reasonably quickly:
512
+ if e.status_code == 500 and i < 4:
513
+ time.sleep(2**(i - 1))
514
+ continue
515
+ else:
516
+ raise e
517
+ else:
518
+ break
519
+
520
+ if tqdm_bar_position == 0:
521
+ logger.info("Completed uploading table %s to Kumo.", name)
522
+ if temp_file_created:
523
+ os.unlink(path)
524
+
525
+
526
+ def _upload_partitioned_parquet(name: str, path: str,
527
+ partition_size: int) -> None:
528
+ r"""Upload a large parquet file by partitioning it into smaller chunks."""
529
+ logger.info("File %s is large, partitioning for upload...", path)
530
+ pf = pq.ParquetFile(path)
531
+ new_columns, _ = _sanitize_columns(pf.schema.names)
532
+
533
+ partitions: List[Tuple[int, List[int]]] = []
534
+ part_idx = 0
535
+ current_size = 0
536
+ current_row_groups: list[int] = []
537
+
538
+ for rg_idx in range(pf.num_row_groups):
539
+ rg_size = pf.metadata.row_group(rg_idx).total_byte_size
540
+ if rg_size > MAX_PARTITION_SIZE:
541
+ raise ValueError(
542
+ f"Row group {rg_idx} is larger than the maximum partition size"
543
+ f"{MAX_PARTITION_SIZE} bytes")
544
+ if current_size + rg_size > partition_size and current_row_groups:
545
+ partitions.append((part_idx, current_row_groups.copy()))
546
+ part_idx += 1
547
+ current_row_groups = []
548
+ current_size = 0
549
+ current_row_groups.append(rg_idx)
550
+ current_size += rg_size
551
+ if current_row_groups:
552
+ partitions.append((part_idx, current_row_groups))
553
+
554
+ logger.info("Splitting %s into %d partitions", path, len(partitions))
555
+
556
+ def writer(path: str, row_groups: List[int]) -> None:
557
+ original_schema = pf.schema.to_arrow_schema()
558
+ fields = [
559
+ field.with_name(new_name)
560
+ for field, new_name in zip(original_schema, new_columns)
561
+ ]
562
+ sanitized_schema = pa.schema(fields)
563
+ pq_writer = pq.ParquetWriter(path, sanitized_schema)
564
+ for rg_idx in row_groups:
565
+ tbl = pf.read_row_group(rg_idx).rename_columns(new_columns)
566
+ pq_writer.write_table(tbl)
567
+ pq_writer.close()
568
+
569
+ _upload_all_partitions(partitions, name, ".parquet", writer)
570
+ logger.info("Upload complete. Validated table %s.", name)
571
+
572
+
573
+ def _upload_partitioned_csv(name: str, path: str, partition_size: int) -> None:
574
+ r"""Upload a large CSV file by partitioning it into smaller chunks."""
575
+ partitions: List[Tuple[int, List[str]]] = []
576
+ part_idx = 0
577
+ columns = pd.read_csv(path, nrows=0).columns.tolist()
578
+ new_columns, _ = _sanitize_columns(columns)
579
+ with open(path, 'r', encoding='utf-8') as f:
580
+ _ = f.readline()
581
+ header = ','.join(new_columns) + '\n'
582
+ header_size = len(header.encode('utf-8'))
583
+ current_lines = [header]
584
+ current_size = header_size
585
+ for line in f:
586
+ line_size = len(line.encode('utf-8'))
587
+ if (current_size + line_size > partition_size
588
+ and len(current_lines) > 1):
589
+ partitions.append((part_idx, current_lines.copy()))
590
+ part_idx += 1
591
+ current_lines = [header]
592
+ current_size = header_size
593
+ current_lines.append(line)
594
+ current_size += line_size
595
+ if len(current_lines) > 1:
596
+ partitions.append((part_idx, current_lines))
597
+
598
+ logger.info("Splitting %s into %d partitions", path, len(partitions))
599
+
600
+ def writer(path: str, lines: List[str]) -> None:
601
+ with open(path, "w", encoding="utf-8") as f:
602
+ f.writelines(lines)
603
+
604
+ _upload_all_partitions(partitions, name, ".csv", writer)
605
+ logger.info("Upload complete. Validated table %s.", name)
606
+
607
+
608
+ def _upload_all_partitions(
609
+ partitions: List[Tuple[int, Any]],
610
+ name: str,
611
+ file_suffix: str,
612
+ writer: Callable[[str, Any], None],
613
+ ) -> None:
614
+ with tqdm(partitions, desc=f"Uploading {name}", position=0) as pbar:
615
+ for part_idx, partition_data in pbar:
616
+ partition_desc = f"Part {part_idx+1}/{len(partitions)}"
617
+ pbar.set_postfix_str(partition_desc)
618
+ _create_and_upload_partition(
619
+ name=name,
620
+ part_idx=part_idx,
621
+ file_suffix=file_suffix,
622
+ partition_writer=writer,
623
+ partition_data=partition_data,
624
+ tqdm_bar_position=1,
625
+ )
626
+
627
+
628
+ def _create_and_upload_partition(
629
+ name: str,
630
+ part_idx: int,
631
+ file_suffix: str,
632
+ partition_writer: Callable[[str, Any], None],
633
+ partition_data: Any,
634
+ tqdm_bar_position: int = 0,
635
+ ) -> None:
636
+ r"""Create a partition file, write to it, upload it, and delete the
637
+ local copy.
638
+ """
639
+ partition_name = (f"{name}{file_suffix}/"
640
+ f"part_{part_idx+1:04d}{file_suffix}")
641
+ with tempfile.NamedTemporaryFile(suffix=file_suffix,
642
+ delete=False) as temp_file:
643
+ partition_path = temp_file.name
644
+
645
+ try:
646
+ partition_writer(partition_path, partition_data)
647
+ _upload_single_file(partition_name, partition_path,
648
+ tqdm_bar_position=tqdm_bar_position)
649
+ finally:
650
+ try:
651
+ os.unlink(partition_path)
652
+ except OSError:
653
+ pass
654
+
655
+
656
+ def delete_uploaded_table(name: str, file_type: str) -> None:
657
+ r"""Synchronously deletes a previously uploaded table from the Kumo data
658
+ plane.
659
+
660
+ .. code-block:: python
661
+
662
+ import kumoai
663
+ from kumoai.connector import delete_uploaded_table
664
+
665
+ # Assume we have uploaded a `.parquet` table named `users`,
666
+ # and we want to delete this table from Kumo:
667
+ delete_uploaded_table(name="users", file_type="parquet")
668
+
669
+ # Assume we have uploaded a `.csv` table named `orders`,
670
+ # and we want to delete this table from Kumo:
671
+ delete_uploaded_table(name="orders", file_type="csv")
672
+
673
+ Args:
674
+ name: The name of the table to be deleted. This table must have
675
+ previously been uploaded with a call to
676
+ :meth:`~kumoai.connector.upload_table`.
677
+ file_type: The file type of the table to be deleted; this can either
678
+ be :obj:`"parquet"` or :obj:`"csv"`
679
+ """
680
+ assert file_type in {'parquet', 'csv'}
681
+ req = DeleteUploadedFileRequest(
682
+ source_table_name=name,
683
+ connector_id=CONNECTOR_ID_MAP[file_type],
684
+ )
685
+ global_state.client.connector_api.delete_file_upload(req)
686
+ logger.info("Successfully deleted table %s from Kumo.", name)
687
+
688
+
689
+ def replace_table(name: str, path: str, file_type: str) -> None:
690
+ r"""Replaces an existing uploaded table on the Kumo data plane with a new
691
+ table.
692
+
693
+ .. code-block:: python
694
+
695
+ import kumoai
696
+ from kumoai.connector import replace_table
697
+
698
+ # Replace an existing `.csv` table named `users`
699
+ # with a new version located at `/data/new_users.csv`:
700
+ replace_table(
701
+ name="users",
702
+ path="/data/new_users.csv",
703
+ file_type="csv",
704
+ )
705
+
706
+ Args:
707
+ name: The name of the table to be replaced. This table must have
708
+ previously been uploaded with a call to
709
+ :meth:`~kumoai.connector.upload_table`.
710
+ path: The full path of the new table to be uploaded, on the
711
+ local machine.
712
+ file_type: The file type of the table to be replaced; this
713
+ can either be :obj:`"parquet"` or :obj:`"csv"`.
714
+
715
+ Raises:
716
+ ValueError: If the specified path does not point to a valid
717
+ `.csv` or `.parquet` file.
718
+ """
719
+ if not (path.endswith(".parquet") or path.endswith(".csv")):
720
+ raise ValueError(f"Path {path} must be either a CSV or Parquet file. "
721
+ "Partitioned data is not currently supported.")
722
+ try:
723
+ logger.info("Deleting previously uploaded table %s of type %s.", name,
724
+ file_type)
725
+ delete_uploaded_table(name=name, file_type=file_type)
726
+ except Exception:
727
+ pass
728
+ logger.info("Uploading table %s.", name)
729
+ upload_table(name=name, path=path)
730
+ logger.info("Successfully replaced table %s with the new table.", name)
731
+
732
+
733
+ def _start_table_upload(
734
+ table_name: str,
735
+ file_type: str,
736
+ file_size_bytes: float,
737
+ ) -> StartFileUploadResponse:
738
+ assert file_type in CONNECTOR_ID_MAP.keys()
739
+ req = StartFileUploadRequest(
740
+ source_table_name=table_name,
741
+ connector_id=CONNECTOR_ID_MAP[file_type],
742
+ num_parts=max(1, math.ceil(file_size_bytes / CHUNK_SIZE)),
743
+ )
744
+ return global_state.client.connector_api.start_file_upload(req)
745
+
746
+
747
+ def _start_table_upload_with_parts(
748
+ table_name: str,
749
+ file_type: str,
750
+ file_size_bytes: int,
751
+ num_parts: int,
752
+ ) -> StartFileUploadResponse:
753
+ assert file_type in CONNECTOR_ID_MAP.keys()
754
+ req = StartFileUploadRequest(
755
+ source_table_name=table_name,
756
+ connector_id=CONNECTOR_ID_MAP[file_type],
757
+ num_parts=max(1, int(num_parts)),
758
+ )
759
+ return global_state.client.connector_api.start_file_upload(req)
760
+
761
+
762
+ def _complete_table_upload(
763
+ table_name: str,
764
+ file_type: str,
765
+ upload_path: str,
766
+ upload_id: str,
767
+ parts_metadata: List[PartUploadMetadata],
768
+ ) -> None:
769
+ assert file_type in CONNECTOR_ID_MAP.keys()
770
+ req = CompleteFileUploadRequest(
771
+ source_table_name=table_name,
772
+ connector_id=CONNECTOR_ID_MAP[file_type],
773
+ temp_upload_path=str(upload_path),
774
+ upload_id=str(upload_id),
775
+ parts_metadata=parts_metadata,
776
+ # Server-side validation is disabled because client-side (SDK)
777
+ # validation is now comprehensive and eliminates the need for
778
+ # additional server-side validation.
779
+ validate_data=False,
780
+ )
781
+ return global_state.client.connector_api.complete_file_upload(req)
782
+
783
+
784
+ # -----------------------
785
+ # Remote I/O (fsspec)
786
+ # -----------------------
787
+
788
+ # Define data type for filesystem that does not depend on fsspec
789
+ Filesystem = Any
790
+
791
+
792
+ def _make_filesystem(scheme: str) -> Filesystem:
793
+ if scheme == "s3":
794
+ try:
795
+ import fsspec # noqa: F401
796
+ import s3fs # noqa: F401
797
+ except Exception:
798
+ raise ImportError(
799
+ "S3 paths require 's3fs'. Install: pip install s3fs")
800
+ fs = fsspec.filesystem("s3")
801
+ elif scheme == "gs":
802
+ try:
803
+ import fsspec # noqa: F401
804
+ import gcsfs # noqa: F401
805
+ except Exception:
806
+ raise ImportError(
807
+ "GCS paths require 'gcsfs'. Install: pip install gcsfs")
808
+ fs = fsspec.filesystem("gcs")
809
+ elif scheme in ("abfs", "abfss", "az"):
810
+ try:
811
+ import adlfs # noqa: F401
812
+ import fsspec # noqa: F401
813
+ except Exception:
814
+ raise ImportError(
815
+ "Azure paths require 'adlfs'. Install: pip install adlfs")
816
+ fs = fsspec.filesystem(scheme)
817
+ else:
818
+ raise ValueError(f"Unsupported remote scheme: {scheme}")
819
+ return fs
820
+
821
+
822
+ def _get_fs_and_path(url: str) -> Tuple[Filesystem, str]:
823
+ parsed = urlparse(url)
824
+ scheme = parsed.scheme
825
+ fs = _make_filesystem(scheme)
826
+ return fs, url
827
+
828
+
829
+ def _remote_info(fs: Filesystem, path: str) -> dict:
830
+ try:
831
+ info = fs.info(path)
832
+ if info.get("type") in ("file", "directory"):
833
+ return info
834
+ # s3fs for directories can return {'Key':..., 'Size':...}; normalize
835
+ if info.get("Size") is not None and info.get("Key"):
836
+ return {
837
+ "type": "file",
838
+ "size": info.get("Size"),
839
+ "name": info.get("Key")
840
+ }
841
+ return info
842
+ except Exception as e:
843
+ raise RuntimeError(f"Failed to stat remote path {path}: {e}")
844
+
845
+
846
+ def _remote_dir_manifest(fs: Filesystem, path: str) -> dict:
847
+ # Return lists of parquet and csv entries with size
848
+ try:
849
+ listing = fs.ls(path, detail=True)
850
+ except Exception as e:
851
+ raise RuntimeError(f"Failed to list remote directory {path}: {e}")
852
+
853
+ parquet_files: List[dict] = []
854
+ csv_files: List[dict] = []
855
+ for ent in listing:
856
+ if isinstance(ent, dict):
857
+ p = ent.get("name") or ent.get("Key") or ent.get("path")
858
+ s = ent.get("size") or ent.get("Size") or 0
859
+ t = ent.get("type") or ent.get("StorageClass") or ""
860
+ if t == "directory":
861
+ continue
862
+ else:
863
+ p = ent
864
+ try:
865
+ s = fs.info(p).get("size", 0)
866
+ except Exception:
867
+ s = 0
868
+ if not isinstance(p, str):
869
+ continue
870
+ ext = os.path.splitext(p.lower())[1]
871
+ if ext == ".parquet":
872
+ parquet_files.append({"path": p, "size": int(s or 0)})
873
+ elif ext == ".csv":
874
+ csv_files.append({"path": p, "size": int(s or 0)})
875
+
876
+ return {"parquet": parquet_files, "csv": csv_files}
877
+
878
+
879
+ def _read_remote_file_with_progress(
880
+ fs: Filesystem,
881
+ path: str,
882
+ expected_size: Optional[int],
883
+ update_bytes: Optional[Callable[[int], Optional[bool]]] = None,
884
+ capture_first_line: bool = False,
885
+ ) -> Tuple[io.BytesIO, memoryview, Optional[bytes]]:
886
+ """Stream into a single BytesIO (one allocation) and return a zero-copy
887
+ memoryview.
888
+ """
889
+ buf = io.BytesIO()
890
+
891
+ header_line: Optional[bytes] = None
892
+ if capture_first_line:
893
+ header_acc = bytearray()
894
+ seen_nl = False
895
+ else:
896
+ header_acc = bytearray()
897
+ seen_nl = True
898
+
899
+ with fs.open(path, "rb") as fobj:
900
+ while True:
901
+ chunk = fobj.read(READ_CHUNK_BYTES)
902
+ if not chunk:
903
+ break
904
+ if capture_first_line and not seen_nl:
905
+ nl_idx = chunk.find(b"\n")
906
+ if nl_idx != -1:
907
+ header_acc += chunk[:nl_idx]
908
+ # small copy only for header
909
+ header_line = bytes(header_acc)
910
+ seen_nl = True
911
+ else:
912
+ header_acc += chunk
913
+ buf.write(chunk)
914
+ if update_bytes:
915
+ try:
916
+ update_bytes(len(chunk))
917
+ except Exception:
918
+ pass
919
+
920
+ if capture_first_line and not seen_nl:
921
+ header_line = bytes(header_acc)
922
+
923
+ mv = buf.getbuffer() # zero-copy view of BytesIO internal buffer
924
+ return buf, mv, header_line
925
+
926
+
927
+ # -----------------------
928
+ # Memory budget & helpers
929
+ # -----------------------
930
+ def _compute_mem_budget_bytes(files: List[dict]) -> int:
931
+ # 50% of system RAM
932
+ try:
933
+ import psutil
934
+ total = psutil.virtual_memory().total
935
+ except Exception:
936
+ total = 8 * 1024**3 # assume 8 GiB
937
+ budget = int(total * 0.50)
938
+ return max(budget, 512 * 1024**2) # at least 512 MiB
939
+
940
+
941
+ class MemoryBudget:
942
+ """A byte-level semaphore to prevent OOM when reading many shards."""
943
+ def __init__(self, budget_bytes: int) -> None:
944
+ self.budget = budget_bytes
945
+ self.avail = budget_bytes
946
+ self.cv = threading.Condition()
947
+
948
+ def acquire(self, need: int) -> None:
949
+ with self.cv:
950
+ while self.avail < need:
951
+ self.cv.wait(timeout=0.25)
952
+ self.avail -= need
953
+
954
+ def release(self, freed: int) -> None:
955
+ with self.cv:
956
+ self.avail += freed
957
+ if self.avail > self.budget:
958
+ self.avail = self.budget
959
+ self.cv.notify_all()
960
+
961
+
962
+ def _determine_parallelism(files: List[dict], requested: Optional[int]) -> int:
963
+ if requested is not None and requested > 0:
964
+ return min(requested, len(files))
965
+ env_par = os.getenv("KUMO_UPLOAD_PARALLELISM")
966
+ if env_par:
967
+ try:
968
+ val = int(env_par)
969
+ if val > 0:
970
+ return min(val, len(files))
971
+ except Exception:
972
+ pass
973
+
974
+ budget_bytes = _compute_mem_budget_bytes(files)
975
+ # 128 MiB overhead by default
976
+ try:
977
+ overhead_bytes = max(0, int(os.getenv("KUMO_UPLOAD_OVERHEAD_MB",
978
+ "128"))) * 1024**2
979
+ except Exception:
980
+ overhead_bytes = 128 * 1024**2
981
+
982
+ needs = []
983
+ for f in files:
984
+ size = int(f.get("size") or 0)
985
+ if size <= 0:
986
+ continue
987
+ needs.append(size + overhead_bytes)
988
+ if not needs:
989
+ return 1
990
+ needs.sort()
991
+ median_need = needs[len(needs) // 2]
992
+ par = max(1, budget_bytes // max(1, median_need))
993
+ return min(int(par), len(files))
994
+
995
+
996
+ def _iter_mv_chunks(mv: memoryview,
997
+ part_size: int) -> Generator[memoryview, None, None]:
998
+ pos = 0
999
+ n = mv.nbytes
1000
+ while pos < n:
1001
+ nxt = min(n, pos + part_size)
1002
+ yield mv[pos:nxt] # zero-copy slice
1003
+ pos = nxt
1004
+
1005
+
1006
+ # -----------------------
1007
+ # Parquet helpers
1008
+ # -----------------------
1009
+ def _parquet_schema_from_bytes(data_mv: memoryview) -> pa.Schema:
1010
+ reader = pa.BufferReader(pa.py_buffer(data_mv))
1011
+ pf = pq.ParquetFile(reader)
1012
+
1013
+ # zero-row guard via metadata (no data scan)
1014
+ if getattr(pf.metadata, "num_rows", None) == 0:
1015
+ raise ValueError("Parquet file contains zero rows.")
1016
+
1017
+ return pf.schema_arrow
1018
+
1019
+
1020
+ def _parquet_num_rows_from_bytes(data_mv: memoryview) -> int:
1021
+ buf = pa.py_buffer(data_mv)
1022
+ reader = pa.BufferReader(buf)
1023
+ pf = pq.ParquetFile(reader)
1024
+ md = pf.metadata
1025
+ if md is None:
1026
+ total = 0
1027
+ for rg in range(pf.num_row_groups):
1028
+ total += pf.metadata.row_group(rg).num_rows
1029
+ return total
1030
+ return md.num_rows
1031
+
1032
+
1033
+ def validate_parquet_schema(schema: pa.Schema, source_name: str) -> None:
1034
+ """Validate a PyArrow schema for Kumo compatibility (source_name
1035
+ required).
1036
+
1037
+ Disallowed:
1038
+ - All large_* types: large_string, large_binary, large_list<*>
1039
+ - Any time-of-day types (time32/64<*>); ONLY epoch-based timestamps are
1040
+ allowed
1041
+ - Any duration types (e.g., pa.duration('ns'))
1042
+ - list<string> and list<bool>
1043
+ - Unsigned integers (uint8/16/32/64)
1044
+ - Null-typed columns
1045
+
1046
+ Allowed:
1047
+ - boolean, signed integer, floating, (regular) string, date, timestamp
1048
+ (epoch-based), (regular) binary
1049
+ - decimal up to configured precision (env KUMO_DECIMAL_MAX_PRECISION,
1050
+ default 18)
1051
+ - list of {signed integer, float}
1052
+ - dictionary<int, string>
1053
+
1054
+ Raises:
1055
+ ValueError listing offending columns (including source_name).
1056
+ """
1057
+ try:
1058
+ max_dec_prec = int(os.getenv("KUMO_DECIMAL_MAX_PRECISION", "18"))
1059
+ except Exception:
1060
+ max_dec_prec = 18
1061
+
1062
+ where = f" in {source_name}"
1063
+ errors: list[str] = []
1064
+
1065
+ for col, dt in zip(schema.names, schema.types):
1066
+ # 1) Hard-disallow all large_* types
1067
+ if pa.types.is_large_string(dt):
1068
+ errors.append(
1069
+ f" - column '{col}'{where} has unsupported type large_string")
1070
+ continue
1071
+ if pa.types.is_large_binary(dt):
1072
+ errors.append(
1073
+ f" - column '{col}'{where} has unsupported type large_binary")
1074
+ continue
1075
+ if pa.types.is_large_list(dt):
1076
+ errors.append(
1077
+ f" - column '{col}'{where} has unsupported type {dt} "
1078
+ f"(large_list not supported)")
1079
+ continue
1080
+
1081
+ # 2) Disallow time-of-day and duration
1082
+ if pa.types.is_time(dt):
1083
+ errors.append(
1084
+ f" - column '{col}'{where} has unsupported time-of-day type "
1085
+ f"'{dt}' (only epoch-based timestamps are supported)")
1086
+ continue
1087
+ if pa.types.is_duration(dt):
1088
+ errors.append(
1089
+ f" - column '{col}'{where} has unsupported duration "
1090
+ f"type '{dt}'")
1091
+ continue
1092
+
1093
+ # 3) Disallow unsigned integers and null columns
1094
+ if pa.types.is_unsigned_integer(dt):
1095
+ errors.append(
1096
+ f" - column '{col}'{where} has unsupported unsigned integer "
1097
+ "type '{dt}'")
1098
+ continue
1099
+ if pa.types.is_null(dt):
1100
+ errors.append(
1101
+ f" - column '{col}'{where} has unsupported null type '{dt}'")
1102
+ continue
1103
+
1104
+ supported = (
1105
+ pa.types.is_boolean(dt)
1106
+ # signed ints only
1107
+ or (pa.types.is_integer(dt)
1108
+ and not pa.types.is_unsigned_integer(dt)) or
1109
+ pa.types.is_floating(dt) or
1110
+ pa.types.is_string(dt) # regular string only
1111
+ or pa.types.is_date(dt) or
1112
+ pa.types.is_timestamp(dt) # epoch-based timestamps
1113
+ or pa.types.is_binary(dt) # regular binary only
1114
+ )
1115
+
1116
+ # 4) Decimals with precision limit
1117
+ if not supported and pa.types.is_decimal(dt):
1118
+ try:
1119
+ prec = int(getattr(dt, "precision", 0) or 0)
1120
+ except Exception:
1121
+ prec = 0
1122
+ if 0 < prec <= max_dec_prec:
1123
+ supported = True
1124
+ else:
1125
+ errors.append(
1126
+ f" - column '{col}'{where} has unsupported decimal "
1127
+ f"precision {prec} (max {max_dec_prec}): type '{dt}'")
1128
+ continue
1129
+
1130
+ # 5) Lists: only list of {signed int, float}; explicitly deny
1131
+ # list<string> and list<bool>
1132
+ if not supported and pa.types.is_list(dt):
1133
+ elem = dt.value_type
1134
+ if pa.types.is_string(elem):
1135
+ errors.append(
1136
+ f" - column '{col}'{where} is {dt} (list<string> not "
1137
+ f"supported)")
1138
+ continue
1139
+ if pa.types.is_boolean(elem):
1140
+ errors.append(f" - column '{col}'{where} is {dt} (list<bool> "
1141
+ f"not supported)")
1142
+ continue
1143
+ if pa.types.is_integer(
1144
+ elem) and not pa.types.is_unsigned_integer(elem):
1145
+ supported = True
1146
+ elif pa.types.is_floating(elem):
1147
+ supported = True
1148
+ else:
1149
+ errors.append(
1150
+ f" - column '{col}'{where} is {dt} (only list of signed "
1151
+ f"int/float supported)")
1152
+ continue
1153
+
1154
+ # 6) Dictionary<int, string> only
1155
+ if not supported and pa.types.is_dictionary(dt):
1156
+ if (pa.types.is_integer(dt.index_type)
1157
+ and not pa.types.is_unsigned_integer(dt.index_type)
1158
+ and pa.types.is_string(dt.value_type)):
1159
+ supported = True
1160
+
1161
+ if not supported:
1162
+ errors.append(
1163
+ f" - column '{col}'{where} has unsupported type '{dt}'")
1164
+
1165
+ if errors:
1166
+ raise ValueError(
1167
+ "Unsupported Parquet Data Types detected:\n\n" +
1168
+ "\n".join(errors) + "\n\nAllowed types: boolean, signed integer, "
1169
+ "float, (regular) string, date, "
1170
+ "timestamp (epoch-based), (regular) binary, "
1171
+ "decimal (<= configured precision), "
1172
+ "list of {signed int, float}, dictionary<int,string>.\n"
1173
+ "Disallowed examples: large_string, large_binary, "
1174
+ "large_list<*>, time32/64<*>, "
1175
+ "duration('unit'), list<string>, list<bool>, "
1176
+ "unsigned integers, null columns, "
1177
+ "structs, maps, and other nested types.")
1178
+
1179
+
1180
+ # -----------------------
1181
+ # CSV helpers
1182
+ # -----------------------
1183
+ def _detect_and_validate_csv(head_bytes: bytes) -> str:
1184
+ r"""Detect a CSV delimiter from a small head sample and verify it.
1185
+
1186
+ - Uses csv.Sniffer (preferred delimiters: | , ; \t) with fallback to ','.
1187
+ - Reads a handful of complete, quote-aware records (handles newlines inside
1188
+ quotes).
1189
+ - Re-serializes those rows and validates with pandas (small nrows) to catch
1190
+ malformed inputs.
1191
+ - Raises ValueError on empty input or if parsing fails with the chosen
1192
+ delimiter.
1193
+ """
1194
+ if not head_bytes:
1195
+ raise ValueError("Could not auto-detect a delimiter: file is empty.")
1196
+
1197
+ text = head_bytes.decode("utf-8", errors="ignore").replace("\r\n",
1198
+ "\n").replace(
1199
+ "\r", "\n")
1200
+
1201
+ # 1) Detect delimiter (simple preference list; no denylist)
1202
+ try:
1203
+ delimiter = csv.Sniffer().sniff(text, delimiters="|,;\t").delimiter
1204
+ except Exception:
1205
+ logger.warning("No separator found in sample; defaulting to ','.")
1206
+ delimiter = ','
1207
+
1208
+ # 2) Pull a few complete records with csv.reader (quote-aware,
1209
+ # handles embedded newlines)
1210
+ rows = []
1211
+ try:
1212
+ rdr = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"',
1213
+ doublequote=True)
1214
+ for _ in range(50): # small, bounded sample
1215
+ try:
1216
+ rows.append(next(rdr))
1217
+ except StopIteration:
1218
+ break
1219
+ except Exception as e:
1220
+ raise ValueError(
1221
+ f"Could not auto-detect a valid delimiter. Tried '{delimiter}', "
1222
+ f"csv parse failed: {repr(e)}")
1223
+
1224
+ if not rows:
1225
+ raise ValueError(
1226
+ "Could not auto-detect a valid delimiter: no complete records "
1227
+ "found.")
1228
+
1229
+ # 3) Re-serialize snippet and validate minimally with pandas
1230
+ out = io.StringIO()
1231
+ w = csv.writer(out, delimiter=delimiter, lineterminator="\n",
1232
+ quotechar='"', doublequote=True)
1233
+ for r in rows:
1234
+ w.writerow(r)
1235
+
1236
+ try:
1237
+ pd.read_csv(
1238
+ io.StringIO(out.getvalue()),
1239
+ sep=delimiter,
1240
+ index_col=False,
1241
+ on_bad_lines='error',
1242
+ nrows=50,
1243
+ engine="python", # more tolerant for quoted/newline combos
1244
+ skip_blank_lines=False,
1245
+ )
1246
+ except Exception as e:
1247
+ raise ValueError(
1248
+ f"Could not auto-detect a valid delimiter. Tried '{delimiter}', "
1249
+ f"pandas parse failed: {repr(e)}")
1250
+
1251
+ return delimiter
1252
+
1253
+
1254
+ def _csv_has_data_rows(data_mv: memoryview) -> bool:
1255
+ """Return True if any non-newline, non-carriage-return byte exists after
1256
+ the first newline. Uses zero-copy iteration over the memoryview to avoid
1257
+ duplicating buffers.
1258
+ """
1259
+ mv = data_mv
1260
+ if mv.format != 'B':
1261
+ try:
1262
+ mv = mv.cast('B') # zero-copy view of bytes
1263
+ except TypeError:
1264
+ # fallback: create a contiguous view via slicing (still zero-copy)
1265
+ mv = mv[:]
1266
+
1267
+ saw_newline = False
1268
+ # Iterate in a single pass; break as soon as we see a data-ish byte
1269
+ for b in mv:
1270
+ if not saw_newline:
1271
+ if b == 10: # '\n'
1272
+ saw_newline = True
1273
+ continue
1274
+ # after header newline: any byte that isn't CR or LF counts as data
1275
+ if b not in (10, 13):
1276
+ return True
1277
+ return False
1278
+
1279
+
1280
+ def _maybe_rewrite_csv_header_buffer(
1281
+ data_mv: memoryview,
1282
+ header_line: bytes,
1283
+ delimiter: str,
1284
+ ) -> tuple[Optional[io.BytesIO], memoryview, bytes, list[str], dict[str, str],
1285
+ bool]:
1286
+ """Rewrite ONLY the header if needed. Uses a new BytesIO but frees the old
1287
+ buffer immediately after swap.
1288
+ """
1289
+ try:
1290
+ header_str = header_line.decode("utf-8").rstrip("\r\n")
1291
+ except UnicodeDecodeError:
1292
+ raise ValueError("CSV header is not valid UTF-8.")
1293
+
1294
+ orig_cols = [c.strip() for c in header_str.split(delimiter)]
1295
+ new_cols, changed = _sanitize_columns(orig_cols)
1296
+ if not changed:
1297
+ return None, data_mv, header_line, orig_cols, {}, False
1298
+
1299
+ rename_map = {o: n for o, n in zip(orig_cols, new_cols) if o != n}
1300
+
1301
+ nl_idx = len(header_line)
1302
+ if nl_idx >= data_mv.nbytes:
1303
+ raise ValueError("Malformed CSV: newline not found in header.")
1304
+
1305
+ new_header_bytes = delimiter.join(new_cols).encode("utf-8")
1306
+ new_buf = io.BytesIO()
1307
+ new_buf.write(new_header_bytes)
1308
+ new_buf.write(b"\n")
1309
+ # Write the remainder via a zero-copy memoryview slice; BytesIO will copy
1310
+ # into its own buffer, but we free the original immediately after returning
1311
+ # to avoid double residency.
1312
+ new_buf.write(data_mv[nl_idx + 1:])
1313
+ new_mv = new_buf.getbuffer()
1314
+ return new_buf, new_mv, new_header_bytes, new_cols, rename_map, True
1315
+
1316
+
1317
+ # -----------------------
1318
+ # Remote upload (refactor)
1319
+ # -----------------------
1320
+ @dataclass
1321
+ class _RemoteSettings:
1322
+ part_size: int
1323
+ part_conc: int
1324
+ overhead_bytes: int
1325
+ parallelism_override: Optional[int]
1326
+
1327
+
1328
+ def _make_remote_settings(parallelism: Optional[int]) -> _RemoteSettings:
1329
+ part_mb = int(os.getenv("KUMO_REMOTE_PART_MB", "64"))
1330
+ part_size = max(8, part_mb) * 1024**2
1331
+ part_conc = int(os.getenv("KUMO_REMOTE_PART_CONCURRENCY", "4"))
1332
+ try:
1333
+ overhead_bytes = max(0, int(os.getenv("KUMO_UPLOAD_OVERHEAD_MB",
1334
+ "128"))) * 1024**2
1335
+ except Exception:
1336
+ overhead_bytes = 128 * 1024**2
1337
+ return _RemoteSettings(
1338
+ part_size=part_size,
1339
+ part_conc=part_conc,
1340
+ overhead_bytes=overhead_bytes,
1341
+ parallelism_override=parallelism,
1342
+ )
1343
+
1344
+
1345
+ def _remote_upload_file(name: str, fs: Filesystem, url: str, info: dict,
1346
+ st: _RemoteSettings, file_type: Optional[str]) -> None:
1347
+ detected_ftype = _validate_url_ext(url, file_type)
1348
+
1349
+ size = int(info.get("size") or 0)
1350
+ if size == 0:
1351
+ raise ValueError(f"Remote file {url} is empty (0 bytes).")
1352
+ if size > MAX_PARTITION_SIZE:
1353
+ raise ValueError(
1354
+ "Remote single-file uploads larger than 1GB are not supported. "
1355
+ "Please re-partition the source into ~200MB chunks and upload the "
1356
+ "whole directory instead.")
1357
+
1358
+ # Read with progress
1359
+ with tqdm(total=size, desc=f"Reading {_short_path(url)}", unit="B",
1360
+ unit_scale=True, unit_divisor=1024, position=0, leave=False,
1361
+ smoothing=0.1) as read_bar:
1362
+ tr0 = time.perf_counter()
1363
+ buf, data_mv, header_line = _read_remote_file_with_progress(
1364
+ fs, url, expected_size=size, update_bytes=read_bar.update,
1365
+ capture_first_line=(detected_ftype == "csv"))
1366
+ tread = time.perf_counter() - tr0
1367
+
1368
+ # Validate/sanitize
1369
+ tv0 = time.perf_counter()
1370
+ renamed_cols_msg = None
1371
+ if detected_ftype == "parquet":
1372
+ schema = _parquet_schema_from_bytes(data_mv)
1373
+ _validate_columns_or_raise(list(schema.names))
1374
+ validate_parquet_schema(schema, url)
1375
+ nrows = _parquet_num_rows_from_bytes(data_mv)
1376
+ if nrows <= 0:
1377
+ raise ValueError("Parquet file has zero rows.")
1378
+ file_type = "parquet"
1379
+ else:
1380
+ head_len = min(50000, data_mv.nbytes)
1381
+ # small bounded copy only for sniffing
1382
+ head = bytes(data_mv[:head_len])
1383
+ delimiter = _detect_and_validate_csv(head)
1384
+ if header_line is None:
1385
+ # Shouldn't happen (we captured it during read), but keep a bounded
1386
+ # fallback (64 KiB)
1387
+ prefix_len = min(64 * 1024, data_mv.nbytes)
1388
+ prefix = data_mv[:prefix_len]
1389
+ # build header_line from prefix without large copies
1390
+ acc = bytearray()
1391
+ for b in (prefix.cast('B') if prefix.format != 'B' else prefix):
1392
+ if b == 10: # '\n'
1393
+ break
1394
+ acc.append(b)
1395
+ header_line = bytes(acc)
1396
+ new_buf, new_mv, new_header, cols, rename_map, changed = (
1397
+ _maybe_rewrite_csv_header_buffer(data_mv, header_line, delimiter))
1398
+ if changed:
1399
+ try:
1400
+ buf.close()
1401
+ except Exception:
1402
+ pass
1403
+ if changed:
1404
+ buf = new_buf # type: ignore[assignment]
1405
+ data_mv = new_mv
1406
+ header_line = new_header
1407
+ if rename_map:
1408
+ pairs = ", ".join(f"{k}->{v}" for k, v in rename_map.items())
1409
+ renamed_cols_msg = f"CSV header sanitized (renamed): {pairs}"
1410
+ if not _csv_has_data_rows(data_mv):
1411
+ raise ValueError(
1412
+ "CSV file has zero data rows (only header present).")
1413
+ file_type = "csv"
1414
+ tval = time.perf_counter() - tv0
1415
+
1416
+ # Multipart upload
1417
+ size_bytes = data_mv.nbytes
1418
+ num_parts = max(1, math.ceil(size_bytes / st.part_size))
1419
+ upload_res = _start_table_upload_with_parts(table_name=name,
1420
+ file_type=file_type,
1421
+ file_size_bytes=size_bytes,
1422
+ num_parts=num_parts)
1423
+ try:
1424
+ urls = [
1425
+ u for k, u in sorted(upload_res.presigned_part_urls.items(),
1426
+ key=lambda kv: int(kv[0]))
1427
+ ]
1428
+ except Exception:
1429
+ urls = list(upload_res.presigned_part_urls.values())
1430
+
1431
+ loop = _KUMO_EVENT_LOOP
1432
+ with tqdm(total=size_bytes, desc="Uploading", unit="B", unit_scale=True,
1433
+ unit_divisor=1024, position=2, leave=False,
1434
+ smoothing=0.1) as upload_bar:
1435
+ part_metadata_list_fut = asyncio.run_coroutine_threadsafe(
1436
+ multi_put_bounded(
1437
+ urls=urls,
1438
+ data_iter=_iter_mv_chunks(data_mv, st.part_size),
1439
+ tqdm_bar_position=3,
1440
+ concurrency=max(1, min(st.part_conc, len(urls))),
1441
+ upload_progress_cb=lambda n: _safe_bar_update(upload_bar, n),
1442
+ upload_subchunk_bytes=UPLOAD_CHUNK_BYTES,
1443
+ ),
1444
+ loop,
1445
+ )
1446
+ part_metadata_list = part_metadata_list_fut.result()
1447
+ upload_bar.set_postfix_str(f"Done — {_short_path(url)}")
1448
+ upload_bar.refresh()
1449
+
1450
+ # Complete
1451
+ tu0 = time.perf_counter()
1452
+ for i in range(5):
1453
+ try:
1454
+ _complete_table_upload(
1455
+ table_name=name,
1456
+ file_type=file_type,
1457
+ upload_path=upload_res.temp_upload_path,
1458
+ upload_id=upload_res.upload_id,
1459
+ parts_metadata=part_metadata_list,
1460
+ )
1461
+ except HTTPException as e:
1462
+ if e.status_code == 500 and i < 4:
1463
+ time.sleep(2**(i - 1))
1464
+ continue
1465
+ else:
1466
+ raise
1467
+ else:
1468
+ break
1469
+ tupl = time.perf_counter() - tu0
1470
+
1471
+ _log_file_timing("single-file(multipart)", url, size_bytes, tread, tval,
1472
+ tupl)
1473
+ if renamed_cols_msg:
1474
+ logger.info(renamed_cols_msg)
1475
+
1476
+ try:
1477
+ if buf:
1478
+ buf.close()
1479
+ except Exception:
1480
+ pass
1481
+ del buf, data_mv, header_line
1482
+ gc.collect()
1483
+
1484
+ logger.info("Upload complete. Validated table %s.", name)
1485
+
1486
+
1487
+ def _remote_upload_directory(
1488
+ name: str,
1489
+ fs: Filesystem,
1490
+ url: str,
1491
+ info: dict,
1492
+ st: _RemoteSettings,
1493
+ file_type: Optional[str] = None, # "csv", "parquet", or None
1494
+ ) -> None:
1495
+ manifest = _remote_dir_manifest(fs, url)
1496
+ parquet_files = sorted(manifest["parquet"], key=lambda x: x["path"])
1497
+ csv_files = sorted(manifest["csv"], key=lambda x: x["path"])
1498
+
1499
+ # Normalize expected type
1500
+ if file_type not in (None, "csv", "parquet"):
1501
+ raise ValueError("file_type must be 'csv', 'parquet', or None.")
1502
+
1503
+ # Resolve files + detected type
1504
+ if file_type is None:
1505
+ if not parquet_files and not csv_files:
1506
+ raise ValueError("Directory contains no .parquet or .csv files.")
1507
+ if parquet_files and csv_files:
1508
+ raise ValueError(
1509
+ "Mixed CSV and Parquet files detected; keep only one format.")
1510
+ files = parquet_files if parquet_files else csv_files
1511
+ detected_type = "parquet" if parquet_files else "csv"
1512
+ elif file_type == "parquet":
1513
+ if not parquet_files:
1514
+ raise ValueError(
1515
+ "Directory contains no .parquet files (file_type='parquet').")
1516
+ if csv_files:
1517
+ raise ValueError(
1518
+ "Directory also contains CSV files; remove them or set"
1519
+ "file_type=None.")
1520
+ files, detected_type = parquet_files, "parquet"
1521
+ else: # file_type == "csv"
1522
+ if not csv_files:
1523
+ raise ValueError(
1524
+ "Directory contains no .csv files (file_type='csv').")
1525
+ if parquet_files:
1526
+ raise ValueError(
1527
+ "Directory also contains Parquet files; remove them or "
1528
+ "set file_type=None.")
1529
+ files, detected_type = csv_files, "csv"
1530
+
1531
+ total_bytes = sum(int(f.get("size") or 0) for f in files)
1532
+
1533
+ too_large = [
1534
+ f["path"] for f in files if (f.get("size") or 0) > MAX_PARTITION_SIZE
1535
+ ]
1536
+ zero_bytes = [f["path"] for f in files if (f.get("size") or 0) == 0]
1537
+ if zero_bytes:
1538
+ raise ValueError(
1539
+ f"Found zero-byte {detected_type.upper()} files: {zero_bytes[:3]}"
1540
+ f"{'...' if len(zero_bytes)>3 else ''}")
1541
+ if too_large:
1542
+ raise ValueError(
1543
+ f"The following files exceed 1GB and must be re-partitioned "
1544
+ f"(~200MB each): "
1545
+ f"{too_large[:3]}{'...' if len(too_large)>3 else ''}")
1546
+
1547
+ par = _determine_parallelism(files, requested=st.parallelism_override)
1548
+ par = max(1, min(par, len(files)))
1549
+ budget_bytes = _compute_mem_budget_bytes(files)
1550
+ mem_budget = MemoryBudget(budget_bytes)
1551
+
1552
+ from collections import deque
1553
+ with (tqdm(total=len(files),
1554
+ desc=f"Files ({len(files)}) [{detected_type}] | par={par}",
1555
+ position=0) as file_bar,
1556
+ tqdm(total=total_bytes, desc="Total bytes (read)", unit="B",
1557
+ unit_scale=True, unit_divisor=1024, position=1, smoothing=0.1)
1558
+ as bytes_bar,
1559
+ tqdm(total=total_bytes, desc="Total bytes (uploaded)", unit="B",
1560
+ unit_scale=True, unit_divisor=1024, position=2, smoothing=0.1)
1561
+ as uploaded_bar):
1562
+
1563
+ status_lock = threading.Lock()
1564
+ recent_paths: Deque[str] = deque(maxlen=5)
1565
+ completed_files = {"n": 0}
1566
+ file_bar.set_postfix_str(f"Uploaded 0/{len(files)}")
1567
+ file_bar.refresh()
1568
+
1569
+ rename_aggregate_lock = threading.Lock()
1570
+ rename_aggregate: dict[str, str] = {}
1571
+
1572
+ def _merge_status_update(path: str) -> None:
1573
+ with status_lock:
1574
+ completed_files["n"] += 1
1575
+ recent_paths.append(path)
1576
+ tail = ' | '.join(_short_path(p) for p in list(recent_paths))
1577
+ msg = f"Uploaded {completed_files['n']}/{len(files)}"
1578
+ if tail:
1579
+ msg += f" — {tail}"
1580
+ with _TQDM_LOCK:
1581
+ file_bar.set_postfix_str(msg)
1582
+ file_bar.refresh()
1583
+
1584
+ ref_schema_fields: Dict[str, Any] = {"value": None}
1585
+ ref_cols: Dict[str, Any] = {"value": None}
1586
+
1587
+ def _worker(idx: int, fmeta: dict) -> None:
1588
+ fpath = fmeta["path"]
1589
+ fsize = int(fmeta.get("size") or 0)
1590
+ need_bytes = (2 * fsize +
1591
+ st.overhead_bytes) if detected_type == "csv" else (
1592
+ fsize + st.overhead_bytes)
1593
+ mem_budget.acquire(need_bytes)
1594
+ try:
1595
+ tr0 = time.perf_counter()
1596
+ buf, data_mv, header_line = _read_remote_file_with_progress(
1597
+ fs,
1598
+ fpath,
1599
+ expected_size=fsize if fsize > 0 else None,
1600
+ update_bytes=lambda n: _safe_bar_update(bytes_bar, n),
1601
+ capture_first_line=(detected_type == "csv"),
1602
+ )
1603
+ tread = time.perf_counter() - tr0
1604
+
1605
+ tv0 = time.perf_counter()
1606
+ if detected_type == "parquet":
1607
+ schema = _parquet_schema_from_bytes(data_mv)
1608
+ names = list(schema.names)
1609
+ _validate_columns_or_raise(names)
1610
+ validate_parquet_schema(schema, fpath)
1611
+ nrows = _parquet_num_rows_from_bytes(data_mv)
1612
+ if nrows <= 0:
1613
+ raise ValueError(
1614
+ f"Parquet file has zero rows: {fpath}")
1615
+ fields = [(fld.name, fld.type) for fld in schema]
1616
+ if ref_schema_fields["value"] is None:
1617
+ ref_schema_fields["value"] = fields
1618
+ elif fields != ref_schema_fields["value"]:
1619
+ ref_names = [n for n, _ in ref_schema_fields["value"]]
1620
+ raise ValueError(
1621
+ "Parquet schema mismatch across files. "
1622
+ f"First file columns: {ref_names}; mismatched "
1623
+ f"file: {fpath}")
1624
+ part_name = f"{name}.parquet/part_{idx:04d}.parquet"
1625
+
1626
+ else:
1627
+ head_len = min(50000, data_mv.nbytes)
1628
+ # bounded small copy for sniffing
1629
+ head = bytes(data_mv[:head_len])
1630
+ delimiter = _detect_and_validate_csv(head)
1631
+ if header_line is None:
1632
+ # Bounded fallback (64 KiB) to extract header without
1633
+ # copying whole file
1634
+ prefix_len = min(64 * 1024, data_mv.nbytes)
1635
+ prefix = data_mv[:prefix_len]
1636
+ acc = bytearray()
1637
+ for b in (prefix.cast('B')
1638
+ if prefix.format != 'B' else prefix):
1639
+ if b == 10: # '\n'
1640
+ break
1641
+ acc.append(b)
1642
+ header_line = bytes(acc)
1643
+
1644
+ new_buf, new_mv, new_header, cols, rename_map, changed = (
1645
+ _maybe_rewrite_csv_header_buffer(
1646
+ data_mv, header_line, delimiter))
1647
+ if changed:
1648
+ try:
1649
+ buf.close()
1650
+ except Exception:
1651
+ pass
1652
+ buf = new_buf # type: ignore[assignment]
1653
+ data_mv = new_mv
1654
+ header_line = new_header
1655
+ if rename_map:
1656
+ with rename_aggregate_lock:
1657
+ rename_aggregate.update(rename_map)
1658
+
1659
+ if ref_cols["value"] is None:
1660
+ ref_cols["value"] = cols
1661
+ elif cols != ref_cols["value"]:
1662
+ raise ValueError(
1663
+ "CSV header mismatch across files. "
1664
+ f"Expected: {ref_cols['value']}; mismatched file: "
1665
+ f"{fpath} has: {cols}")
1666
+ if not _csv_has_data_rows(data_mv):
1667
+ raise ValueError(
1668
+ f"CSV file has zero data rows: {fpath}")
1669
+ part_name = f"{name}.csv/part_{idx:04d}.csv"
1670
+ tval = time.perf_counter() - tv0
1671
+
1672
+ size_bytes = data_mv.nbytes
1673
+ num_parts = max(1, math.ceil(size_bytes / st.part_size))
1674
+ upload_res = _start_table_upload_with_parts(
1675
+ table_name=part_name,
1676
+ file_type=detected_type,
1677
+ file_size_bytes=size_bytes,
1678
+ num_parts=num_parts,
1679
+ )
1680
+ try:
1681
+ urls = [
1682
+ u for k, u in sorted(
1683
+ upload_res.presigned_part_urls.items(),
1684
+ key=lambda kv: int(kv[0]))
1685
+ ]
1686
+ except Exception:
1687
+ urls = list(upload_res.presigned_part_urls.values())
1688
+
1689
+ loop_inner = _KUMO_EVENT_LOOP
1690
+ part_metadata_list_fut = asyncio.run_coroutine_threadsafe(
1691
+ multi_put_bounded(
1692
+ urls=urls,
1693
+ data_iter=_iter_mv_chunks(data_mv, st.part_size),
1694
+ tqdm_bar_position=3,
1695
+ concurrency=max(1, min(st.part_conc, len(urls))),
1696
+ upload_progress_cb=lambda n: _safe_bar_update(
1697
+ uploaded_bar, n),
1698
+ upload_subchunk_bytes=UPLOAD_CHUNK_BYTES,
1699
+ ),
1700
+ loop_inner,
1701
+ )
1702
+ part_metadata_list = part_metadata_list_fut.result()
1703
+
1704
+ for i in range(5):
1705
+ try:
1706
+ _complete_table_upload(
1707
+ table_name=part_name,
1708
+ file_type=detected_type,
1709
+ upload_path=upload_res.temp_upload_path,
1710
+ upload_id=upload_res.upload_id,
1711
+ parts_metadata=part_metadata_list,
1712
+ )
1713
+ except HTTPException as e:
1714
+ if e.status_code == 500 and i < 4:
1715
+ time.sleep(2**(i - 1))
1716
+ continue
1717
+ else:
1718
+ raise
1719
+ else:
1720
+ break
1721
+
1722
+ try:
1723
+ if buf:
1724
+ buf.close()
1725
+ except Exception:
1726
+ pass
1727
+ del buf, data_mv, header_line
1728
+ gc.collect()
1729
+
1730
+ _safe_bar_update(file_bar, 1)
1731
+ _merge_status_update(fpath)
1732
+ _log_file_timing("dir-file(multipart)", fpath, fsize, tread,
1733
+ tval, 0.0)
1734
+
1735
+ finally:
1736
+ mem_budget.release(need_bytes)
1737
+
1738
+ indexed = list(enumerate(files, start=1))
1739
+ first_ex = None
1740
+ with ThreadPoolExecutor(max_workers=par) as ex:
1741
+ futures = {
1742
+ ex.submit(_worker, idx, fmeta): (idx, fmeta["path"])
1743
+ for idx, fmeta in indexed
1744
+ }
1745
+ for fut in as_completed(futures):
1746
+ try:
1747
+ fut.result()
1748
+ except Exception as e:
1749
+ first_ex = e
1750
+ for f2 in futures:
1751
+ f2.cancel()
1752
+ break
1753
+ if first_ex:
1754
+ raise first_ex
1755
+
1756
+ # after bars close, log any header renames once
1757
+ if detected_type == "csv" and rename_aggregate:
1758
+ pairs = ", ".join(f"{k}->{v}" for k, v in rename_aggregate.items())
1759
+ logger.info("CSV header sanitized (renamed): %s", pairs)
1760
+
1761
+ logger.info("Upload complete. Validated table %s.", name)
1762
+
1763
+
1764
+ def _upload_table_remote(
1765
+ name: str,
1766
+ path: str,
1767
+ auto_partition: bool = True,
1768
+ partition_size_mb: int = 250,
1769
+ parallelism: Optional[int] = None,
1770
+ file_type: Optional[str] = None,
1771
+ ) -> None:
1772
+ """Dispatch remote upload to file or directory paths."""
1773
+ fs, url = _get_fs_and_path(path)
1774
+ info = _remote_info(fs, url)
1775
+ st = _make_remote_settings(parallelism)
1776
+
1777
+ if info.get("type") == "file":
1778
+ return _remote_upload_file(name, fs, url, info, st, file_type)
1779
+ if info.get("type") == "directory":
1780
+ return _remote_upload_directory(name, fs, url, info, st, file_type)
1781
+ raise ValueError(f"Unsupported remote object type for {path}: {info}")
1782
+
1783
+
1784
+ # -----------------------
1785
+ # Column name validator
1786
+ # -----------------------
1787
+ def _validate_columns_or_raise(names: List[str]) -> None:
1788
+ # Ensure sanitized form equals original to enforce our header rules (for
1789
+ # parquet), but don't modify parquet; for CSV we already sanitize header
1790
+ # proactively.
1791
+ new, changed = _sanitize_columns(names)
1792
+ if changed:
1793
+ diffs = [f"{o}->{n}" for o, n in zip(names, new) if o != n]
1794
+ raise ValueError(
1795
+ "Column names contain invalid characters or duplicates. "
1796
+ "Please rename the following columns:\n " + ", ".join(diffs))