kumoai 2.9.0.dev202509081831__cp312-cp312-win_amd64.whl → 2.12.0.dev202511111731__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. kumoai/__init__.py +4 -2
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +10 -5
  4. kumoai/client/endpoints.py +1 -0
  5. kumoai/client/rfm.py +37 -8
  6. kumoai/connector/file_upload_connector.py +71 -102
  7. kumoai/connector/utils.py +1367 -236
  8. kumoai/experimental/rfm/__init__.py +5 -3
  9. kumoai/experimental/rfm/authenticate.py +8 -5
  10. kumoai/experimental/rfm/infer/timestamp.py +7 -4
  11. kumoai/experimental/rfm/local_graph.py +90 -80
  12. kumoai/experimental/rfm/local_graph_sampler.py +16 -8
  13. kumoai/experimental/rfm/local_graph_store.py +22 -6
  14. kumoai/experimental/rfm/local_pquery_driver.py +336 -42
  15. kumoai/experimental/rfm/local_table.py +100 -22
  16. kumoai/experimental/rfm/pquery/__init__.py +4 -4
  17. kumoai/experimental/rfm/pquery/{backend.py → executor.py} +24 -58
  18. kumoai/experimental/rfm/pquery/{pandas_backend.py → pandas_executor.py} +278 -222
  19. kumoai/experimental/rfm/rfm.py +514 -117
  20. kumoai/jobs.py +1 -0
  21. kumoai/kumolib.cp312-win_amd64.pyd +0 -0
  22. kumoai/trainer/trainer.py +19 -10
  23. kumoai/utils/progress_logger.py +68 -0
  24. {kumoai-2.9.0.dev202509081831.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/METADATA +4 -5
  25. {kumoai-2.9.0.dev202509081831.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/RECORD +28 -28
  26. {kumoai-2.9.0.dev202509081831.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/WHEEL +0 -0
  27. {kumoai-2.9.0.dev202509081831.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/licenses/LICENSE +0 -0
  28. {kumoai-2.9.0.dev202509081831.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/top_level.txt +0 -0
kumoai/connector/utils.py CHANGED
@@ -1,13 +1,30 @@
1
1
  import asyncio
2
+ import csv
3
+ import gc
2
4
  import io
3
5
  import math
4
6
  import os
5
7
  import re
6
8
  import tempfile
9
+ import threading
7
10
  import time
8
- import warnings
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+ from dataclasses import dataclass
9
13
  from logging import getLogger
10
- from typing import Any, Callable, Generator, List, Tuple
14
+ from typing import (
15
+ Any,
16
+ AsyncIterator,
17
+ Callable,
18
+ Deque,
19
+ Dict,
20
+ Generator,
21
+ Iterator,
22
+ List,
23
+ Optional,
24
+ Tuple,
25
+ Union,
26
+ )
27
+ from urllib.parse import urlparse
11
28
 
12
29
  import aiohttp
13
30
  import pandas as pd
@@ -21,177 +38,322 @@ from kumoapi.data_source import (
21
38
  StartFileUploadResponse,
22
39
  )
23
40
  from tqdm import tqdm
24
- from tqdm.asyncio import tqdm_asyncio
25
41
 
26
42
  from kumoai import global_state
43
+ # still used for server-side completion retries
27
44
  from kumoai.exceptions import HTTPException
28
45
  from kumoai.futures import _KUMO_EVENT_LOOP
29
46
 
30
- CHUNK_SIZE = 100 * 10**6 # 100 MB
47
+ # -------------------
48
+ # Constants & Globals
49
+ # -------------------
50
+ logger = getLogger(__name__)
51
+
52
+ CHUNK_SIZE = 100 * 10**6 # 100 MB (legacy local single-file chunk)
53
+ READ_CHUNK_BYTES = 8 * 1024**2 # 8 MiB remote read buffer
54
+ UPLOAD_CHUNK_BYTES = 8 * 1024**2 # 8 MiB streamed PUT sub-chunks
31
55
  MAX_PARTITION_SIZE = 1000 * 1024**2 # 1GB
32
56
  MIN_PARTITION_SIZE = 100 * 1024**2 # 100MB
33
57
 
34
- logger = getLogger(__name__)
35
-
36
58
  CONNECTOR_ID_MAP = {
37
59
  "csv": "csv_upload_connector",
38
60
  "parquet": "parquet_upload_connector",
39
61
  }
40
62
 
63
+ _TQDM_LOCK = threading.Lock()
64
+
65
+
66
+ # ---------------
67
+ # Small utilities
68
+ # ---------------
69
+ def _fmt_bytes(n: int) -> str:
70
+ value = float(n)
71
+ units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]
72
+ for unit in units:
73
+ if value < 1024:
74
+ return f"{value:.1f} {unit}"
75
+ value /= 1024
76
+ return f"{value:.1f} EiB"
77
+
78
+
79
+ def _fmt_secs(s: float) -> str:
80
+ if s < 1:
81
+ return f"{s*1000:.0f} ms"
82
+ return f"{s:.2f} s"
83
+
84
+
85
+ def _fmt_rate(nbytes: int, secs: float) -> str:
86
+ if secs <= 0:
87
+ return "-"
88
+ return f"{(nbytes / secs) / 1024**2:.1f} MB/s"
89
+
90
+
91
+ def _short_path(p: str, maxlen: int = 60) -> str:
92
+ if len(p) <= maxlen:
93
+ return p
94
+ try:
95
+ parsed = urlparse(p)
96
+ head = f"{parsed.scheme}://"
97
+ tail = p[-40:]
98
+ return f"{head}…{tail}"
99
+ except Exception:
100
+ return f"…{p[-maxlen:]}"
101
+
41
102
 
42
- async def put(
103
+ def _safe_bar_update(bar: tqdm, inc: int) -> None:
104
+ with _TQDM_LOCK:
105
+ try:
106
+ bar.update(inc)
107
+ except Exception:
108
+ pass
109
+
110
+
111
+ def _log_file_timing(label: str, path: str, size: int, tread: float,
112
+ tval: float, tupl: float) -> None:
113
+ logger.debug("[%s] %s (%s) | read=%s @ %s | validate=%s | upload=%s @ %s",
114
+ label, path, _fmt_bytes(size), _fmt_secs(tread),
115
+ _fmt_rate(size, max(tread, 1e-6)), _fmt_secs(tval),
116
+ _fmt_secs(tupl), _fmt_rate(size, max(tupl, 1e-6)))
117
+
118
+
119
+ # -----------------------
120
+ # Async upload primitives
121
+ # -----------------------
122
+ def _iter_memview_stream(
123
+ mv: memoryview,
124
+ subchunk_bytes: int,
125
+ progress_cb: Optional[Callable[[int], None]] = None,
126
+ ) -> Iterator[memoryview]:
127
+ """Yield memoryview slices (zero-copy) for streaming PUT."""
128
+ pos = 0
129
+ n = mv.nbytes
130
+ while pos < n:
131
+ nxt = min(n, pos + subchunk_bytes)
132
+ chunk = mv[pos:nxt] # zero-copy slice
133
+ pos = nxt
134
+ if progress_cb:
135
+ try:
136
+ progress_cb(len(chunk))
137
+ except Exception:
138
+ pass
139
+ yield chunk
140
+
141
+
142
+ async def _put_with_retry_streamed(
43
143
  session: aiohttp.ClientSession,
44
144
  url: str,
45
- data: bytes,
145
+ mv: memoryview,
46
146
  part_no: int,
147
+ subchunk_bytes: int = UPLOAD_CHUNK_BYTES,
148
+ progress_cb: Optional[Callable[[int], None]] = None,
149
+ retries: int = 3,
47
150
  ) -> Tuple[int, str]:
48
- r"""Performs an asynchronous PUT request to upload data to a presigned S3
49
- URL, and returns a tuple corresponding to the uploaded part number and
50
- the Etag of the header.
51
-
52
- Args:
53
- session: the ``aiohttp`` client session to use for the request
54
- url: the S3 presigned URL to PUT ``data`` to
55
- data: the data (``bytes``) that should be PUT to ``url``
56
- part_no: the part number of the data to be PUT
151
+ """Stream a memoryview to a presigned URL using an *async* generator so
152
+ aiohttp does not try to wrap it as multipart/form-data. We also set
153
+ Content-Length explicitly so S3/GCS expects a fixed-size payload (avoids
154
+ chunked TE).
57
155
  """
58
- # TODO(manan): add retry...
59
- async with session.put(url, data=data) as res:
60
- logger.debug("PUT part_no=%s bytes=%s", part_no, len(data))
61
- _ = await res.text()
62
- if res.status != 200:
63
- raise RuntimeError(
64
- f"PUT URL={url} failed: with status {res.status}: "
65
- f"{res}")
66
- headers = res.headers
67
- return (part_no + 1, headers['Etag'])
68
-
69
-
70
- async def multi_put(
71
- loop: asyncio.AbstractEventLoop,
156
+
157
+ # Build a fresh async generator per attempt (can't reuse after failure).
158
+ def _make_async_gen() -> Callable[[], Any]:
159
+ async def _agen() -> AsyncIterator[memoryview]:
160
+ # Yield zero-copy memoryview slices; aiohttp can send memoryview
161
+ # directly.
162
+ for chunk in _iter_memview_stream(mv, subchunk_bytes, progress_cb):
163
+ yield chunk
164
+ # cooperative yield; keeps event loop snappy without extra
165
+ # copies
166
+ await asyncio.sleep(0)
167
+
168
+ return _agen
169
+
170
+ headers = {
171
+ "Content-Type": "application/octet-stream",
172
+ "Content-Length": str(mv.nbytes),
173
+ }
174
+
175
+ attempt = 0
176
+ while True:
177
+ try:
178
+ async with session.put(url, data=_make_async_gen()(),
179
+ headers=headers) as res:
180
+ # Read/consume response to free the connection
181
+ _ = await res.read()
182
+ if res.status != 200:
183
+ raise RuntimeError(
184
+ f"PUT failed {res.status}: {res.reason}")
185
+ etag = res.headers.get("ETag") or res.headers.get("Etag") or ""
186
+ return (part_no + 1, etag)
187
+ except Exception:
188
+ attempt += 1
189
+ if attempt > retries:
190
+ raise
191
+ # backoff before retrying; generator will be recreated next loop
192
+ await asyncio.sleep(0.5 * attempt)
193
+
194
+
195
+ async def multi_put_bounded(
72
196
  urls: List[str],
73
- data: Generator[bytes, None, None],
74
- tqdm_bar_position: int = 0,
197
+ data_iter: Generator[Union[bytes, memoryview], None, None],
198
+ tqdm_bar_position: int = 0, # kept for compatibility (unused)
199
+ concurrency: int = 4,
200
+ upload_progress_cb: Optional[Callable[[int], None]] = None,
201
+ upload_subchunk_bytes: int = UPLOAD_CHUNK_BYTES,
75
202
  ) -> List[PartUploadMetadata]:
76
- r"""Performs multiple asynchronous PUT requests of the data yielded
77
- from the ``data`` generator to the specified URLs. If the data
78
- generator is exhausted early, only a subset of URLs are used. If
79
- the data generator is not exhausted by the URLs, uploaded data may
80
- be corrupted!
203
+ """Multipart uploader with bounded concurrency and byte-accurate progress.
204
+ No extra progress bar here; caller drives a single byte counter via
205
+ upload_progress_cb.
81
206
  """
82
- # TODO(manan): retry
83
- # TODO(manan): properly stream chunks
84
- async with aiohttp.ClientSession(
85
- loop=loop,
86
- connector=aiohttp.TCPConnector(verify_ssl=False),
87
- headers={'Content-Type': 'binary'},
88
- ) as session:
89
- results = await tqdm_asyncio.gather(
90
- *[
91
- put(session, url, data, i)
92
- for i, (url, data) in enumerate(zip(urls, data))
93
- ], desc="Uploading chunks", position=tqdm_bar_position,
94
- leave=False)
95
- for r in results:
96
- if isinstance(r, BaseException):
97
- raise r
98
- return [PartUploadMetadata(v[0], v[1]) for v in results]
207
+ sem = asyncio.Semaphore(concurrency)
208
+ results: List[Union[Tuple[int, str], None]] = [None] * len(urls)
209
+
210
+ async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
211
+ ssl=False)) as session:
212
+
213
+ async def worker(idx: int, url: str, chunk: Union[bytes,
214
+ memoryview]) -> None:
215
+ async with sem:
216
+ mv = chunk if isinstance(chunk,
217
+ memoryview) else memoryview(chunk)
218
+ res = await _put_with_retry_streamed(
219
+ session=session,
220
+ url=url,
221
+ mv=mv,
222
+ part_no=idx,
223
+ subchunk_bytes=upload_subchunk_bytes,
224
+ progress_cb=upload_progress_cb,
225
+ )
226
+ results[idx] = res
227
+
228
+ tasks: List[asyncio.Task] = []
229
+ for idx, url in enumerate(urls):
230
+ try:
231
+ chunk = next(data_iter)
232
+ except StopIteration:
233
+ break
234
+ tasks.append(asyncio.create_task(worker(idx, url, chunk)))
235
+
236
+ try:
237
+ await asyncio.gather(*tasks)
238
+ except Exception:
239
+ for t in tasks:
240
+ if not t.done():
241
+ t.cancel()
242
+ await asyncio.gather(*tasks, return_exceptions=True)
243
+ raise
244
+
245
+ out: List[PartUploadMetadata] = []
246
+ for r in results:
247
+ if r is None:
248
+ continue
249
+ out.append(PartUploadMetadata(r[0], r[1]))
250
+ return out
99
251
 
100
252
 
101
253
  def stream_read(
102
254
  f: io.BufferedReader,
103
255
  chunk_size: int,
104
256
  ) -> Generator[bytes, None, None]:
105
- r"""Streams ``chunk_size`` contiguous bytes from buffered reader
106
- ``f`` each time the generator is yielded from.
257
+ r"""Streams ``chunk_size`` contiguous bytes from buffered reader ``f`` each
258
+ time the generator is yielded from.
107
259
  """
108
260
  while True:
109
261
  byte_buf = f.read(chunk_size)
110
262
  if len(byte_buf) == 0:
111
- # StopIteration:
112
263
  break
113
264
  yield byte_buf
114
265
 
115
266
 
267
+ def _validate_url_ext(url: str, file_type: Union[str, None]) -> str:
268
+ """Validate that `url` ends with .csv or .parquet. If `file_type` is
269
+ given ("csv" or "parquet"), ensure it matches. Returns the detected type
270
+ ("csv" or "parquet"), else raises ValueError.
271
+ """
272
+ u = url.lower()
273
+ detected = "csv" if u.endswith(".csv") else "parquet" if u.endswith(
274
+ ".parquet") else None
275
+ if detected is None:
276
+ raise ValueError(f"File path '{url}' must end with .csv or .parquet")
277
+
278
+ if file_type is None:
279
+ return detected
280
+
281
+ ft = file_type.lower()
282
+ if ft not in ("csv", "parquet"):
283
+ raise ValueError("file_type must be 'csv', 'parquet', or None")
284
+
285
+ if ft != detected:
286
+ raise ValueError(f"File path '{url}' must end with .{ft}")
287
+ return detected
288
+
289
+
116
290
  def upload_table(
117
291
  name: str,
118
292
  path: str,
119
293
  auto_partition: bool = True,
120
294
  partition_size_mb: int = 250,
295
+ parallelism: Optional[int] = None,
296
+ file_type: Optional[str] = None,
121
297
  ) -> None:
122
- r"""Synchronously uploads a table located on your local machine to the
123
- Kumo data plane. Tables uploaded in this way can be accessed with a
124
- :class:`~kumoai.connector.FileUploadConnector`.
125
-
126
- For files larger than 1GB, the table will be automatically partitioned
127
- into smaller chunks and uploaded with common prefix that allows
128
- FileUploadConnector to union them when reading.
129
-
130
- .. warning::
131
- Uploaded tables must be single files, either in parquet or CSV
132
- format. Partitioned tables are not currently supported.
133
-
134
- .. code-block:: python
298
+ """Upload a CSV/Parquet table to Kumo from a local file or a remote path
299
+ (s3://, gs://, abfs://, abfss://, az://).
135
300
 
136
- import kumoai
137
- from kumoai.connector import upload_table
138
-
139
- # Upload a small table
140
- upload_table(name="users", path="/data/users.parquet")
141
-
142
- # Upload a large parquet table (will be automatically partitioned)
143
- upload_table(name="transactions",
144
- path="/data/large_transactions.parquet")
145
-
146
- # Upload a large CSV table (will be automatically partitioned)
147
- upload_table(name="sales", path="/data/large_sales.csv")
148
-
149
- # Disable auto-partitioning (will raise error for large files)
150
- upload_table(name="users", path="/data/users.parquet",
151
- auto_partition=False)
301
+ - Local file: uploaded as-is. If >1 GiB and `auto_partition=True`, splits
302
+ into ~`partition_size_mb` MiB parts.
303
+ - Remote file: uploaded via multipart. Files >1 GiB are rejected
304
+ (re-shard to ~200 MiB and upload as a directory).
305
+ - Remote directory: auto-detects format (or use `file_type`), validates
306
+ each shard, and uploads in parallel with a memory-safe budget.
152
307
 
153
308
  Args:
154
- name: The name of the table to be uploaded. The uploaded table can
155
- be accessed from the :class:`~kumoai.connector.FileUploadConnector`
156
- with this name.
157
- path: The full path of the table to be uploaded, on the local
158
- machine.
159
- auto_partition: Whether to automatically partition large files (>1GB).
160
- If False and file is >1GB, raises ValueError. Supports both
161
- Parquet and CSV files.
162
- partition_size_mb: The size of each partition in MB. Only used if
163
- auto_partition is True.
164
- """
165
- warnings.warn(
166
- "upload_table is deprecated; use "
167
- "FileUploadConnector.upload instead.", DeprecationWarning,
168
- stacklevel=2)
169
-
170
- # Validate file type
171
- if not (path.endswith(".parquet") or path.endswith(".csv")):
172
- raise ValueError(f"Path {path} must be either a CSV or Parquet "
173
- f"file. Partitioned data is not currently "
174
- f"supported.")
309
+ name: Destination table name in Kumo.
310
+ path: Local path or remote URL to a .csv/.parquet file or directory.
311
+ auto_partition: Local-only; partition files >1 GiB.
312
+ partition_size_mb: Local partition target size (100–1000 MiB).
313
+ parallelism: Directory uploads concurrency override.
314
+ file_type: Force "csv" or "parquet" for directories; None = auto-detect
175
315
 
316
+ Raises:
317
+ ValueError: Bad/mixed types, zero rows, >1 GiB remote file,
318
+ schema/header mismatch, or invalid column names.
319
+ ImportError: Missing filesystem dependency (s3fs/gcsfs/adlfs).
320
+ RuntimeError: Remote stat/list/read or multipart completion failures.
321
+
322
+ Notes:
323
+ CSV headers are sanitized (chars → underscore, de-duped). Parquet
324
+ columns must already be valid.
325
+ """
326
+ # Decide local vs remote by scheme
327
+ scheme = urlparse(path).scheme
328
+ if scheme in ("s3", "gs", "abfs", "abfss", "az"):
329
+ return _upload_table_remote(
330
+ name=name,
331
+ path=path,
332
+ auto_partition=auto_partition,
333
+ partition_size_mb=partition_size_mb,
334
+ parallelism=parallelism,
335
+ file_type=file_type,
336
+ )
337
+ # Local path
338
+ _validate_url_ext(path, file_type)
176
339
  file_size = os.path.getsize(path)
177
340
 
178
- # Route based on file size
179
341
  if file_size < MAX_PARTITION_SIZE:
180
342
  return _upload_single_file(name, path)
181
343
 
182
344
  if not auto_partition:
183
- raise ValueError(f"File {path} is {file_size / (1024**3):.2f}GB, "
184
- f"which exceeds the 1GB limit. Enable "
185
- f"auto_partition=True to automatically partition "
186
- f"large files.")
345
+ raise ValueError(
346
+ f"File {path} is {file_size / (1024**3):.2f}GB, which exceeds "
347
+ f"the 1GB limit. Enable auto_partition=True to automatically "
348
+ f"partition large files.")
187
349
 
188
- # Partition and upload large files
189
350
  partition_size = partition_size_mb * 1024**2
190
351
  if (partition_size > MAX_PARTITION_SIZE
191
352
  or partition_size < MIN_PARTITION_SIZE):
192
- raise ValueError(f"Partition size {partition_size_mb}MB must be "
193
- f"between {MIN_PARTITION_SIZE / 1024**2}MB and "
194
- f"{MAX_PARTITION_SIZE / 1024**2}MB.")
353
+ raise ValueError(
354
+ f"Partition size {partition_size_mb}MB must be between "
355
+ f"{MIN_PARTITION_SIZE / 1024**2}MB and "
356
+ f"{MAX_PARTITION_SIZE / 1024**2}MB.")
195
357
 
196
358
  logger.info("File %s is large with size %s, partitioning for upload...",
197
359
  path, file_size)
@@ -202,25 +364,18 @@ def upload_table(
202
364
 
203
365
 
204
366
  def _handle_duplicate_names(names: List[str]) -> List[str]:
205
-
206
- unique_names = []
207
- unique_names_with_counts = {}
208
-
367
+ unique_names: List[str] = []
368
+ unique_counts: dict[str, int] = {}
209
369
  for name in names:
210
370
  if name not in unique_names:
211
- # The first instance of a column name will retain its name
212
- # without change.
213
- unique_names_with_counts[name] = 0
371
+ unique_counts[name] = 0
214
372
  unique_names.append(name)
215
373
  else:
216
- # Subsequent instances of a duplicated column name will have
217
- # numerals added to disambiguate.
218
- unique_names_with_counts[name] += 1
219
- new_name = f"{name}_{unique_names_with_counts[name]}"
374
+ unique_counts[name] += 1
375
+ new_name = f"{name}_{unique_counts[name]}"
220
376
  while new_name in names or new_name in unique_names:
221
- unique_names_with_counts[name] += 1
222
- new_name = f"{name}_{unique_names_with_counts[name]}"
223
-
377
+ unique_counts[name] += 1
378
+ new_name = f"{name}_{unique_counts[name]}"
224
379
  unique_names.append(new_name)
225
380
  return unique_names
226
381
 
@@ -233,20 +388,17 @@ def _sanitize_columns(names: List[str]) -> Tuple[List[str], bool]:
233
388
 
234
389
 
235
390
  def sanitize_file(src_path: str) -> Tuple[str, bool]:
236
- """Sanitizes the columns of a CSV or Parquet file by replacing invalid
237
- characters with underscores.
238
- Returns a tuple of the new path and a boolean indicating if the file was
239
- changed. If the file was not changed, the original path is returned.
240
- If the file was changed, a temporary file is created and returned.
241
- The temporary file should be deleted by the caller.
391
+ """Normalize column names in a CSV or Parquet file.
242
392
 
243
- Args:
244
- src_path: The path to the CSV or Parquet file to sanitize.
393
+ Rules:
394
+ - Replace any non-alphanumeric character with "_"
395
+ - Strip leading/trailing underscores
396
+ - Ensure uniqueness by appending suffixes: _1, _2, ...
245
397
 
246
- Returns:
247
- A tuple of the new path and a boolean indicating if the file was
248
- changed. If the file was not changed, the original path is returned.
249
- If the file was changed, a temporary file is created and returned.
398
+ Returns (path, changed):
399
+ - (src_path, False) if no changes were needed
400
+ - (temp_path, True) if a sanitized temp file was written (caller must
401
+ delete)
250
402
  """
251
403
  if src_path.endswith('.parquet'):
252
404
  pf = pq.ParquetFile(src_path)
@@ -255,7 +407,6 @@ def sanitize_file(src_path: str) -> Tuple[str, bool]:
255
407
  return src_path, False
256
408
  temp_file = tempfile.NamedTemporaryFile(suffix='.parquet',
257
409
  delete=False)
258
- # Create schema with sanitized column names
259
410
  original_schema = pf.schema.to_arrow_schema()
260
411
  fields = [
261
412
  field.with_name(new_name)
@@ -273,18 +424,15 @@ def sanitize_file(src_path: str) -> Tuple[str, bool]:
273
424
  new_cols, changed = _sanitize_columns(cols)
274
425
  if not changed:
275
426
  return src_path, False
276
-
277
427
  tmp = tempfile.NamedTemporaryFile(suffix='.csv', delete=False)
278
428
  tmp_path = tmp.name
279
429
  tmp.close()
280
-
281
430
  reader = pd.read_csv(src_path, chunksize=1_000_000)
282
431
  with open(tmp_path, 'w', encoding='utf-8', newline='') as out:
283
- out.write(','.join(new_cols) + '\n') # header once
432
+ out.write(','.join(new_cols) + '\n')
284
433
  for chunk in reader:
285
434
  chunk.columns = new_cols
286
435
  chunk.to_csv(out, header=False, index=False)
287
-
288
436
  return tmp_path, True
289
437
  else:
290
438
  raise ValueError(
@@ -296,14 +444,10 @@ def _upload_single_file(
296
444
  path: str,
297
445
  tqdm_bar_position: int = 0,
298
446
  ) -> None:
299
- r"""Upload a single file (original upload_table logic)."""
300
- # Validate:
301
447
  if not (path.endswith(".parquet") or path.endswith(".csv")):
302
- raise ValueError(f"Path {path} must be either a CSV or Parquet "
303
- f"file. Partitioned data is not currently "
304
- f"supported.")
448
+ raise ValueError(f"Path {path} must be either a CSV or Parquet file. "
449
+ "Partitioned data is not currently supported.")
305
450
 
306
- # Prepare upload (number of parts based on total size):
307
451
  file_type = 'parquet' if path.endswith('parquet') else 'csv'
308
452
  path, temp_file_created = sanitize_file(path)
309
453
  sz = os.path.getsize(path)
@@ -311,23 +455,24 @@ def _upload_single_file(
311
455
  logger.info("Uploading table %s (path: %s), size=%s bytes", name, path,
312
456
  sz)
313
457
 
314
- upload_res = _start_table_upload(
315
- table_name=name,
316
- file_type=file_type,
317
- file_size_bytes=sz,
318
- )
458
+ upload_res = _start_table_upload(table_name=name, file_type=file_type,
459
+ file_size_bytes=sz)
319
460
 
320
- # Chunk and upload:
321
461
  urls = list(upload_res.presigned_part_urls.values())
322
462
  loop = _KUMO_EVENT_LOOP
323
463
  part_metadata_list_fut = asyncio.run_coroutine_threadsafe(
324
- multi_put(loop, urls=urls, data=stream_read(
325
- open(path, 'rb'),
326
- CHUNK_SIZE,
327
- ), tqdm_bar_position=tqdm_bar_position), loop)
464
+ multi_put_bounded(
465
+ urls=urls,
466
+ data_iter=stream_read(open(path, 'rb'), CHUNK_SIZE),
467
+ tqdm_bar_position=tqdm_bar_position,
468
+ concurrency=min(4, len(urls)),
469
+ upload_progress_cb=None,
470
+ upload_subchunk_bytes=UPLOAD_CHUNK_BYTES,
471
+ ),
472
+ loop,
473
+ )
328
474
  part_metadata_list = part_metadata_list_fut.result()
329
475
 
330
- # Complete:
331
476
  if tqdm_bar_position == 0:
332
477
  logger.info("Upload complete. Validating table %s.", name)
333
478
  for i in range(5):
@@ -340,10 +485,10 @@ def _upload_single_file(
340
485
  parts_metadata=part_metadata_list,
341
486
  )
342
487
  except HTTPException as e:
488
+ # TODO(manan): this can happen when DELETE above has
489
+ # not propagated. So we retry with delay here. We
490
+ # assume DELETE is processed reasonably quickly:
343
491
  if e.status_code == 500 and i < 4:
344
- # TODO(manan): this can happen when DELETE above has
345
- # not propagated. So we retry with delay here. We
346
- # assume DELETE is processed reasonably quickly:
347
492
  time.sleep(2**(i - 1))
348
493
  continue
349
494
  else:
@@ -357,46 +502,37 @@ def _upload_single_file(
357
502
  os.unlink(path)
358
503
 
359
504
 
360
- def _upload_partitioned_parquet(
361
- name: str,
362
- path: str,
363
- partition_size: int,
364
- ) -> None:
505
+ def _upload_partitioned_parquet(name: str, path: str,
506
+ partition_size: int) -> None:
365
507
  r"""Upload a large parquet file by partitioning it into smaller chunks."""
366
508
  logger.info("File %s is large, partitioning for upload...", path)
367
-
368
509
  pf = pq.ParquetFile(path)
369
510
  new_columns, _ = _sanitize_columns(pf.schema.names)
370
- # Calculate partitions
371
- partitions = []
511
+
512
+ partitions: List[Tuple[int, List[int]]] = []
372
513
  part_idx = 0
373
514
  current_size = 0
374
515
  current_row_groups: list[int] = []
375
516
 
376
517
  for rg_idx in range(pf.num_row_groups):
377
518
  rg_size = pf.metadata.row_group(rg_idx).total_byte_size
378
-
379
519
  if rg_size > MAX_PARTITION_SIZE:
380
- raise ValueError(f"Row group {rg_idx} is larger than the "
381
- f"maximum partition size {MAX_PARTITION_SIZE} "
382
- f"bytes")
383
-
520
+ raise ValueError(
521
+ f"Row group {rg_idx} is larger than the maximum partition size"
522
+ f"{MAX_PARTITION_SIZE} bytes")
384
523
  if current_size + rg_size > partition_size and current_row_groups:
385
524
  partitions.append((part_idx, current_row_groups.copy()))
386
525
  part_idx += 1
387
526
  current_row_groups = []
388
527
  current_size = 0
389
-
390
528
  current_row_groups.append(rg_idx)
391
529
  current_size += rg_size
392
-
393
530
  if current_row_groups:
394
531
  partitions.append((part_idx, current_row_groups))
395
532
 
396
533
  logger.info("Splitting %s into %d partitions", path, len(partitions))
397
534
 
398
535
  def writer(path: str, row_groups: List[int]) -> None:
399
- # Create schema with sanitized column names
400
536
  original_schema = pf.schema.to_arrow_schema()
401
537
  fields = [
402
538
  field.with_name(new_name)
@@ -410,45 +546,32 @@ def _upload_partitioned_parquet(
410
546
  pq_writer.close()
411
547
 
412
548
  _upload_all_partitions(partitions, name, ".parquet", writer)
413
- # validation done by _upload_single_file on each partition
414
549
  logger.info("Upload complete. Validated table %s.", name)
415
550
 
416
551
 
417
- def _upload_partitioned_csv(
418
- name: str,
419
- path: str,
420
- partition_size: int,
421
- ) -> None:
552
+ def _upload_partitioned_csv(name: str, path: str, partition_size: int) -> None:
422
553
  r"""Upload a large CSV file by partitioning it into smaller chunks."""
423
- # calculate partitions
424
- partitions = []
554
+ partitions: List[Tuple[int, List[str]]] = []
425
555
  part_idx = 0
426
556
  columns = pd.read_csv(path, nrows=0).columns.tolist()
427
557
  new_columns, _ = _sanitize_columns(columns)
428
558
  with open(path, 'r', encoding='utf-8') as f:
429
- # preserve header per partition
430
- _ = f.readline() # skip header
559
+ _ = f.readline()
431
560
  header = ','.join(new_columns) + '\n'
432
561
  header_size = len(header.encode('utf-8'))
433
-
434
562
  current_lines = [header]
435
-
436
563
  current_size = header_size
437
-
438
564
  for line in f:
439
565
  line_size = len(line.encode('utf-8'))
440
-
441
566
  if (current_size + line_size > partition_size
442
567
  and len(current_lines) > 1):
443
568
  partitions.append((part_idx, current_lines.copy()))
444
569
  part_idx += 1
445
- current_lines = [header] # Start new partition with header
570
+ current_lines = [header]
446
571
  current_size = header_size
447
-
448
572
  current_lines.append(line)
449
573
  current_size += line_size
450
-
451
- if len(current_lines) > 1: # More than just header
574
+ if len(current_lines) > 1:
452
575
  partitions.append((part_idx, current_lines))
453
576
 
454
577
  logger.info("Splitting %s into %d partitions", path, len(partitions))
@@ -458,7 +581,6 @@ def _upload_partitioned_csv(
458
581
  f.writelines(lines)
459
582
 
460
583
  _upload_all_partitions(partitions, name, ".csv", writer)
461
- # validation done by _upload_single_file on each partition
462
584
  logger.info("Upload complete. Validated table %s.", name)
463
585
 
464
586
 
@@ -472,7 +594,6 @@ def _upload_all_partitions(
472
594
  for part_idx, partition_data in pbar:
473
595
  partition_desc = f"Part {part_idx+1}/{len(partitions)}"
474
596
  pbar.set_postfix_str(partition_desc)
475
-
476
597
  _create_and_upload_partition(
477
598
  name=name,
478
599
  part_idx=part_idx,
@@ -496,33 +617,23 @@ def _create_and_upload_partition(
496
617
  """
497
618
  partition_name = (f"{name}{file_suffix}/"
498
619
  f"part_{part_idx+1:04d}{file_suffix}")
499
-
500
620
  with tempfile.NamedTemporaryFile(suffix=file_suffix,
501
621
  delete=False) as temp_file:
502
622
  partition_path = temp_file.name
503
623
 
504
624
  try:
505
625
  partition_writer(partition_path, partition_data)
506
-
507
- # Upload partition immediately with a nested progress bar
508
626
  _upload_single_file(partition_name, partition_path,
509
627
  tqdm_bar_position=tqdm_bar_position)
510
-
511
628
  finally:
512
- # clean up the temporary file, even if the upload fails
513
629
  try:
514
630
  os.unlink(partition_path)
515
631
  except OSError:
516
- pass # File might already be deleted or not exist
632
+ pass
517
633
 
518
634
 
519
- def delete_uploaded_table(
520
- name: str,
521
- file_type: str,
522
- ) -> None:
523
- r"""Now deprecated in favor of
524
- :func:`kumoai.connector.file_upload_connector.FileUploadConnector.delete`.
525
- Synchronously deletes a previously uploaded table from the Kumo data
635
+ def delete_uploaded_table(name: str, file_type: str) -> None:
636
+ r"""Synchronously deletes a previously uploaded table from the Kumo data
526
637
  plane.
527
638
 
528
639
  .. code-block:: python
@@ -545,10 +656,6 @@ def delete_uploaded_table(
545
656
  file_type: The file type of the table to be deleted; this can either
546
657
  be :obj:`"parquet"` or :obj:`"csv"`
547
658
  """
548
- warnings.warn(
549
- "delete_uploaded_table is deprecated; use "
550
- "FileUploadConnector.delete instead.", DeprecationWarning,
551
- stacklevel=2)
552
659
  assert file_type in {'parquet', 'csv'}
553
660
  req = DeleteUploadedFileRequest(
554
661
  source_table_name=name,
@@ -558,11 +665,7 @@ def delete_uploaded_table(
558
665
  logger.info("Successfully deleted table %s from Kumo.", name)
559
666
 
560
667
 
561
- def replace_table(
562
- name: str,
563
- path: str,
564
- file_type: str,
565
- ) -> None:
668
+ def replace_table(name: str, path: str, file_type: str) -> None:
566
669
  r"""Replaces an existing uploaded table on the Kumo data plane with a new
567
670
  table.
568
671
 
@@ -592,20 +695,15 @@ def replace_table(
592
695
  ValueError: If the specified path does not point to a valid
593
696
  `.csv` or `.parquet` file.
594
697
  """
595
- # Validate:
596
698
  if not (path.endswith(".parquet") or path.endswith(".csv")):
597
- raise ValueError(f"Path {path} must be either a CSV or Parquet "
598
- f"file. Partitioned data is not currently "
599
- f"supported.")
600
-
699
+ raise ValueError(f"Path {path} must be either a CSV or Parquet file. "
700
+ "Partitioned data is not currently supported.")
601
701
  try:
602
702
  logger.info("Deleting previously uploaded table %s of type %s.", name,
603
703
  file_type)
604
704
  delete_uploaded_table(name=name, file_type=file_type)
605
705
  except Exception:
606
- # TODO(manan): fix this...
607
706
  pass
608
-
609
707
  logger.info("Uploading table %s.", name)
610
708
  upload_table(name=name, path=path)
611
709
  logger.info("Successfully replaced table %s with the new table.", name)
@@ -625,6 +723,21 @@ def _start_table_upload(
625
723
  return global_state.client.connector_api.start_file_upload(req)
626
724
 
627
725
 
726
+ def _start_table_upload_with_parts(
727
+ table_name: str,
728
+ file_type: str,
729
+ file_size_bytes: int,
730
+ num_parts: int,
731
+ ) -> StartFileUploadResponse:
732
+ assert file_type in CONNECTOR_ID_MAP.keys()
733
+ req = StartFileUploadRequest(
734
+ source_table_name=table_name,
735
+ connector_id=CONNECTOR_ID_MAP[file_type],
736
+ num_parts=max(1, int(num_parts)),
737
+ )
738
+ return global_state.client.connector_api.start_file_upload(req)
739
+
740
+
628
741
  def _complete_table_upload(
629
742
  table_name: str,
630
743
  file_type: str,
@@ -633,12 +746,1030 @@ def _complete_table_upload(
633
746
  parts_metadata: List[PartUploadMetadata],
634
747
  ) -> None:
635
748
  assert file_type in CONNECTOR_ID_MAP.keys()
636
-
637
749
  req = CompleteFileUploadRequest(
638
750
  source_table_name=table_name,
639
751
  connector_id=CONNECTOR_ID_MAP[file_type],
640
- temp_upload_path=upload_path,
641
- upload_id=upload_id,
752
+ temp_upload_path=str(upload_path),
753
+ upload_id=str(upload_id),
642
754
  parts_metadata=parts_metadata,
755
+ # Server-side validation is disabled because client-side (SDK)
756
+ # validation is now comprehensive and eliminates the need for
757
+ # additional server-side validation.
758
+ validate_data=False,
643
759
  )
644
760
  return global_state.client.connector_api.complete_file_upload(req)
761
+
762
+
763
+ # -----------------------
764
+ # Remote I/O (fsspec)
765
+ # -----------------------
766
+
767
+ # Define data type for filesystem that does not depend on fsspec
768
+ Filesystem = Any
769
+
770
+
771
+ def _make_filesystem(scheme: str) -> Filesystem:
772
+ if scheme == "s3":
773
+ try:
774
+ import fsspec # noqa: F401
775
+ import s3fs # noqa: F401
776
+ except Exception:
777
+ raise ImportError(
778
+ "S3 paths require 's3fs'. Install: pip install s3fs")
779
+ fs = fsspec.filesystem("s3")
780
+ elif scheme == "gs":
781
+ try:
782
+ import fsspec # noqa: F401
783
+ import gcsfs # noqa: F401
784
+ except Exception:
785
+ raise ImportError(
786
+ "GCS paths require 'gcsfs'. Install: pip install gcsfs")
787
+ fs = fsspec.filesystem("gcs")
788
+ elif scheme in ("abfs", "abfss", "az"):
789
+ try:
790
+ import adlfs # noqa: F401
791
+ import fsspec # noqa: F401
792
+ except Exception:
793
+ raise ImportError(
794
+ "Azure paths require 'adlfs'. Install: pip install adlfs")
795
+ fs = fsspec.filesystem(scheme)
796
+ else:
797
+ raise ValueError(f"Unsupported remote scheme: {scheme}")
798
+ return fs
799
+
800
+
801
+ def _get_fs_and_path(url: str) -> Tuple[Filesystem, str]:
802
+ parsed = urlparse(url)
803
+ scheme = parsed.scheme
804
+ fs = _make_filesystem(scheme)
805
+ return fs, url
806
+
807
+
808
+ def _remote_info(fs: Filesystem, path: str) -> dict:
809
+ try:
810
+ info = fs.info(path)
811
+ if info.get("type") in ("file", "directory"):
812
+ return info
813
+ # s3fs for directories can return {'Key':..., 'Size':...}; normalize
814
+ if info.get("Size") is not None and info.get("Key"):
815
+ return {
816
+ "type": "file",
817
+ "size": info.get("Size"),
818
+ "name": info.get("Key")
819
+ }
820
+ return info
821
+ except Exception as e:
822
+ raise RuntimeError(f"Failed to stat remote path {path}: {e}")
823
+
824
+
825
+ def _remote_dir_manifest(fs: Filesystem, path: str) -> dict:
826
+ # Return lists of parquet and csv entries with size
827
+ try:
828
+ listing = fs.ls(path, detail=True)
829
+ except Exception as e:
830
+ raise RuntimeError(f"Failed to list remote directory {path}: {e}")
831
+
832
+ parquet_files: List[dict] = []
833
+ csv_files: List[dict] = []
834
+ for ent in listing:
835
+ if isinstance(ent, dict):
836
+ p = ent.get("name") or ent.get("Key") or ent.get("path")
837
+ s = ent.get("size") or ent.get("Size") or 0
838
+ t = ent.get("type") or ent.get("StorageClass") or ""
839
+ if t == "directory":
840
+ continue
841
+ else:
842
+ p = ent
843
+ try:
844
+ s = fs.info(p).get("size", 0)
845
+ except Exception:
846
+ s = 0
847
+ if not isinstance(p, str):
848
+ continue
849
+ ext = os.path.splitext(p.lower())[1]
850
+ if ext == ".parquet":
851
+ parquet_files.append({"path": p, "size": int(s or 0)})
852
+ elif ext == ".csv":
853
+ csv_files.append({"path": p, "size": int(s or 0)})
854
+
855
+ return {"parquet": parquet_files, "csv": csv_files}
856
+
857
+
858
+ def _read_remote_file_with_progress(
859
+ fs: Filesystem,
860
+ path: str,
861
+ expected_size: Optional[int],
862
+ update_bytes: Optional[Callable[[int], Optional[bool]]] = None,
863
+ capture_first_line: bool = False,
864
+ ) -> Tuple[io.BytesIO, memoryview, Optional[bytes]]:
865
+ """Stream into a single BytesIO (one allocation) and return a zero-copy
866
+ memoryview.
867
+ """
868
+ buf = io.BytesIO()
869
+
870
+ header_line: Optional[bytes] = None
871
+ if capture_first_line:
872
+ header_acc = bytearray()
873
+ seen_nl = False
874
+ else:
875
+ header_acc = bytearray()
876
+ seen_nl = True
877
+
878
+ with fs.open(path, "rb") as fobj:
879
+ while True:
880
+ chunk = fobj.read(READ_CHUNK_BYTES)
881
+ if not chunk:
882
+ break
883
+ if capture_first_line and not seen_nl:
884
+ nl_idx = chunk.find(b"\n")
885
+ if nl_idx != -1:
886
+ header_acc += chunk[:nl_idx]
887
+ # small copy only for header
888
+ header_line = bytes(header_acc)
889
+ seen_nl = True
890
+ else:
891
+ header_acc += chunk
892
+ buf.write(chunk)
893
+ if update_bytes:
894
+ try:
895
+ update_bytes(len(chunk))
896
+ except Exception:
897
+ pass
898
+
899
+ if capture_first_line and not seen_nl:
900
+ header_line = bytes(header_acc)
901
+
902
+ mv = buf.getbuffer() # zero-copy view of BytesIO internal buffer
903
+ return buf, mv, header_line
904
+
905
+
906
+ # -----------------------
907
+ # Memory budget & helpers
908
+ # -----------------------
909
+ def _compute_mem_budget_bytes(files: List[dict]) -> int:
910
+ # 50% of system RAM
911
+ try:
912
+ import psutil
913
+ total = psutil.virtual_memory().total
914
+ except Exception:
915
+ total = 8 * 1024**3 # assume 8 GiB
916
+ budget = int(total * 0.50)
917
+ return max(budget, 512 * 1024**2) # at least 512 MiB
918
+
919
+
920
+ class MemoryBudget:
921
+ """A byte-level semaphore to prevent OOM when reading many shards."""
922
+ def __init__(self, budget_bytes: int) -> None:
923
+ self.budget = budget_bytes
924
+ self.avail = budget_bytes
925
+ self.cv = threading.Condition()
926
+
927
+ def acquire(self, need: int) -> None:
928
+ with self.cv:
929
+ while self.avail < need:
930
+ self.cv.wait(timeout=0.25)
931
+ self.avail -= need
932
+
933
+ def release(self, freed: int) -> None:
934
+ with self.cv:
935
+ self.avail += freed
936
+ if self.avail > self.budget:
937
+ self.avail = self.budget
938
+ self.cv.notify_all()
939
+
940
+
941
+ def _determine_parallelism(files: List[dict], requested: Optional[int]) -> int:
942
+ if requested is not None and requested > 0:
943
+ return min(requested, len(files))
944
+ env_par = os.getenv("KUMO_UPLOAD_PARALLELISM")
945
+ if env_par:
946
+ try:
947
+ val = int(env_par)
948
+ if val > 0:
949
+ return min(val, len(files))
950
+ except Exception:
951
+ pass
952
+
953
+ budget_bytes = _compute_mem_budget_bytes(files)
954
+ # 128 MiB overhead by default
955
+ try:
956
+ overhead_bytes = max(0, int(os.getenv("KUMO_UPLOAD_OVERHEAD_MB",
957
+ "128"))) * 1024**2
958
+ except Exception:
959
+ overhead_bytes = 128 * 1024**2
960
+
961
+ needs = []
962
+ for f in files:
963
+ size = int(f.get("size") or 0)
964
+ if size <= 0:
965
+ continue
966
+ needs.append(size + overhead_bytes)
967
+ if not needs:
968
+ return 1
969
+ needs.sort()
970
+ median_need = needs[len(needs) // 2]
971
+ par = max(1, budget_bytes // max(1, median_need))
972
+ return min(int(par), len(files))
973
+
974
+
975
+ def _iter_mv_chunks(mv: memoryview,
976
+ part_size: int) -> Generator[memoryview, None, None]:
977
+ pos = 0
978
+ n = mv.nbytes
979
+ while pos < n:
980
+ nxt = min(n, pos + part_size)
981
+ yield mv[pos:nxt] # zero-copy slice
982
+ pos = nxt
983
+
984
+
985
+ # -----------------------
986
+ # Parquet helpers
987
+ # -----------------------
988
+ def _parquet_schema_from_bytes(data_mv: memoryview) -> pa.Schema:
989
+ reader = pa.BufferReader(pa.py_buffer(data_mv))
990
+ pf = pq.ParquetFile(reader)
991
+
992
+ # zero-row guard via metadata (no data scan)
993
+ if getattr(pf.metadata, "num_rows", None) == 0:
994
+ raise ValueError("Parquet file contains zero rows.")
995
+
996
+ return pf.schema_arrow
997
+
998
+
999
+ def _parquet_num_rows_from_bytes(data_mv: memoryview) -> int:
1000
+ buf = pa.py_buffer(data_mv)
1001
+ reader = pa.BufferReader(buf)
1002
+ pf = pq.ParquetFile(reader)
1003
+ md = pf.metadata
1004
+ if md is None:
1005
+ total = 0
1006
+ for rg in range(pf.num_row_groups):
1007
+ total += pf.metadata.row_group(rg).num_rows
1008
+ return total
1009
+ return md.num_rows
1010
+
1011
+
1012
+ def validate_parquet_schema(schema: pa.Schema, source_name: str) -> None:
1013
+ """Validate a PyArrow schema for Kumo compatibility (source_name
1014
+ required).
1015
+
1016
+ Disallowed:
1017
+ - All large_* types: large_string, large_binary, large_list<*>
1018
+ - Any time-of-day types (time32/64<*>); ONLY epoch-based timestamps are
1019
+ allowed
1020
+ - Any duration types (e.g., pa.duration('ns'))
1021
+ - list<string> and list<bool>
1022
+ - Unsigned integers (uint8/16/32/64)
1023
+ - Null-typed columns
1024
+
1025
+ Allowed:
1026
+ - boolean, signed integer, floating, (regular) string, date, timestamp
1027
+ (epoch-based), (regular) binary
1028
+ - decimal up to configured precision (env KUMO_DECIMAL_MAX_PRECISION,
1029
+ default 18)
1030
+ - list of {signed integer, float}
1031
+ - dictionary<int, string>
1032
+
1033
+ Raises:
1034
+ ValueError listing offending columns (including source_name).
1035
+ """
1036
+ try:
1037
+ max_dec_prec = int(os.getenv("KUMO_DECIMAL_MAX_PRECISION", "18"))
1038
+ except Exception:
1039
+ max_dec_prec = 18
1040
+
1041
+ where = f" in {source_name}"
1042
+ errors: list[str] = []
1043
+
1044
+ for col, dt in zip(schema.names, schema.types):
1045
+ # 1) Hard-disallow all large_* types
1046
+ if pa.types.is_large_string(dt):
1047
+ errors.append(
1048
+ f" - column '{col}'{where} has unsupported type large_string")
1049
+ continue
1050
+ if pa.types.is_large_binary(dt):
1051
+ errors.append(
1052
+ f" - column '{col}'{where} has unsupported type large_binary")
1053
+ continue
1054
+ if pa.types.is_large_list(dt):
1055
+ errors.append(
1056
+ f" - column '{col}'{where} has unsupported type {dt} "
1057
+ f"(large_list not supported)")
1058
+ continue
1059
+
1060
+ # 2) Disallow time-of-day and duration
1061
+ if pa.types.is_time(dt):
1062
+ errors.append(
1063
+ f" - column '{col}'{where} has unsupported time-of-day type "
1064
+ f"'{dt}' (only epoch-based timestamps are supported)")
1065
+ continue
1066
+ if pa.types.is_duration(dt):
1067
+ errors.append(
1068
+ f" - column '{col}'{where} has unsupported duration "
1069
+ f"type '{dt}'")
1070
+ continue
1071
+
1072
+ # 3) Disallow unsigned integers and null columns
1073
+ if pa.types.is_unsigned_integer(dt):
1074
+ errors.append(
1075
+ f" - column '{col}'{where} has unsupported unsigned integer "
1076
+ "type '{dt}'")
1077
+ continue
1078
+ if pa.types.is_null(dt):
1079
+ errors.append(
1080
+ f" - column '{col}'{where} has unsupported null type '{dt}'")
1081
+ continue
1082
+
1083
+ supported = (
1084
+ pa.types.is_boolean(dt)
1085
+ # signed ints only
1086
+ or (pa.types.is_integer(dt)
1087
+ and not pa.types.is_unsigned_integer(dt)) or
1088
+ pa.types.is_floating(dt) or
1089
+ pa.types.is_string(dt) # regular string only
1090
+ or pa.types.is_date(dt) or
1091
+ pa.types.is_timestamp(dt) # epoch-based timestamps
1092
+ or pa.types.is_binary(dt) # regular binary only
1093
+ )
1094
+
1095
+ # 4) Decimals with precision limit
1096
+ if not supported and pa.types.is_decimal(dt):
1097
+ try:
1098
+ prec = int(getattr(dt, "precision", 0) or 0)
1099
+ except Exception:
1100
+ prec = 0
1101
+ if 0 < prec <= max_dec_prec:
1102
+ supported = True
1103
+ else:
1104
+ errors.append(
1105
+ f" - column '{col}'{where} has unsupported decimal "
1106
+ f"precision {prec} (max {max_dec_prec}): type '{dt}'")
1107
+ continue
1108
+
1109
+ # 5) Lists: only list of {signed int, float}; explicitly deny
1110
+ # list<string> and list<bool>
1111
+ if not supported and pa.types.is_list(dt):
1112
+ elem = dt.value_type
1113
+ if pa.types.is_string(elem):
1114
+ errors.append(
1115
+ f" - column '{col}'{where} is {dt} (list<string> not "
1116
+ f"supported)")
1117
+ continue
1118
+ if pa.types.is_boolean(elem):
1119
+ errors.append(f" - column '{col}'{where} is {dt} (list<bool> "
1120
+ f"not supported)")
1121
+ continue
1122
+ if pa.types.is_integer(
1123
+ elem) and not pa.types.is_unsigned_integer(elem):
1124
+ supported = True
1125
+ elif pa.types.is_floating(elem):
1126
+ supported = True
1127
+ else:
1128
+ errors.append(
1129
+ f" - column '{col}'{where} is {dt} (only list of signed "
1130
+ f"int/float supported)")
1131
+ continue
1132
+
1133
+ # 6) Dictionary<int, string> only
1134
+ if not supported and pa.types.is_dictionary(dt):
1135
+ if (pa.types.is_integer(dt.index_type)
1136
+ and not pa.types.is_unsigned_integer(dt.index_type)
1137
+ and pa.types.is_string(dt.value_type)):
1138
+ supported = True
1139
+
1140
+ if not supported:
1141
+ errors.append(
1142
+ f" - column '{col}'{where} has unsupported type '{dt}'")
1143
+
1144
+ if errors:
1145
+ raise ValueError(
1146
+ "Unsupported Parquet Data Types detected:\n\n" +
1147
+ "\n".join(errors) + "\n\nAllowed types: boolean, signed integer, "
1148
+ "float, (regular) string, date, "
1149
+ "timestamp (epoch-based), (regular) binary, "
1150
+ "decimal (<= configured precision), "
1151
+ "list of {signed int, float}, dictionary<int,string>.\n"
1152
+ "Disallowed examples: large_string, large_binary, "
1153
+ "large_list<*>, time32/64<*>, "
1154
+ "duration('unit'), list<string>, list<bool>, "
1155
+ "unsigned integers, null columns, "
1156
+ "structs, maps, and other nested types.")
1157
+
1158
+
1159
+ # -----------------------
1160
+ # CSV helpers
1161
+ # -----------------------
1162
+ def _detect_and_validate_csv(head_bytes: bytes) -> str:
1163
+ r"""Detect a CSV delimiter from a small head sample and verify it.
1164
+
1165
+ - Uses csv.Sniffer (preferred delimiters: | , ; \t) with fallback to ','.
1166
+ - Reads a handful of complete, quote-aware records (handles newlines inside
1167
+ quotes).
1168
+ - Re-serializes those rows and validates with pandas (small nrows) to catch
1169
+ malformed inputs.
1170
+ - Raises ValueError on empty input or if parsing fails with the chosen
1171
+ delimiter.
1172
+ """
1173
+ if not head_bytes:
1174
+ raise ValueError("Could not auto-detect a delimiter: file is empty.")
1175
+
1176
+ text = head_bytes.decode("utf-8", errors="ignore").replace("\r\n",
1177
+ "\n").replace(
1178
+ "\r", "\n")
1179
+
1180
+ # 1) Detect delimiter (simple preference list; no denylist)
1181
+ try:
1182
+ delimiter = csv.Sniffer().sniff(text, delimiters="|,;\t").delimiter
1183
+ except Exception:
1184
+ logger.warning("No separator found in sample; defaulting to ','.")
1185
+ delimiter = ','
1186
+
1187
+ # 2) Pull a few complete records with csv.reader (quote-aware,
1188
+ # handles embedded newlines)
1189
+ rows = []
1190
+ try:
1191
+ rdr = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"',
1192
+ doublequote=True)
1193
+ for _ in range(50): # small, bounded sample
1194
+ try:
1195
+ rows.append(next(rdr))
1196
+ except StopIteration:
1197
+ break
1198
+ except Exception as e:
1199
+ raise ValueError(
1200
+ f"Could not auto-detect a valid delimiter. Tried '{delimiter}', "
1201
+ f"csv parse failed: {repr(e)}")
1202
+
1203
+ if not rows:
1204
+ raise ValueError(
1205
+ "Could not auto-detect a valid delimiter: no complete records "
1206
+ "found.")
1207
+
1208
+ # 3) Re-serialize snippet and validate minimally with pandas
1209
+ out = io.StringIO()
1210
+ w = csv.writer(out, delimiter=delimiter, lineterminator="\n",
1211
+ quotechar='"', doublequote=True)
1212
+ for r in rows:
1213
+ w.writerow(r)
1214
+
1215
+ try:
1216
+ pd.read_csv(
1217
+ io.StringIO(out.getvalue()),
1218
+ sep=delimiter,
1219
+ index_col=False,
1220
+ on_bad_lines='error',
1221
+ nrows=50,
1222
+ engine="python", # more tolerant for quoted/newline combos
1223
+ skip_blank_lines=False,
1224
+ )
1225
+ except Exception as e:
1226
+ raise ValueError(
1227
+ f"Could not auto-detect a valid delimiter. Tried '{delimiter}', "
1228
+ f"pandas parse failed: {repr(e)}")
1229
+
1230
+ return delimiter
1231
+
1232
+
1233
+ def _csv_has_data_rows(data_mv: memoryview) -> bool:
1234
+ """Return True if any non-newline, non-carriage-return byte exists after
1235
+ the first newline. Uses zero-copy iteration over the memoryview to avoid
1236
+ duplicating buffers.
1237
+ """
1238
+ mv = data_mv
1239
+ if mv.format != 'B':
1240
+ try:
1241
+ mv = mv.cast('B') # zero-copy view of bytes
1242
+ except TypeError:
1243
+ # fallback: create a contiguous view via slicing (still zero-copy)
1244
+ mv = mv[:]
1245
+
1246
+ saw_newline = False
1247
+ # Iterate in a single pass; break as soon as we see a data-ish byte
1248
+ for b in mv:
1249
+ if not saw_newline:
1250
+ if b == 10: # '\n'
1251
+ saw_newline = True
1252
+ continue
1253
+ # after header newline: any byte that isn't CR or LF counts as data
1254
+ if b not in (10, 13):
1255
+ return True
1256
+ return False
1257
+
1258
+
1259
+ def _maybe_rewrite_csv_header_buffer(
1260
+ data_mv: memoryview,
1261
+ header_line: bytes,
1262
+ delimiter: str,
1263
+ ) -> tuple[Optional[io.BytesIO], memoryview, bytes, list[str], dict[str, str],
1264
+ bool]:
1265
+ """Rewrite ONLY the header if needed. Uses a new BytesIO but frees the old
1266
+ buffer immediately after swap.
1267
+ """
1268
+ try:
1269
+ header_str = header_line.decode("utf-8").rstrip("\r\n")
1270
+ except UnicodeDecodeError:
1271
+ raise ValueError("CSV header is not valid UTF-8.")
1272
+
1273
+ orig_cols = [c.strip() for c in header_str.split(delimiter)]
1274
+ new_cols, changed = _sanitize_columns(orig_cols)
1275
+ if not changed:
1276
+ return None, data_mv, header_line, orig_cols, {}, False
1277
+
1278
+ rename_map = {o: n for o, n in zip(orig_cols, new_cols) if o != n}
1279
+
1280
+ nl_idx = len(header_line)
1281
+ if nl_idx >= data_mv.nbytes:
1282
+ raise ValueError("Malformed CSV: newline not found in header.")
1283
+
1284
+ new_header_bytes = delimiter.join(new_cols).encode("utf-8")
1285
+ new_buf = io.BytesIO()
1286
+ new_buf.write(new_header_bytes)
1287
+ new_buf.write(b"\n")
1288
+ # Write the remainder via a zero-copy memoryview slice; BytesIO will copy
1289
+ # into its own buffer, but we free the original immediately after returning
1290
+ # to avoid double residency.
1291
+ new_buf.write(data_mv[nl_idx + 1:])
1292
+ new_mv = new_buf.getbuffer()
1293
+ return new_buf, new_mv, new_header_bytes, new_cols, rename_map, True
1294
+
1295
+
1296
+ # -----------------------
1297
+ # Remote upload (refactor)
1298
+ # -----------------------
1299
+ @dataclass
1300
+ class _RemoteSettings:
1301
+ part_size: int
1302
+ part_conc: int
1303
+ overhead_bytes: int
1304
+ parallelism_override: Optional[int]
1305
+
1306
+
1307
+ def _make_remote_settings(parallelism: Optional[int]) -> _RemoteSettings:
1308
+ part_mb = int(os.getenv("KUMO_REMOTE_PART_MB", "64"))
1309
+ part_size = max(8, part_mb) * 1024**2
1310
+ part_conc = int(os.getenv("KUMO_REMOTE_PART_CONCURRENCY", "4"))
1311
+ try:
1312
+ overhead_bytes = max(0, int(os.getenv("KUMO_UPLOAD_OVERHEAD_MB",
1313
+ "128"))) * 1024**2
1314
+ except Exception:
1315
+ overhead_bytes = 128 * 1024**2
1316
+ return _RemoteSettings(
1317
+ part_size=part_size,
1318
+ part_conc=part_conc,
1319
+ overhead_bytes=overhead_bytes,
1320
+ parallelism_override=parallelism,
1321
+ )
1322
+
1323
+
1324
+ def _remote_upload_file(name: str, fs: Filesystem, url: str, info: dict,
1325
+ st: _RemoteSettings, file_type: Optional[str]) -> None:
1326
+ detected_ftype = _validate_url_ext(url, file_type)
1327
+
1328
+ size = int(info.get("size") or 0)
1329
+ if size == 0:
1330
+ raise ValueError(f"Remote file {url} is empty (0 bytes).")
1331
+ if size > MAX_PARTITION_SIZE:
1332
+ raise ValueError(
1333
+ "Remote single-file uploads larger than 1GB are not supported. "
1334
+ "Please re-partition the source into ~200MB chunks and upload the "
1335
+ "whole directory instead.")
1336
+
1337
+ # Read with progress
1338
+ with tqdm(total=size, desc=f"Reading {_short_path(url)}", unit="B",
1339
+ unit_scale=True, unit_divisor=1024, position=0, leave=False,
1340
+ smoothing=0.1) as read_bar:
1341
+ tr0 = time.perf_counter()
1342
+ buf, data_mv, header_line = _read_remote_file_with_progress(
1343
+ fs, url, expected_size=size, update_bytes=read_bar.update,
1344
+ capture_first_line=(detected_ftype == "csv"))
1345
+ tread = time.perf_counter() - tr0
1346
+
1347
+ # Validate/sanitize
1348
+ tv0 = time.perf_counter()
1349
+ renamed_cols_msg = None
1350
+ if detected_ftype == "parquet":
1351
+ schema = _parquet_schema_from_bytes(data_mv)
1352
+ _validate_columns_or_raise(list(schema.names))
1353
+ validate_parquet_schema(schema, url)
1354
+ nrows = _parquet_num_rows_from_bytes(data_mv)
1355
+ if nrows <= 0:
1356
+ raise ValueError("Parquet file has zero rows.")
1357
+ file_type = "parquet"
1358
+ else:
1359
+ head_len = min(50000, data_mv.nbytes)
1360
+ # small bounded copy only for sniffing
1361
+ head = bytes(data_mv[:head_len])
1362
+ delimiter = _detect_and_validate_csv(head)
1363
+ if header_line is None:
1364
+ # Shouldn't happen (we captured it during read), but keep a bounded
1365
+ # fallback (64 KiB)
1366
+ prefix_len = min(64 * 1024, data_mv.nbytes)
1367
+ prefix = data_mv[:prefix_len]
1368
+ # build header_line from prefix without large copies
1369
+ acc = bytearray()
1370
+ for b in (prefix.cast('B') if prefix.format != 'B' else prefix):
1371
+ if b == 10: # '\n'
1372
+ break
1373
+ acc.append(b)
1374
+ header_line = bytes(acc)
1375
+ new_buf, new_mv, new_header, cols, rename_map, changed = (
1376
+ _maybe_rewrite_csv_header_buffer(data_mv, header_line, delimiter))
1377
+ if changed:
1378
+ try:
1379
+ buf.close()
1380
+ except Exception:
1381
+ pass
1382
+ if changed:
1383
+ buf = new_buf # type: ignore[assignment]
1384
+ data_mv = new_mv
1385
+ header_line = new_header
1386
+ if rename_map:
1387
+ pairs = ", ".join(f"{k}->{v}" for k, v in rename_map.items())
1388
+ renamed_cols_msg = f"CSV header sanitized (renamed): {pairs}"
1389
+ if not _csv_has_data_rows(data_mv):
1390
+ raise ValueError(
1391
+ "CSV file has zero data rows (only header present).")
1392
+ file_type = "csv"
1393
+ tval = time.perf_counter() - tv0
1394
+
1395
+ # Multipart upload
1396
+ size_bytes = data_mv.nbytes
1397
+ num_parts = max(1, math.ceil(size_bytes / st.part_size))
1398
+ upload_res = _start_table_upload_with_parts(table_name=name,
1399
+ file_type=file_type,
1400
+ file_size_bytes=size_bytes,
1401
+ num_parts=num_parts)
1402
+ try:
1403
+ urls = [
1404
+ u for k, u in sorted(upload_res.presigned_part_urls.items(),
1405
+ key=lambda kv: int(kv[0]))
1406
+ ]
1407
+ except Exception:
1408
+ urls = list(upload_res.presigned_part_urls.values())
1409
+
1410
+ loop = _KUMO_EVENT_LOOP
1411
+ with tqdm(total=size_bytes, desc="Uploading", unit="B", unit_scale=True,
1412
+ unit_divisor=1024, position=2, leave=False,
1413
+ smoothing=0.1) as upload_bar:
1414
+ part_metadata_list_fut = asyncio.run_coroutine_threadsafe(
1415
+ multi_put_bounded(
1416
+ urls=urls,
1417
+ data_iter=_iter_mv_chunks(data_mv, st.part_size),
1418
+ tqdm_bar_position=3,
1419
+ concurrency=max(1, min(st.part_conc, len(urls))),
1420
+ upload_progress_cb=lambda n: _safe_bar_update(upload_bar, n),
1421
+ upload_subchunk_bytes=UPLOAD_CHUNK_BYTES,
1422
+ ),
1423
+ loop,
1424
+ )
1425
+ part_metadata_list = part_metadata_list_fut.result()
1426
+ upload_bar.set_postfix_str(f"Done — {_short_path(url)}")
1427
+ upload_bar.refresh()
1428
+
1429
+ # Complete
1430
+ tu0 = time.perf_counter()
1431
+ for i in range(5):
1432
+ try:
1433
+ _complete_table_upload(
1434
+ table_name=name,
1435
+ file_type=file_type,
1436
+ upload_path=upload_res.temp_upload_path,
1437
+ upload_id=upload_res.upload_id,
1438
+ parts_metadata=part_metadata_list,
1439
+ )
1440
+ except HTTPException as e:
1441
+ if e.status_code == 500 and i < 4:
1442
+ time.sleep(2**(i - 1))
1443
+ continue
1444
+ else:
1445
+ raise
1446
+ else:
1447
+ break
1448
+ tupl = time.perf_counter() - tu0
1449
+
1450
+ _log_file_timing("single-file(multipart)", url, size_bytes, tread, tval,
1451
+ tupl)
1452
+ if renamed_cols_msg:
1453
+ logger.info(renamed_cols_msg)
1454
+
1455
+ try:
1456
+ if buf:
1457
+ buf.close()
1458
+ except Exception:
1459
+ pass
1460
+ del buf, data_mv, header_line
1461
+ gc.collect()
1462
+
1463
+ logger.info("Upload complete. Validated table %s.", name)
1464
+
1465
+
1466
+ def _remote_upload_directory(
1467
+ name: str,
1468
+ fs: Filesystem,
1469
+ url: str,
1470
+ info: dict,
1471
+ st: _RemoteSettings,
1472
+ file_type: Optional[str] = None, # "csv", "parquet", or None
1473
+ ) -> None:
1474
+ manifest = _remote_dir_manifest(fs, url)
1475
+ parquet_files = sorted(manifest["parquet"], key=lambda x: x["path"])
1476
+ csv_files = sorted(manifest["csv"], key=lambda x: x["path"])
1477
+
1478
+ # Normalize expected type
1479
+ if file_type not in (None, "csv", "parquet"):
1480
+ raise ValueError("file_type must be 'csv', 'parquet', or None.")
1481
+
1482
+ # Resolve files + detected type
1483
+ if file_type is None:
1484
+ if not parquet_files and not csv_files:
1485
+ raise ValueError("Directory contains no .parquet or .csv files.")
1486
+ if parquet_files and csv_files:
1487
+ raise ValueError(
1488
+ "Mixed CSV and Parquet files detected; keep only one format.")
1489
+ files = parquet_files if parquet_files else csv_files
1490
+ detected_type = "parquet" if parquet_files else "csv"
1491
+ elif file_type == "parquet":
1492
+ if not parquet_files:
1493
+ raise ValueError(
1494
+ "Directory contains no .parquet files (file_type='parquet').")
1495
+ if csv_files:
1496
+ raise ValueError(
1497
+ "Directory also contains CSV files; remove them or set"
1498
+ "file_type=None.")
1499
+ files, detected_type = parquet_files, "parquet"
1500
+ else: # file_type == "csv"
1501
+ if not csv_files:
1502
+ raise ValueError(
1503
+ "Directory contains no .csv files (file_type='csv').")
1504
+ if parquet_files:
1505
+ raise ValueError(
1506
+ "Directory also contains Parquet files; remove them or "
1507
+ "set file_type=None.")
1508
+ files, detected_type = csv_files, "csv"
1509
+
1510
+ total_bytes = sum(int(f.get("size") or 0) for f in files)
1511
+
1512
+ too_large = [
1513
+ f["path"] for f in files if (f.get("size") or 0) > MAX_PARTITION_SIZE
1514
+ ]
1515
+ zero_bytes = [f["path"] for f in files if (f.get("size") or 0) == 0]
1516
+ if zero_bytes:
1517
+ raise ValueError(
1518
+ f"Found zero-byte {detected_type.upper()} files: {zero_bytes[:3]}"
1519
+ f"{'...' if len(zero_bytes)>3 else ''}")
1520
+ if too_large:
1521
+ raise ValueError(
1522
+ f"The following files exceed 1GB and must be re-partitioned "
1523
+ f"(~200MB each): "
1524
+ f"{too_large[:3]}{'...' if len(too_large)>3 else ''}")
1525
+
1526
+ par = _determine_parallelism(files, requested=st.parallelism_override)
1527
+ par = max(1, min(par, len(files)))
1528
+ budget_bytes = _compute_mem_budget_bytes(files)
1529
+ mem_budget = MemoryBudget(budget_bytes)
1530
+
1531
+ from collections import deque
1532
+ with (tqdm(total=len(files),
1533
+ desc=f"Files ({len(files)}) [{detected_type}] | par={par}",
1534
+ position=0) as file_bar,
1535
+ tqdm(total=total_bytes, desc="Total bytes (read)", unit="B",
1536
+ unit_scale=True, unit_divisor=1024, position=1, smoothing=0.1)
1537
+ as bytes_bar,
1538
+ tqdm(total=total_bytes, desc="Total bytes (uploaded)", unit="B",
1539
+ unit_scale=True, unit_divisor=1024, position=2, smoothing=0.1)
1540
+ as uploaded_bar):
1541
+
1542
+ status_lock = threading.Lock()
1543
+ recent_paths: Deque[str] = deque(maxlen=5)
1544
+ completed_files = {"n": 0}
1545
+ file_bar.set_postfix_str(f"Uploaded 0/{len(files)}")
1546
+ file_bar.refresh()
1547
+
1548
+ rename_aggregate_lock = threading.Lock()
1549
+ rename_aggregate: dict[str, str] = {}
1550
+
1551
+ def _merge_status_update(path: str) -> None:
1552
+ with status_lock:
1553
+ completed_files["n"] += 1
1554
+ recent_paths.append(path)
1555
+ tail = ' | '.join(_short_path(p) for p in list(recent_paths))
1556
+ msg = f"Uploaded {completed_files['n']}/{len(files)}"
1557
+ if tail:
1558
+ msg += f" — {tail}"
1559
+ with _TQDM_LOCK:
1560
+ file_bar.set_postfix_str(msg)
1561
+ file_bar.refresh()
1562
+
1563
+ ref_schema_fields: Dict[str, Any] = {"value": None}
1564
+ ref_cols: Dict[str, Any] = {"value": None}
1565
+
1566
+ def _worker(idx: int, fmeta: dict) -> None:
1567
+ fpath = fmeta["path"]
1568
+ fsize = int(fmeta.get("size") or 0)
1569
+ need_bytes = (2 * fsize +
1570
+ st.overhead_bytes) if detected_type == "csv" else (
1571
+ fsize + st.overhead_bytes)
1572
+ mem_budget.acquire(need_bytes)
1573
+ try:
1574
+ tr0 = time.perf_counter()
1575
+ buf, data_mv, header_line = _read_remote_file_with_progress(
1576
+ fs,
1577
+ fpath,
1578
+ expected_size=fsize if fsize > 0 else None,
1579
+ update_bytes=lambda n: _safe_bar_update(bytes_bar, n),
1580
+ capture_first_line=(detected_type == "csv"),
1581
+ )
1582
+ tread = time.perf_counter() - tr0
1583
+
1584
+ tv0 = time.perf_counter()
1585
+ if detected_type == "parquet":
1586
+ schema = _parquet_schema_from_bytes(data_mv)
1587
+ names = list(schema.names)
1588
+ _validate_columns_or_raise(names)
1589
+ validate_parquet_schema(schema, fpath)
1590
+ nrows = _parquet_num_rows_from_bytes(data_mv)
1591
+ if nrows <= 0:
1592
+ raise ValueError(
1593
+ f"Parquet file has zero rows: {fpath}")
1594
+ fields = [(fld.name, fld.type) for fld in schema]
1595
+ if ref_schema_fields["value"] is None:
1596
+ ref_schema_fields["value"] = fields
1597
+ elif fields != ref_schema_fields["value"]:
1598
+ ref_names = [n for n, _ in ref_schema_fields["value"]]
1599
+ raise ValueError(
1600
+ "Parquet schema mismatch across files. "
1601
+ f"First file columns: {ref_names}; mismatched "
1602
+ f"file: {fpath}")
1603
+ part_name = f"{name}.parquet/part_{idx:04d}.parquet"
1604
+
1605
+ else:
1606
+ head_len = min(50000, data_mv.nbytes)
1607
+ # bounded small copy for sniffing
1608
+ head = bytes(data_mv[:head_len])
1609
+ delimiter = _detect_and_validate_csv(head)
1610
+ if header_line is None:
1611
+ # Bounded fallback (64 KiB) to extract header without
1612
+ # copying whole file
1613
+ prefix_len = min(64 * 1024, data_mv.nbytes)
1614
+ prefix = data_mv[:prefix_len]
1615
+ acc = bytearray()
1616
+ for b in (prefix.cast('B')
1617
+ if prefix.format != 'B' else prefix):
1618
+ if b == 10: # '\n'
1619
+ break
1620
+ acc.append(b)
1621
+ header_line = bytes(acc)
1622
+
1623
+ new_buf, new_mv, new_header, cols, rename_map, changed = (
1624
+ _maybe_rewrite_csv_header_buffer(
1625
+ data_mv, header_line, delimiter))
1626
+ if changed:
1627
+ try:
1628
+ buf.close()
1629
+ except Exception:
1630
+ pass
1631
+ buf = new_buf # type: ignore[assignment]
1632
+ data_mv = new_mv
1633
+ header_line = new_header
1634
+ if rename_map:
1635
+ with rename_aggregate_lock:
1636
+ rename_aggregate.update(rename_map)
1637
+
1638
+ if ref_cols["value"] is None:
1639
+ ref_cols["value"] = cols
1640
+ elif cols != ref_cols["value"]:
1641
+ raise ValueError(
1642
+ "CSV header mismatch across files. "
1643
+ f"Expected: {ref_cols['value']}; mismatched file: "
1644
+ f"{fpath} has: {cols}")
1645
+ if not _csv_has_data_rows(data_mv):
1646
+ raise ValueError(
1647
+ f"CSV file has zero data rows: {fpath}")
1648
+ part_name = f"{name}.csv/part_{idx:04d}.csv"
1649
+ tval = time.perf_counter() - tv0
1650
+
1651
+ size_bytes = data_mv.nbytes
1652
+ num_parts = max(1, math.ceil(size_bytes / st.part_size))
1653
+ upload_res = _start_table_upload_with_parts(
1654
+ table_name=part_name,
1655
+ file_type=detected_type,
1656
+ file_size_bytes=size_bytes,
1657
+ num_parts=num_parts,
1658
+ )
1659
+ try:
1660
+ urls = [
1661
+ u for k, u in sorted(
1662
+ upload_res.presigned_part_urls.items(),
1663
+ key=lambda kv: int(kv[0]))
1664
+ ]
1665
+ except Exception:
1666
+ urls = list(upload_res.presigned_part_urls.values())
1667
+
1668
+ loop_inner = _KUMO_EVENT_LOOP
1669
+ part_metadata_list_fut = asyncio.run_coroutine_threadsafe(
1670
+ multi_put_bounded(
1671
+ urls=urls,
1672
+ data_iter=_iter_mv_chunks(data_mv, st.part_size),
1673
+ tqdm_bar_position=3,
1674
+ concurrency=max(1, min(st.part_conc, len(urls))),
1675
+ upload_progress_cb=lambda n: _safe_bar_update(
1676
+ uploaded_bar, n),
1677
+ upload_subchunk_bytes=UPLOAD_CHUNK_BYTES,
1678
+ ),
1679
+ loop_inner,
1680
+ )
1681
+ part_metadata_list = part_metadata_list_fut.result()
1682
+
1683
+ for i in range(5):
1684
+ try:
1685
+ _complete_table_upload(
1686
+ table_name=part_name,
1687
+ file_type=detected_type,
1688
+ upload_path=upload_res.temp_upload_path,
1689
+ upload_id=upload_res.upload_id,
1690
+ parts_metadata=part_metadata_list,
1691
+ )
1692
+ except HTTPException as e:
1693
+ if e.status_code == 500 and i < 4:
1694
+ time.sleep(2**(i - 1))
1695
+ continue
1696
+ else:
1697
+ raise
1698
+ else:
1699
+ break
1700
+
1701
+ try:
1702
+ if buf:
1703
+ buf.close()
1704
+ except Exception:
1705
+ pass
1706
+ del buf, data_mv, header_line
1707
+ gc.collect()
1708
+
1709
+ _safe_bar_update(file_bar, 1)
1710
+ _merge_status_update(fpath)
1711
+ _log_file_timing("dir-file(multipart)", fpath, fsize, tread,
1712
+ tval, 0.0)
1713
+
1714
+ finally:
1715
+ mem_budget.release(need_bytes)
1716
+
1717
+ indexed = list(enumerate(files, start=1))
1718
+ first_ex = None
1719
+ with ThreadPoolExecutor(max_workers=par) as ex:
1720
+ futures = {
1721
+ ex.submit(_worker, idx, fmeta): (idx, fmeta["path"])
1722
+ for idx, fmeta in indexed
1723
+ }
1724
+ for fut in as_completed(futures):
1725
+ try:
1726
+ fut.result()
1727
+ except Exception as e:
1728
+ first_ex = e
1729
+ for f2 in futures:
1730
+ f2.cancel()
1731
+ break
1732
+ if first_ex:
1733
+ raise first_ex
1734
+
1735
+ # after bars close, log any header renames once
1736
+ if detected_type == "csv" and rename_aggregate:
1737
+ pairs = ", ".join(f"{k}->{v}" for k, v in rename_aggregate.items())
1738
+ logger.info("CSV header sanitized (renamed): %s", pairs)
1739
+
1740
+ logger.info("Upload complete. Validated table %s.", name)
1741
+
1742
+
1743
+ def _upload_table_remote(
1744
+ name: str,
1745
+ path: str,
1746
+ auto_partition: bool = True,
1747
+ partition_size_mb: int = 250,
1748
+ parallelism: Optional[int] = None,
1749
+ file_type: Optional[str] = None,
1750
+ ) -> None:
1751
+ """Dispatch remote upload to file or directory paths."""
1752
+ fs, url = _get_fs_and_path(path)
1753
+ info = _remote_info(fs, url)
1754
+ st = _make_remote_settings(parallelism)
1755
+
1756
+ if info.get("type") == "file":
1757
+ return _remote_upload_file(name, fs, url, info, st, file_type)
1758
+ if info.get("type") == "directory":
1759
+ return _remote_upload_directory(name, fs, url, info, st, file_type)
1760
+ raise ValueError(f"Unsupported remote object type for {path}: {info}")
1761
+
1762
+
1763
+ # -----------------------
1764
+ # Column name validator
1765
+ # -----------------------
1766
+ def _validate_columns_or_raise(names: List[str]) -> None:
1767
+ # Ensure sanitized form equals original to enforce our header rules (for
1768
+ # parquet), but don't modify parquet; for CSV we already sanitize header
1769
+ # proactively.
1770
+ new, changed = _sanitize_columns(names)
1771
+ if changed:
1772
+ diffs = [f"{o}->{n}" for o, n in zip(names, new) if o != n]
1773
+ raise ValueError(
1774
+ "Column names contain invalid characters or duplicates. "
1775
+ "Please rename the following columns:\n " + ", ".join(diffs))