kumoai 2.14.0.dev202601011731__cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kumoai might be problematic. Click here for more details.
- kumoai/__init__.py +300 -0
- kumoai/_logging.py +29 -0
- kumoai/_singleton.py +25 -0
- kumoai/_version.py +1 -0
- kumoai/artifact_export/__init__.py +9 -0
- kumoai/artifact_export/config.py +209 -0
- kumoai/artifact_export/job.py +108 -0
- kumoai/client/__init__.py +5 -0
- kumoai/client/client.py +223 -0
- kumoai/client/connector.py +110 -0
- kumoai/client/endpoints.py +150 -0
- kumoai/client/graph.py +120 -0
- kumoai/client/jobs.py +471 -0
- kumoai/client/online.py +78 -0
- kumoai/client/pquery.py +207 -0
- kumoai/client/rfm.py +112 -0
- kumoai/client/source_table.py +53 -0
- kumoai/client/table.py +101 -0
- kumoai/client/utils.py +130 -0
- kumoai/codegen/__init__.py +19 -0
- kumoai/codegen/cli.py +100 -0
- kumoai/codegen/context.py +16 -0
- kumoai/codegen/edits.py +473 -0
- kumoai/codegen/exceptions.py +10 -0
- kumoai/codegen/generate.py +222 -0
- kumoai/codegen/handlers/__init__.py +4 -0
- kumoai/codegen/handlers/connector.py +118 -0
- kumoai/codegen/handlers/graph.py +71 -0
- kumoai/codegen/handlers/pquery.py +62 -0
- kumoai/codegen/handlers/table.py +109 -0
- kumoai/codegen/handlers/utils.py +42 -0
- kumoai/codegen/identity.py +114 -0
- kumoai/codegen/loader.py +93 -0
- kumoai/codegen/naming.py +94 -0
- kumoai/codegen/registry.py +121 -0
- kumoai/connector/__init__.py +31 -0
- kumoai/connector/base.py +153 -0
- kumoai/connector/bigquery_connector.py +200 -0
- kumoai/connector/databricks_connector.py +213 -0
- kumoai/connector/file_upload_connector.py +189 -0
- kumoai/connector/glue_connector.py +150 -0
- kumoai/connector/s3_connector.py +278 -0
- kumoai/connector/snowflake_connector.py +252 -0
- kumoai/connector/source_table.py +471 -0
- kumoai/connector/utils.py +1796 -0
- kumoai/databricks.py +14 -0
- kumoai/encoder/__init__.py +4 -0
- kumoai/exceptions.py +26 -0
- kumoai/experimental/__init__.py +0 -0
- kumoai/experimental/rfm/__init__.py +210 -0
- kumoai/experimental/rfm/authenticate.py +432 -0
- kumoai/experimental/rfm/backend/__init__.py +0 -0
- kumoai/experimental/rfm/backend/local/__init__.py +42 -0
- kumoai/experimental/rfm/backend/local/graph_store.py +297 -0
- kumoai/experimental/rfm/backend/local/sampler.py +312 -0
- kumoai/experimental/rfm/backend/local/table.py +113 -0
- kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
- kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
- kumoai/experimental/rfm/backend/snow/table.py +242 -0
- kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
- kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
- kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
- kumoai/experimental/rfm/base/__init__.py +30 -0
- kumoai/experimental/rfm/base/column.py +152 -0
- kumoai/experimental/rfm/base/expression.py +44 -0
- kumoai/experimental/rfm/base/sampler.py +761 -0
- kumoai/experimental/rfm/base/source.py +19 -0
- kumoai/experimental/rfm/base/sql_sampler.py +143 -0
- kumoai/experimental/rfm/base/table.py +736 -0
- kumoai/experimental/rfm/graph.py +1237 -0
- kumoai/experimental/rfm/infer/__init__.py +19 -0
- kumoai/experimental/rfm/infer/categorical.py +40 -0
- kumoai/experimental/rfm/infer/dtype.py +82 -0
- kumoai/experimental/rfm/infer/id.py +46 -0
- kumoai/experimental/rfm/infer/multicategorical.py +48 -0
- kumoai/experimental/rfm/infer/pkey.py +128 -0
- kumoai/experimental/rfm/infer/stype.py +35 -0
- kumoai/experimental/rfm/infer/time_col.py +61 -0
- kumoai/experimental/rfm/infer/timestamp.py +41 -0
- kumoai/experimental/rfm/pquery/__init__.py +7 -0
- kumoai/experimental/rfm/pquery/executor.py +102 -0
- kumoai/experimental/rfm/pquery/pandas_executor.py +530 -0
- kumoai/experimental/rfm/relbench.py +76 -0
- kumoai/experimental/rfm/rfm.py +1184 -0
- kumoai/experimental/rfm/sagemaker.py +138 -0
- kumoai/experimental/rfm/task_table.py +231 -0
- kumoai/formatting.py +30 -0
- kumoai/futures.py +99 -0
- kumoai/graph/__init__.py +12 -0
- kumoai/graph/column.py +106 -0
- kumoai/graph/graph.py +948 -0
- kumoai/graph/table.py +838 -0
- kumoai/jobs.py +80 -0
- kumoai/kumolib.cpython-310-x86_64-linux-gnu.so +0 -0
- kumoai/mixin.py +28 -0
- kumoai/pquery/__init__.py +25 -0
- kumoai/pquery/prediction_table.py +287 -0
- kumoai/pquery/predictive_query.py +641 -0
- kumoai/pquery/training_table.py +424 -0
- kumoai/spcs.py +121 -0
- kumoai/testing/__init__.py +8 -0
- kumoai/testing/decorators.py +57 -0
- kumoai/testing/snow.py +50 -0
- kumoai/trainer/__init__.py +42 -0
- kumoai/trainer/baseline_trainer.py +93 -0
- kumoai/trainer/config.py +2 -0
- kumoai/trainer/distilled_trainer.py +175 -0
- kumoai/trainer/job.py +1192 -0
- kumoai/trainer/online_serving.py +258 -0
- kumoai/trainer/trainer.py +475 -0
- kumoai/trainer/util.py +103 -0
- kumoai/utils/__init__.py +11 -0
- kumoai/utils/datasets.py +83 -0
- kumoai/utils/display.py +51 -0
- kumoai/utils/forecasting.py +209 -0
- kumoai/utils/progress_logger.py +343 -0
- kumoai/utils/sql.py +3 -0
- kumoai-2.14.0.dev202601011731.dist-info/METADATA +71 -0
- kumoai-2.14.0.dev202601011731.dist-info/RECORD +122 -0
- kumoai-2.14.0.dev202601011731.dist-info/WHEEL +6 -0
- kumoai-2.14.0.dev202601011731.dist-info/licenses/LICENSE +9 -0
- kumoai-2.14.0.dev202601011731.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1796 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import csv
|
|
3
|
+
import gc
|
|
4
|
+
import io
|
|
5
|
+
import math
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import tempfile
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from logging import getLogger
|
|
14
|
+
from typing import (
|
|
15
|
+
Any,
|
|
16
|
+
AsyncIterator,
|
|
17
|
+
Callable,
|
|
18
|
+
Deque,
|
|
19
|
+
Dict,
|
|
20
|
+
Generator,
|
|
21
|
+
Iterator,
|
|
22
|
+
List,
|
|
23
|
+
Optional,
|
|
24
|
+
Tuple,
|
|
25
|
+
Union,
|
|
26
|
+
)
|
|
27
|
+
from urllib.parse import urlparse
|
|
28
|
+
|
|
29
|
+
import aiohttp
|
|
30
|
+
import pandas as pd
|
|
31
|
+
import pyarrow as pa
|
|
32
|
+
import pyarrow.parquet as pq
|
|
33
|
+
from kumoapi.data_source import (
|
|
34
|
+
CompleteFileUploadRequest,
|
|
35
|
+
DeleteUploadedFileRequest,
|
|
36
|
+
PartUploadMetadata,
|
|
37
|
+
StartFileUploadRequest,
|
|
38
|
+
StartFileUploadResponse,
|
|
39
|
+
)
|
|
40
|
+
from tqdm import tqdm
|
|
41
|
+
|
|
42
|
+
from kumoai import global_state
|
|
43
|
+
# still used for server-side completion retries
|
|
44
|
+
from kumoai.exceptions import HTTPException
|
|
45
|
+
from kumoai.futures import _KUMO_EVENT_LOOP
|
|
46
|
+
|
|
47
|
+
# -------------------
|
|
48
|
+
# Constants & Globals
|
|
49
|
+
# -------------------
|
|
50
|
+
logger = getLogger(__name__)
|
|
51
|
+
|
|
52
|
+
CHUNK_SIZE = 100 * 10**6 # 100 MB (legacy local single-file chunk)
|
|
53
|
+
READ_CHUNK_BYTES = 8 * 1024**2 # 8 MiB remote read buffer
|
|
54
|
+
UPLOAD_CHUNK_BYTES = 8 * 1024**2 # 8 MiB streamed PUT sub-chunks
|
|
55
|
+
MAX_PARTITION_SIZE = 1000 * 1024**2 # 1GB
|
|
56
|
+
MIN_PARTITION_SIZE = 100 * 1024**2 # 100MB
|
|
57
|
+
|
|
58
|
+
CONNECTOR_ID_MAP = {
|
|
59
|
+
"csv": "csv_upload_connector",
|
|
60
|
+
"parquet": "parquet_upload_connector",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
_TQDM_LOCK = threading.Lock()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ---------------
|
|
67
|
+
# Small utilities
|
|
68
|
+
# ---------------
|
|
69
|
+
def _fmt_bytes(n: int) -> str:
|
|
70
|
+
value = float(n)
|
|
71
|
+
units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]
|
|
72
|
+
for unit in units:
|
|
73
|
+
if value < 1024:
|
|
74
|
+
return f"{value:.1f} {unit}"
|
|
75
|
+
value /= 1024
|
|
76
|
+
return f"{value:.1f} EiB"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _fmt_secs(s: float) -> str:
|
|
80
|
+
if s < 1:
|
|
81
|
+
return f"{s*1000:.0f} ms"
|
|
82
|
+
return f"{s:.2f} s"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _fmt_rate(nbytes: int, secs: float) -> str:
|
|
86
|
+
if secs <= 0:
|
|
87
|
+
return "-"
|
|
88
|
+
return f"{(nbytes / secs) / 1024**2:.1f} MB/s"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _short_path(p: str, maxlen: int = 60) -> str:
|
|
92
|
+
if len(p) <= maxlen:
|
|
93
|
+
return p
|
|
94
|
+
try:
|
|
95
|
+
parsed = urlparse(p)
|
|
96
|
+
head = f"{parsed.scheme}://"
|
|
97
|
+
tail = p[-40:]
|
|
98
|
+
return f"{head}…{tail}"
|
|
99
|
+
except Exception:
|
|
100
|
+
return f"…{p[-maxlen:]}"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _safe_bar_update(bar: tqdm, inc: int) -> None:
|
|
104
|
+
with _TQDM_LOCK:
|
|
105
|
+
try:
|
|
106
|
+
bar.update(inc)
|
|
107
|
+
except Exception:
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _log_file_timing(label: str, path: str, size: int, tread: float,
|
|
112
|
+
tval: float, tupl: float) -> None:
|
|
113
|
+
logger.debug("[%s] %s (%s) | read=%s @ %s | validate=%s | upload=%s @ %s",
|
|
114
|
+
label, path, _fmt_bytes(size), _fmt_secs(tread),
|
|
115
|
+
_fmt_rate(size, max(tread, 1e-6)), _fmt_secs(tval),
|
|
116
|
+
_fmt_secs(tupl), _fmt_rate(size, max(tupl, 1e-6)))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# -----------------------
|
|
120
|
+
# Async upload primitives
|
|
121
|
+
# -----------------------
|
|
122
|
+
def _iter_memview_stream(
|
|
123
|
+
mv: memoryview,
|
|
124
|
+
subchunk_bytes: int,
|
|
125
|
+
progress_cb: Optional[Callable[[int], None]] = None,
|
|
126
|
+
) -> Iterator[memoryview]:
|
|
127
|
+
"""Yield memoryview slices (zero-copy) for streaming PUT."""
|
|
128
|
+
pos = 0
|
|
129
|
+
n = mv.nbytes
|
|
130
|
+
while pos < n:
|
|
131
|
+
nxt = min(n, pos + subchunk_bytes)
|
|
132
|
+
chunk = mv[pos:nxt] # zero-copy slice
|
|
133
|
+
pos = nxt
|
|
134
|
+
if progress_cb:
|
|
135
|
+
try:
|
|
136
|
+
progress_cb(len(chunk))
|
|
137
|
+
except Exception:
|
|
138
|
+
pass
|
|
139
|
+
yield chunk
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
async def _put_with_retry_streamed(
|
|
143
|
+
session: aiohttp.ClientSession,
|
|
144
|
+
url: str,
|
|
145
|
+
mv: memoryview,
|
|
146
|
+
part_no: int,
|
|
147
|
+
subchunk_bytes: int = UPLOAD_CHUNK_BYTES,
|
|
148
|
+
progress_cb: Optional[Callable[[int], None]] = None,
|
|
149
|
+
retries: int = 3,
|
|
150
|
+
) -> Tuple[int, str]:
|
|
151
|
+
"""Stream a memoryview to a presigned URL using an *async* generator so
|
|
152
|
+
aiohttp does not try to wrap it as multipart/form-data. We also set
|
|
153
|
+
Content-Length explicitly so S3/GCS expects a fixed-size payload (avoids
|
|
154
|
+
chunked TE).
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
# Build a fresh async generator per attempt (can't reuse after failure).
|
|
158
|
+
def _make_async_gen() -> Callable[[], Any]:
|
|
159
|
+
async def _agen() -> AsyncIterator[memoryview]:
|
|
160
|
+
# Yield zero-copy memoryview slices; aiohttp can send memoryview
|
|
161
|
+
# directly.
|
|
162
|
+
for chunk in _iter_memview_stream(mv, subchunk_bytes, progress_cb):
|
|
163
|
+
yield chunk
|
|
164
|
+
# cooperative yield; keeps event loop snappy without extra
|
|
165
|
+
# copies
|
|
166
|
+
await asyncio.sleep(0)
|
|
167
|
+
|
|
168
|
+
return _agen
|
|
169
|
+
|
|
170
|
+
headers = {
|
|
171
|
+
"Content-Type": "application/octet-stream",
|
|
172
|
+
"Content-Length": str(mv.nbytes),
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
attempt = 0
|
|
176
|
+
while True:
|
|
177
|
+
try:
|
|
178
|
+
async with session.put(url, data=_make_async_gen()(),
|
|
179
|
+
headers=headers) as res:
|
|
180
|
+
# Read/consume response to free the connection
|
|
181
|
+
_ = await res.read()
|
|
182
|
+
if res.status != 200:
|
|
183
|
+
raise RuntimeError(
|
|
184
|
+
f"PUT failed {res.status}: {res.reason}")
|
|
185
|
+
etag = res.headers.get("ETag") or res.headers.get("Etag") or ""
|
|
186
|
+
return (part_no + 1, etag)
|
|
187
|
+
except Exception:
|
|
188
|
+
attempt += 1
|
|
189
|
+
if attempt > retries:
|
|
190
|
+
raise
|
|
191
|
+
# backoff before retrying; generator will be recreated next loop
|
|
192
|
+
await asyncio.sleep(0.5 * attempt)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
async def multi_put_bounded(
|
|
196
|
+
urls: List[str],
|
|
197
|
+
data_iter: Generator[Union[bytes, memoryview], None, None],
|
|
198
|
+
tqdm_bar_position: int = 0, # kept for compatibility (unused)
|
|
199
|
+
concurrency: int = 4,
|
|
200
|
+
upload_progress_cb: Optional[Callable[[int], None]] = None,
|
|
201
|
+
upload_subchunk_bytes: int = UPLOAD_CHUNK_BYTES,
|
|
202
|
+
) -> List[PartUploadMetadata]:
|
|
203
|
+
"""Multipart uploader with bounded concurrency and byte-accurate progress.
|
|
204
|
+
No extra progress bar here; caller drives a single byte counter via
|
|
205
|
+
upload_progress_cb.
|
|
206
|
+
"""
|
|
207
|
+
sem = asyncio.Semaphore(concurrency)
|
|
208
|
+
results: List[Union[Tuple[int, str], None]] = [None] * len(urls)
|
|
209
|
+
|
|
210
|
+
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
|
|
211
|
+
ssl=False)) as session:
|
|
212
|
+
|
|
213
|
+
async def worker(idx: int, url: str, chunk: Union[bytes,
|
|
214
|
+
memoryview]) -> None:
|
|
215
|
+
async with sem:
|
|
216
|
+
mv = chunk if isinstance(chunk,
|
|
217
|
+
memoryview) else memoryview(chunk)
|
|
218
|
+
res = await _put_with_retry_streamed(
|
|
219
|
+
session=session,
|
|
220
|
+
url=url,
|
|
221
|
+
mv=mv,
|
|
222
|
+
part_no=idx,
|
|
223
|
+
subchunk_bytes=upload_subchunk_bytes,
|
|
224
|
+
progress_cb=upload_progress_cb,
|
|
225
|
+
)
|
|
226
|
+
results[idx] = res
|
|
227
|
+
|
|
228
|
+
tasks: List[asyncio.Task] = []
|
|
229
|
+
for idx, url in enumerate(urls):
|
|
230
|
+
try:
|
|
231
|
+
chunk = next(data_iter)
|
|
232
|
+
except StopIteration:
|
|
233
|
+
break
|
|
234
|
+
tasks.append(asyncio.create_task(worker(idx, url, chunk)))
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
await asyncio.gather(*tasks)
|
|
238
|
+
except Exception:
|
|
239
|
+
for t in tasks:
|
|
240
|
+
if not t.done():
|
|
241
|
+
t.cancel()
|
|
242
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
243
|
+
raise
|
|
244
|
+
|
|
245
|
+
out: List[PartUploadMetadata] = []
|
|
246
|
+
for r in results:
|
|
247
|
+
if r is None:
|
|
248
|
+
continue
|
|
249
|
+
out.append(PartUploadMetadata(r[0], r[1]))
|
|
250
|
+
return out
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def stream_read(
|
|
254
|
+
f: io.BufferedReader,
|
|
255
|
+
chunk_size: int,
|
|
256
|
+
) -> Generator[bytes, None, None]:
|
|
257
|
+
r"""Streams ``chunk_size`` contiguous bytes from buffered reader ``f`` each
|
|
258
|
+
time the generator is yielded from.
|
|
259
|
+
"""
|
|
260
|
+
while True:
|
|
261
|
+
byte_buf = f.read(chunk_size)
|
|
262
|
+
if len(byte_buf) == 0:
|
|
263
|
+
break
|
|
264
|
+
yield byte_buf
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _validate_url_ext(url: str, file_type: Union[str, None]) -> str:
|
|
268
|
+
"""Validate that `url` ends with .csv or .parquet. If `file_type` is
|
|
269
|
+
given ("csv" or "parquet"), ensure it matches. Returns the detected type
|
|
270
|
+
("csv" or "parquet"), else raises ValueError.
|
|
271
|
+
"""
|
|
272
|
+
u = url.lower()
|
|
273
|
+
detected = "csv" if u.endswith(".csv") else "parquet" if u.endswith(
|
|
274
|
+
".parquet") else None
|
|
275
|
+
if detected is None:
|
|
276
|
+
raise ValueError(f"File path '{url}' must end with .csv or .parquet")
|
|
277
|
+
|
|
278
|
+
if file_type is None:
|
|
279
|
+
return detected
|
|
280
|
+
|
|
281
|
+
ft = file_type.lower()
|
|
282
|
+
if ft not in ("csv", "parquet"):
|
|
283
|
+
raise ValueError("file_type must be 'csv', 'parquet', or None")
|
|
284
|
+
|
|
285
|
+
if ft != detected:
|
|
286
|
+
raise ValueError(f"File path '{url}' must end with .{ft}")
|
|
287
|
+
return detected
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def upload_table(
|
|
291
|
+
name: str,
|
|
292
|
+
path: str,
|
|
293
|
+
auto_partition: bool = True,
|
|
294
|
+
partition_size_mb: int = 250,
|
|
295
|
+
parallelism: Optional[int] = None,
|
|
296
|
+
file_type: Optional[str] = None,
|
|
297
|
+
) -> None:
|
|
298
|
+
"""Upload a CSV/Parquet table to Kumo from a local file or a remote path
|
|
299
|
+
(s3://, gs://, abfs://, abfss://, az://).
|
|
300
|
+
|
|
301
|
+
- Local file: uploaded as-is. If >1 GiB and `auto_partition=True`, splits
|
|
302
|
+
into ~`partition_size_mb` MiB parts.
|
|
303
|
+
- Remote file: uploaded via multipart. Files >1 GiB are rejected
|
|
304
|
+
(re-shard to ~200 MiB and upload as a directory).
|
|
305
|
+
- Remote directory: auto-detects format (or use `file_type`), validates
|
|
306
|
+
each shard, and uploads in parallel with a memory-safe budget.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
name: Destination table name in Kumo.
|
|
310
|
+
path: Local path or remote URL to a .csv/.parquet file or directory.
|
|
311
|
+
auto_partition: Local-only; partition files >1 GiB.
|
|
312
|
+
partition_size_mb: Local partition target size (100–1000 MiB).
|
|
313
|
+
parallelism: Directory uploads concurrency override.
|
|
314
|
+
file_type: Force "csv" or "parquet" for directories; None = auto-detect
|
|
315
|
+
|
|
316
|
+
Raises:
|
|
317
|
+
ValueError: Bad/mixed types, zero rows, >1 GiB remote file,
|
|
318
|
+
schema/header mismatch, or invalid column names.
|
|
319
|
+
ImportError: Missing filesystem dependency (s3fs/gcsfs/adlfs).
|
|
320
|
+
RuntimeError: Remote stat/list/read or multipart completion failures.
|
|
321
|
+
|
|
322
|
+
Notes:
|
|
323
|
+
CSV headers are sanitized (chars → underscore, de-duped). Parquet
|
|
324
|
+
columns must already be valid.
|
|
325
|
+
"""
|
|
326
|
+
# Decide local vs remote by scheme
|
|
327
|
+
scheme = urlparse(path).scheme
|
|
328
|
+
if scheme in ("s3", "gs", "abfs", "abfss", "az"):
|
|
329
|
+
return _upload_table_remote(
|
|
330
|
+
name=name,
|
|
331
|
+
path=path,
|
|
332
|
+
auto_partition=auto_partition,
|
|
333
|
+
partition_size_mb=partition_size_mb,
|
|
334
|
+
parallelism=parallelism,
|
|
335
|
+
file_type=file_type,
|
|
336
|
+
)
|
|
337
|
+
# Local path
|
|
338
|
+
_validate_url_ext(path, file_type)
|
|
339
|
+
file_size = os.path.getsize(path)
|
|
340
|
+
|
|
341
|
+
if file_size < MAX_PARTITION_SIZE:
|
|
342
|
+
return _upload_single_file(name, path)
|
|
343
|
+
|
|
344
|
+
if not auto_partition:
|
|
345
|
+
raise ValueError(
|
|
346
|
+
f"File {path} is {file_size / (1024**3):.2f}GB, which exceeds "
|
|
347
|
+
f"the 1GB limit. Enable auto_partition=True to automatically "
|
|
348
|
+
f"partition large files.")
|
|
349
|
+
|
|
350
|
+
partition_size = partition_size_mb * 1024**2
|
|
351
|
+
if (partition_size > MAX_PARTITION_SIZE
|
|
352
|
+
or partition_size < MIN_PARTITION_SIZE):
|
|
353
|
+
raise ValueError(
|
|
354
|
+
f"Partition size {partition_size_mb}MB must be between "
|
|
355
|
+
f"{MIN_PARTITION_SIZE / 1024**2}MB and "
|
|
356
|
+
f"{MAX_PARTITION_SIZE / 1024**2}MB.")
|
|
357
|
+
|
|
358
|
+
logger.info("File %s is large with size %s, partitioning for upload...",
|
|
359
|
+
path, file_size)
|
|
360
|
+
if path.endswith('.parquet'):
|
|
361
|
+
_upload_partitioned_parquet(name, path, partition_size)
|
|
362
|
+
else:
|
|
363
|
+
_upload_partitioned_csv(name, path, partition_size)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _handle_duplicate_names(names: List[str]) -> List[str]:
|
|
367
|
+
unique_names: List[str] = []
|
|
368
|
+
unique_counts: dict[str, int] = {}
|
|
369
|
+
for name in names:
|
|
370
|
+
if name not in unique_names:
|
|
371
|
+
unique_counts[name] = 0
|
|
372
|
+
unique_names.append(name)
|
|
373
|
+
else:
|
|
374
|
+
unique_counts[name] += 1
|
|
375
|
+
new_name = f"{name}_{unique_counts[name]}"
|
|
376
|
+
while new_name in names or new_name in unique_names:
|
|
377
|
+
unique_counts[name] += 1
|
|
378
|
+
new_name = f"{name}_{unique_counts[name]}"
|
|
379
|
+
unique_names.append(new_name)
|
|
380
|
+
return unique_names
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _sanitize_columns(names: List[str]) -> Tuple[List[str], bool]:
|
|
384
|
+
"""Normalize column names in a CSV or Parquet file.
|
|
385
|
+
|
|
386
|
+
Rules:
|
|
387
|
+
- Replace any non-alphanumeric character with "_"
|
|
388
|
+
- Strip leading/trailing underscores
|
|
389
|
+
- Ensure uniqueness by appending suffixes: _1, _2, ...
|
|
390
|
+
- Auto-name empty columns as auto_named_<n>
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
(new_column_names, changed)
|
|
394
|
+
"""
|
|
395
|
+
_SAN_RE = re.compile(r"[^0-9A-Za-z,\t]")
|
|
396
|
+
# 1) Replace non-alphanumeric sequences with underscore
|
|
397
|
+
new = [_SAN_RE.sub("_", n).strip("_") for n in names]
|
|
398
|
+
|
|
399
|
+
# 2) Auto-name any empty column names to match UI behavior
|
|
400
|
+
unnamed_counter = 0
|
|
401
|
+
for i, n in enumerate(new):
|
|
402
|
+
if not n:
|
|
403
|
+
new[i] = f"auto_named_{unnamed_counter}"
|
|
404
|
+
unnamed_counter += 1
|
|
405
|
+
|
|
406
|
+
# 3) Ensure uniqueness (append suffixes where needed)
|
|
407
|
+
new = _handle_duplicate_names(new)
|
|
408
|
+
return new, new != names
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def sanitize_file(src_path: str) -> Tuple[str, bool]:
|
|
412
|
+
"""Normalize column names in a CSV or Parquet file.
|
|
413
|
+
|
|
414
|
+
Rules:
|
|
415
|
+
- Replace any non-alphanumeric character with "_"
|
|
416
|
+
- Strip leading/trailing underscores
|
|
417
|
+
- Ensure uniqueness by appending suffixes: _1, _2, ...
|
|
418
|
+
|
|
419
|
+
Returns (path, changed):
|
|
420
|
+
- (src_path, False) if no changes were needed
|
|
421
|
+
- (temp_path, True) if a sanitized temp file was written (caller must
|
|
422
|
+
delete)
|
|
423
|
+
"""
|
|
424
|
+
if src_path.endswith('.parquet'):
|
|
425
|
+
pf = pq.ParquetFile(src_path)
|
|
426
|
+
new_names, changed = _sanitize_columns(pf.schema.names)
|
|
427
|
+
if not changed:
|
|
428
|
+
return src_path, False
|
|
429
|
+
temp_file = tempfile.NamedTemporaryFile(suffix='.parquet',
|
|
430
|
+
delete=False)
|
|
431
|
+
original_schema = pf.schema.to_arrow_schema()
|
|
432
|
+
fields = [
|
|
433
|
+
field.with_name(new_name)
|
|
434
|
+
for field, new_name in zip(original_schema, new_names)
|
|
435
|
+
]
|
|
436
|
+
sanitized_schema = pa.schema(fields)
|
|
437
|
+
writer = pq.ParquetWriter(temp_file.name, sanitized_schema)
|
|
438
|
+
for i in range(pf.num_row_groups):
|
|
439
|
+
tbl = pf.read_row_group(i).rename_columns(new_names)
|
|
440
|
+
writer.write_table(tbl)
|
|
441
|
+
writer.close()
|
|
442
|
+
return temp_file.name, True
|
|
443
|
+
elif src_path.endswith('.csv'):
|
|
444
|
+
cols = pd.read_csv(src_path, nrows=0).columns.tolist()
|
|
445
|
+
new_cols, changed = _sanitize_columns(cols)
|
|
446
|
+
if not changed:
|
|
447
|
+
return src_path, False
|
|
448
|
+
tmp = tempfile.NamedTemporaryFile(suffix='.csv', delete=False)
|
|
449
|
+
tmp_path = tmp.name
|
|
450
|
+
tmp.close()
|
|
451
|
+
reader = pd.read_csv(src_path, chunksize=1_000_000)
|
|
452
|
+
with open(tmp_path, 'w', encoding='utf-8', newline='') as out:
|
|
453
|
+
out.write(','.join(new_cols) + '\n')
|
|
454
|
+
for chunk in reader:
|
|
455
|
+
chunk.columns = new_cols
|
|
456
|
+
chunk.to_csv(out, header=False, index=False)
|
|
457
|
+
return tmp_path, True
|
|
458
|
+
else:
|
|
459
|
+
raise ValueError(
|
|
460
|
+
f"File {src_path} must be either a CSV or Parquet file.")
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def _upload_single_file(
|
|
464
|
+
name: str,
|
|
465
|
+
path: str,
|
|
466
|
+
tqdm_bar_position: int = 0,
|
|
467
|
+
) -> None:
|
|
468
|
+
if not (path.endswith(".parquet") or path.endswith(".csv")):
|
|
469
|
+
raise ValueError(f"Path {path} must be either a CSV or Parquet file. "
|
|
470
|
+
"Partitioned data is not currently supported.")
|
|
471
|
+
|
|
472
|
+
file_type = 'parquet' if path.endswith('parquet') else 'csv'
|
|
473
|
+
path, temp_file_created = sanitize_file(path)
|
|
474
|
+
sz = os.path.getsize(path)
|
|
475
|
+
if tqdm_bar_position == 0:
|
|
476
|
+
logger.info("Uploading table %s (path: %s), size=%s bytes", name, path,
|
|
477
|
+
sz)
|
|
478
|
+
|
|
479
|
+
upload_res = _start_table_upload(table_name=name, file_type=file_type,
|
|
480
|
+
file_size_bytes=sz)
|
|
481
|
+
|
|
482
|
+
urls = list(upload_res.presigned_part_urls.values())
|
|
483
|
+
loop = _KUMO_EVENT_LOOP
|
|
484
|
+
part_metadata_list_fut = asyncio.run_coroutine_threadsafe(
|
|
485
|
+
multi_put_bounded(
|
|
486
|
+
urls=urls,
|
|
487
|
+
data_iter=stream_read(open(path, 'rb'), CHUNK_SIZE),
|
|
488
|
+
tqdm_bar_position=tqdm_bar_position,
|
|
489
|
+
concurrency=min(4, len(urls)),
|
|
490
|
+
upload_progress_cb=None,
|
|
491
|
+
upload_subchunk_bytes=UPLOAD_CHUNK_BYTES,
|
|
492
|
+
),
|
|
493
|
+
loop,
|
|
494
|
+
)
|
|
495
|
+
part_metadata_list = part_metadata_list_fut.result()
|
|
496
|
+
|
|
497
|
+
if tqdm_bar_position == 0:
|
|
498
|
+
logger.info("Upload complete. Validating table %s.", name)
|
|
499
|
+
for i in range(5):
|
|
500
|
+
try:
|
|
501
|
+
_complete_table_upload(
|
|
502
|
+
table_name=name,
|
|
503
|
+
file_type=file_type,
|
|
504
|
+
upload_path=upload_res.temp_upload_path,
|
|
505
|
+
upload_id=upload_res.upload_id,
|
|
506
|
+
parts_metadata=part_metadata_list,
|
|
507
|
+
)
|
|
508
|
+
except HTTPException as e:
|
|
509
|
+
# TODO(manan): this can happen when DELETE above has
|
|
510
|
+
# not propagated. So we retry with delay here. We
|
|
511
|
+
# assume DELETE is processed reasonably quickly:
|
|
512
|
+
if e.status_code == 500 and i < 4:
|
|
513
|
+
time.sleep(2**(i - 1))
|
|
514
|
+
continue
|
|
515
|
+
else:
|
|
516
|
+
raise e
|
|
517
|
+
else:
|
|
518
|
+
break
|
|
519
|
+
|
|
520
|
+
if tqdm_bar_position == 0:
|
|
521
|
+
logger.info("Completed uploading table %s to Kumo.", name)
|
|
522
|
+
if temp_file_created:
|
|
523
|
+
os.unlink(path)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def _upload_partitioned_parquet(name: str, path: str,
|
|
527
|
+
partition_size: int) -> None:
|
|
528
|
+
r"""Upload a large parquet file by partitioning it into smaller chunks."""
|
|
529
|
+
logger.info("File %s is large, partitioning for upload...", path)
|
|
530
|
+
pf = pq.ParquetFile(path)
|
|
531
|
+
new_columns, _ = _sanitize_columns(pf.schema.names)
|
|
532
|
+
|
|
533
|
+
partitions: List[Tuple[int, List[int]]] = []
|
|
534
|
+
part_idx = 0
|
|
535
|
+
current_size = 0
|
|
536
|
+
current_row_groups: list[int] = []
|
|
537
|
+
|
|
538
|
+
for rg_idx in range(pf.num_row_groups):
|
|
539
|
+
rg_size = pf.metadata.row_group(rg_idx).total_byte_size
|
|
540
|
+
if rg_size > MAX_PARTITION_SIZE:
|
|
541
|
+
raise ValueError(
|
|
542
|
+
f"Row group {rg_idx} is larger than the maximum partition size"
|
|
543
|
+
f"{MAX_PARTITION_SIZE} bytes")
|
|
544
|
+
if current_size + rg_size > partition_size and current_row_groups:
|
|
545
|
+
partitions.append((part_idx, current_row_groups.copy()))
|
|
546
|
+
part_idx += 1
|
|
547
|
+
current_row_groups = []
|
|
548
|
+
current_size = 0
|
|
549
|
+
current_row_groups.append(rg_idx)
|
|
550
|
+
current_size += rg_size
|
|
551
|
+
if current_row_groups:
|
|
552
|
+
partitions.append((part_idx, current_row_groups))
|
|
553
|
+
|
|
554
|
+
logger.info("Splitting %s into %d partitions", path, len(partitions))
|
|
555
|
+
|
|
556
|
+
def writer(path: str, row_groups: List[int]) -> None:
|
|
557
|
+
original_schema = pf.schema.to_arrow_schema()
|
|
558
|
+
fields = [
|
|
559
|
+
field.with_name(new_name)
|
|
560
|
+
for field, new_name in zip(original_schema, new_columns)
|
|
561
|
+
]
|
|
562
|
+
sanitized_schema = pa.schema(fields)
|
|
563
|
+
pq_writer = pq.ParquetWriter(path, sanitized_schema)
|
|
564
|
+
for rg_idx in row_groups:
|
|
565
|
+
tbl = pf.read_row_group(rg_idx).rename_columns(new_columns)
|
|
566
|
+
pq_writer.write_table(tbl)
|
|
567
|
+
pq_writer.close()
|
|
568
|
+
|
|
569
|
+
_upload_all_partitions(partitions, name, ".parquet", writer)
|
|
570
|
+
logger.info("Upload complete. Validated table %s.", name)
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def _upload_partitioned_csv(name: str, path: str, partition_size: int) -> None:
|
|
574
|
+
r"""Upload a large CSV file by partitioning it into smaller chunks."""
|
|
575
|
+
partitions: List[Tuple[int, List[str]]] = []
|
|
576
|
+
part_idx = 0
|
|
577
|
+
columns = pd.read_csv(path, nrows=0).columns.tolist()
|
|
578
|
+
new_columns, _ = _sanitize_columns(columns)
|
|
579
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
580
|
+
_ = f.readline()
|
|
581
|
+
header = ','.join(new_columns) + '\n'
|
|
582
|
+
header_size = len(header.encode('utf-8'))
|
|
583
|
+
current_lines = [header]
|
|
584
|
+
current_size = header_size
|
|
585
|
+
for line in f:
|
|
586
|
+
line_size = len(line.encode('utf-8'))
|
|
587
|
+
if (current_size + line_size > partition_size
|
|
588
|
+
and len(current_lines) > 1):
|
|
589
|
+
partitions.append((part_idx, current_lines.copy()))
|
|
590
|
+
part_idx += 1
|
|
591
|
+
current_lines = [header]
|
|
592
|
+
current_size = header_size
|
|
593
|
+
current_lines.append(line)
|
|
594
|
+
current_size += line_size
|
|
595
|
+
if len(current_lines) > 1:
|
|
596
|
+
partitions.append((part_idx, current_lines))
|
|
597
|
+
|
|
598
|
+
logger.info("Splitting %s into %d partitions", path, len(partitions))
|
|
599
|
+
|
|
600
|
+
def writer(path: str, lines: List[str]) -> None:
|
|
601
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
602
|
+
f.writelines(lines)
|
|
603
|
+
|
|
604
|
+
_upload_all_partitions(partitions, name, ".csv", writer)
|
|
605
|
+
logger.info("Upload complete. Validated table %s.", name)
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def _upload_all_partitions(
|
|
609
|
+
partitions: List[Tuple[int, Any]],
|
|
610
|
+
name: str,
|
|
611
|
+
file_suffix: str,
|
|
612
|
+
writer: Callable[[str, Any], None],
|
|
613
|
+
) -> None:
|
|
614
|
+
with tqdm(partitions, desc=f"Uploading {name}", position=0) as pbar:
|
|
615
|
+
for part_idx, partition_data in pbar:
|
|
616
|
+
partition_desc = f"Part {part_idx+1}/{len(partitions)}"
|
|
617
|
+
pbar.set_postfix_str(partition_desc)
|
|
618
|
+
_create_and_upload_partition(
|
|
619
|
+
name=name,
|
|
620
|
+
part_idx=part_idx,
|
|
621
|
+
file_suffix=file_suffix,
|
|
622
|
+
partition_writer=writer,
|
|
623
|
+
partition_data=partition_data,
|
|
624
|
+
tqdm_bar_position=1,
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def _create_and_upload_partition(
|
|
629
|
+
name: str,
|
|
630
|
+
part_idx: int,
|
|
631
|
+
file_suffix: str,
|
|
632
|
+
partition_writer: Callable[[str, Any], None],
|
|
633
|
+
partition_data: Any,
|
|
634
|
+
tqdm_bar_position: int = 0,
|
|
635
|
+
) -> None:
|
|
636
|
+
r"""Create a partition file, write to it, upload it, and delete the
|
|
637
|
+
local copy.
|
|
638
|
+
"""
|
|
639
|
+
partition_name = (f"{name}{file_suffix}/"
|
|
640
|
+
f"part_{part_idx+1:04d}{file_suffix}")
|
|
641
|
+
with tempfile.NamedTemporaryFile(suffix=file_suffix,
|
|
642
|
+
delete=False) as temp_file:
|
|
643
|
+
partition_path = temp_file.name
|
|
644
|
+
|
|
645
|
+
try:
|
|
646
|
+
partition_writer(partition_path, partition_data)
|
|
647
|
+
_upload_single_file(partition_name, partition_path,
|
|
648
|
+
tqdm_bar_position=tqdm_bar_position)
|
|
649
|
+
finally:
|
|
650
|
+
try:
|
|
651
|
+
os.unlink(partition_path)
|
|
652
|
+
except OSError:
|
|
653
|
+
pass
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def delete_uploaded_table(name: str, file_type: str) -> None:
|
|
657
|
+
r"""Synchronously deletes a previously uploaded table from the Kumo data
|
|
658
|
+
plane.
|
|
659
|
+
|
|
660
|
+
.. code-block:: python
|
|
661
|
+
|
|
662
|
+
import kumoai
|
|
663
|
+
from kumoai.connector import delete_uploaded_table
|
|
664
|
+
|
|
665
|
+
# Assume we have uploaded a `.parquet` table named `users`,
|
|
666
|
+
# and we want to delete this table from Kumo:
|
|
667
|
+
delete_uploaded_table(name="users", file_type="parquet")
|
|
668
|
+
|
|
669
|
+
# Assume we have uploaded a `.csv` table named `orders`,
|
|
670
|
+
# and we want to delete this table from Kumo:
|
|
671
|
+
delete_uploaded_table(name="orders", file_type="csv")
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
name: The name of the table to be deleted. This table must have
|
|
675
|
+
previously been uploaded with a call to
|
|
676
|
+
:meth:`~kumoai.connector.upload_table`.
|
|
677
|
+
file_type: The file type of the table to be deleted; this can either
|
|
678
|
+
be :obj:`"parquet"` or :obj:`"csv"`
|
|
679
|
+
"""
|
|
680
|
+
assert file_type in {'parquet', 'csv'}
|
|
681
|
+
req = DeleteUploadedFileRequest(
|
|
682
|
+
source_table_name=name,
|
|
683
|
+
connector_id=CONNECTOR_ID_MAP[file_type],
|
|
684
|
+
)
|
|
685
|
+
global_state.client.connector_api.delete_file_upload(req)
|
|
686
|
+
logger.info("Successfully deleted table %s from Kumo.", name)
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def replace_table(name: str, path: str, file_type: str) -> None:
|
|
690
|
+
r"""Replaces an existing uploaded table on the Kumo data plane with a new
|
|
691
|
+
table.
|
|
692
|
+
|
|
693
|
+
.. code-block:: python
|
|
694
|
+
|
|
695
|
+
import kumoai
|
|
696
|
+
from kumoai.connector import replace_table
|
|
697
|
+
|
|
698
|
+
# Replace an existing `.csv` table named `users`
|
|
699
|
+
# with a new version located at `/data/new_users.csv`:
|
|
700
|
+
replace_table(
|
|
701
|
+
name="users",
|
|
702
|
+
path="/data/new_users.csv",
|
|
703
|
+
file_type="csv",
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
Args:
|
|
707
|
+
name: The name of the table to be replaced. This table must have
|
|
708
|
+
previously been uploaded with a call to
|
|
709
|
+
:meth:`~kumoai.connector.upload_table`.
|
|
710
|
+
path: The full path of the new table to be uploaded, on the
|
|
711
|
+
local machine.
|
|
712
|
+
file_type: The file type of the table to be replaced; this
|
|
713
|
+
can either be :obj:`"parquet"` or :obj:`"csv"`.
|
|
714
|
+
|
|
715
|
+
Raises:
|
|
716
|
+
ValueError: If the specified path does not point to a valid
|
|
717
|
+
`.csv` or `.parquet` file.
|
|
718
|
+
"""
|
|
719
|
+
if not (path.endswith(".parquet") or path.endswith(".csv")):
|
|
720
|
+
raise ValueError(f"Path {path} must be either a CSV or Parquet file. "
|
|
721
|
+
"Partitioned data is not currently supported.")
|
|
722
|
+
try:
|
|
723
|
+
logger.info("Deleting previously uploaded table %s of type %s.", name,
|
|
724
|
+
file_type)
|
|
725
|
+
delete_uploaded_table(name=name, file_type=file_type)
|
|
726
|
+
except Exception:
|
|
727
|
+
pass
|
|
728
|
+
logger.info("Uploading table %s.", name)
|
|
729
|
+
upload_table(name=name, path=path)
|
|
730
|
+
logger.info("Successfully replaced table %s with the new table.", name)
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _start_table_upload(
|
|
734
|
+
table_name: str,
|
|
735
|
+
file_type: str,
|
|
736
|
+
file_size_bytes: float,
|
|
737
|
+
) -> StartFileUploadResponse:
|
|
738
|
+
assert file_type in CONNECTOR_ID_MAP.keys()
|
|
739
|
+
req = StartFileUploadRequest(
|
|
740
|
+
source_table_name=table_name,
|
|
741
|
+
connector_id=CONNECTOR_ID_MAP[file_type],
|
|
742
|
+
num_parts=max(1, math.ceil(file_size_bytes / CHUNK_SIZE)),
|
|
743
|
+
)
|
|
744
|
+
return global_state.client.connector_api.start_file_upload(req)
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
def _start_table_upload_with_parts(
|
|
748
|
+
table_name: str,
|
|
749
|
+
file_type: str,
|
|
750
|
+
file_size_bytes: int,
|
|
751
|
+
num_parts: int,
|
|
752
|
+
) -> StartFileUploadResponse:
|
|
753
|
+
assert file_type in CONNECTOR_ID_MAP.keys()
|
|
754
|
+
req = StartFileUploadRequest(
|
|
755
|
+
source_table_name=table_name,
|
|
756
|
+
connector_id=CONNECTOR_ID_MAP[file_type],
|
|
757
|
+
num_parts=max(1, int(num_parts)),
|
|
758
|
+
)
|
|
759
|
+
return global_state.client.connector_api.start_file_upload(req)
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def _complete_table_upload(
|
|
763
|
+
table_name: str,
|
|
764
|
+
file_type: str,
|
|
765
|
+
upload_path: str,
|
|
766
|
+
upload_id: str,
|
|
767
|
+
parts_metadata: List[PartUploadMetadata],
|
|
768
|
+
) -> None:
|
|
769
|
+
assert file_type in CONNECTOR_ID_MAP.keys()
|
|
770
|
+
req = CompleteFileUploadRequest(
|
|
771
|
+
source_table_name=table_name,
|
|
772
|
+
connector_id=CONNECTOR_ID_MAP[file_type],
|
|
773
|
+
temp_upload_path=str(upload_path),
|
|
774
|
+
upload_id=str(upload_id),
|
|
775
|
+
parts_metadata=parts_metadata,
|
|
776
|
+
# Server-side validation is disabled because client-side (SDK)
|
|
777
|
+
# validation is now comprehensive and eliminates the need for
|
|
778
|
+
# additional server-side validation.
|
|
779
|
+
validate_data=False,
|
|
780
|
+
)
|
|
781
|
+
return global_state.client.connector_api.complete_file_upload(req)
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
# -----------------------
|
|
785
|
+
# Remote I/O (fsspec)
|
|
786
|
+
# -----------------------
|
|
787
|
+
|
|
788
|
+
# Define data type for filesystem that does not depend on fsspec
|
|
789
|
+
Filesystem = Any
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def _make_filesystem(scheme: str) -> Filesystem:
|
|
793
|
+
if scheme == "s3":
|
|
794
|
+
try:
|
|
795
|
+
import fsspec # noqa: F401
|
|
796
|
+
import s3fs # noqa: F401
|
|
797
|
+
except Exception:
|
|
798
|
+
raise ImportError(
|
|
799
|
+
"S3 paths require 's3fs'. Install: pip install s3fs")
|
|
800
|
+
fs = fsspec.filesystem("s3")
|
|
801
|
+
elif scheme == "gs":
|
|
802
|
+
try:
|
|
803
|
+
import fsspec # noqa: F401
|
|
804
|
+
import gcsfs # noqa: F401
|
|
805
|
+
except Exception:
|
|
806
|
+
raise ImportError(
|
|
807
|
+
"GCS paths require 'gcsfs'. Install: pip install gcsfs")
|
|
808
|
+
fs = fsspec.filesystem("gcs")
|
|
809
|
+
elif scheme in ("abfs", "abfss", "az"):
|
|
810
|
+
try:
|
|
811
|
+
import adlfs # noqa: F401
|
|
812
|
+
import fsspec # noqa: F401
|
|
813
|
+
except Exception:
|
|
814
|
+
raise ImportError(
|
|
815
|
+
"Azure paths require 'adlfs'. Install: pip install adlfs")
|
|
816
|
+
fs = fsspec.filesystem(scheme)
|
|
817
|
+
else:
|
|
818
|
+
raise ValueError(f"Unsupported remote scheme: {scheme}")
|
|
819
|
+
return fs
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
def _get_fs_and_path(url: str) -> Tuple[Filesystem, str]:
|
|
823
|
+
parsed = urlparse(url)
|
|
824
|
+
scheme = parsed.scheme
|
|
825
|
+
fs = _make_filesystem(scheme)
|
|
826
|
+
return fs, url
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def _remote_info(fs: Filesystem, path: str) -> dict:
|
|
830
|
+
try:
|
|
831
|
+
info = fs.info(path)
|
|
832
|
+
if info.get("type") in ("file", "directory"):
|
|
833
|
+
return info
|
|
834
|
+
# s3fs for directories can return {'Key':..., 'Size':...}; normalize
|
|
835
|
+
if info.get("Size") is not None and info.get("Key"):
|
|
836
|
+
return {
|
|
837
|
+
"type": "file",
|
|
838
|
+
"size": info.get("Size"),
|
|
839
|
+
"name": info.get("Key")
|
|
840
|
+
}
|
|
841
|
+
return info
|
|
842
|
+
except Exception as e:
|
|
843
|
+
raise RuntimeError(f"Failed to stat remote path {path}: {e}")
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
def _remote_dir_manifest(fs: Filesystem, path: str) -> dict:
|
|
847
|
+
# Return lists of parquet and csv entries with size
|
|
848
|
+
try:
|
|
849
|
+
listing = fs.ls(path, detail=True)
|
|
850
|
+
except Exception as e:
|
|
851
|
+
raise RuntimeError(f"Failed to list remote directory {path}: {e}")
|
|
852
|
+
|
|
853
|
+
parquet_files: List[dict] = []
|
|
854
|
+
csv_files: List[dict] = []
|
|
855
|
+
for ent in listing:
|
|
856
|
+
if isinstance(ent, dict):
|
|
857
|
+
p = ent.get("name") or ent.get("Key") or ent.get("path")
|
|
858
|
+
s = ent.get("size") or ent.get("Size") or 0
|
|
859
|
+
t = ent.get("type") or ent.get("StorageClass") or ""
|
|
860
|
+
if t == "directory":
|
|
861
|
+
continue
|
|
862
|
+
else:
|
|
863
|
+
p = ent
|
|
864
|
+
try:
|
|
865
|
+
s = fs.info(p).get("size", 0)
|
|
866
|
+
except Exception:
|
|
867
|
+
s = 0
|
|
868
|
+
if not isinstance(p, str):
|
|
869
|
+
continue
|
|
870
|
+
ext = os.path.splitext(p.lower())[1]
|
|
871
|
+
if ext == ".parquet":
|
|
872
|
+
parquet_files.append({"path": p, "size": int(s or 0)})
|
|
873
|
+
elif ext == ".csv":
|
|
874
|
+
csv_files.append({"path": p, "size": int(s or 0)})
|
|
875
|
+
|
|
876
|
+
return {"parquet": parquet_files, "csv": csv_files}
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def _read_remote_file_with_progress(
|
|
880
|
+
fs: Filesystem,
|
|
881
|
+
path: str,
|
|
882
|
+
expected_size: Optional[int],
|
|
883
|
+
update_bytes: Optional[Callable[[int], Optional[bool]]] = None,
|
|
884
|
+
capture_first_line: bool = False,
|
|
885
|
+
) -> Tuple[io.BytesIO, memoryview, Optional[bytes]]:
|
|
886
|
+
"""Stream into a single BytesIO (one allocation) and return a zero-copy
|
|
887
|
+
memoryview.
|
|
888
|
+
"""
|
|
889
|
+
buf = io.BytesIO()
|
|
890
|
+
|
|
891
|
+
header_line: Optional[bytes] = None
|
|
892
|
+
if capture_first_line:
|
|
893
|
+
header_acc = bytearray()
|
|
894
|
+
seen_nl = False
|
|
895
|
+
else:
|
|
896
|
+
header_acc = bytearray()
|
|
897
|
+
seen_nl = True
|
|
898
|
+
|
|
899
|
+
with fs.open(path, "rb") as fobj:
|
|
900
|
+
while True:
|
|
901
|
+
chunk = fobj.read(READ_CHUNK_BYTES)
|
|
902
|
+
if not chunk:
|
|
903
|
+
break
|
|
904
|
+
if capture_first_line and not seen_nl:
|
|
905
|
+
nl_idx = chunk.find(b"\n")
|
|
906
|
+
if nl_idx != -1:
|
|
907
|
+
header_acc += chunk[:nl_idx]
|
|
908
|
+
# small copy only for header
|
|
909
|
+
header_line = bytes(header_acc)
|
|
910
|
+
seen_nl = True
|
|
911
|
+
else:
|
|
912
|
+
header_acc += chunk
|
|
913
|
+
buf.write(chunk)
|
|
914
|
+
if update_bytes:
|
|
915
|
+
try:
|
|
916
|
+
update_bytes(len(chunk))
|
|
917
|
+
except Exception:
|
|
918
|
+
pass
|
|
919
|
+
|
|
920
|
+
if capture_first_line and not seen_nl:
|
|
921
|
+
header_line = bytes(header_acc)
|
|
922
|
+
|
|
923
|
+
mv = buf.getbuffer() # zero-copy view of BytesIO internal buffer
|
|
924
|
+
return buf, mv, header_line
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
# -----------------------
|
|
928
|
+
# Memory budget & helpers
|
|
929
|
+
# -----------------------
|
|
930
|
+
def _compute_mem_budget_bytes(files: List[dict]) -> int:
|
|
931
|
+
# 50% of system RAM
|
|
932
|
+
try:
|
|
933
|
+
import psutil
|
|
934
|
+
total = psutil.virtual_memory().total
|
|
935
|
+
except Exception:
|
|
936
|
+
total = 8 * 1024**3 # assume 8 GiB
|
|
937
|
+
budget = int(total * 0.50)
|
|
938
|
+
return max(budget, 512 * 1024**2) # at least 512 MiB
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
class MemoryBudget:
|
|
942
|
+
"""A byte-level semaphore to prevent OOM when reading many shards."""
|
|
943
|
+
def __init__(self, budget_bytes: int) -> None:
|
|
944
|
+
self.budget = budget_bytes
|
|
945
|
+
self.avail = budget_bytes
|
|
946
|
+
self.cv = threading.Condition()
|
|
947
|
+
|
|
948
|
+
def acquire(self, need: int) -> None:
|
|
949
|
+
with self.cv:
|
|
950
|
+
while self.avail < need:
|
|
951
|
+
self.cv.wait(timeout=0.25)
|
|
952
|
+
self.avail -= need
|
|
953
|
+
|
|
954
|
+
def release(self, freed: int) -> None:
|
|
955
|
+
with self.cv:
|
|
956
|
+
self.avail += freed
|
|
957
|
+
if self.avail > self.budget:
|
|
958
|
+
self.avail = self.budget
|
|
959
|
+
self.cv.notify_all()
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
def _determine_parallelism(files: List[dict], requested: Optional[int]) -> int:
|
|
963
|
+
if requested is not None and requested > 0:
|
|
964
|
+
return min(requested, len(files))
|
|
965
|
+
env_par = os.getenv("KUMO_UPLOAD_PARALLELISM")
|
|
966
|
+
if env_par:
|
|
967
|
+
try:
|
|
968
|
+
val = int(env_par)
|
|
969
|
+
if val > 0:
|
|
970
|
+
return min(val, len(files))
|
|
971
|
+
except Exception:
|
|
972
|
+
pass
|
|
973
|
+
|
|
974
|
+
budget_bytes = _compute_mem_budget_bytes(files)
|
|
975
|
+
# 128 MiB overhead by default
|
|
976
|
+
try:
|
|
977
|
+
overhead_bytes = max(0, int(os.getenv("KUMO_UPLOAD_OVERHEAD_MB",
|
|
978
|
+
"128"))) * 1024**2
|
|
979
|
+
except Exception:
|
|
980
|
+
overhead_bytes = 128 * 1024**2
|
|
981
|
+
|
|
982
|
+
needs = []
|
|
983
|
+
for f in files:
|
|
984
|
+
size = int(f.get("size") or 0)
|
|
985
|
+
if size <= 0:
|
|
986
|
+
continue
|
|
987
|
+
needs.append(size + overhead_bytes)
|
|
988
|
+
if not needs:
|
|
989
|
+
return 1
|
|
990
|
+
needs.sort()
|
|
991
|
+
median_need = needs[len(needs) // 2]
|
|
992
|
+
par = max(1, budget_bytes // max(1, median_need))
|
|
993
|
+
return min(int(par), len(files))
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def _iter_mv_chunks(mv: memoryview,
|
|
997
|
+
part_size: int) -> Generator[memoryview, None, None]:
|
|
998
|
+
pos = 0
|
|
999
|
+
n = mv.nbytes
|
|
1000
|
+
while pos < n:
|
|
1001
|
+
nxt = min(n, pos + part_size)
|
|
1002
|
+
yield mv[pos:nxt] # zero-copy slice
|
|
1003
|
+
pos = nxt
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
# -----------------------
|
|
1007
|
+
# Parquet helpers
|
|
1008
|
+
# -----------------------
|
|
1009
|
+
def _parquet_schema_from_bytes(data_mv: memoryview) -> pa.Schema:
|
|
1010
|
+
reader = pa.BufferReader(pa.py_buffer(data_mv))
|
|
1011
|
+
pf = pq.ParquetFile(reader)
|
|
1012
|
+
|
|
1013
|
+
# zero-row guard via metadata (no data scan)
|
|
1014
|
+
if getattr(pf.metadata, "num_rows", None) == 0:
|
|
1015
|
+
raise ValueError("Parquet file contains zero rows.")
|
|
1016
|
+
|
|
1017
|
+
return pf.schema_arrow
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
def _parquet_num_rows_from_bytes(data_mv: memoryview) -> int:
|
|
1021
|
+
buf = pa.py_buffer(data_mv)
|
|
1022
|
+
reader = pa.BufferReader(buf)
|
|
1023
|
+
pf = pq.ParquetFile(reader)
|
|
1024
|
+
md = pf.metadata
|
|
1025
|
+
if md is None:
|
|
1026
|
+
total = 0
|
|
1027
|
+
for rg in range(pf.num_row_groups):
|
|
1028
|
+
total += pf.metadata.row_group(rg).num_rows
|
|
1029
|
+
return total
|
|
1030
|
+
return md.num_rows
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
def validate_parquet_schema(schema: pa.Schema, source_name: str) -> None:
|
|
1034
|
+
"""Validate a PyArrow schema for Kumo compatibility (source_name
|
|
1035
|
+
required).
|
|
1036
|
+
|
|
1037
|
+
Disallowed:
|
|
1038
|
+
- All large_* types: large_string, large_binary, large_list<*>
|
|
1039
|
+
- Any time-of-day types (time32/64<*>); ONLY epoch-based timestamps are
|
|
1040
|
+
allowed
|
|
1041
|
+
- Any duration types (e.g., pa.duration('ns'))
|
|
1042
|
+
- list<string> and list<bool>
|
|
1043
|
+
- Unsigned integers (uint8/16/32/64)
|
|
1044
|
+
- Null-typed columns
|
|
1045
|
+
|
|
1046
|
+
Allowed:
|
|
1047
|
+
- boolean, signed integer, floating, (regular) string, date, timestamp
|
|
1048
|
+
(epoch-based), (regular) binary
|
|
1049
|
+
- decimal up to configured precision (env KUMO_DECIMAL_MAX_PRECISION,
|
|
1050
|
+
default 18)
|
|
1051
|
+
- list of {signed integer, float}
|
|
1052
|
+
- dictionary<int, string>
|
|
1053
|
+
|
|
1054
|
+
Raises:
|
|
1055
|
+
ValueError listing offending columns (including source_name).
|
|
1056
|
+
"""
|
|
1057
|
+
try:
|
|
1058
|
+
max_dec_prec = int(os.getenv("KUMO_DECIMAL_MAX_PRECISION", "18"))
|
|
1059
|
+
except Exception:
|
|
1060
|
+
max_dec_prec = 18
|
|
1061
|
+
|
|
1062
|
+
where = f" in {source_name}"
|
|
1063
|
+
errors: list[str] = []
|
|
1064
|
+
|
|
1065
|
+
for col, dt in zip(schema.names, schema.types):
|
|
1066
|
+
# 1) Hard-disallow all large_* types
|
|
1067
|
+
if pa.types.is_large_string(dt):
|
|
1068
|
+
errors.append(
|
|
1069
|
+
f" - column '{col}'{where} has unsupported type large_string")
|
|
1070
|
+
continue
|
|
1071
|
+
if pa.types.is_large_binary(dt):
|
|
1072
|
+
errors.append(
|
|
1073
|
+
f" - column '{col}'{where} has unsupported type large_binary")
|
|
1074
|
+
continue
|
|
1075
|
+
if pa.types.is_large_list(dt):
|
|
1076
|
+
errors.append(
|
|
1077
|
+
f" - column '{col}'{where} has unsupported type {dt} "
|
|
1078
|
+
f"(large_list not supported)")
|
|
1079
|
+
continue
|
|
1080
|
+
|
|
1081
|
+
# 2) Disallow time-of-day and duration
|
|
1082
|
+
if pa.types.is_time(dt):
|
|
1083
|
+
errors.append(
|
|
1084
|
+
f" - column '{col}'{where} has unsupported time-of-day type "
|
|
1085
|
+
f"'{dt}' (only epoch-based timestamps are supported)")
|
|
1086
|
+
continue
|
|
1087
|
+
if pa.types.is_duration(dt):
|
|
1088
|
+
errors.append(
|
|
1089
|
+
f" - column '{col}'{where} has unsupported duration "
|
|
1090
|
+
f"type '{dt}'")
|
|
1091
|
+
continue
|
|
1092
|
+
|
|
1093
|
+
# 3) Disallow unsigned integers and null columns
|
|
1094
|
+
if pa.types.is_unsigned_integer(dt):
|
|
1095
|
+
errors.append(
|
|
1096
|
+
f" - column '{col}'{where} has unsupported unsigned integer "
|
|
1097
|
+
"type '{dt}'")
|
|
1098
|
+
continue
|
|
1099
|
+
if pa.types.is_null(dt):
|
|
1100
|
+
errors.append(
|
|
1101
|
+
f" - column '{col}'{where} has unsupported null type '{dt}'")
|
|
1102
|
+
continue
|
|
1103
|
+
|
|
1104
|
+
supported = (
|
|
1105
|
+
pa.types.is_boolean(dt)
|
|
1106
|
+
# signed ints only
|
|
1107
|
+
or (pa.types.is_integer(dt)
|
|
1108
|
+
and not pa.types.is_unsigned_integer(dt)) or
|
|
1109
|
+
pa.types.is_floating(dt) or
|
|
1110
|
+
pa.types.is_string(dt) # regular string only
|
|
1111
|
+
or pa.types.is_date(dt) or
|
|
1112
|
+
pa.types.is_timestamp(dt) # epoch-based timestamps
|
|
1113
|
+
or pa.types.is_binary(dt) # regular binary only
|
|
1114
|
+
)
|
|
1115
|
+
|
|
1116
|
+
# 4) Decimals with precision limit
|
|
1117
|
+
if not supported and pa.types.is_decimal(dt):
|
|
1118
|
+
try:
|
|
1119
|
+
prec = int(getattr(dt, "precision", 0) or 0)
|
|
1120
|
+
except Exception:
|
|
1121
|
+
prec = 0
|
|
1122
|
+
if 0 < prec <= max_dec_prec:
|
|
1123
|
+
supported = True
|
|
1124
|
+
else:
|
|
1125
|
+
errors.append(
|
|
1126
|
+
f" - column '{col}'{where} has unsupported decimal "
|
|
1127
|
+
f"precision {prec} (max {max_dec_prec}): type '{dt}'")
|
|
1128
|
+
continue
|
|
1129
|
+
|
|
1130
|
+
# 5) Lists: only list of {signed int, float}; explicitly deny
|
|
1131
|
+
# list<string> and list<bool>
|
|
1132
|
+
if not supported and pa.types.is_list(dt):
|
|
1133
|
+
elem = dt.value_type
|
|
1134
|
+
if pa.types.is_string(elem):
|
|
1135
|
+
errors.append(
|
|
1136
|
+
f" - column '{col}'{where} is {dt} (list<string> not "
|
|
1137
|
+
f"supported)")
|
|
1138
|
+
continue
|
|
1139
|
+
if pa.types.is_boolean(elem):
|
|
1140
|
+
errors.append(f" - column '{col}'{where} is {dt} (list<bool> "
|
|
1141
|
+
f"not supported)")
|
|
1142
|
+
continue
|
|
1143
|
+
if pa.types.is_integer(
|
|
1144
|
+
elem) and not pa.types.is_unsigned_integer(elem):
|
|
1145
|
+
supported = True
|
|
1146
|
+
elif pa.types.is_floating(elem):
|
|
1147
|
+
supported = True
|
|
1148
|
+
else:
|
|
1149
|
+
errors.append(
|
|
1150
|
+
f" - column '{col}'{where} is {dt} (only list of signed "
|
|
1151
|
+
f"int/float supported)")
|
|
1152
|
+
continue
|
|
1153
|
+
|
|
1154
|
+
# 6) Dictionary<int, string> only
|
|
1155
|
+
if not supported and pa.types.is_dictionary(dt):
|
|
1156
|
+
if (pa.types.is_integer(dt.index_type)
|
|
1157
|
+
and not pa.types.is_unsigned_integer(dt.index_type)
|
|
1158
|
+
and pa.types.is_string(dt.value_type)):
|
|
1159
|
+
supported = True
|
|
1160
|
+
|
|
1161
|
+
if not supported:
|
|
1162
|
+
errors.append(
|
|
1163
|
+
f" - column '{col}'{where} has unsupported type '{dt}'")
|
|
1164
|
+
|
|
1165
|
+
if errors:
|
|
1166
|
+
raise ValueError(
|
|
1167
|
+
"Unsupported Parquet Data Types detected:\n\n" +
|
|
1168
|
+
"\n".join(errors) + "\n\nAllowed types: boolean, signed integer, "
|
|
1169
|
+
"float, (regular) string, date, "
|
|
1170
|
+
"timestamp (epoch-based), (regular) binary, "
|
|
1171
|
+
"decimal (<= configured precision), "
|
|
1172
|
+
"list of {signed int, float}, dictionary<int,string>.\n"
|
|
1173
|
+
"Disallowed examples: large_string, large_binary, "
|
|
1174
|
+
"large_list<*>, time32/64<*>, "
|
|
1175
|
+
"duration('unit'), list<string>, list<bool>, "
|
|
1176
|
+
"unsigned integers, null columns, "
|
|
1177
|
+
"structs, maps, and other nested types.")
|
|
1178
|
+
|
|
1179
|
+
|
|
1180
|
+
# -----------------------
|
|
1181
|
+
# CSV helpers
|
|
1182
|
+
# -----------------------
|
|
1183
|
+
def _detect_and_validate_csv(head_bytes: bytes) -> str:
|
|
1184
|
+
r"""Detect a CSV delimiter from a small head sample and verify it.
|
|
1185
|
+
|
|
1186
|
+
- Uses csv.Sniffer (preferred delimiters: | , ; \t) with fallback to ','.
|
|
1187
|
+
- Reads a handful of complete, quote-aware records (handles newlines inside
|
|
1188
|
+
quotes).
|
|
1189
|
+
- Re-serializes those rows and validates with pandas (small nrows) to catch
|
|
1190
|
+
malformed inputs.
|
|
1191
|
+
- Raises ValueError on empty input or if parsing fails with the chosen
|
|
1192
|
+
delimiter.
|
|
1193
|
+
"""
|
|
1194
|
+
if not head_bytes:
|
|
1195
|
+
raise ValueError("Could not auto-detect a delimiter: file is empty.")
|
|
1196
|
+
|
|
1197
|
+
text = head_bytes.decode("utf-8", errors="ignore").replace("\r\n",
|
|
1198
|
+
"\n").replace(
|
|
1199
|
+
"\r", "\n")
|
|
1200
|
+
|
|
1201
|
+
# 1) Detect delimiter (simple preference list; no denylist)
|
|
1202
|
+
try:
|
|
1203
|
+
delimiter = csv.Sniffer().sniff(text, delimiters="|,;\t").delimiter
|
|
1204
|
+
except Exception:
|
|
1205
|
+
logger.warning("No separator found in sample; defaulting to ','.")
|
|
1206
|
+
delimiter = ','
|
|
1207
|
+
|
|
1208
|
+
# 2) Pull a few complete records with csv.reader (quote-aware,
|
|
1209
|
+
# handles embedded newlines)
|
|
1210
|
+
rows = []
|
|
1211
|
+
try:
|
|
1212
|
+
rdr = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"',
|
|
1213
|
+
doublequote=True)
|
|
1214
|
+
for _ in range(50): # small, bounded sample
|
|
1215
|
+
try:
|
|
1216
|
+
rows.append(next(rdr))
|
|
1217
|
+
except StopIteration:
|
|
1218
|
+
break
|
|
1219
|
+
except Exception as e:
|
|
1220
|
+
raise ValueError(
|
|
1221
|
+
f"Could not auto-detect a valid delimiter. Tried '{delimiter}', "
|
|
1222
|
+
f"csv parse failed: {repr(e)}")
|
|
1223
|
+
|
|
1224
|
+
if not rows:
|
|
1225
|
+
raise ValueError(
|
|
1226
|
+
"Could not auto-detect a valid delimiter: no complete records "
|
|
1227
|
+
"found.")
|
|
1228
|
+
|
|
1229
|
+
# 3) Re-serialize snippet and validate minimally with pandas
|
|
1230
|
+
out = io.StringIO()
|
|
1231
|
+
w = csv.writer(out, delimiter=delimiter, lineterminator="\n",
|
|
1232
|
+
quotechar='"', doublequote=True)
|
|
1233
|
+
for r in rows:
|
|
1234
|
+
w.writerow(r)
|
|
1235
|
+
|
|
1236
|
+
try:
|
|
1237
|
+
pd.read_csv(
|
|
1238
|
+
io.StringIO(out.getvalue()),
|
|
1239
|
+
sep=delimiter,
|
|
1240
|
+
index_col=False,
|
|
1241
|
+
on_bad_lines='error',
|
|
1242
|
+
nrows=50,
|
|
1243
|
+
engine="python", # more tolerant for quoted/newline combos
|
|
1244
|
+
skip_blank_lines=False,
|
|
1245
|
+
)
|
|
1246
|
+
except Exception as e:
|
|
1247
|
+
raise ValueError(
|
|
1248
|
+
f"Could not auto-detect a valid delimiter. Tried '{delimiter}', "
|
|
1249
|
+
f"pandas parse failed: {repr(e)}")
|
|
1250
|
+
|
|
1251
|
+
return delimiter
|
|
1252
|
+
|
|
1253
|
+
|
|
1254
|
+
def _csv_has_data_rows(data_mv: memoryview) -> bool:
|
|
1255
|
+
"""Return True if any non-newline, non-carriage-return byte exists after
|
|
1256
|
+
the first newline. Uses zero-copy iteration over the memoryview to avoid
|
|
1257
|
+
duplicating buffers.
|
|
1258
|
+
"""
|
|
1259
|
+
mv = data_mv
|
|
1260
|
+
if mv.format != 'B':
|
|
1261
|
+
try:
|
|
1262
|
+
mv = mv.cast('B') # zero-copy view of bytes
|
|
1263
|
+
except TypeError:
|
|
1264
|
+
# fallback: create a contiguous view via slicing (still zero-copy)
|
|
1265
|
+
mv = mv[:]
|
|
1266
|
+
|
|
1267
|
+
saw_newline = False
|
|
1268
|
+
# Iterate in a single pass; break as soon as we see a data-ish byte
|
|
1269
|
+
for b in mv:
|
|
1270
|
+
if not saw_newline:
|
|
1271
|
+
if b == 10: # '\n'
|
|
1272
|
+
saw_newline = True
|
|
1273
|
+
continue
|
|
1274
|
+
# after header newline: any byte that isn't CR or LF counts as data
|
|
1275
|
+
if b not in (10, 13):
|
|
1276
|
+
return True
|
|
1277
|
+
return False
|
|
1278
|
+
|
|
1279
|
+
|
|
1280
|
+
def _maybe_rewrite_csv_header_buffer(
|
|
1281
|
+
data_mv: memoryview,
|
|
1282
|
+
header_line: bytes,
|
|
1283
|
+
delimiter: str,
|
|
1284
|
+
) -> tuple[Optional[io.BytesIO], memoryview, bytes, list[str], dict[str, str],
|
|
1285
|
+
bool]:
|
|
1286
|
+
"""Rewrite ONLY the header if needed. Uses a new BytesIO but frees the old
|
|
1287
|
+
buffer immediately after swap.
|
|
1288
|
+
"""
|
|
1289
|
+
try:
|
|
1290
|
+
header_str = header_line.decode("utf-8").rstrip("\r\n")
|
|
1291
|
+
except UnicodeDecodeError:
|
|
1292
|
+
raise ValueError("CSV header is not valid UTF-8.")
|
|
1293
|
+
|
|
1294
|
+
orig_cols = [c.strip() for c in header_str.split(delimiter)]
|
|
1295
|
+
new_cols, changed = _sanitize_columns(orig_cols)
|
|
1296
|
+
if not changed:
|
|
1297
|
+
return None, data_mv, header_line, orig_cols, {}, False
|
|
1298
|
+
|
|
1299
|
+
rename_map = {o: n for o, n in zip(orig_cols, new_cols) if o != n}
|
|
1300
|
+
|
|
1301
|
+
nl_idx = len(header_line)
|
|
1302
|
+
if nl_idx >= data_mv.nbytes:
|
|
1303
|
+
raise ValueError("Malformed CSV: newline not found in header.")
|
|
1304
|
+
|
|
1305
|
+
new_header_bytes = delimiter.join(new_cols).encode("utf-8")
|
|
1306
|
+
new_buf = io.BytesIO()
|
|
1307
|
+
new_buf.write(new_header_bytes)
|
|
1308
|
+
new_buf.write(b"\n")
|
|
1309
|
+
# Write the remainder via a zero-copy memoryview slice; BytesIO will copy
|
|
1310
|
+
# into its own buffer, but we free the original immediately after returning
|
|
1311
|
+
# to avoid double residency.
|
|
1312
|
+
new_buf.write(data_mv[nl_idx + 1:])
|
|
1313
|
+
new_mv = new_buf.getbuffer()
|
|
1314
|
+
return new_buf, new_mv, new_header_bytes, new_cols, rename_map, True
|
|
1315
|
+
|
|
1316
|
+
|
|
1317
|
+
# -----------------------
|
|
1318
|
+
# Remote upload (refactor)
|
|
1319
|
+
# -----------------------
|
|
1320
|
+
@dataclass
|
|
1321
|
+
class _RemoteSettings:
|
|
1322
|
+
part_size: int
|
|
1323
|
+
part_conc: int
|
|
1324
|
+
overhead_bytes: int
|
|
1325
|
+
parallelism_override: Optional[int]
|
|
1326
|
+
|
|
1327
|
+
|
|
1328
|
+
def _make_remote_settings(parallelism: Optional[int]) -> _RemoteSettings:
|
|
1329
|
+
part_mb = int(os.getenv("KUMO_REMOTE_PART_MB", "64"))
|
|
1330
|
+
part_size = max(8, part_mb) * 1024**2
|
|
1331
|
+
part_conc = int(os.getenv("KUMO_REMOTE_PART_CONCURRENCY", "4"))
|
|
1332
|
+
try:
|
|
1333
|
+
overhead_bytes = max(0, int(os.getenv("KUMO_UPLOAD_OVERHEAD_MB",
|
|
1334
|
+
"128"))) * 1024**2
|
|
1335
|
+
except Exception:
|
|
1336
|
+
overhead_bytes = 128 * 1024**2
|
|
1337
|
+
return _RemoteSettings(
|
|
1338
|
+
part_size=part_size,
|
|
1339
|
+
part_conc=part_conc,
|
|
1340
|
+
overhead_bytes=overhead_bytes,
|
|
1341
|
+
parallelism_override=parallelism,
|
|
1342
|
+
)
|
|
1343
|
+
|
|
1344
|
+
|
|
1345
|
+
def _remote_upload_file(name: str, fs: Filesystem, url: str, info: dict,
|
|
1346
|
+
st: _RemoteSettings, file_type: Optional[str]) -> None:
|
|
1347
|
+
detected_ftype = _validate_url_ext(url, file_type)
|
|
1348
|
+
|
|
1349
|
+
size = int(info.get("size") or 0)
|
|
1350
|
+
if size == 0:
|
|
1351
|
+
raise ValueError(f"Remote file {url} is empty (0 bytes).")
|
|
1352
|
+
if size > MAX_PARTITION_SIZE:
|
|
1353
|
+
raise ValueError(
|
|
1354
|
+
"Remote single-file uploads larger than 1GB are not supported. "
|
|
1355
|
+
"Please re-partition the source into ~200MB chunks and upload the "
|
|
1356
|
+
"whole directory instead.")
|
|
1357
|
+
|
|
1358
|
+
# Read with progress
|
|
1359
|
+
with tqdm(total=size, desc=f"Reading {_short_path(url)}", unit="B",
|
|
1360
|
+
unit_scale=True, unit_divisor=1024, position=0, leave=False,
|
|
1361
|
+
smoothing=0.1) as read_bar:
|
|
1362
|
+
tr0 = time.perf_counter()
|
|
1363
|
+
buf, data_mv, header_line = _read_remote_file_with_progress(
|
|
1364
|
+
fs, url, expected_size=size, update_bytes=read_bar.update,
|
|
1365
|
+
capture_first_line=(detected_ftype == "csv"))
|
|
1366
|
+
tread = time.perf_counter() - tr0
|
|
1367
|
+
|
|
1368
|
+
# Validate/sanitize
|
|
1369
|
+
tv0 = time.perf_counter()
|
|
1370
|
+
renamed_cols_msg = None
|
|
1371
|
+
if detected_ftype == "parquet":
|
|
1372
|
+
schema = _parquet_schema_from_bytes(data_mv)
|
|
1373
|
+
_validate_columns_or_raise(list(schema.names))
|
|
1374
|
+
validate_parquet_schema(schema, url)
|
|
1375
|
+
nrows = _parquet_num_rows_from_bytes(data_mv)
|
|
1376
|
+
if nrows <= 0:
|
|
1377
|
+
raise ValueError("Parquet file has zero rows.")
|
|
1378
|
+
file_type = "parquet"
|
|
1379
|
+
else:
|
|
1380
|
+
head_len = min(50000, data_mv.nbytes)
|
|
1381
|
+
# small bounded copy only for sniffing
|
|
1382
|
+
head = bytes(data_mv[:head_len])
|
|
1383
|
+
delimiter = _detect_and_validate_csv(head)
|
|
1384
|
+
if header_line is None:
|
|
1385
|
+
# Shouldn't happen (we captured it during read), but keep a bounded
|
|
1386
|
+
# fallback (64 KiB)
|
|
1387
|
+
prefix_len = min(64 * 1024, data_mv.nbytes)
|
|
1388
|
+
prefix = data_mv[:prefix_len]
|
|
1389
|
+
# build header_line from prefix without large copies
|
|
1390
|
+
acc = bytearray()
|
|
1391
|
+
for b in (prefix.cast('B') if prefix.format != 'B' else prefix):
|
|
1392
|
+
if b == 10: # '\n'
|
|
1393
|
+
break
|
|
1394
|
+
acc.append(b)
|
|
1395
|
+
header_line = bytes(acc)
|
|
1396
|
+
new_buf, new_mv, new_header, cols, rename_map, changed = (
|
|
1397
|
+
_maybe_rewrite_csv_header_buffer(data_mv, header_line, delimiter))
|
|
1398
|
+
if changed:
|
|
1399
|
+
try:
|
|
1400
|
+
buf.close()
|
|
1401
|
+
except Exception:
|
|
1402
|
+
pass
|
|
1403
|
+
if changed:
|
|
1404
|
+
buf = new_buf # type: ignore[assignment]
|
|
1405
|
+
data_mv = new_mv
|
|
1406
|
+
header_line = new_header
|
|
1407
|
+
if rename_map:
|
|
1408
|
+
pairs = ", ".join(f"{k}->{v}" for k, v in rename_map.items())
|
|
1409
|
+
renamed_cols_msg = f"CSV header sanitized (renamed): {pairs}"
|
|
1410
|
+
if not _csv_has_data_rows(data_mv):
|
|
1411
|
+
raise ValueError(
|
|
1412
|
+
"CSV file has zero data rows (only header present).")
|
|
1413
|
+
file_type = "csv"
|
|
1414
|
+
tval = time.perf_counter() - tv0
|
|
1415
|
+
|
|
1416
|
+
# Multipart upload
|
|
1417
|
+
size_bytes = data_mv.nbytes
|
|
1418
|
+
num_parts = max(1, math.ceil(size_bytes / st.part_size))
|
|
1419
|
+
upload_res = _start_table_upload_with_parts(table_name=name,
|
|
1420
|
+
file_type=file_type,
|
|
1421
|
+
file_size_bytes=size_bytes,
|
|
1422
|
+
num_parts=num_parts)
|
|
1423
|
+
try:
|
|
1424
|
+
urls = [
|
|
1425
|
+
u for k, u in sorted(upload_res.presigned_part_urls.items(),
|
|
1426
|
+
key=lambda kv: int(kv[0]))
|
|
1427
|
+
]
|
|
1428
|
+
except Exception:
|
|
1429
|
+
urls = list(upload_res.presigned_part_urls.values())
|
|
1430
|
+
|
|
1431
|
+
loop = _KUMO_EVENT_LOOP
|
|
1432
|
+
with tqdm(total=size_bytes, desc="Uploading", unit="B", unit_scale=True,
|
|
1433
|
+
unit_divisor=1024, position=2, leave=False,
|
|
1434
|
+
smoothing=0.1) as upload_bar:
|
|
1435
|
+
part_metadata_list_fut = asyncio.run_coroutine_threadsafe(
|
|
1436
|
+
multi_put_bounded(
|
|
1437
|
+
urls=urls,
|
|
1438
|
+
data_iter=_iter_mv_chunks(data_mv, st.part_size),
|
|
1439
|
+
tqdm_bar_position=3,
|
|
1440
|
+
concurrency=max(1, min(st.part_conc, len(urls))),
|
|
1441
|
+
upload_progress_cb=lambda n: _safe_bar_update(upload_bar, n),
|
|
1442
|
+
upload_subchunk_bytes=UPLOAD_CHUNK_BYTES,
|
|
1443
|
+
),
|
|
1444
|
+
loop,
|
|
1445
|
+
)
|
|
1446
|
+
part_metadata_list = part_metadata_list_fut.result()
|
|
1447
|
+
upload_bar.set_postfix_str(f"Done — {_short_path(url)}")
|
|
1448
|
+
upload_bar.refresh()
|
|
1449
|
+
|
|
1450
|
+
# Complete
|
|
1451
|
+
tu0 = time.perf_counter()
|
|
1452
|
+
for i in range(5):
|
|
1453
|
+
try:
|
|
1454
|
+
_complete_table_upload(
|
|
1455
|
+
table_name=name,
|
|
1456
|
+
file_type=file_type,
|
|
1457
|
+
upload_path=upload_res.temp_upload_path,
|
|
1458
|
+
upload_id=upload_res.upload_id,
|
|
1459
|
+
parts_metadata=part_metadata_list,
|
|
1460
|
+
)
|
|
1461
|
+
except HTTPException as e:
|
|
1462
|
+
if e.status_code == 500 and i < 4:
|
|
1463
|
+
time.sleep(2**(i - 1))
|
|
1464
|
+
continue
|
|
1465
|
+
else:
|
|
1466
|
+
raise
|
|
1467
|
+
else:
|
|
1468
|
+
break
|
|
1469
|
+
tupl = time.perf_counter() - tu0
|
|
1470
|
+
|
|
1471
|
+
_log_file_timing("single-file(multipart)", url, size_bytes, tread, tval,
|
|
1472
|
+
tupl)
|
|
1473
|
+
if renamed_cols_msg:
|
|
1474
|
+
logger.info(renamed_cols_msg)
|
|
1475
|
+
|
|
1476
|
+
try:
|
|
1477
|
+
if buf:
|
|
1478
|
+
buf.close()
|
|
1479
|
+
except Exception:
|
|
1480
|
+
pass
|
|
1481
|
+
del buf, data_mv, header_line
|
|
1482
|
+
gc.collect()
|
|
1483
|
+
|
|
1484
|
+
logger.info("Upload complete. Validated table %s.", name)
|
|
1485
|
+
|
|
1486
|
+
|
|
1487
|
+
def _remote_upload_directory(
|
|
1488
|
+
name: str,
|
|
1489
|
+
fs: Filesystem,
|
|
1490
|
+
url: str,
|
|
1491
|
+
info: dict,
|
|
1492
|
+
st: _RemoteSettings,
|
|
1493
|
+
file_type: Optional[str] = None, # "csv", "parquet", or None
|
|
1494
|
+
) -> None:
|
|
1495
|
+
manifest = _remote_dir_manifest(fs, url)
|
|
1496
|
+
parquet_files = sorted(manifest["parquet"], key=lambda x: x["path"])
|
|
1497
|
+
csv_files = sorted(manifest["csv"], key=lambda x: x["path"])
|
|
1498
|
+
|
|
1499
|
+
# Normalize expected type
|
|
1500
|
+
if file_type not in (None, "csv", "parquet"):
|
|
1501
|
+
raise ValueError("file_type must be 'csv', 'parquet', or None.")
|
|
1502
|
+
|
|
1503
|
+
# Resolve files + detected type
|
|
1504
|
+
if file_type is None:
|
|
1505
|
+
if not parquet_files and not csv_files:
|
|
1506
|
+
raise ValueError("Directory contains no .parquet or .csv files.")
|
|
1507
|
+
if parquet_files and csv_files:
|
|
1508
|
+
raise ValueError(
|
|
1509
|
+
"Mixed CSV and Parquet files detected; keep only one format.")
|
|
1510
|
+
files = parquet_files if parquet_files else csv_files
|
|
1511
|
+
detected_type = "parquet" if parquet_files else "csv"
|
|
1512
|
+
elif file_type == "parquet":
|
|
1513
|
+
if not parquet_files:
|
|
1514
|
+
raise ValueError(
|
|
1515
|
+
"Directory contains no .parquet files (file_type='parquet').")
|
|
1516
|
+
if csv_files:
|
|
1517
|
+
raise ValueError(
|
|
1518
|
+
"Directory also contains CSV files; remove them or set"
|
|
1519
|
+
"file_type=None.")
|
|
1520
|
+
files, detected_type = parquet_files, "parquet"
|
|
1521
|
+
else: # file_type == "csv"
|
|
1522
|
+
if not csv_files:
|
|
1523
|
+
raise ValueError(
|
|
1524
|
+
"Directory contains no .csv files (file_type='csv').")
|
|
1525
|
+
if parquet_files:
|
|
1526
|
+
raise ValueError(
|
|
1527
|
+
"Directory also contains Parquet files; remove them or "
|
|
1528
|
+
"set file_type=None.")
|
|
1529
|
+
files, detected_type = csv_files, "csv"
|
|
1530
|
+
|
|
1531
|
+
total_bytes = sum(int(f.get("size") or 0) for f in files)
|
|
1532
|
+
|
|
1533
|
+
too_large = [
|
|
1534
|
+
f["path"] for f in files if (f.get("size") or 0) > MAX_PARTITION_SIZE
|
|
1535
|
+
]
|
|
1536
|
+
zero_bytes = [f["path"] for f in files if (f.get("size") or 0) == 0]
|
|
1537
|
+
if zero_bytes:
|
|
1538
|
+
raise ValueError(
|
|
1539
|
+
f"Found zero-byte {detected_type.upper()} files: {zero_bytes[:3]}"
|
|
1540
|
+
f"{'...' if len(zero_bytes)>3 else ''}")
|
|
1541
|
+
if too_large:
|
|
1542
|
+
raise ValueError(
|
|
1543
|
+
f"The following files exceed 1GB and must be re-partitioned "
|
|
1544
|
+
f"(~200MB each): "
|
|
1545
|
+
f"{too_large[:3]}{'...' if len(too_large)>3 else ''}")
|
|
1546
|
+
|
|
1547
|
+
par = _determine_parallelism(files, requested=st.parallelism_override)
|
|
1548
|
+
par = max(1, min(par, len(files)))
|
|
1549
|
+
budget_bytes = _compute_mem_budget_bytes(files)
|
|
1550
|
+
mem_budget = MemoryBudget(budget_bytes)
|
|
1551
|
+
|
|
1552
|
+
from collections import deque
|
|
1553
|
+
with (tqdm(total=len(files),
|
|
1554
|
+
desc=f"Files ({len(files)}) [{detected_type}] | par={par}",
|
|
1555
|
+
position=0) as file_bar,
|
|
1556
|
+
tqdm(total=total_bytes, desc="Total bytes (read)", unit="B",
|
|
1557
|
+
unit_scale=True, unit_divisor=1024, position=1, smoothing=0.1)
|
|
1558
|
+
as bytes_bar,
|
|
1559
|
+
tqdm(total=total_bytes, desc="Total bytes (uploaded)", unit="B",
|
|
1560
|
+
unit_scale=True, unit_divisor=1024, position=2, smoothing=0.1)
|
|
1561
|
+
as uploaded_bar):
|
|
1562
|
+
|
|
1563
|
+
status_lock = threading.Lock()
|
|
1564
|
+
recent_paths: Deque[str] = deque(maxlen=5)
|
|
1565
|
+
completed_files = {"n": 0}
|
|
1566
|
+
file_bar.set_postfix_str(f"Uploaded 0/{len(files)}")
|
|
1567
|
+
file_bar.refresh()
|
|
1568
|
+
|
|
1569
|
+
rename_aggregate_lock = threading.Lock()
|
|
1570
|
+
rename_aggregate: dict[str, str] = {}
|
|
1571
|
+
|
|
1572
|
+
def _merge_status_update(path: str) -> None:
|
|
1573
|
+
with status_lock:
|
|
1574
|
+
completed_files["n"] += 1
|
|
1575
|
+
recent_paths.append(path)
|
|
1576
|
+
tail = ' | '.join(_short_path(p) for p in list(recent_paths))
|
|
1577
|
+
msg = f"Uploaded {completed_files['n']}/{len(files)}"
|
|
1578
|
+
if tail:
|
|
1579
|
+
msg += f" — {tail}"
|
|
1580
|
+
with _TQDM_LOCK:
|
|
1581
|
+
file_bar.set_postfix_str(msg)
|
|
1582
|
+
file_bar.refresh()
|
|
1583
|
+
|
|
1584
|
+
ref_schema_fields: Dict[str, Any] = {"value": None}
|
|
1585
|
+
ref_cols: Dict[str, Any] = {"value": None}
|
|
1586
|
+
|
|
1587
|
+
def _worker(idx: int, fmeta: dict) -> None:
|
|
1588
|
+
fpath = fmeta["path"]
|
|
1589
|
+
fsize = int(fmeta.get("size") or 0)
|
|
1590
|
+
need_bytes = (2 * fsize +
|
|
1591
|
+
st.overhead_bytes) if detected_type == "csv" else (
|
|
1592
|
+
fsize + st.overhead_bytes)
|
|
1593
|
+
mem_budget.acquire(need_bytes)
|
|
1594
|
+
try:
|
|
1595
|
+
tr0 = time.perf_counter()
|
|
1596
|
+
buf, data_mv, header_line = _read_remote_file_with_progress(
|
|
1597
|
+
fs,
|
|
1598
|
+
fpath,
|
|
1599
|
+
expected_size=fsize if fsize > 0 else None,
|
|
1600
|
+
update_bytes=lambda n: _safe_bar_update(bytes_bar, n),
|
|
1601
|
+
capture_first_line=(detected_type == "csv"),
|
|
1602
|
+
)
|
|
1603
|
+
tread = time.perf_counter() - tr0
|
|
1604
|
+
|
|
1605
|
+
tv0 = time.perf_counter()
|
|
1606
|
+
if detected_type == "parquet":
|
|
1607
|
+
schema = _parquet_schema_from_bytes(data_mv)
|
|
1608
|
+
names = list(schema.names)
|
|
1609
|
+
_validate_columns_or_raise(names)
|
|
1610
|
+
validate_parquet_schema(schema, fpath)
|
|
1611
|
+
nrows = _parquet_num_rows_from_bytes(data_mv)
|
|
1612
|
+
if nrows <= 0:
|
|
1613
|
+
raise ValueError(
|
|
1614
|
+
f"Parquet file has zero rows: {fpath}")
|
|
1615
|
+
fields = [(fld.name, fld.type) for fld in schema]
|
|
1616
|
+
if ref_schema_fields["value"] is None:
|
|
1617
|
+
ref_schema_fields["value"] = fields
|
|
1618
|
+
elif fields != ref_schema_fields["value"]:
|
|
1619
|
+
ref_names = [n for n, _ in ref_schema_fields["value"]]
|
|
1620
|
+
raise ValueError(
|
|
1621
|
+
"Parquet schema mismatch across files. "
|
|
1622
|
+
f"First file columns: {ref_names}; mismatched "
|
|
1623
|
+
f"file: {fpath}")
|
|
1624
|
+
part_name = f"{name}.parquet/part_{idx:04d}.parquet"
|
|
1625
|
+
|
|
1626
|
+
else:
|
|
1627
|
+
head_len = min(50000, data_mv.nbytes)
|
|
1628
|
+
# bounded small copy for sniffing
|
|
1629
|
+
head = bytes(data_mv[:head_len])
|
|
1630
|
+
delimiter = _detect_and_validate_csv(head)
|
|
1631
|
+
if header_line is None:
|
|
1632
|
+
# Bounded fallback (64 KiB) to extract header without
|
|
1633
|
+
# copying whole file
|
|
1634
|
+
prefix_len = min(64 * 1024, data_mv.nbytes)
|
|
1635
|
+
prefix = data_mv[:prefix_len]
|
|
1636
|
+
acc = bytearray()
|
|
1637
|
+
for b in (prefix.cast('B')
|
|
1638
|
+
if prefix.format != 'B' else prefix):
|
|
1639
|
+
if b == 10: # '\n'
|
|
1640
|
+
break
|
|
1641
|
+
acc.append(b)
|
|
1642
|
+
header_line = bytes(acc)
|
|
1643
|
+
|
|
1644
|
+
new_buf, new_mv, new_header, cols, rename_map, changed = (
|
|
1645
|
+
_maybe_rewrite_csv_header_buffer(
|
|
1646
|
+
data_mv, header_line, delimiter))
|
|
1647
|
+
if changed:
|
|
1648
|
+
try:
|
|
1649
|
+
buf.close()
|
|
1650
|
+
except Exception:
|
|
1651
|
+
pass
|
|
1652
|
+
buf = new_buf # type: ignore[assignment]
|
|
1653
|
+
data_mv = new_mv
|
|
1654
|
+
header_line = new_header
|
|
1655
|
+
if rename_map:
|
|
1656
|
+
with rename_aggregate_lock:
|
|
1657
|
+
rename_aggregate.update(rename_map)
|
|
1658
|
+
|
|
1659
|
+
if ref_cols["value"] is None:
|
|
1660
|
+
ref_cols["value"] = cols
|
|
1661
|
+
elif cols != ref_cols["value"]:
|
|
1662
|
+
raise ValueError(
|
|
1663
|
+
"CSV header mismatch across files. "
|
|
1664
|
+
f"Expected: {ref_cols['value']}; mismatched file: "
|
|
1665
|
+
f"{fpath} has: {cols}")
|
|
1666
|
+
if not _csv_has_data_rows(data_mv):
|
|
1667
|
+
raise ValueError(
|
|
1668
|
+
f"CSV file has zero data rows: {fpath}")
|
|
1669
|
+
part_name = f"{name}.csv/part_{idx:04d}.csv"
|
|
1670
|
+
tval = time.perf_counter() - tv0
|
|
1671
|
+
|
|
1672
|
+
size_bytes = data_mv.nbytes
|
|
1673
|
+
num_parts = max(1, math.ceil(size_bytes / st.part_size))
|
|
1674
|
+
upload_res = _start_table_upload_with_parts(
|
|
1675
|
+
table_name=part_name,
|
|
1676
|
+
file_type=detected_type,
|
|
1677
|
+
file_size_bytes=size_bytes,
|
|
1678
|
+
num_parts=num_parts,
|
|
1679
|
+
)
|
|
1680
|
+
try:
|
|
1681
|
+
urls = [
|
|
1682
|
+
u for k, u in sorted(
|
|
1683
|
+
upload_res.presigned_part_urls.items(),
|
|
1684
|
+
key=lambda kv: int(kv[0]))
|
|
1685
|
+
]
|
|
1686
|
+
except Exception:
|
|
1687
|
+
urls = list(upload_res.presigned_part_urls.values())
|
|
1688
|
+
|
|
1689
|
+
loop_inner = _KUMO_EVENT_LOOP
|
|
1690
|
+
part_metadata_list_fut = asyncio.run_coroutine_threadsafe(
|
|
1691
|
+
multi_put_bounded(
|
|
1692
|
+
urls=urls,
|
|
1693
|
+
data_iter=_iter_mv_chunks(data_mv, st.part_size),
|
|
1694
|
+
tqdm_bar_position=3,
|
|
1695
|
+
concurrency=max(1, min(st.part_conc, len(urls))),
|
|
1696
|
+
upload_progress_cb=lambda n: _safe_bar_update(
|
|
1697
|
+
uploaded_bar, n),
|
|
1698
|
+
upload_subchunk_bytes=UPLOAD_CHUNK_BYTES,
|
|
1699
|
+
),
|
|
1700
|
+
loop_inner,
|
|
1701
|
+
)
|
|
1702
|
+
part_metadata_list = part_metadata_list_fut.result()
|
|
1703
|
+
|
|
1704
|
+
for i in range(5):
|
|
1705
|
+
try:
|
|
1706
|
+
_complete_table_upload(
|
|
1707
|
+
table_name=part_name,
|
|
1708
|
+
file_type=detected_type,
|
|
1709
|
+
upload_path=upload_res.temp_upload_path,
|
|
1710
|
+
upload_id=upload_res.upload_id,
|
|
1711
|
+
parts_metadata=part_metadata_list,
|
|
1712
|
+
)
|
|
1713
|
+
except HTTPException as e:
|
|
1714
|
+
if e.status_code == 500 and i < 4:
|
|
1715
|
+
time.sleep(2**(i - 1))
|
|
1716
|
+
continue
|
|
1717
|
+
else:
|
|
1718
|
+
raise
|
|
1719
|
+
else:
|
|
1720
|
+
break
|
|
1721
|
+
|
|
1722
|
+
try:
|
|
1723
|
+
if buf:
|
|
1724
|
+
buf.close()
|
|
1725
|
+
except Exception:
|
|
1726
|
+
pass
|
|
1727
|
+
del buf, data_mv, header_line
|
|
1728
|
+
gc.collect()
|
|
1729
|
+
|
|
1730
|
+
_safe_bar_update(file_bar, 1)
|
|
1731
|
+
_merge_status_update(fpath)
|
|
1732
|
+
_log_file_timing("dir-file(multipart)", fpath, fsize, tread,
|
|
1733
|
+
tval, 0.0)
|
|
1734
|
+
|
|
1735
|
+
finally:
|
|
1736
|
+
mem_budget.release(need_bytes)
|
|
1737
|
+
|
|
1738
|
+
indexed = list(enumerate(files, start=1))
|
|
1739
|
+
first_ex = None
|
|
1740
|
+
with ThreadPoolExecutor(max_workers=par) as ex:
|
|
1741
|
+
futures = {
|
|
1742
|
+
ex.submit(_worker, idx, fmeta): (idx, fmeta["path"])
|
|
1743
|
+
for idx, fmeta in indexed
|
|
1744
|
+
}
|
|
1745
|
+
for fut in as_completed(futures):
|
|
1746
|
+
try:
|
|
1747
|
+
fut.result()
|
|
1748
|
+
except Exception as e:
|
|
1749
|
+
first_ex = e
|
|
1750
|
+
for f2 in futures:
|
|
1751
|
+
f2.cancel()
|
|
1752
|
+
break
|
|
1753
|
+
if first_ex:
|
|
1754
|
+
raise first_ex
|
|
1755
|
+
|
|
1756
|
+
# after bars close, log any header renames once
|
|
1757
|
+
if detected_type == "csv" and rename_aggregate:
|
|
1758
|
+
pairs = ", ".join(f"{k}->{v}" for k, v in rename_aggregate.items())
|
|
1759
|
+
logger.info("CSV header sanitized (renamed): %s", pairs)
|
|
1760
|
+
|
|
1761
|
+
logger.info("Upload complete. Validated table %s.", name)
|
|
1762
|
+
|
|
1763
|
+
|
|
1764
|
+
def _upload_table_remote(
|
|
1765
|
+
name: str,
|
|
1766
|
+
path: str,
|
|
1767
|
+
auto_partition: bool = True,
|
|
1768
|
+
partition_size_mb: int = 250,
|
|
1769
|
+
parallelism: Optional[int] = None,
|
|
1770
|
+
file_type: Optional[str] = None,
|
|
1771
|
+
) -> None:
|
|
1772
|
+
"""Dispatch remote upload to file or directory paths."""
|
|
1773
|
+
fs, url = _get_fs_and_path(path)
|
|
1774
|
+
info = _remote_info(fs, url)
|
|
1775
|
+
st = _make_remote_settings(parallelism)
|
|
1776
|
+
|
|
1777
|
+
if info.get("type") == "file":
|
|
1778
|
+
return _remote_upload_file(name, fs, url, info, st, file_type)
|
|
1779
|
+
if info.get("type") == "directory":
|
|
1780
|
+
return _remote_upload_directory(name, fs, url, info, st, file_type)
|
|
1781
|
+
raise ValueError(f"Unsupported remote object type for {path}: {info}")
|
|
1782
|
+
|
|
1783
|
+
|
|
1784
|
+
# -----------------------
|
|
1785
|
+
# Column name validator
|
|
1786
|
+
# -----------------------
|
|
1787
|
+
def _validate_columns_or_raise(names: List[str]) -> None:
|
|
1788
|
+
# Ensure sanitized form equals original to enforce our header rules (for
|
|
1789
|
+
# parquet), but don't modify parquet; for CSV we already sanitize header
|
|
1790
|
+
# proactively.
|
|
1791
|
+
new, changed = _sanitize_columns(names)
|
|
1792
|
+
if changed:
|
|
1793
|
+
diffs = [f"{o}->{n}" for o, n in zip(names, new) if o != n]
|
|
1794
|
+
raise ValueError(
|
|
1795
|
+
"Column names contain invalid characters or duplicates. "
|
|
1796
|
+
"Please rename the following columns:\n " + ", ".join(diffs))
|