waveflowdb-client 1.0.2__tar.gz → 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {waveflowdb_client-1.0.2/waveflowdb_client.egg-info → waveflowdb_client-1.0.4}/PKG-INFO +1 -1
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/pyproject.toml +1 -1
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client/client.py +61 -54
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client/models.py +3 -3
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client/utils.py +12 -2
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4/waveflowdb_client.egg-info}/PKG-INFO +1 -1
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/LICENSE +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/readme.md +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/setup.cfg +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client/__init__.py +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client/config.py +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client/exceptions.py +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client.egg-info/SOURCES.txt +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client.egg-info/dependency_links.txt +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client.egg-info/requires.txt +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "waveflowdb_client" # pip install name
|
|
7
|
-
version = "1.0.
|
|
7
|
+
version = "1.0.4"
|
|
8
8
|
description = "VectorLake SDK — Deterministic backend engine powering agent workflows"
|
|
9
9
|
readme = "readme.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -87,6 +87,24 @@ def _divider(label: str = "", width: int = 60) -> str:
|
|
|
87
87
|
return f"\n{'─' * width}\n {label}\n{'─' * width}" if label else f"{'─' * width}"
|
|
88
88
|
|
|
89
89
|
|
|
90
|
+
def _source_ext(chunk_name: str, base_path: str) -> str:
|
|
91
|
+
"""
|
|
92
|
+
Recover the original source file extension from a chunk filename.
|
|
93
|
+
|
|
94
|
+
Parses the chunk stem via VersionedChunkName, then looks for the source
|
|
95
|
+
file in base_path to get its real extension. Falls back to "txt" if the
|
|
96
|
+
source file is not found (e.g. direct-upload mode).
|
|
97
|
+
"""
|
|
98
|
+
parsed = VersionedChunkName.parse(chunk_name)
|
|
99
|
+
if parsed:
|
|
100
|
+
candidates = list(Path(base_path).glob(f"{parsed.stem}.*"))
|
|
101
|
+
# Exclude other chunk files in the same dir
|
|
102
|
+
candidates = [c for c in candidates if not VersionedChunkName.is_versioned(c.name)]
|
|
103
|
+
if candidates:
|
|
104
|
+
return candidates[0].suffix.lstrip(".").lower()
|
|
105
|
+
return Path(chunk_name).suffix.lstrip(".").lower() or "txt"
|
|
106
|
+
|
|
107
|
+
|
|
90
108
|
class VectorLakeClient:
|
|
91
109
|
"""
|
|
92
110
|
Main entry point for the VectorLake SDK.
|
|
@@ -128,7 +146,6 @@ class VectorLakeClient:
|
|
|
128
146
|
headers = self._headers()
|
|
129
147
|
request_kb = len(json.dumps(payload).encode()) / 1024
|
|
130
148
|
logger.debug("Batch %d — payload %.1f KB", batch_num, request_kb)
|
|
131
|
-
print("making request")
|
|
132
149
|
last_error: Optional[APIError] = None
|
|
133
150
|
|
|
134
151
|
for attempt in range(self.config.max_retries):
|
|
@@ -230,7 +247,7 @@ class VectorLakeClient:
|
|
|
230
247
|
user_id: str,
|
|
231
248
|
vector_lake_description: str,
|
|
232
249
|
start_from_batch: int = 1,
|
|
233
|
-
end_batch: Optional[int] = None,
|
|
250
|
+
end_batch: Optional[int] = None,
|
|
234
251
|
intelligent_segmentation: bool = True,
|
|
235
252
|
session_id: Optional[str] = None,
|
|
236
253
|
files: Optional[List[str]] = None,
|
|
@@ -289,7 +306,7 @@ class VectorLakeClient:
|
|
|
289
306
|
|
|
290
307
|
active = [
|
|
291
308
|
(i, b) for i, b in enumerate(batches)
|
|
292
|
-
if start_from_batch <= (i + 1) <= resolved_end
|
|
309
|
+
if start_from_batch <= (i + 1) <= resolved_end
|
|
293
310
|
]
|
|
294
311
|
initial = start_from_batch - 1
|
|
295
312
|
|
|
@@ -305,19 +322,22 @@ class VectorLakeClient:
|
|
|
305
322
|
t0 = time.time()
|
|
306
323
|
|
|
307
324
|
try:
|
|
308
|
-
# ── per-batch START marker ─────────────────────────────
|
|
309
325
|
logger.info(
|
|
310
326
|
"[BATCH START] op=%s batch=%d/%d files=%s",
|
|
311
327
|
operation, batch_num, display_total, batch,
|
|
312
328
|
)
|
|
313
|
-
# ──────────────────────────────────────────────────────
|
|
314
329
|
|
|
315
330
|
file_contents = []
|
|
331
|
+
files_meta = []
|
|
316
332
|
for fname in batch:
|
|
317
333
|
chunk_path = os.path.join(chunks_dir, fname)
|
|
318
334
|
if not os.path.exists(chunk_path):
|
|
319
335
|
chunk_path = os.path.join(base_path, fname)
|
|
320
336
|
file_contents.append(self._file_processor.read_content(chunk_path))
|
|
337
|
+
files_meta.append({
|
|
338
|
+
"filename": fname,
|
|
339
|
+
"extension": _source_ext(fname, base_path),
|
|
340
|
+
})
|
|
321
341
|
|
|
322
342
|
payload = {
|
|
323
343
|
"session_id": session_id,
|
|
@@ -325,9 +345,9 @@ class VectorLakeClient:
|
|
|
325
345
|
"vector_lake_description": vector_lake_description,
|
|
326
346
|
"files_name": batch,
|
|
327
347
|
"files_data": file_contents,
|
|
348
|
+
"files_meta": files_meta,
|
|
328
349
|
"intelligent_segmentation": intelligent_segmentation,
|
|
329
350
|
}
|
|
330
|
-
|
|
331
351
|
request_start_ts = time.time()
|
|
332
352
|
result = self._make_request(endpoint, payload, operation, batch_num)
|
|
333
353
|
elapsed_ms = round((time.time() - request_start_ts) * 1000, 1)
|
|
@@ -345,12 +365,10 @@ class VectorLakeClient:
|
|
|
345
365
|
batch_num, result.get("message", result.get("error")),
|
|
346
366
|
)
|
|
347
367
|
|
|
348
|
-
# ── per-batch END marker ───────────────────────────────
|
|
349
368
|
logger.info(
|
|
350
369
|
"[BATCH END] op=%s batch=%d/%d success=%s elapsed_ms=%.1f files=%s",
|
|
351
370
|
operation, batch_num, display_total, succeeded, elapsed_ms, batch,
|
|
352
371
|
)
|
|
353
|
-
# ──────────────────────────────────────────────────────
|
|
354
372
|
|
|
355
373
|
batch_outputs.append({
|
|
356
374
|
"batch_number": batch_num,
|
|
@@ -440,15 +458,13 @@ class VectorLakeClient:
|
|
|
440
458
|
chunks_dir: str,
|
|
441
459
|
) -> SyncPlan:
|
|
442
460
|
"""
|
|
443
|
-
Classify each source file by
|
|
444
|
-
its existing chunk(s) in chunks_dir.
|
|
461
|
+
Classify each source file by checking whether chunks already exist on disk.
|
|
445
462
|
|
|
446
463
|
Rules
|
|
447
464
|
─────
|
|
448
|
-
No chunk exists on disk
|
|
449
|
-
Chunk
|
|
450
|
-
|
|
451
|
-
Source file missing from disk → UNKNOWN → add_docs (safe default)
|
|
465
|
+
No chunk exists on disk → NEW → add_docs
|
|
466
|
+
Chunk(s) exist on disk → CHANGED → refresh_docs
|
|
467
|
+
Source file missing → UNKNOWN → add_docs (safe default)
|
|
452
468
|
"""
|
|
453
469
|
to_add, to_refresh, to_skip = [], [], []
|
|
454
470
|
classifications = []
|
|
@@ -460,50 +476,35 @@ class VectorLakeClient:
|
|
|
460
476
|
bar.set_postfix_str(fname[:40])
|
|
461
477
|
|
|
462
478
|
if not src.exists():
|
|
463
|
-
|
|
479
|
+
classifications.append(FileClassification(
|
|
464
480
|
filename=fname,
|
|
465
481
|
status=FileStatus.UNKNOWN,
|
|
466
482
|
endpoint="add_docs",
|
|
467
483
|
reason="Source file not on disk — defaulting to add_docs",
|
|
468
|
-
)
|
|
484
|
+
))
|
|
469
485
|
to_add.append(fname)
|
|
470
|
-
classifications.append(c)
|
|
471
486
|
continue
|
|
472
487
|
|
|
473
|
-
|
|
474
|
-
|
|
488
|
+
existing_chunks = sorted(
|
|
489
|
+
Path(chunks_dir).glob(f"{src.stem}_part*.txt")
|
|
490
|
+
)
|
|
475
491
|
|
|
476
492
|
if not existing_chunks:
|
|
477
|
-
|
|
493
|
+
classifications.append(FileClassification(
|
|
478
494
|
filename=fname,
|
|
479
495
|
status=FileStatus.NEW,
|
|
480
496
|
endpoint="add_docs",
|
|
481
497
|
reason="No chunks on disk — first upload",
|
|
482
|
-
)
|
|
498
|
+
))
|
|
483
499
|
to_add.append(fname)
|
|
484
500
|
else:
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
status=FileStatus.UNCHANGED,
|
|
493
|
-
endpoint="skip",
|
|
494
|
-
reason="Content matches existing chunks — skipping",
|
|
495
|
-
)
|
|
496
|
-
to_skip.append(fname)
|
|
497
|
-
else:
|
|
498
|
-
c = FileClassification(
|
|
499
|
-
filename=fname,
|
|
500
|
-
status=FileStatus.CHANGED,
|
|
501
|
-
endpoint="refresh_docs",
|
|
502
|
-
reason="Content differs from existing chunks — refresh required",
|
|
503
|
-
)
|
|
504
|
-
to_refresh.append(fname)
|
|
505
|
-
|
|
506
|
-
classifications.append(c)
|
|
501
|
+
classifications.append(FileClassification(
|
|
502
|
+
filename=fname,
|
|
503
|
+
status=FileStatus.CHANGED,
|
|
504
|
+
endpoint="refresh_docs",
|
|
505
|
+
reason=f"Chunks exist on disk ({len(existing_chunks)}) — routing to refresh",
|
|
506
|
+
))
|
|
507
|
+
to_refresh.append(fname)
|
|
507
508
|
|
|
508
509
|
print(f"\n NEW={len(to_add)} CHANGED={len(to_refresh)} UNCHANGED={len(to_skip)}")
|
|
509
510
|
print(_divider())
|
|
@@ -522,7 +523,7 @@ class VectorLakeClient:
|
|
|
522
523
|
user_id: str,
|
|
523
524
|
vector_lake_description: str,
|
|
524
525
|
start_from_batch: int = 1,
|
|
525
|
-
end_batch: Optional[int] = None,
|
|
526
|
+
end_batch: Optional[int] = None,
|
|
526
527
|
intelligent_segmentation: bool = True,
|
|
527
528
|
session_id: Optional[str] = None,
|
|
528
529
|
files: Optional[List[str]] = None,
|
|
@@ -556,7 +557,7 @@ class VectorLakeClient:
|
|
|
556
557
|
grand_total = self._compute_grand_total(self.config.vector_lake_path)
|
|
557
558
|
return self._process_files_in_batches(
|
|
558
559
|
"add_docs", user_id, vector_lake_description,
|
|
559
|
-
start_from_batch, end_batch,
|
|
560
|
+
start_from_batch, end_batch,
|
|
560
561
|
intelligent_segmentation, session_id, files,
|
|
561
562
|
grand_total_batches=grand_total,
|
|
562
563
|
)
|
|
@@ -570,7 +571,7 @@ class VectorLakeClient:
|
|
|
570
571
|
user_id: str,
|
|
571
572
|
vector_lake_description: str,
|
|
572
573
|
start_from_batch: int = 1,
|
|
573
|
-
end_batch: Optional[int] = None,
|
|
574
|
+
end_batch: Optional[int] = None,
|
|
574
575
|
intelligent_segmentation: bool = True,
|
|
575
576
|
session_id: Optional[str] = None,
|
|
576
577
|
files: Optional[List[str]] = None,
|
|
@@ -604,7 +605,7 @@ class VectorLakeClient:
|
|
|
604
605
|
grand_total = self._compute_grand_total(self.config.vector_lake_path)
|
|
605
606
|
return self._process_files_in_batches(
|
|
606
607
|
"refresh_docs", user_id, vector_lake_description,
|
|
607
|
-
start_from_batch, end_batch,
|
|
608
|
+
start_from_batch, end_batch,
|
|
608
609
|
intelligent_segmentation, session_id, files,
|
|
609
610
|
grand_total_batches=grand_total,
|
|
610
611
|
)
|
|
@@ -618,7 +619,7 @@ class VectorLakeClient:
|
|
|
618
619
|
user_id: str,
|
|
619
620
|
vector_lake_description: str,
|
|
620
621
|
start_from_batch: int = 1,
|
|
621
|
-
end_batch: Optional[int] = None,
|
|
622
|
+
end_batch: Optional[int] = None,
|
|
622
623
|
intelligent_segmentation: bool = True,
|
|
623
624
|
session_id: Optional[str] = None,
|
|
624
625
|
files: Optional[List[str]] = None,
|
|
@@ -629,9 +630,8 @@ class VectorLakeClient:
|
|
|
629
630
|
|
|
630
631
|
Classification (based on chunks already on disk)
|
|
631
632
|
────────────────────────────────────────────────
|
|
632
|
-
No chunk on disk → NEW
|
|
633
|
-
Chunk
|
|
634
|
-
Chunk exists, differs→ CHANGED → refresh_docs
|
|
633
|
+
No chunk on disk → NEW → add_docs
|
|
634
|
+
Chunk(s) exist → CHANGED → refresh_docs
|
|
635
635
|
|
|
636
636
|
dry_run=True returns the classification plan without uploading anything.
|
|
637
637
|
"""
|
|
@@ -684,7 +684,7 @@ class VectorLakeClient:
|
|
|
684
684
|
if plan.to_add:
|
|
685
685
|
add_result = self._process_files_in_batches(
|
|
686
686
|
"add_docs", user_id, vector_lake_description,
|
|
687
|
-
start_from_batch, end_batch,
|
|
687
|
+
start_from_batch, end_batch,
|
|
688
688
|
intelligent_segmentation, session_id,
|
|
689
689
|
files=plan.to_add,
|
|
690
690
|
grand_total_batches=grand_total,
|
|
@@ -693,7 +693,7 @@ class VectorLakeClient:
|
|
|
693
693
|
if plan.to_refresh:
|
|
694
694
|
refresh_result = self._process_files_in_batches(
|
|
695
695
|
"refresh_docs", user_id, vector_lake_description,
|
|
696
|
-
start_from_batch, end_batch,
|
|
696
|
+
start_from_batch, end_batch,
|
|
697
697
|
intelligent_segmentation, session_id,
|
|
698
698
|
files=plan.to_refresh,
|
|
699
699
|
grand_total_batches=grand_total,
|
|
@@ -755,12 +755,19 @@ class VectorLakeClient:
|
|
|
755
755
|
"files_name and files_data must have the same length"
|
|
756
756
|
).to_response()
|
|
757
757
|
|
|
758
|
+
basenames = [os.path.basename(n) for n in files_name]
|
|
759
|
+
files_meta = [
|
|
760
|
+
{"filename": b, "extension": Path(b).suffix.lstrip(".").lower()}
|
|
761
|
+
for b in basenames
|
|
762
|
+
]
|
|
763
|
+
|
|
758
764
|
payload = {
|
|
759
765
|
"session_id": session_id,
|
|
760
766
|
"user_id": user_id,
|
|
761
767
|
"vector_lake_description": vector_lake_description,
|
|
762
|
-
"files_name":
|
|
768
|
+
"files_name": basenames,
|
|
763
769
|
"files_data": files_data,
|
|
770
|
+
"files_meta": files_meta,
|
|
764
771
|
"intelligent_segmentation": intelligent_segmentation,
|
|
765
772
|
}
|
|
766
773
|
return self._make_request(
|
|
@@ -68,9 +68,9 @@ class FileStatus(Enum):
|
|
|
68
68
|
Classified intent for a single source file before upload.
|
|
69
69
|
|
|
70
70
|
NEW → never uploaded; use add_docs
|
|
71
|
-
CHANGED → uploaded before
|
|
72
|
-
UNCHANGED →
|
|
73
|
-
UNKNOWN →
|
|
71
|
+
CHANGED → uploaded before, chunks exist on disk; use refresh_docs
|
|
72
|
+
UNCHANGED → reserved, not currently used
|
|
73
|
+
UNKNOWN → source file missing from disk; caller decides
|
|
74
74
|
"""
|
|
75
75
|
NEW = "new"
|
|
76
76
|
CHANGED = "changed"
|
|
@@ -260,8 +260,18 @@ class FileProcessor:
|
|
|
260
260
|
raise UnsupportedFileTypeError(filename, self.allowed_extensions)
|
|
261
261
|
|
|
262
262
|
def read_content(self, filepath: str) -> str:
|
|
263
|
-
"""
|
|
263
|
+
"""
|
|
264
|
+
Read and return full text content of any supported file.
|
|
265
|
+
|
|
266
|
+
Chunks written by BatchManager are always plain .txt files regardless
|
|
267
|
+
of the original source format. Running a format-specific extractor
|
|
268
|
+
(e.g. _extract_pdf) on an already-extracted .txt chunk would either
|
|
269
|
+
fail or produce garbage, so versioned chunks are always read as plain
|
|
270
|
+
text directly.
|
|
271
|
+
"""
|
|
264
272
|
self.assert_supported(filepath)
|
|
273
|
+
if VersionedChunkName.is_versioned(filepath):
|
|
274
|
+
return _read_plain(filepath)
|
|
265
275
|
return extract_text(filepath)
|
|
266
276
|
|
|
267
277
|
|
|
@@ -274,7 +284,7 @@ class BatchManager:
|
|
|
274
284
|
Converts a list of source files into upload-ready batches.
|
|
275
285
|
|
|
276
286
|
Chunk naming follows the SDK convention:
|
|
277
|
-
{stem}
|
|
287
|
+
{stem}_part{part}.txt
|
|
278
288
|
Chunks already on disk are reused — safe to re-run after a crash.
|
|
279
289
|
"""
|
|
280
290
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{waveflowdb_client-1.0.2 → waveflowdb_client-1.0.4}/waveflowdb_client.egg-info/top_level.txt
RENAMED
|
File without changes
|