waveflowdb-client 1.0.2__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {waveflowdb_client-1.0.2/waveflowdb_client.egg-info → waveflowdb_client-1.0.3}/PKG-INFO +1 -1
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/pyproject.toml +1 -1
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client/client.py +62 -53
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client/models.py +3 -3
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client/utils.py +12 -2
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3/waveflowdb_client.egg-info}/PKG-INFO +1 -1
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/LICENSE +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/readme.md +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/setup.cfg +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client/__init__.py +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client/config.py +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client/exceptions.py +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client.egg-info/SOURCES.txt +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client.egg-info/dependency_links.txt +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client.egg-info/requires.txt +0 -0
- {waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "waveflowdb_client" # pip install name
|
|
7
|
-
version = "1.0.
|
|
7
|
+
version = "1.0.3"
|
|
8
8
|
description = "VectorLake SDK — Deterministic backend engine powering agent workflows"
|
|
9
9
|
readme = "readme.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -87,6 +87,24 @@ def _divider(label: str = "", width: int = 60) -> str:
|
|
|
87
87
|
return f"\n{'─' * width}\n {label}\n{'─' * width}" if label else f"{'─' * width}"
|
|
88
88
|
|
|
89
89
|
|
|
90
|
+
def _source_ext(chunk_name: str, base_path: str) -> str:
|
|
91
|
+
"""
|
|
92
|
+
Recover the original source file extension from a chunk filename.
|
|
93
|
+
|
|
94
|
+
Parses the chunk stem via VersionedChunkName, then looks for the source
|
|
95
|
+
file in base_path to get its real extension. Falls back to "txt" if the
|
|
96
|
+
source file is not found (e.g. direct-upload mode).
|
|
97
|
+
"""
|
|
98
|
+
parsed = VersionedChunkName.parse(chunk_name)
|
|
99
|
+
if parsed:
|
|
100
|
+
candidates = list(Path(base_path).glob(f"{parsed.stem}.*"))
|
|
101
|
+
# Exclude other chunk files in the same dir
|
|
102
|
+
candidates = [c for c in candidates if not VersionedChunkName.is_versioned(c.name)]
|
|
103
|
+
if candidates:
|
|
104
|
+
return candidates[0].suffix.lstrip(".").lower()
|
|
105
|
+
return Path(chunk_name).suffix.lstrip(".").lower() or "txt"
|
|
106
|
+
|
|
107
|
+
|
|
90
108
|
class VectorLakeClient:
|
|
91
109
|
"""
|
|
92
110
|
Main entry point for the VectorLake SDK.
|
|
@@ -230,7 +248,7 @@ class VectorLakeClient:
|
|
|
230
248
|
user_id: str,
|
|
231
249
|
vector_lake_description: str,
|
|
232
250
|
start_from_batch: int = 1,
|
|
233
|
-
end_batch: Optional[int] = None,
|
|
251
|
+
end_batch: Optional[int] = None,
|
|
234
252
|
intelligent_segmentation: bool = True,
|
|
235
253
|
session_id: Optional[str] = None,
|
|
236
254
|
files: Optional[List[str]] = None,
|
|
@@ -289,7 +307,7 @@ class VectorLakeClient:
|
|
|
289
307
|
|
|
290
308
|
active = [
|
|
291
309
|
(i, b) for i, b in enumerate(batches)
|
|
292
|
-
if start_from_batch <= (i + 1) <= resolved_end
|
|
310
|
+
if start_from_batch <= (i + 1) <= resolved_end
|
|
293
311
|
]
|
|
294
312
|
initial = start_from_batch - 1
|
|
295
313
|
|
|
@@ -305,19 +323,22 @@ class VectorLakeClient:
|
|
|
305
323
|
t0 = time.time()
|
|
306
324
|
|
|
307
325
|
try:
|
|
308
|
-
# ── per-batch START marker ─────────────────────────────
|
|
309
326
|
logger.info(
|
|
310
327
|
"[BATCH START] op=%s batch=%d/%d files=%s",
|
|
311
328
|
operation, batch_num, display_total, batch,
|
|
312
329
|
)
|
|
313
|
-
# ──────────────────────────────────────────────────────
|
|
314
330
|
|
|
315
331
|
file_contents = []
|
|
332
|
+
files_meta = []
|
|
316
333
|
for fname in batch:
|
|
317
334
|
chunk_path = os.path.join(chunks_dir, fname)
|
|
318
335
|
if not os.path.exists(chunk_path):
|
|
319
336
|
chunk_path = os.path.join(base_path, fname)
|
|
320
337
|
file_contents.append(self._file_processor.read_content(chunk_path))
|
|
338
|
+
files_meta.append({
|
|
339
|
+
"filename": fname,
|
|
340
|
+
"extension": _source_ext(fname, base_path),
|
|
341
|
+
})
|
|
321
342
|
|
|
322
343
|
payload = {
|
|
323
344
|
"session_id": session_id,
|
|
@@ -325,9 +346,10 @@ class VectorLakeClient:
|
|
|
325
346
|
"vector_lake_description": vector_lake_description,
|
|
326
347
|
"files_name": batch,
|
|
327
348
|
"files_data": file_contents,
|
|
349
|
+
"files_meta": files_meta,
|
|
328
350
|
"intelligent_segmentation": intelligent_segmentation,
|
|
329
351
|
}
|
|
330
|
-
|
|
352
|
+
print("payload",payload)
|
|
331
353
|
request_start_ts = time.time()
|
|
332
354
|
result = self._make_request(endpoint, payload, operation, batch_num)
|
|
333
355
|
elapsed_ms = round((time.time() - request_start_ts) * 1000, 1)
|
|
@@ -345,12 +367,10 @@ class VectorLakeClient:
|
|
|
345
367
|
batch_num, result.get("message", result.get("error")),
|
|
346
368
|
)
|
|
347
369
|
|
|
348
|
-
# ── per-batch END marker ───────────────────────────────
|
|
349
370
|
logger.info(
|
|
350
371
|
"[BATCH END] op=%s batch=%d/%d success=%s elapsed_ms=%.1f files=%s",
|
|
351
372
|
operation, batch_num, display_total, succeeded, elapsed_ms, batch,
|
|
352
373
|
)
|
|
353
|
-
# ──────────────────────────────────────────────────────
|
|
354
374
|
|
|
355
375
|
batch_outputs.append({
|
|
356
376
|
"batch_number": batch_num,
|
|
@@ -440,15 +460,13 @@ class VectorLakeClient:
|
|
|
440
460
|
chunks_dir: str,
|
|
441
461
|
) -> SyncPlan:
|
|
442
462
|
"""
|
|
443
|
-
Classify each source file by
|
|
444
|
-
its existing chunk(s) in chunks_dir.
|
|
463
|
+
Classify each source file by checking whether chunks already exist on disk.
|
|
445
464
|
|
|
446
465
|
Rules
|
|
447
466
|
─────
|
|
448
|
-
No chunk exists on disk
|
|
449
|
-
Chunk
|
|
450
|
-
|
|
451
|
-
Source file missing from disk → UNKNOWN → add_docs (safe default)
|
|
467
|
+
No chunk exists on disk → NEW → add_docs
|
|
468
|
+
Chunk(s) exist on disk → CHANGED → refresh_docs
|
|
469
|
+
Source file missing → UNKNOWN → add_docs (safe default)
|
|
452
470
|
"""
|
|
453
471
|
to_add, to_refresh, to_skip = [], [], []
|
|
454
472
|
classifications = []
|
|
@@ -460,50 +478,35 @@ class VectorLakeClient:
|
|
|
460
478
|
bar.set_postfix_str(fname[:40])
|
|
461
479
|
|
|
462
480
|
if not src.exists():
|
|
463
|
-
|
|
481
|
+
classifications.append(FileClassification(
|
|
464
482
|
filename=fname,
|
|
465
483
|
status=FileStatus.UNKNOWN,
|
|
466
484
|
endpoint="add_docs",
|
|
467
485
|
reason="Source file not on disk — defaulting to add_docs",
|
|
468
|
-
)
|
|
486
|
+
))
|
|
469
487
|
to_add.append(fname)
|
|
470
|
-
classifications.append(c)
|
|
471
488
|
continue
|
|
472
489
|
|
|
473
|
-
|
|
474
|
-
|
|
490
|
+
existing_chunks = sorted(
|
|
491
|
+
Path(chunks_dir).glob(f"{src.stem}_part*.txt")
|
|
492
|
+
)
|
|
475
493
|
|
|
476
494
|
if not existing_chunks:
|
|
477
|
-
|
|
495
|
+
classifications.append(FileClassification(
|
|
478
496
|
filename=fname,
|
|
479
497
|
status=FileStatus.NEW,
|
|
480
498
|
endpoint="add_docs",
|
|
481
499
|
reason="No chunks on disk — first upload",
|
|
482
|
-
)
|
|
500
|
+
))
|
|
483
501
|
to_add.append(fname)
|
|
484
502
|
else:
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
status=FileStatus.UNCHANGED,
|
|
493
|
-
endpoint="skip",
|
|
494
|
-
reason="Content matches existing chunks — skipping",
|
|
495
|
-
)
|
|
496
|
-
to_skip.append(fname)
|
|
497
|
-
else:
|
|
498
|
-
c = FileClassification(
|
|
499
|
-
filename=fname,
|
|
500
|
-
status=FileStatus.CHANGED,
|
|
501
|
-
endpoint="refresh_docs",
|
|
502
|
-
reason="Content differs from existing chunks — refresh required",
|
|
503
|
-
)
|
|
504
|
-
to_refresh.append(fname)
|
|
505
|
-
|
|
506
|
-
classifications.append(c)
|
|
503
|
+
classifications.append(FileClassification(
|
|
504
|
+
filename=fname,
|
|
505
|
+
status=FileStatus.CHANGED,
|
|
506
|
+
endpoint="refresh_docs",
|
|
507
|
+
reason=f"Chunks exist on disk ({len(existing_chunks)}) — routing to refresh",
|
|
508
|
+
))
|
|
509
|
+
to_refresh.append(fname)
|
|
507
510
|
|
|
508
511
|
print(f"\n NEW={len(to_add)} CHANGED={len(to_refresh)} UNCHANGED={len(to_skip)}")
|
|
509
512
|
print(_divider())
|
|
@@ -522,7 +525,7 @@ class VectorLakeClient:
|
|
|
522
525
|
user_id: str,
|
|
523
526
|
vector_lake_description: str,
|
|
524
527
|
start_from_batch: int = 1,
|
|
525
|
-
end_batch: Optional[int] = None,
|
|
528
|
+
end_batch: Optional[int] = None,
|
|
526
529
|
intelligent_segmentation: bool = True,
|
|
527
530
|
session_id: Optional[str] = None,
|
|
528
531
|
files: Optional[List[str]] = None,
|
|
@@ -556,7 +559,7 @@ class VectorLakeClient:
|
|
|
556
559
|
grand_total = self._compute_grand_total(self.config.vector_lake_path)
|
|
557
560
|
return self._process_files_in_batches(
|
|
558
561
|
"add_docs", user_id, vector_lake_description,
|
|
559
|
-
start_from_batch, end_batch,
|
|
562
|
+
start_from_batch, end_batch,
|
|
560
563
|
intelligent_segmentation, session_id, files,
|
|
561
564
|
grand_total_batches=grand_total,
|
|
562
565
|
)
|
|
@@ -570,7 +573,7 @@ class VectorLakeClient:
|
|
|
570
573
|
user_id: str,
|
|
571
574
|
vector_lake_description: str,
|
|
572
575
|
start_from_batch: int = 1,
|
|
573
|
-
end_batch: Optional[int] = None,
|
|
576
|
+
end_batch: Optional[int] = None,
|
|
574
577
|
intelligent_segmentation: bool = True,
|
|
575
578
|
session_id: Optional[str] = None,
|
|
576
579
|
files: Optional[List[str]] = None,
|
|
@@ -604,7 +607,7 @@ class VectorLakeClient:
|
|
|
604
607
|
grand_total = self._compute_grand_total(self.config.vector_lake_path)
|
|
605
608
|
return self._process_files_in_batches(
|
|
606
609
|
"refresh_docs", user_id, vector_lake_description,
|
|
607
|
-
start_from_batch, end_batch,
|
|
610
|
+
start_from_batch, end_batch,
|
|
608
611
|
intelligent_segmentation, session_id, files,
|
|
609
612
|
grand_total_batches=grand_total,
|
|
610
613
|
)
|
|
@@ -618,7 +621,7 @@ class VectorLakeClient:
|
|
|
618
621
|
user_id: str,
|
|
619
622
|
vector_lake_description: str,
|
|
620
623
|
start_from_batch: int = 1,
|
|
621
|
-
end_batch: Optional[int] = None,
|
|
624
|
+
end_batch: Optional[int] = None,
|
|
622
625
|
intelligent_segmentation: bool = True,
|
|
623
626
|
session_id: Optional[str] = None,
|
|
624
627
|
files: Optional[List[str]] = None,
|
|
@@ -629,9 +632,8 @@ class VectorLakeClient:
|
|
|
629
632
|
|
|
630
633
|
Classification (based on chunks already on disk)
|
|
631
634
|
────────────────────────────────────────────────
|
|
632
|
-
No chunk on disk → NEW
|
|
633
|
-
Chunk
|
|
634
|
-
Chunk exists, differs→ CHANGED → refresh_docs
|
|
635
|
+
No chunk on disk → NEW → add_docs
|
|
636
|
+
Chunk(s) exist → CHANGED → refresh_docs
|
|
635
637
|
|
|
636
638
|
dry_run=True returns the classification plan without uploading anything.
|
|
637
639
|
"""
|
|
@@ -684,7 +686,7 @@ class VectorLakeClient:
|
|
|
684
686
|
if plan.to_add:
|
|
685
687
|
add_result = self._process_files_in_batches(
|
|
686
688
|
"add_docs", user_id, vector_lake_description,
|
|
687
|
-
start_from_batch, end_batch,
|
|
689
|
+
start_from_batch, end_batch,
|
|
688
690
|
intelligent_segmentation, session_id,
|
|
689
691
|
files=plan.to_add,
|
|
690
692
|
grand_total_batches=grand_total,
|
|
@@ -693,7 +695,7 @@ class VectorLakeClient:
|
|
|
693
695
|
if plan.to_refresh:
|
|
694
696
|
refresh_result = self._process_files_in_batches(
|
|
695
697
|
"refresh_docs", user_id, vector_lake_description,
|
|
696
|
-
start_from_batch, end_batch,
|
|
698
|
+
start_from_batch, end_batch,
|
|
697
699
|
intelligent_segmentation, session_id,
|
|
698
700
|
files=plan.to_refresh,
|
|
699
701
|
grand_total_batches=grand_total,
|
|
@@ -755,12 +757,19 @@ class VectorLakeClient:
|
|
|
755
757
|
"files_name and files_data must have the same length"
|
|
756
758
|
).to_response()
|
|
757
759
|
|
|
760
|
+
basenames = [os.path.basename(n) for n in files_name]
|
|
761
|
+
files_meta = [
|
|
762
|
+
{"filename": b, "extension": Path(b).suffix.lstrip(".").lower()}
|
|
763
|
+
for b in basenames
|
|
764
|
+
]
|
|
765
|
+
|
|
758
766
|
payload = {
|
|
759
767
|
"session_id": session_id,
|
|
760
768
|
"user_id": user_id,
|
|
761
769
|
"vector_lake_description": vector_lake_description,
|
|
762
|
-
"files_name":
|
|
770
|
+
"files_name": basenames,
|
|
763
771
|
"files_data": files_data,
|
|
772
|
+
"files_meta": files_meta,
|
|
764
773
|
"intelligent_segmentation": intelligent_segmentation,
|
|
765
774
|
}
|
|
766
775
|
return self._make_request(
|
|
@@ -68,9 +68,9 @@ class FileStatus(Enum):
|
|
|
68
68
|
Classified intent for a single source file before upload.
|
|
69
69
|
|
|
70
70
|
NEW → never uploaded; use add_docs
|
|
71
|
-
CHANGED → uploaded before
|
|
72
|
-
UNCHANGED →
|
|
73
|
-
UNKNOWN →
|
|
71
|
+
CHANGED → uploaded before, chunks exist on disk; use refresh_docs
|
|
72
|
+
UNCHANGED → reserved, not currently used
|
|
73
|
+
UNKNOWN → source file missing from disk; caller decides
|
|
74
74
|
"""
|
|
75
75
|
NEW = "new"
|
|
76
76
|
CHANGED = "changed"
|
|
@@ -260,8 +260,18 @@ class FileProcessor:
|
|
|
260
260
|
raise UnsupportedFileTypeError(filename, self.allowed_extensions)
|
|
261
261
|
|
|
262
262
|
def read_content(self, filepath: str) -> str:
|
|
263
|
-
"""
|
|
263
|
+
"""
|
|
264
|
+
Read and return full text content of any supported file.
|
|
265
|
+
|
|
266
|
+
Chunks written by BatchManager are always plain .txt files regardless
|
|
267
|
+
of the original source format. Running a format-specific extractor
|
|
268
|
+
(e.g. _extract_pdf) on an already-extracted .txt chunk would either
|
|
269
|
+
fail or produce garbage, so versioned chunks are always read as plain
|
|
270
|
+
text directly.
|
|
271
|
+
"""
|
|
264
272
|
self.assert_supported(filepath)
|
|
273
|
+
if VersionedChunkName.is_versioned(filepath):
|
|
274
|
+
return _read_plain(filepath)
|
|
265
275
|
return extract_text(filepath)
|
|
266
276
|
|
|
267
277
|
|
|
@@ -274,7 +284,7 @@ class BatchManager:
|
|
|
274
284
|
Converts a list of source files into upload-ready batches.
|
|
275
285
|
|
|
276
286
|
Chunk naming follows the SDK convention:
|
|
277
|
-
{stem}
|
|
287
|
+
{stem}_part{part}.txt
|
|
278
288
|
Chunks already on disk are reused — safe to re-run after a crash.
|
|
279
289
|
"""
|
|
280
290
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client.egg-info/top_level.txt
RENAMED
|
File without changes
|