waveflowdb-client 1.0.2__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: waveflowdb_client
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
5
5
  Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "waveflowdb_client" # pip install name
7
- version = "1.0.2"
7
+ version = "1.0.3"
8
8
  description = "VectorLake SDK — Deterministic backend engine powering agent workflows"
9
9
  readme = "readme.md"
10
10
  requires-python = ">=3.8"
@@ -87,6 +87,24 @@ def _divider(label: str = "", width: int = 60) -> str:
87
87
  return f"\n{'─' * width}\n {label}\n{'─' * width}" if label else f"{'─' * width}"
88
88
 
89
89
 
90
+ def _source_ext(chunk_name: str, base_path: str) -> str:
91
+ """
92
+ Recover the original source file extension from a chunk filename.
93
+
94
+ Parses the chunk stem via VersionedChunkName, then looks for the source
95
+ file in base_path to get its real extension. Falls back to "txt" if the
96
+ source file is not found (e.g. direct-upload mode).
97
+ """
98
+ parsed = VersionedChunkName.parse(chunk_name)
99
+ if parsed:
100
+ candidates = list(Path(base_path).glob(f"{parsed.stem}.*"))
101
+ # Exclude other chunk files in the same dir
102
+ candidates = [c for c in candidates if not VersionedChunkName.is_versioned(c.name)]
103
+ if candidates:
104
+ return candidates[0].suffix.lstrip(".").lower()
105
+ return Path(chunk_name).suffix.lstrip(".").lower() or "txt"
106
+
107
+
90
108
  class VectorLakeClient:
91
109
  """
92
110
  Main entry point for the VectorLake SDK.
@@ -230,7 +248,7 @@ class VectorLakeClient:
230
248
  user_id: str,
231
249
  vector_lake_description: str,
232
250
  start_from_batch: int = 1,
233
- end_batch: Optional[int] = None, # ← NEW
251
+ end_batch: Optional[int] = None,
234
252
  intelligent_segmentation: bool = True,
235
253
  session_id: Optional[str] = None,
236
254
  files: Optional[List[str]] = None,
@@ -289,7 +307,7 @@ class VectorLakeClient:
289
307
 
290
308
  active = [
291
309
  (i, b) for i, b in enumerate(batches)
292
- if start_from_batch <= (i + 1) <= resolved_end # ← respects both bounds
310
+ if start_from_batch <= (i + 1) <= resolved_end
293
311
  ]
294
312
  initial = start_from_batch - 1
295
313
 
@@ -305,19 +323,22 @@ class VectorLakeClient:
305
323
  t0 = time.time()
306
324
 
307
325
  try:
308
- # ── per-batch START marker ─────────────────────────────
309
326
  logger.info(
310
327
  "[BATCH START] op=%s batch=%d/%d files=%s",
311
328
  operation, batch_num, display_total, batch,
312
329
  )
313
- # ──────────────────────────────────────────────────────
314
330
 
315
331
  file_contents = []
332
+ files_meta = []
316
333
  for fname in batch:
317
334
  chunk_path = os.path.join(chunks_dir, fname)
318
335
  if not os.path.exists(chunk_path):
319
336
  chunk_path = os.path.join(base_path, fname)
320
337
  file_contents.append(self._file_processor.read_content(chunk_path))
338
+ files_meta.append({
339
+ "filename": fname,
340
+ "extension": _source_ext(fname, base_path),
341
+ })
321
342
 
322
343
  payload = {
323
344
  "session_id": session_id,
@@ -325,9 +346,10 @@ class VectorLakeClient:
325
346
  "vector_lake_description": vector_lake_description,
326
347
  "files_name": batch,
327
348
  "files_data": file_contents,
349
+ "files_meta": files_meta,
328
350
  "intelligent_segmentation": intelligent_segmentation,
329
351
  }
330
-
352
+ print("payload",payload)
331
353
  request_start_ts = time.time()
332
354
  result = self._make_request(endpoint, payload, operation, batch_num)
333
355
  elapsed_ms = round((time.time() - request_start_ts) * 1000, 1)
@@ -345,12 +367,10 @@ class VectorLakeClient:
345
367
  batch_num, result.get("message", result.get("error")),
346
368
  )
347
369
 
348
- # ── per-batch END marker ───────────────────────────────
349
370
  logger.info(
350
371
  "[BATCH END] op=%s batch=%d/%d success=%s elapsed_ms=%.1f files=%s",
351
372
  operation, batch_num, display_total, succeeded, elapsed_ms, batch,
352
373
  )
353
- # ──────────────────────────────────────────────────────
354
374
 
355
375
  batch_outputs.append({
356
376
  "batch_number": batch_num,
@@ -440,15 +460,13 @@ class VectorLakeClient:
440
460
  chunks_dir: str,
441
461
  ) -> SyncPlan:
442
462
  """
443
- Classify each source file by comparing its MD5 against the MD5 of
444
- its existing chunk(s) in chunks_dir.
463
+ Classify each source file by checking whether chunks already exist on disk.
445
464
 
446
465
  Rules
447
466
  ─────
448
- No chunk exists on disk → NEW → add_docs
449
- Chunk exists, MD5 matches UNCHANGEDskip
450
- Chunk exists, MD5 differs CHANGED refresh_docs
451
- Source file missing from disk → UNKNOWN → add_docs (safe default)
467
+ No chunk exists on disk → NEW → add_docs
468
+ Chunk(s) exist on disk CHANGEDrefresh_docs
469
+ Source file missing UNKNOWN add_docs (safe default)
452
470
  """
453
471
  to_add, to_refresh, to_skip = [], [], []
454
472
  classifications = []
@@ -460,50 +478,35 @@ class VectorLakeClient:
460
478
  bar.set_postfix_str(fname[:40])
461
479
 
462
480
  if not src.exists():
463
- c = FileClassification(
481
+ classifications.append(FileClassification(
464
482
  filename=fname,
465
483
  status=FileStatus.UNKNOWN,
466
484
  endpoint="add_docs",
467
485
  reason="Source file not on disk — defaulting to add_docs",
468
- )
486
+ ))
469
487
  to_add.append(fname)
470
- classifications.append(c)
471
488
  continue
472
489
 
473
- stem = src.stem
474
- existing_chunks = sorted(Path(chunks_dir).glob(f"{stem}_part*.txt"))
490
+ existing_chunks = sorted(
491
+ Path(chunks_dir).glob(f"{src.stem}_part*.txt")
492
+ )
475
493
 
476
494
  if not existing_chunks:
477
- c = FileClassification(
495
+ classifications.append(FileClassification(
478
496
  filename=fname,
479
497
  status=FileStatus.NEW,
480
498
  endpoint="add_docs",
481
499
  reason="No chunks on disk — first upload",
482
- )
500
+ ))
483
501
  to_add.append(fname)
484
502
  else:
485
- src_md5 = _md5(str(src))
486
- chunk_content = b"".join(Path(cp).read_bytes() for cp in existing_chunks)
487
- chunk_md5 = hashlib.md5(chunk_content).hexdigest()
488
-
489
- if src_md5 == chunk_md5:
490
- c = FileClassification(
491
- filename=fname,
492
- status=FileStatus.UNCHANGED,
493
- endpoint="skip",
494
- reason="Content matches existing chunks — skipping",
495
- )
496
- to_skip.append(fname)
497
- else:
498
- c = FileClassification(
499
- filename=fname,
500
- status=FileStatus.CHANGED,
501
- endpoint="refresh_docs",
502
- reason="Content differs from existing chunks — refresh required",
503
- )
504
- to_refresh.append(fname)
505
-
506
- classifications.append(c)
503
+ classifications.append(FileClassification(
504
+ filename=fname,
505
+ status=FileStatus.CHANGED,
506
+ endpoint="refresh_docs",
507
+ reason=f"Chunks exist on disk ({len(existing_chunks)}) — routing to refresh",
508
+ ))
509
+ to_refresh.append(fname)
507
510
 
508
511
  print(f"\n NEW={len(to_add)} CHANGED={len(to_refresh)} UNCHANGED={len(to_skip)}")
509
512
  print(_divider())
@@ -522,7 +525,7 @@ class VectorLakeClient:
522
525
  user_id: str,
523
526
  vector_lake_description: str,
524
527
  start_from_batch: int = 1,
525
- end_batch: Optional[int] = None,
528
+ end_batch: Optional[int] = None,
526
529
  intelligent_segmentation: bool = True,
527
530
  session_id: Optional[str] = None,
528
531
  files: Optional[List[str]] = None,
@@ -556,7 +559,7 @@ class VectorLakeClient:
556
559
  grand_total = self._compute_grand_total(self.config.vector_lake_path)
557
560
  return self._process_files_in_batches(
558
561
  "add_docs", user_id, vector_lake_description,
559
- start_from_batch, end_batch, # ← NEW
562
+ start_from_batch, end_batch,
560
563
  intelligent_segmentation, session_id, files,
561
564
  grand_total_batches=grand_total,
562
565
  )
@@ -570,7 +573,7 @@ class VectorLakeClient:
570
573
  user_id: str,
571
574
  vector_lake_description: str,
572
575
  start_from_batch: int = 1,
573
- end_batch: Optional[int] = None, # ← NEW
576
+ end_batch: Optional[int] = None,
574
577
  intelligent_segmentation: bool = True,
575
578
  session_id: Optional[str] = None,
576
579
  files: Optional[List[str]] = None,
@@ -604,7 +607,7 @@ class VectorLakeClient:
604
607
  grand_total = self._compute_grand_total(self.config.vector_lake_path)
605
608
  return self._process_files_in_batches(
606
609
  "refresh_docs", user_id, vector_lake_description,
607
- start_from_batch, end_batch, # ← NEW
610
+ start_from_batch, end_batch,
608
611
  intelligent_segmentation, session_id, files,
609
612
  grand_total_batches=grand_total,
610
613
  )
@@ -618,7 +621,7 @@ class VectorLakeClient:
618
621
  user_id: str,
619
622
  vector_lake_description: str,
620
623
  start_from_batch: int = 1,
621
- end_batch: Optional[int] = None, # ← NEW
624
+ end_batch: Optional[int] = None,
622
625
  intelligent_segmentation: bool = True,
623
626
  session_id: Optional[str] = None,
624
627
  files: Optional[List[str]] = None,
@@ -629,9 +632,8 @@ class VectorLakeClient:
629
632
 
630
633
  Classification (based on chunks already on disk)
631
634
  ────────────────────────────────────────────────
632
- No chunk on disk → NEW → add_docs
633
- Chunk exists, same UNCHANGEDskipped (no server call)
634
- Chunk exists, differs→ CHANGED → refresh_docs
635
+ No chunk on disk → NEW → add_docs
636
+ Chunk(s) exist CHANGEDrefresh_docs
635
637
 
636
638
  dry_run=True returns the classification plan without uploading anything.
637
639
  """
@@ -684,7 +686,7 @@ class VectorLakeClient:
684
686
  if plan.to_add:
685
687
  add_result = self._process_files_in_batches(
686
688
  "add_docs", user_id, vector_lake_description,
687
- start_from_batch, end_batch, # ← NEW
689
+ start_from_batch, end_batch,
688
690
  intelligent_segmentation, session_id,
689
691
  files=plan.to_add,
690
692
  grand_total_batches=grand_total,
@@ -693,7 +695,7 @@ class VectorLakeClient:
693
695
  if plan.to_refresh:
694
696
  refresh_result = self._process_files_in_batches(
695
697
  "refresh_docs", user_id, vector_lake_description,
696
- start_from_batch, end_batch, # ← NEW
698
+ start_from_batch, end_batch,
697
699
  intelligent_segmentation, session_id,
698
700
  files=plan.to_refresh,
699
701
  grand_total_batches=grand_total,
@@ -755,12 +757,19 @@ class VectorLakeClient:
755
757
  "files_name and files_data must have the same length"
756
758
  ).to_response()
757
759
 
760
+ basenames = [os.path.basename(n) for n in files_name]
761
+ files_meta = [
762
+ {"filename": b, "extension": Path(b).suffix.lstrip(".").lower()}
763
+ for b in basenames
764
+ ]
765
+
758
766
  payload = {
759
767
  "session_id": session_id,
760
768
  "user_id": user_id,
761
769
  "vector_lake_description": vector_lake_description,
762
- "files_name": [os.path.basename(n) for n in files_name],
770
+ "files_name": basenames,
763
771
  "files_data": files_data,
772
+ "files_meta": files_meta,
764
773
  "intelligent_segmentation": intelligent_segmentation,
765
774
  }
766
775
  return self._make_request(
@@ -68,9 +68,9 @@ class FileStatus(Enum):
68
68
  Classified intent for a single source file before upload.
69
69
 
70
70
  NEW → never uploaded; use add_docs
71
- CHANGED → uploaded before but content differs; use refresh_docs
72
- UNCHANGED → content identical to last upload; skip
73
- UNKNOWN → no chunks on disk to compare against; caller decides
71
+ CHANGED → uploaded before, chunks exist on disk; use refresh_docs
72
+ UNCHANGED → reserved, not currently used
73
+ UNKNOWN → source file missing from disk; caller decides
74
74
  """
75
75
  NEW = "new"
76
76
  CHANGED = "changed"
@@ -260,8 +260,18 @@ class FileProcessor:
260
260
  raise UnsupportedFileTypeError(filename, self.allowed_extensions)
261
261
 
262
262
  def read_content(self, filepath: str) -> str:
263
- """Read and return full text content of any supported file."""
263
+ """
264
+ Read and return full text content of any supported file.
265
+
266
+ Chunks written by BatchManager are always plain .txt files regardless
267
+ of the original source format. Running a format-specific extractor
268
+ (e.g. _extract_pdf) on an already-extracted .txt chunk would either
269
+ fail or produce garbage, so versioned chunks are always read as plain
270
+ text directly.
271
+ """
264
272
  self.assert_supported(filepath)
273
+ if VersionedChunkName.is_versioned(filepath):
274
+ return _read_plain(filepath)
265
275
  return extract_text(filepath)
266
276
 
267
277
 
@@ -274,7 +284,7 @@ class BatchManager:
274
284
  Converts a list of source files into upload-ready batches.
275
285
 
276
286
  Chunk naming follows the SDK convention:
277
- {stem}__v0001_part{part:04d}{ext}
287
+ {stem}_part{part}.txt
278
288
  Chunks already on disk are reused — safe to re-run after a crash.
279
289
  """
280
290
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: waveflowdb_client
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
5
5
  Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
6
6
  License: MIT License