waveflowdb-client 1.0.2__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: waveflowdb_client
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
5
5
  Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "waveflowdb_client" # pip install name
7
- version = "1.0.2"
7
+ version = "1.0.4"
8
8
  description = "VectorLake SDK — Deterministic backend engine powering agent workflows"
9
9
  readme = "readme.md"
10
10
  requires-python = ">=3.8"
@@ -87,6 +87,24 @@ def _divider(label: str = "", width: int = 60) -> str:
87
87
  return f"\n{'─' * width}\n {label}\n{'─' * width}" if label else f"{'─' * width}"
88
88
 
89
89
 
90
+ def _source_ext(chunk_name: str, base_path: str) -> str:
91
+ """
92
+ Recover the original source file extension from a chunk filename.
93
+
94
+ Parses the chunk stem via VersionedChunkName, then looks for the source
95
+ file in base_path to get its real extension. Falls back to "txt" if the
96
+ source file is not found (e.g. direct-upload mode).
97
+ """
98
+ parsed = VersionedChunkName.parse(chunk_name)
99
+ if parsed:
100
+ candidates = list(Path(base_path).glob(f"{parsed.stem}.*"))
101
+ # Exclude other chunk files in the same dir
102
+ candidates = [c for c in candidates if not VersionedChunkName.is_versioned(c.name)]
103
+ if candidates:
104
+ return candidates[0].suffix.lstrip(".").lower()
105
+ return Path(chunk_name).suffix.lstrip(".").lower() or "txt"
106
+
107
+
90
108
  class VectorLakeClient:
91
109
  """
92
110
  Main entry point for the VectorLake SDK.
@@ -128,7 +146,6 @@ class VectorLakeClient:
128
146
  headers = self._headers()
129
147
  request_kb = len(json.dumps(payload).encode()) / 1024
130
148
  logger.debug("Batch %d — payload %.1f KB", batch_num, request_kb)
131
- print("making request")
132
149
  last_error: Optional[APIError] = None
133
150
 
134
151
  for attempt in range(self.config.max_retries):
@@ -230,7 +247,7 @@ class VectorLakeClient:
230
247
  user_id: str,
231
248
  vector_lake_description: str,
232
249
  start_from_batch: int = 1,
233
- end_batch: Optional[int] = None, # ← NEW
250
+ end_batch: Optional[int] = None,
234
251
  intelligent_segmentation: bool = True,
235
252
  session_id: Optional[str] = None,
236
253
  files: Optional[List[str]] = None,
@@ -289,7 +306,7 @@ class VectorLakeClient:
289
306
 
290
307
  active = [
291
308
  (i, b) for i, b in enumerate(batches)
292
- if start_from_batch <= (i + 1) <= resolved_end # ← respects both bounds
309
+ if start_from_batch <= (i + 1) <= resolved_end
293
310
  ]
294
311
  initial = start_from_batch - 1
295
312
 
@@ -305,19 +322,22 @@ class VectorLakeClient:
305
322
  t0 = time.time()
306
323
 
307
324
  try:
308
- # ── per-batch START marker ─────────────────────────────
309
325
  logger.info(
310
326
  "[BATCH START] op=%s batch=%d/%d files=%s",
311
327
  operation, batch_num, display_total, batch,
312
328
  )
313
- # ──────────────────────────────────────────────────────
314
329
 
315
330
  file_contents = []
331
+ files_meta = []
316
332
  for fname in batch:
317
333
  chunk_path = os.path.join(chunks_dir, fname)
318
334
  if not os.path.exists(chunk_path):
319
335
  chunk_path = os.path.join(base_path, fname)
320
336
  file_contents.append(self._file_processor.read_content(chunk_path))
337
+ files_meta.append({
338
+ "filename": fname,
339
+ "extension": _source_ext(fname, base_path),
340
+ })
321
341
 
322
342
  payload = {
323
343
  "session_id": session_id,
@@ -325,9 +345,9 @@ class VectorLakeClient:
325
345
  "vector_lake_description": vector_lake_description,
326
346
  "files_name": batch,
327
347
  "files_data": file_contents,
348
+ "files_meta": files_meta,
328
349
  "intelligent_segmentation": intelligent_segmentation,
329
350
  }
330
-
331
351
  request_start_ts = time.time()
332
352
  result = self._make_request(endpoint, payload, operation, batch_num)
333
353
  elapsed_ms = round((time.time() - request_start_ts) * 1000, 1)
@@ -345,12 +365,10 @@ class VectorLakeClient:
345
365
  batch_num, result.get("message", result.get("error")),
346
366
  )
347
367
 
348
- # ── per-batch END marker ───────────────────────────────
349
368
  logger.info(
350
369
  "[BATCH END] op=%s batch=%d/%d success=%s elapsed_ms=%.1f files=%s",
351
370
  operation, batch_num, display_total, succeeded, elapsed_ms, batch,
352
371
  )
353
- # ──────────────────────────────────────────────────────
354
372
 
355
373
  batch_outputs.append({
356
374
  "batch_number": batch_num,
@@ -440,15 +458,13 @@ class VectorLakeClient:
440
458
  chunks_dir: str,
441
459
  ) -> SyncPlan:
442
460
  """
443
- Classify each source file by comparing its MD5 against the MD5 of
444
- its existing chunk(s) in chunks_dir.
461
+ Classify each source file by checking whether chunks already exist on disk.
445
462
 
446
463
  Rules
447
464
  ─────
448
- No chunk exists on disk → NEW → add_docs
449
- Chunk exists, MD5 matches UNCHANGEDskip
450
- Chunk exists, MD5 differs CHANGED refresh_docs
451
- Source file missing from disk → UNKNOWN → add_docs (safe default)
465
+ No chunk exists on disk → NEW → add_docs
466
+ Chunk(s) exist on disk CHANGEDrefresh_docs
467
+ Source file missing UNKNOWN add_docs (safe default)
452
468
  """
453
469
  to_add, to_refresh, to_skip = [], [], []
454
470
  classifications = []
@@ -460,50 +476,35 @@ class VectorLakeClient:
460
476
  bar.set_postfix_str(fname[:40])
461
477
 
462
478
  if not src.exists():
463
- c = FileClassification(
479
+ classifications.append(FileClassification(
464
480
  filename=fname,
465
481
  status=FileStatus.UNKNOWN,
466
482
  endpoint="add_docs",
467
483
  reason="Source file not on disk — defaulting to add_docs",
468
- )
484
+ ))
469
485
  to_add.append(fname)
470
- classifications.append(c)
471
486
  continue
472
487
 
473
- stem = src.stem
474
- existing_chunks = sorted(Path(chunks_dir).glob(f"{stem}_part*.txt"))
488
+ existing_chunks = sorted(
489
+ Path(chunks_dir).glob(f"{src.stem}_part*.txt")
490
+ )
475
491
 
476
492
  if not existing_chunks:
477
- c = FileClassification(
493
+ classifications.append(FileClassification(
478
494
  filename=fname,
479
495
  status=FileStatus.NEW,
480
496
  endpoint="add_docs",
481
497
  reason="No chunks on disk — first upload",
482
- )
498
+ ))
483
499
  to_add.append(fname)
484
500
  else:
485
- src_md5 = _md5(str(src))
486
- chunk_content = b"".join(Path(cp).read_bytes() for cp in existing_chunks)
487
- chunk_md5 = hashlib.md5(chunk_content).hexdigest()
488
-
489
- if src_md5 == chunk_md5:
490
- c = FileClassification(
491
- filename=fname,
492
- status=FileStatus.UNCHANGED,
493
- endpoint="skip",
494
- reason="Content matches existing chunks — skipping",
495
- )
496
- to_skip.append(fname)
497
- else:
498
- c = FileClassification(
499
- filename=fname,
500
- status=FileStatus.CHANGED,
501
- endpoint="refresh_docs",
502
- reason="Content differs from existing chunks — refresh required",
503
- )
504
- to_refresh.append(fname)
505
-
506
- classifications.append(c)
501
+ classifications.append(FileClassification(
502
+ filename=fname,
503
+ status=FileStatus.CHANGED,
504
+ endpoint="refresh_docs",
505
+ reason=f"Chunks exist on disk ({len(existing_chunks)}) — routing to refresh",
506
+ ))
507
+ to_refresh.append(fname)
507
508
 
508
509
  print(f"\n NEW={len(to_add)} CHANGED={len(to_refresh)} UNCHANGED={len(to_skip)}")
509
510
  print(_divider())
@@ -522,7 +523,7 @@ class VectorLakeClient:
522
523
  user_id: str,
523
524
  vector_lake_description: str,
524
525
  start_from_batch: int = 1,
525
- end_batch: Optional[int] = None,
526
+ end_batch: Optional[int] = None,
526
527
  intelligent_segmentation: bool = True,
527
528
  session_id: Optional[str] = None,
528
529
  files: Optional[List[str]] = None,
@@ -556,7 +557,7 @@ class VectorLakeClient:
556
557
  grand_total = self._compute_grand_total(self.config.vector_lake_path)
557
558
  return self._process_files_in_batches(
558
559
  "add_docs", user_id, vector_lake_description,
559
- start_from_batch, end_batch, # ← NEW
560
+ start_from_batch, end_batch,
560
561
  intelligent_segmentation, session_id, files,
561
562
  grand_total_batches=grand_total,
562
563
  )
@@ -570,7 +571,7 @@ class VectorLakeClient:
570
571
  user_id: str,
571
572
  vector_lake_description: str,
572
573
  start_from_batch: int = 1,
573
- end_batch: Optional[int] = None, # ← NEW
574
+ end_batch: Optional[int] = None,
574
575
  intelligent_segmentation: bool = True,
575
576
  session_id: Optional[str] = None,
576
577
  files: Optional[List[str]] = None,
@@ -604,7 +605,7 @@ class VectorLakeClient:
604
605
  grand_total = self._compute_grand_total(self.config.vector_lake_path)
605
606
  return self._process_files_in_batches(
606
607
  "refresh_docs", user_id, vector_lake_description,
607
- start_from_batch, end_batch, # ← NEW
608
+ start_from_batch, end_batch,
608
609
  intelligent_segmentation, session_id, files,
609
610
  grand_total_batches=grand_total,
610
611
  )
@@ -618,7 +619,7 @@ class VectorLakeClient:
618
619
  user_id: str,
619
620
  vector_lake_description: str,
620
621
  start_from_batch: int = 1,
621
- end_batch: Optional[int] = None, # ← NEW
622
+ end_batch: Optional[int] = None,
622
623
  intelligent_segmentation: bool = True,
623
624
  session_id: Optional[str] = None,
624
625
  files: Optional[List[str]] = None,
@@ -629,9 +630,8 @@ class VectorLakeClient:
629
630
 
630
631
  Classification (based on chunks already on disk)
631
632
  ────────────────────────────────────────────────
632
- No chunk on disk → NEW → add_docs
633
- Chunk exists, same UNCHANGEDskipped (no server call)
634
- Chunk exists, differs→ CHANGED → refresh_docs
633
+ No chunk on disk → NEW → add_docs
634
+ Chunk(s) exist CHANGEDrefresh_docs
635
635
 
636
636
  dry_run=True returns the classification plan without uploading anything.
637
637
  """
@@ -684,7 +684,7 @@ class VectorLakeClient:
684
684
  if plan.to_add:
685
685
  add_result = self._process_files_in_batches(
686
686
  "add_docs", user_id, vector_lake_description,
687
- start_from_batch, end_batch, # ← NEW
687
+ start_from_batch, end_batch,
688
688
  intelligent_segmentation, session_id,
689
689
  files=plan.to_add,
690
690
  grand_total_batches=grand_total,
@@ -693,7 +693,7 @@ class VectorLakeClient:
693
693
  if plan.to_refresh:
694
694
  refresh_result = self._process_files_in_batches(
695
695
  "refresh_docs", user_id, vector_lake_description,
696
- start_from_batch, end_batch, # ← NEW
696
+ start_from_batch, end_batch,
697
697
  intelligent_segmentation, session_id,
698
698
  files=plan.to_refresh,
699
699
  grand_total_batches=grand_total,
@@ -755,12 +755,19 @@ class VectorLakeClient:
755
755
  "files_name and files_data must have the same length"
756
756
  ).to_response()
757
757
 
758
+ basenames = [os.path.basename(n) for n in files_name]
759
+ files_meta = [
760
+ {"filename": b, "extension": Path(b).suffix.lstrip(".").lower()}
761
+ for b in basenames
762
+ ]
763
+
758
764
  payload = {
759
765
  "session_id": session_id,
760
766
  "user_id": user_id,
761
767
  "vector_lake_description": vector_lake_description,
762
- "files_name": [os.path.basename(n) for n in files_name],
768
+ "files_name": basenames,
763
769
  "files_data": files_data,
770
+ "files_meta": files_meta,
764
771
  "intelligent_segmentation": intelligent_segmentation,
765
772
  }
766
773
  return self._make_request(
@@ -68,9 +68,9 @@ class FileStatus(Enum):
68
68
  Classified intent for a single source file before upload.
69
69
 
70
70
  NEW → never uploaded; use add_docs
71
- CHANGED → uploaded before but content differs; use refresh_docs
72
- UNCHANGED → content identical to last upload; skip
73
- UNKNOWN → no chunks on disk to compare against; caller decides
71
+ CHANGED → uploaded before, chunks exist on disk; use refresh_docs
72
+ UNCHANGED → reserved, not currently used
73
+ UNKNOWN → source file missing from disk; caller decides
74
74
  """
75
75
  NEW = "new"
76
76
  CHANGED = "changed"
@@ -260,8 +260,18 @@ class FileProcessor:
260
260
  raise UnsupportedFileTypeError(filename, self.allowed_extensions)
261
261
 
262
262
  def read_content(self, filepath: str) -> str:
263
- """Read and return full text content of any supported file."""
263
+ """
264
+ Read and return full text content of any supported file.
265
+
266
+ Chunks written by BatchManager are always plain .txt files regardless
267
+ of the original source format. Running a format-specific extractor
268
+ (e.g. _extract_pdf) on an already-extracted .txt chunk would either
269
+ fail or produce garbage, so versioned chunks are always read as plain
270
+ text directly.
271
+ """
264
272
  self.assert_supported(filepath)
273
+ if VersionedChunkName.is_versioned(filepath):
274
+ return _read_plain(filepath)
265
275
  return extract_text(filepath)
266
276
 
267
277
 
@@ -274,7 +284,7 @@ class BatchManager:
274
284
  Converts a list of source files into upload-ready batches.
275
285
 
276
286
  Chunk naming follows the SDK convention:
277
- {stem}__v0001_part{part:04d}{ext}
287
+ {stem}_part{part}.txt
278
288
  Chunks already on disk are reused — safe to re-run after a crash.
279
289
  """
280
290
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: waveflowdb_client
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
5
5
  Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
6
6
  License: MIT License