PyPI - waveflowdb-client - Versions diffs - 1.0.2__tar.gz → 1.0.3__tar.gz - Mend

waveflowdb-client 1.0.2tar.gz → 1.0.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{waveflowdb_client-1.0.2/waveflowdb_client.egg-info → waveflowdb_client-1.0.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: waveflowdb_client
-Version: 1.0.2
+Version: 1.0.3
 Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
 Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
 License: MIT License

{waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "waveflowdb_client"                   # pip install name
-version = "1.0.2"
+version = "1.0.3"
 description = "VectorLake SDK — Deterministic backend engine powering agent workflows"
 readme = "readme.md"
 requires-python = ">=3.8"

{waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client/client.py RENAMED Viewed

@@ -87,6 +87,24 @@ def _divider(label: str = "", width: int = 60) -> str:
     return f"\n{'─' * width}\n  {label}\n{'─' * width}" if label else f"{'─' * width}"
+def _source_ext(chunk_name: str, base_path: str) -> str:
+    """
+    Recover the original source file extension from a chunk filename.
+    Parses the chunk stem via VersionedChunkName, then looks for the source
+    file in base_path to get its real extension. Falls back to "txt" if the
+    source file is not found (e.g. direct-upload mode).
+    """
+    parsed = VersionedChunkName.parse(chunk_name)
+    if parsed:
+        candidates = list(Path(base_path).glob(f"{parsed.stem}.*"))
+        # Exclude other chunk files in the same dir
+        candidates = [c for c in candidates if not VersionedChunkName.is_versioned(c.name)]
+        if candidates:
+            return candidates[0].suffix.lstrip(".").lower()
+    return Path(chunk_name).suffix.lstrip(".").lower() or "txt"
 class VectorLakeClient:
     """
     Main entry point for the VectorLake SDK.
@@ -230,7 +248,7 @@ class VectorLakeClient:
         user_id: str,
         vector_lake_description: str,
         start_from_batch: int = 1,
-        end_batch: Optional[int] = None,                      # ← NEW
+        end_batch: Optional[int] = None,
         intelligent_segmentation: bool = True,
         session_id: Optional[str] = None,
         files: Optional[List[str]] = None,
@@ -289,7 +307,7 @@ class VectorLakeClient:
         active  = [
             (i, b) for i, b in enumerate(batches)
-            if start_from_batch <= (i + 1) <= resolved_end   # ← respects both bounds
+            if start_from_batch <= (i + 1) <= resolved_end
         ]
         initial = start_from_batch - 1
@@ -305,19 +323,22 @@ class VectorLakeClient:
                 t0        = time.time()
                 try:
-                    # ── per-batch START marker ─────────────────────────────
                     logger.info(
                         "[BATCH START] op=%s batch=%d/%d files=%s",
                         operation, batch_num, display_total, batch,
                     )
-                    # ──────────────────────────────────────────────────────
                     file_contents = []
+                    files_meta    = []
                     for fname in batch:
                         chunk_path = os.path.join(chunks_dir, fname)
                         if not os.path.exists(chunk_path):
                             chunk_path = os.path.join(base_path, fname)
                         file_contents.append(self._file_processor.read_content(chunk_path))
+                        files_meta.append({
+                            "filename":  fname,
+                            "extension": _source_ext(fname, base_path),
+                        })
                     payload = {
                         "session_id":               session_id,
@@ -325,9 +346,10 @@ class VectorLakeClient:
                         "vector_lake_description":  vector_lake_description,
                         "files_name":               batch,
                         "files_data":               file_contents,
+                        "files_meta":               files_meta,
                         "intelligent_segmentation": intelligent_segmentation,
                     }
+                    print("payload",payload)
                     request_start_ts = time.time()
                     result           = self._make_request(endpoint, payload, operation, batch_num)
                     elapsed_ms       = round((time.time() - request_start_ts) * 1000, 1)
@@ -345,12 +367,10 @@ class VectorLakeClient:
                             batch_num, result.get("message", result.get("error")),
                         )
-                    # ── per-batch END marker ───────────────────────────────
                     logger.info(
                         "[BATCH END] op=%s batch=%d/%d success=%s elapsed_ms=%.1f files=%s",
                         operation, batch_num, display_total, succeeded, elapsed_ms, batch,
                     )
-                    # ──────────────────────────────────────────────────────
                     batch_outputs.append({
                         "batch_number":    batch_num,
@@ -440,15 +460,13 @@ class VectorLakeClient:
         chunks_dir: str,
     ) -> SyncPlan:
         """
-        Classify each source file by comparing its MD5 against the MD5 of
-        its existing chunk(s) in chunks_dir.
+        Classify each source file by checking whether chunks already exist on disk.
         Rules
         ─────
-        No chunk exists on disk        → NEW       → add_docs
-        Chunk exists, MD5 matches      → UNCHANGED → skip
-        Chunk exists, MD5 differs      → CHANGED   → refresh_docs
-        Source file missing from disk  → UNKNOWN   → add_docs (safe default)
+        No chunk exists on disk   → NEW     → add_docs
+        Chunk(s) exist on disk    → CHANGED → refresh_docs
+        Source file missing       → UNKNOWN → add_docs (safe default)
         """
         to_add, to_refresh, to_skip = [], [], []
         classifications = []
@@ -460,50 +478,35 @@ class VectorLakeClient:
                 bar.set_postfix_str(fname[:40])
                 if not src.exists():
-                    c = FileClassification(
+                    classifications.append(FileClassification(
                         filename=fname,
                         status=FileStatus.UNKNOWN,
                         endpoint="add_docs",
                         reason="Source file not on disk — defaulting to add_docs",
-                    )
+                    ))
                     to_add.append(fname)
-                    classifications.append(c)
                     continue
-                stem            = src.stem
-                existing_chunks = sorted(Path(chunks_dir).glob(f"{stem}_part*.txt"))
+                existing_chunks = sorted(
+                    Path(chunks_dir).glob(f"{src.stem}_part*.txt")
+                )
                 if not existing_chunks:
-                    c = FileClassification(
+                    classifications.append(FileClassification(
                         filename=fname,
                         status=FileStatus.NEW,
                         endpoint="add_docs",
                         reason="No chunks on disk — first upload",
-                    )
+                    ))
                     to_add.append(fname)
                 else:
-                    src_md5       = _md5(str(src))
-                    chunk_content = b"".join(Path(cp).read_bytes() for cp in existing_chunks)
-                    chunk_md5     = hashlib.md5(chunk_content).hexdigest()
-                    if src_md5 == chunk_md5:
-                        c = FileClassification(
-                            filename=fname,
-                            status=FileStatus.UNCHANGED,
-                            endpoint="skip",
-                            reason="Content matches existing chunks — skipping",
-                        )
-                        to_skip.append(fname)
-                    else:
-                        c = FileClassification(
-                            filename=fname,
-                            status=FileStatus.CHANGED,
-                            endpoint="refresh_docs",
-                            reason="Content differs from existing chunks — refresh required",
-                        )
-                        to_refresh.append(fname)
-                classifications.append(c)
+                    classifications.append(FileClassification(
+                        filename=fname,
+                        status=FileStatus.CHANGED,
+                        endpoint="refresh_docs",
+                        reason=f"Chunks exist on disk ({len(existing_chunks)}) — routing to refresh",
+                    ))
+                    to_refresh.append(fname)
         print(f"\n   NEW={len(to_add)}  CHANGED={len(to_refresh)}  UNCHANGED={len(to_skip)}")
         print(_divider())
@@ -522,7 +525,7 @@ class VectorLakeClient:
         user_id: str,
         vector_lake_description: str,
         start_from_batch: int = 1,
-        end_batch: Optional[int] = None,
+        end_batch: Optional[int] = None,
         intelligent_segmentation: bool = True,
         session_id: Optional[str] = None,
         files: Optional[List[str]] = None,
@@ -556,7 +559,7 @@ class VectorLakeClient:
             grand_total = self._compute_grand_total(self.config.vector_lake_path)
             return self._process_files_in_batches(
                 "add_docs", user_id, vector_lake_description,
-                start_from_batch, end_batch,                  # ← NEW
+                start_from_batch, end_batch,
                 intelligent_segmentation, session_id, files,
                 grand_total_batches=grand_total,
             )
@@ -570,7 +573,7 @@ class VectorLakeClient:
         user_id: str,
         vector_lake_description: str,
         start_from_batch: int = 1,
-        end_batch: Optional[int] = None,                      # ← NEW
+        end_batch: Optional[int] = None,
         intelligent_segmentation: bool = True,
         session_id: Optional[str] = None,
         files: Optional[List[str]] = None,
@@ -604,7 +607,7 @@ class VectorLakeClient:
             grand_total = self._compute_grand_total(self.config.vector_lake_path)
             return self._process_files_in_batches(
                 "refresh_docs", user_id, vector_lake_description,
-                start_from_batch, end_batch,                  # ← NEW
+                start_from_batch, end_batch,
                 intelligent_segmentation, session_id, files,
                 grand_total_batches=grand_total,
             )
@@ -618,7 +621,7 @@ class VectorLakeClient:
         user_id: str,
         vector_lake_description: str,
         start_from_batch: int = 1,
-        end_batch: Optional[int] = None,                      # ← NEW
+        end_batch: Optional[int] = None,
         intelligent_segmentation: bool = True,
         session_id: Optional[str] = None,
         files: Optional[List[str]] = None,
@@ -629,9 +632,8 @@ class VectorLakeClient:
         Classification (based on chunks already on disk)
         ────────────────────────────────────────────────
-        No chunk on disk     → NEW       → add_docs
-        Chunk exists, same   → UNCHANGED → skipped (no server call)
-        Chunk exists, differs→ CHANGED   → refresh_docs
+        No chunk on disk     → NEW     → add_docs
+        Chunk(s) exist       → CHANGED → refresh_docs
         dry_run=True returns the classification plan without uploading anything.
         """
@@ -684,7 +686,7 @@ class VectorLakeClient:
             if plan.to_add:
                 add_result = self._process_files_in_batches(
                     "add_docs", user_id, vector_lake_description,
-                    start_from_batch, end_batch,              # ← NEW
+                    start_from_batch, end_batch,
                     intelligent_segmentation, session_id,
                     files=plan.to_add,
                     grand_total_batches=grand_total,
@@ -693,7 +695,7 @@ class VectorLakeClient:
             if plan.to_refresh:
                 refresh_result = self._process_files_in_batches(
                     "refresh_docs", user_id, vector_lake_description,
-                    start_from_batch, end_batch,              # ← NEW
+                    start_from_batch, end_batch,
                     intelligent_segmentation, session_id,
                     files=plan.to_refresh,
                     grand_total_batches=grand_total,
@@ -755,12 +757,19 @@ class VectorLakeClient:
                 "files_name and files_data must have the same length"
             ).to_response()
+        basenames  = [os.path.basename(n) for n in files_name]
+        files_meta = [
+            {"filename": b, "extension": Path(b).suffix.lstrip(".").lower()}
+            for b in basenames
+        ]
         payload = {
             "session_id":               session_id,
             "user_id":                  user_id,
             "vector_lake_description":  vector_lake_description,
-            "files_name":               [os.path.basename(n) for n in files_name],
+            "files_name":               basenames,
             "files_data":               files_data,
+            "files_meta":               files_meta,
             "intelligent_segmentation": intelligent_segmentation,
         }
         return self._make_request(

{waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client/models.py RENAMED Viewed

@@ -68,9 +68,9 @@ class FileStatus(Enum):
     Classified intent for a single source file before upload.
     NEW       → never uploaded; use add_docs
-    CHANGED   → uploaded before but content differs; use refresh_docs
-    UNCHANGED → content identical to last upload; skip
-    UNKNOWN   → no chunks on disk to compare against; caller decides
+    CHANGED   → uploaded before, chunks exist on disk; use refresh_docs
+    UNCHANGED → reserved, not currently used
+    UNKNOWN   → source file missing from disk; caller decides
     """
     NEW       = "new"
     CHANGED   = "changed"

{waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3}/waveflowdb_client/utils.py RENAMED Viewed

@@ -260,8 +260,18 @@ class FileProcessor:
             raise UnsupportedFileTypeError(filename, self.allowed_extensions)
     def read_content(self, filepath: str) -> str:
-        """Read and return full text content of any supported file."""
+        """
+        Read and return full text content of any supported file.
+        Chunks written by BatchManager are always plain .txt files regardless
+        of the original source format. Running a format-specific extractor
+        (e.g. _extract_pdf) on an already-extracted .txt chunk would either
+        fail or produce garbage, so versioned chunks are always read as plain
+        text directly.
+        """
         self.assert_supported(filepath)
+        if VersionedChunkName.is_versioned(filepath):
+            return _read_plain(filepath)
         return extract_text(filepath)
@@ -274,7 +284,7 @@ class BatchManager:
     Converts a list of source files into upload-ready batches.
     Chunk naming follows the SDK convention:
-        {stem}__v0001_part{part:04d}{ext}
+        {stem}_part{part}.txt
     Chunks already on disk are reused — safe to re-run after a crash.
     """

{waveflowdb_client-1.0.2 → waveflowdb_client-1.0.3/waveflowdb_client.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: waveflowdb_client
-Version: 1.0.2
+Version: 1.0.3
 Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
 Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
 License: MIT License