PyPI - waveflowdb-client - Versions diffs - 0.0.4__tar.gz → 1.0.0__tar.gz - Mend

waveflowdb-client 0.0.4tar.gz → 1.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{waveflowdb_client-0.0.4/waveflowdb_client.egg-info → waveflowdb_client-1.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: waveflowdb_client
-Version: 0.0.4
+Version: 1.0.0
 Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
 Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
 License: MIT License
@@ -27,7 +27,7 @@ License: MIT License
 Project-URL: Homepage, https://agentanalytics.ai
 Project-URL: Documentation, https://www.agentanalytics.ai/docs/waveflow-db
-Keywords: vector db,VECTOR QUERY LANGUAGE,waveflow,agentanalytics,VQL
+Keywords: vector db,VECTOR QUERY LANGUAGE,waveflowdb,agentanalytics,VQL
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE

{waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "waveflowdb_client"                   # pip install name
-version = "0.0.4"
+version = "1.0.0"
 description = "VectorLake SDK — Deterministic backend engine powering agent workflows"
 readme = "readme.md"
 requires-python = ">=3.8"
@@ -14,7 +14,7 @@ authors = [
   { name = "agentanalytics.ai", email = "nitin@agentanalytics.ai" }
 ]
-keywords = ["vector db", "VECTOR QUERY LANGUAGE", "waveflow", "agentanalytics", "VQL"]
+keywords = ["vector db", "VECTOR QUERY LANGUAGE", "waveflowdb", "agentanalytics", "VQL"]
 dependencies = [
   "requests",

{waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.0.4"
+__version__ = "1.0.0"
 from .client import VectorLakeClient
 from .config import Config

{waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client/client.py RENAMED Viewed

@@ -3,13 +3,12 @@ import logging
 import json
 import requests
 import os
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Optional, Dict, Any
 from .config import Config
 from .utils import FileProcessor, Logger, BatchManager
-from .exceptions import APIError
-from .models import HealthResponse  # removed BatchResult import
+from .exceptions import APIError, FileProcessingError
+from .models import HealthResponse
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -24,36 +23,42 @@ class VectorLakeClient:
         self.logger = Logger(config.log_dir)
         self.batch_manager = BatchManager(config.max_files_per_batch, config.max_batch_size_mb)
         self.file_processor = FileProcessor()
-        self.perf_csv = "performance_logs.csv"
     def _get_headers(self) -> Dict[str, str]:
-        return {
-            'Content-Type': 'application/json',
-            'x-api-key': self.config.api_key
-        }
+        return {'Content-Type': 'application/json', 'x-api-key': self.config.api_key}
     def _make_request(self, endpoint: str, payload: Dict[str, Any], operation: str = "", batch_num: int = 0) -> Dict[str, Any]:
         headers = self._get_headers()
-        request_size = len(json.dumps(payload).encode('utf-8')) / 1024 if payload is not None else 0
+        request_size = len(json.dumps(payload).encode('utf-8')) / 1024 if payload else 0
         for attempt in range(self.config.max_retries):
             try:
                 start_time = time.time()
                 response = requests.post(endpoint, json=payload, headers=headers, timeout=self.config.timeout)
                 latency = (time.time() - start_time) * 1000
                 try:
                     result = response.json()
                 except Exception:
                     result = {"status_code": response.status_code, "text": response.text}
                 if operation:
-                    response_size = len(response.content) / 1024 if response.content is not None else 0
+                    response_size = len(response.content) / 1024 if response.content else 0
                     result_count = len(result.get("results", [])) if isinstance(result, dict) else "N/A"
-                    self.logger.log_performance(operation, batch_num, latency, request_size, response_size, result_count)
+                    self.logger.log_performance(
+                        operation=operation,
+                        batch_num=batch_num,
+                        latency=latency,
+                        request_size=request_size,
+                        response_size=response_size,
+                        result_count=result_count
+                    )
                 if response.status_code >= 400:
                     raise APIError(result.get('message', f'HTTP {response.status_code}'), status_code=response.status_code, response_text=response.text)
                 return result
             except requests.exceptions.RequestException as e:
                 if attempt == self.config.max_retries - 1:
                     error_msg = f"Request failed after {self.config.max_retries} attempts: {str(e)}"
@@ -62,19 +67,38 @@ class VectorLakeClient:
                     raise APIError(error_msg, getattr(e.response, 'status_code', None), getattr(e.response, 'text', None))
                 time.sleep(2 ** attempt)
-    def _read_files(self, filenames: List[str]) -> List[str]:
+    def _make_request_with_backoff(self, endpoint, payload, operation, batch_num, retries=5, base_delay=1):
+        delay = base_delay
+        for attempt in range(retries):
+            try:
+                return self._make_request(endpoint, payload, operation, batch_num)
+            except APIError as e:
+                if getattr(e, "status_code", None) == 429:
+                    logging.warning(f"Batch {batch_num} throttled, retrying in {delay}s...")
+                    time.sleep(delay)
+                    delay *= 2
+                    continue
+                raise
+            except Exception:
+                raise
+    def _read_files(self, filenames: List[str], chunks_dir: Optional[str] = None) -> List[str]:
+        """
+        Reads files safely. If chunks_dir is provided, reads files from that folder.
+        """
         contents = []
-        for filename in filenames:
-            filepath = os.path.join(self.config.vector_lake_path, filename)
+        for fname in filenames:
+            path_base = chunks_dir if chunks_dir else self.config.vector_lake_path
+            filepath = os.path.join(path_base, fname)
             try:
-                if self.file_processor.is_supported_file(filename):
+                if self.file_processor.is_supported_file(fname):
                     content = self.file_processor.read_file_content(filepath)
                     contents.append(content)
                 else:
-                    self.logger.log_skipped_file(filename, "Unsupported file type")
+                    self.logger.log_skipped_file(fname, "Unsupported file type")
                     contents.append("")
             except Exception as e:
-                self.logger.log_skipped_file(filename, f"Read error: {str(e)}")
+                self.logger.log_skipped_file(fname, f"Read error: {str(e)}")
                 contents.append("")
         return contents
@@ -106,36 +130,23 @@ class VectorLakeClient:
             "pattern": pattern
         }
-        # DIRECT MODE: user provided files_name + files_data -> return raw server response
+        # Direct mode
         if files_name and files_data:
             if len(files_name) != len(files_data):
                 raise ValueError("files_name and files_data must be same length")
-            # ensure names are sanitized (no full paths)
             clean_names = [os.path.basename(n) for n in files_name]
-            payload.update({
-                "files_name": clean_names,
-                "files_data": files_data,
-                "pattern": "dynamic"
-            })
+            payload.update({"files_name": clean_names, "files_data": files_data, "pattern": "dynamic"})
             return self._make_request(endpoint, payload, endpoint_key)
-        # DYNAMIC MODE: read from local filesystem, then return raw server response
+        # Dynamic mode: read from filesystem
         if pattern == "dynamic" and files:
-            # sanitize file names (server expects names, not full paths)
-            clean_names = [os.path.basename(f) for f in files]
-            file_contents = self._read_files(clean_names) if all(os.path.exists(os.path.join(self.config.vector_lake_path, os.path.basename(f))) for f in files) else self._read_files(files)
-            payload.update({
-                "files_name": clean_names,
-                "files_data": file_contents,
-                "pattern": "dynamic"
-            })
-            # debug print — remove or comment out in prod if not needed
-            logging.debug("MATCHING DOCS DYNAMIC PAYLOAD: %s", json.dumps(payload, indent=2)[:2000])
+            batches, chunks_dir = self.batch_manager.create_batches(files, self.config.vector_lake_path)
+            flat_files = [fname for batch in batches for fname in batch]
+            file_contents = self._read_files(flat_files, chunks_dir)
+            payload.update({"files_name": flat_files, "files_data": file_contents, "pattern": "dynamic"})
             return self._make_request(endpoint, payload, endpoint_key)
-        # STATIC MODE: just run the request and return raw response
+        # Static mode
         return self._make_request(endpoint, payload, endpoint_key)
     def add_documents(self,
@@ -149,11 +160,9 @@ class VectorLakeClient:
                       files_data: Optional[List[str]] = None,
                       max_workers=5) -> Any:
         """
-        Direct mode returns raw server response.
-        Filesystem batch mode returns a 'batch' envelope.
+        Add documents either in direct mode (names + data) or batch mode (filesystem).
         """
-        # Direct mode: user supplied names + data -> single request (raw server response)
+        # Direct mode
         if files_name and files_data:
             if len(files_name) != len(files_data):
                 raise ValueError("files_name and files_data must be same length")
@@ -166,13 +175,12 @@ class VectorLakeClient:
                 "intelligent_segmentation": intelligent_segmentation
             }
             endpoint = self.config.endpoints["add_docs"]
-            # batch_num is OK for add_docs (server batch logging); set to 1 for this single request
-            result = self._make_request(endpoint, payload, "add_docs", batch_num=1)
-            return result
+            return self._make_request(endpoint, payload, "add_docs", batch_num=1)
-        # Batch mode: filesystem-driven -> process in batches and return envelope
+        # Batch mode
         return self._process_files_in_batches(
-            "add_docs", user_id, vector_lake_description, start_from_batch, intelligent_segmentation, session_id, files, max_workers=max_workers
+            "add_docs", user_id, vector_lake_description, start_from_batch,
+            intelligent_segmentation, session_id, files, max_workers=max_workers
         )
     def refresh_documents(self,
@@ -186,16 +194,11 @@ class VectorLakeClient:
                           files_data: Optional[List[str]] = None,
                           max_workers=5) -> Any:
         """
-        Same semantics as add_documents:
-        - Direct mode returns raw server response
-        - Batch mode returns batch envelope
+        Same semantics as add_documents
         """
-        # Direct mode
         if files_name and files_data:
             if len(files_name) != len(files_data):
                 raise ValueError("files_name and files_data must be same length")
             payload = {
                 "session_id": session_id,
                 "user_id": user_id,
@@ -205,23 +208,130 @@ class VectorLakeClient:
                 "intelligent_segmentation": intelligent_segmentation
             }
             endpoint = self.config.endpoints["refresh_docs"]
-            result = self._make_request(endpoint, payload, "refresh_docs", batch_num=1)
-            return result
+            return self._make_request(endpoint, payload, "refresh_docs", batch_num=1)
-        # Batch mode — NOTE: call using positional 'operation' arg (operation first)
         return self._process_files_in_batches(
-            "refresh_docs", user_id, vector_lake_description, start_from_batch, intelligent_segmentation, session_id, files, max_workers=max_workers
+            "refresh_docs", user_id, vector_lake_description, start_from_batch,
+            intelligent_segmentation, session_id, files, max_workers=max_workers
         )
-    def health_check(self, user_id: str, vector_lake_description: str, session_id: Optional[str] = None) -> Dict[str, Any]:
+    def health_check(
+        self,
+        user_id: str,
+        vector_lake_description: str,
+        session_id: Optional[str] = None
+    ) -> Dict[str, Any]:
         endpoint = self.config.endpoints["health"]
-        payload = {"session_id": session_id, "user_id": user_id, "vector_lake_description": vector_lake_description}
+        payload = {
+            "user_id": user_id,
+            "vector_lake_description": vector_lake_description,
+            "session_id": session_id
+        }
         try:
-            result = self._make_request(endpoint, payload, "health")
-            return HealthResponse(status="success", message=result.get("message", "ok"), timestamp=time.time(), details=result)
+            result = self._make_request(
+                endpoint=endpoint,
+                payload=payload,
+                operation="health"
+            )
+            return result
         except Exception as e:
-            return HealthResponse(status="error", message=str(e), timestamp=time.time())
+            return {
+                "status": "error",
+                "message": str(e),
+                "timestamp": time.time()
+            }
+    def _process_files_in_batches(self, operation: str, user_id: str, vector_lake_description: str,
+                              start_from_batch: int = 1, intelligent_segmentation: bool = False,
+                              session_id: Optional[str] = None, files: Optional[List[str]] = None,
+                              max_workers: int = 5, batch_delay: float = 2) -> dict:
+        """
+        Processes files from the filesystem in batches using BatchManager.
+        Returns a standardized envelope.
+        """
+        base_path = self.config.vector_lake_path
+        # 1. Gather files if not provided
+        if files is None:
+            files = [f for f in os.listdir(base_path)
+                    if os.path.isfile(os.path.join(base_path, f)) and self.file_processor.is_supported_file(f)]
+        # 2. Create batches using BatchManager
+        batches, chunks_dir = self.batch_manager.create_batches(files, base_path)
+        logging.info(f"Batches created: {batches}, chunks_dir: {chunks_dir}")
+        batch_outputs = []
+        start_index = start_from_batch - 1
+        endpoint = self.config.endpoints[operation]
+        for i, batch in enumerate(batches):
+            batch_num = i + 1
+            # Skip batches if resuming
+            if i < start_index:
+                logging.info(f"Skipping batch {batch_num}")
+                continue
+            start_time = time.time()
+            try:
+                # 3. Read file contents from chunks folder
+                file_contents = []
+                for fname in batch:
+                    full_path = os.path.join(chunks_dir, fname)
+                    if not os.path.exists(full_path):
+                        full_path = os.path.join(base_path, fname)
+                    content = self.file_processor.read_file_content(full_path)
+                    file_contents.append(content)
+                payload = {
+                    "session_id": session_id,
+                    "user_id": user_id,
+                    "vector_lake_description": vector_lake_description,
+                    "files_name": batch,      # API expects basenames
+                    "files_data": file_contents,
+                    "intelligent_segmentation": intelligent_segmentation
+                }
+                # logging.info(f"payload is {payload}")
+                 # 4. Make request with backoff
+                result = self._make_request_with_backoff(endpoint, payload, operation, batch_num)
+                processing_time = time.time() - start_time
+                logging.info(f"Batch {batch_num} done")
+                batch_outputs.append({
+                    "batch_number": batch_num,
+                    "files": batch,
+                    "success": True,
+                    "processing_time": round(processing_time, 3),
+                    "response": result
+                })
+            except Exception as e:
+                processing_time = time.time() - start_time
+                logging.error(f"Batch {batch_num} failed: {str(e)}")
+                batch_outputs.append({
+                    "batch_number": batch_num,
+                    "files": batch,
+                    "success": False,
+                    "processing_time": round(processing_time, 3),
+                    "response": None,
+                    "error": str(e)
+                })
+            # 5. Delay between batches
+            time.sleep(batch_delay)
+        return {
+            "mode": "batch",
+            "total_batches": len(batches),
+            "batches": sorted(batch_outputs, key=lambda x: x["batch_number"])
+        }
     def get_namespace_details(self, user_id: str, session_id: Optional[str] = None, vector_lake_description: Optional[str] = None) -> Dict[str, Any]:
         endpoint = self.config.endpoints["get_namespace_details_by_userid"]
         payload = {"session_id": session_id, "user_id": user_id}
@@ -252,84 +362,3 @@ class VectorLakeClient:
             return result
         except Exception as e:
             return {"status": "error", "message": str(e)}
-    def _make_request_with_backoff(self, endpoint, payload, operation, batch_num, retries=5, base_delay=1):
-        delay = base_delay
-        for attempt in range(retries):
-            try:
-                result = self._make_request(endpoint, payload, operation, batch_num)
-                return result
-            except APIError as e:
-                if getattr(e, "status_code", None) == 429:
-                    logging.warning(f"Batch {batch_num} throttled, retrying in {delay}s...")
-                    time.sleep(delay)
-                    delay *= 2
-                    continue
-                raise
-            except Exception:
-                raise
-    def _process_files_in_batches(self, operation: str, user_id: str, vector_lake_description: str, start_from_batch, intelligent_segmentation: bool = False, session_id: Optional[str] = None, files: Optional[List[str]] = None, max_workers: int = 1, batch_delay: float = 2) -> Dict[str, Any]:
-        """
-        Processes files from the filesystem in batches and returns a standardized
-        envelope:
-        {
-          "mode": "batch",
-          "total_batches": N,
-          "batches": [
-             { "batch_number": i, "files": [...], "success": True, "processing_time": x, "response": {...} },
-             ...
-          ]
-        }
-        """
-        if files is None:
-            files = [f for f in os.listdir(self.config.vector_lake_path) if os.path.isfile(os.path.join(self.config.vector_lake_path, f)) and self.file_processor.is_supported_file(f)]
-        batches = self.batch_manager.create_batches(files, self.config.vector_lake_path)
-        batch_outputs: List[Dict[str, Any]] = []
-        start_batch_index = start_from_batch - 1
-        if start_from_batch > 1:
-            logging.info(f"Resuming from batch {start_from_batch}, skipping first {start_from_batch - 1} batches")
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            futures = {}
-            for i, batch in enumerate(batches):
-                if i < start_batch_index:
-                    logging.info(f"Skipping batch {i + 1}")
-                    continue
-                batch_num = i + 1
-                file_contents = self._read_files(batch)
-                payload = {"session_id": session_id, "user_id": user_id, "vector_lake_description": vector_lake_description, "files_name": batch, "files_data": file_contents, "intelligent_segmentation": intelligent_segmentation}
-                endpoint = self.config.endpoints[operation]
-                futures[executor.submit(self._make_request_with_backoff, endpoint, payload, operation, batch_num)] = (batch_num, batch, time.time())
-                time.sleep(batch_delay)
-            for future in as_completed(futures):
-                batch_num, batch, start_time = futures[future]
-                processing_time = time.time() - start_time
-                try:
-                    result = future.result()
-                    logging.info(f"Batch {batch_num} done")
-                    batch_outputs.append({
-                        "batch_number": batch_num,
-                        "files": batch,
-                        "success": True,
-                        "processing_time": round(processing_time, 3),
-                        "response": result
-                    })
-                except Exception as e:
-                    logging.error(f"Batch {batch_num} failed: {str(e)}")
-                    batch_outputs.append({
-                        "batch_number": batch_num,
-                        "files": batch,
-                        "success": False,
-                        "processing_time": round(processing_time, 3),
-                        "response": None,
-                        "error": str(e)
-                    })
-        return {
-            "mode": "batch",
-            "total_batches": len(batch_outputs),
-            "batches": sorted(batch_outputs, key=lambda x: x["batch_number"])
-        }

{waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client/config.py RENAMED Viewed

@@ -8,10 +8,10 @@ class Config:
         self,
         api_key: Optional[str] = None,
         host: str = "https://waveflow-analytics.com",
-        timeout: int = 60,
+        timeout: int = 240,
         max_retries: int = 2,
         max_files_per_batch: int = 100,
-        max_batch_size_mb: int = 20,
+        max_batch_size_mb: int = 1,
         vector_lake_path: str = "upload",
         log_dir: str = "logs",
         service_port: int = None,

waveflowdb_client-1.0.0/waveflowdb_client/utils.py ADDED Viewed

@@ -0,0 +1,551 @@
+import os
+import csv
+import logging
+from pathlib import Path
+from datetime import datetime
+import json
+import shutil
+from typing import List,Tuple
+from .exceptions import FileProcessingError
+# logger = logging.getLogger(__name__)
+# logging.basicConfig(level=logging.INFO)
+import logging
+# Configure the logging settings
+logging.basicConfig(
+    level=logging.DEBUG,  # Set to INFO to see only high-level actions, or DEBUG for detail
+    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
+    filename='batch_creation.log',  # Log output to a file
+    filemode='w'  # Overwrite the log file each run
+)
+# If you also want output to the console:
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(levelname)s: %(message)s')
+console_handler.setFormatter(formatter)
+logging.getLogger().addHandler(console_handler)
+# Get the logger instance for your class (if needed, though basicConfig covers the root)
+logger = logging.getLogger(__name__)
+# -------------------------------------------------------------------------
+#                UNIVERSAL HELPERS FOR FILE CONVERSION & CHUNKING
+# -------------------------------------------------------------------------
+def paragraph_chunk(text: str, max_chars: int) -> list:
+    """
+    Strict paragraph-based chunking.
+    No paragraph is ever split.
+    """
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    chunks = []
+    current = ""
+    for p in paragraphs:
+        if len(current) + len(p) + 2 > max_chars:
+            if current:
+                chunks.append(current.strip())
+            current = p + "\n\n"
+        else:
+            current += p + "\n\n"
+    if current.strip():
+        chunks.append(current.strip())
+    return chunks
+def json_to_csv(json_path: str) -> str:
+    out_path = json_path.replace(".json", ".csv")
+    with open(json_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, list):
+        raise FileProcessingError("JSON must be an array of objects.")
+    # Extract all keys across JSON objects
+    headers = sorted({k for row in data for k in row.keys()})
+    with open(out_path, "w", newline="", encoding="utf-8") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=headers)
+        writer.writeheader()
+        writer.writerows(data)
+    return out_path
+def convert_to_txt(file_path: str) -> str:
+    base, _ = os.path.splitext(file_path)
+    out_path = base + ".txt"
+    if file_path.endswith(".py"):
+        with open(file_path, "r", encoding="utf-8") as f:
+            content = f.read()
+    elif file_path.endswith(".ipynb"):
+        import json
+        with open(file_path, "r", encoding="utf-8") as f:
+            nb = json.load(f)
+        content = "\n\n".join(
+            cell["source"] if isinstance(cell["source"], str)
+            else "".join(cell["source"])
+            for cell in nb.get("cells", []) if cell.get("cell_type") == "code"
+        )
+    else:
+        raise FileProcessingError("Unsupported file for text conversion")
+    with open(out_path, "w", encoding="utf-8") as out:
+        out.write(content)
+    return out_path
+def extract_text(file_path: str) -> str:
+    """
+    Extract text from PDF, DOCX, or TXT.
+    """
+    ext = file_path.lower().split(".")[-1]
+    if ext == "pdf":
+        try:
+            import PyPDF2
+        except Exception as e:
+            raise FileProcessingError(f"PyPDF2 not available: {e}")
+        text = ""
+        with open(file_path, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            for p in reader.pages:
+                text += (p.extract_text() or "") + "\n\n"
+        return text
+    if ext == "docx":
+        try:
+            from docx import Document
+        except Exception as e:
+            raise FileProcessingError(f"python-docx not available: {e}")
+        doc = Document(file_path)
+        return "\n\n".join(p.text for p in doc.paragraphs)
+    if ext == "txt":
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    raise FileProcessingError("Unsupported extraction type.")
+def split_csv_if_needed(file_path: str, max_bytes: int) -> list:
+    """
+    Split CSV into smaller parts using 20% safety buffer.
+    No partial rows.
+    """
+    SAFE_LIMIT = int(max_bytes * 0.8)
+    size = os.path.getsize(file_path)
+    if size <= SAFE_LIMIT:
+        return [file_path]
+    base, ext = os.path.splitext(file_path)
+    output_files = []
+    with open(file_path, "r", newline="", encoding="utf-8") as f:
+        reader = csv.reader(f)
+        header = next(reader)
+        header_bytes = len(",".join(header).encode("utf-8"))
+        part = 1
+        rows = []
+        current_size = header_bytes
+        for row in reader:
+            row_bytes = len(",".join(row).encode("utf-8"))
+            if row_bytes > SAFE_LIMIT:
+                raise FileProcessingError(f"A CSV row exceeds the safe max size.")
+            if current_size + row_bytes > SAFE_LIMIT:
+                out_file = f"{base}_part_{part}{ext}"
+                with open(out_file, "w", newline="", encoding="utf-8") as out:
+                    writer = csv.writer(out)
+                    writer.writerow(header)
+                    writer.writerows(rows)
+                output_files.append(out_file)
+                part += 1
+                rows = []
+                current_size = header_bytes
+            rows.append(row)
+            current_size += row_bytes
+        # last chunk
+        if rows:
+            out_file = f"{base}_part_{part}{ext}"
+            with open(out_file, "w", newline="", encoding="utf-8") as out:
+                writer = csv.writer(out)
+                writer.writerow(header)
+                writer.writerows(rows)
+            output_files.append(out_file)
+    return output_files
+# -------------------------------------------------------------------------
+#                            FILE PROCESSOR
+# -------------------------------------------------------------------------
+class FileProcessor:
+    SUPPORTED_EXTENSIONS = ['txt', 'csv', 'json', 'py', 'docx', 'pdf', 'ipynb']
+    @staticmethod
+    def read_file_content(filepath: str) -> str:
+        ext = filepath.lower().split('.')[-1]
+        if ext in ['txt', 'csv', 'py', 'json']:
+            with open(filepath, encoding='utf-8') as f:
+                return f.read()
+        if ext == 'docx':
+            from docx import Document
+            doc = Document(filepath)
+            return '\n'.join(p.text for p in doc.paragraphs)
+        if ext == 'pdf':
+            import PyPDF2
+            with open(filepath, 'rb') as f:
+                reader = PyPDF2.PdfReader(f)
+                return '\n'.join(p.extract_text() or '' for p in reader.pages)
+        raise FileProcessingError(f'Unsupported extension: {ext}')
+    @staticmethod
+    def get_file_size(filepath: str) -> int:
+        return os.path.getsize(filepath)
+    @staticmethod
+    def is_supported_file(filename: str) -> bool:
+        ext = filename.lower().split('.')[-1]
+        return ext in FileProcessor.SUPPORTED_EXTENSIONS
+# -------------------------------------------------------------------------
+#                               LOGGER
+# -------------------------------------------------------------------------
+class Logger:
+    def __init__(self, log_dir: str):
+        Path(log_dir).mkdir(parents=True, exist_ok=True)
+        self.skipped_log = Path(log_dir) / 'skipped_files_log.csv'
+        self.api_error_log = Path(log_dir) / 'api_error_log.csv'
+        self.performance_log = Path(log_dir) / 'api_performance_log.csv'
+    def _write_csv_log(self, path: Path, header: List[str], row: List):
+        exists = path.exists()
+        with open(path, 'a', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            if not exists:
+                writer.writerow(header)
+            writer.writerow(row)
+    def log_skipped_file(self, filename: str, reason: str):
+        self._write_csv_log(self.skipped_log,
+                            ['ts', 'filename', 'reason'],
+                            [datetime.utcnow().isoformat(), filename, reason])
+    def log_api_error(self, operation: str, batch_num: int, error_message: str):
+        self._write_csv_log(self.api_error_log,
+                            ['ts', 'operation', 'batch_num', 'err'],
+                            [datetime.utcnow().isoformat(), operation, batch_num, error_message])
+    def log_performance(self, **kwargs):
+        """
+        SAFE CSV logger for performance metrics.
+        Accepts ANY keyword args and serializes them safely.
+        """
+        logfile = self.performance_log
+        # Start row with timestamp
+        row = {"timestamp": datetime.utcnow().isoformat()}
+        # Safely encode all user-provided fields
+        for k, v in kwargs.items():
+            try:
+                if isinstance(v, (list, tuple, set)):
+                    row[k] = ",".join(str(i) for i in v)
+                else:
+                    row[k] = str(v) if v is not None else ""
+            except Exception:
+                row[k] = "UNSERIALIZABLE"
+        # Write row
+        file_exists = logfile.exists()
+        with open(logfile, "a", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=row.keys())
+            if not file_exists:
+                writer.writeheader()
+            writer.writerow(row)
+# -------------------------------------------------------------------------
+#                         BATCH MANAGER (UPDATED)
+# -------------------------------------------------------------------------
+class BatchManager:
+    def __init__(self, max_files: int, max_size_mb: int):
+        self.max_files = max_files
+        self.max_bytes = max_size_mb * 1024 * 1024  # convert MB to bytes
+    def create_batches(self, files: List[str], base_path: str) -> Tuple[List[List[str]], str]:
+        """
+        Process files into batches based on max files & max bytes, with detailed logging.
+        """
+        # Log initial parameters
+        logger.info(f"Starting batch creation. Max files: {self.max_files}, Max bytes: {self.max_bytes}")
+        SAFE_LIMIT = int(self.max_bytes * 0.8)
+        logger.debug(f"Calculated SAFE_LIMIT (80% of max_bytes): {SAFE_LIMIT} bytes")
+        chunks_dir = os.path.join(base_path, "chunks")
+        os.makedirs(chunks_dir, exist_ok=True)
+        logger.info(f"Chunks directory created: {chunks_dir}")
+        file_info = []
+        # -----------------------------
+        # Step 1: Preprocess files
+        # -----------------------------
+        logger.info("--- Step 1: Preprocessing and Chunking Files ---")
+        for f in files:
+            filepath = os.path.join(base_path, f)
+            if not os.path.isfile(filepath):
+                logger.warning(f"File not found, skipping: {filepath}")
+                continue
+            ext = f.lower().split(".")[-1]
+            logger.debug(f"Processing file: {f}, detected extension: {ext}")
+            # JSON → CSV
+            if ext == "json":
+                original_path = filepath
+                filepath = json_to_csv(filepath)
+                ext = "csv"
+                logger.info(f"Converted JSON file {original_path} to CSV at {filepath}")
+            # PY/IPYNB → TXT
+            elif ext in ("py", "ipynb"):
+                original_path = filepath
+                filepath = convert_to_txt(filepath)
+                ext = "txt"
+                logger.info(f"Converted code file {original_path} to TXT at {filepath}")
+            # PDF/DOCX/TXT → paragraph chunks if needed
+            if ext in ("pdf", "docx", "txt") and ext != "csv":
+                text = extract_text(filepath)
+                chunks = paragraph_chunk(text, max_chars=SAFE_LIMIT)
+                base_fname, _ = os.path.splitext(os.path.basename(filepath))
+                part_files = []
+                for i, block in enumerate(chunks, 1):
+                    out_file = os.path.join(chunks_dir, f"{base_fname}_part_{i}.txt")
+                    with open(out_file, "w", encoding="utf-8") as out:
+                        out.write(block)
+                    size = os.path.getsize(out_file)
+                    part_files.append((out_file, size))
+                    logger.debug(f"Chunked file {f} into part {i}: {os.path.basename(out_file)}, Size: {size} bytes")
+                file_info.extend(part_files)
+                logger.info(f"Successfully chunked {f} into {len(chunks)} parts.")
+                continue
+            # CSV → safe split
+            if ext == "csv":
+                parts = self._split_csv_safe(filepath, SAFE_LIMIT, chunks_dir)
+                sizes = [os.path.getsize(p) for p in parts]
+                file_info.extend(list(zip(parts, sizes)))
+                logger.info(f"Successfully split CSV file {f} into {len(parts)} safe chunks.")
+                continue
+            # Other small files → move to chunks folder or chunk if large
+            size = os.path.getsize(filepath)
+            if size > SAFE_LIMIT:
+                logger.warning(f"File {f} ({size} bytes) exceeds SAFE_LIMIT ({SAFE_LIMIT} bytes). Attempting generic TXT chunking.")
+                try:
+                    with open(filepath, "r", encoding="utf-8", errors="ignore") as f_in:
+                        content = f_in.read()
+                    chunks = paragraph_chunk(content, max_chars=SAFE_LIMIT)
+                    base_fname, _ = os.path.splitext(os.path.basename(filepath))
+                    for i, block in enumerate(chunks, 1):
+                        out_file = os.path.join(chunks_dir, f"{base_fname}_part_{i}.txt")
+                        with open(out_file, "w", encoding="utf-8") as out:
+                            out.write(block)
+                        file_info.append((out_file, os.path.getsize(out_file)))
+                        logger.debug(f"Generic chunked file {f} into part {i}: {os.path.basename(out_file)}")
+                except Exception as e:
+                    logger.error(f"Failed to chunk large file {filepath} as TXT. Error: {e}")
+                    raise FileProcessingError(f"File {filepath} exceeds batch limit and cannot be chunked")
+            else:
+                # Copy small files to chunks directory
+                new_path = os.path.join(chunks_dir, os.path.basename(filepath))
+                if not os.path.exists(new_path):
+                    shutil.copy2(filepath, new_path)
+                file_info.append((new_path, size))
+                logger.debug(f"Copied small file {f} to chunks_dir. Size: {size} bytes.")
+        logger.info(f"Preprocessing complete. Total items ready for batching: {len(file_info)}")
+        # -----------------------------
+        # Step 2: Create dynamic batches
+        # -----------------------------
+        logger.info("--- Step 2: Creating Dynamic Batches ---")
+        batches = []
+        cur_batch = []
+        cur_size = 0
+        for filepath, size in file_info:
+            filename = os.path.basename(filepath)
+            logger.debug(f"Considering file: {filename}, Size: {size} bytes. Current batch size: {cur_size} bytes, files: {len(cur_batch)}")
+            # Absolute safety check (shouldn't happen if Step 1 worked)
+            if size > self.max_bytes:
+                logger.critical(f"Chunked file {filename} ({size} bytes) still exceeds max batch size ({self.max_bytes} bytes).")
+                raise FileProcessingError(f"File {filepath} exceeds max batch size ({self.max_bytes} bytes)")
+            # Flush current batch if limits exceeded
+            if cur_batch and (len(cur_batch) >= self.max_files or (cur_size + size) > self.max_bytes):
+                logger.info(f"Batch limit hit. Flushing batch with {len(cur_batch)} files and {cur_size} bytes.")
+                batches.append(cur_batch)
+                cur_batch = []
+                cur_size = 0
+            cur_batch.append(filename)
+            cur_size += size
+            logger.debug(f"Added file {filename}. New batch size: {cur_size} bytes, files: {len(cur_batch)}")
+        if cur_batch:
+            logger.info(f"Flushing final batch with {len(cur_batch)} files and {cur_size} bytes.")
+            batches.append(cur_batch)
+        logger.info(f"Batch creation complete. Total batches created: {len(batches)}")
+        return batches, chunks_dir
+    def _split_csv_safe(self, file_path: str, safe_limit: int, chunks_dir: str, preferred_encoding: str = "utf-8") -> List[str]:
+        """
+        Split CSV safely into chunks respecting row boundaries.
+        Tries multiple common encodings if the preferred one fails, ensuring robustness
+        across different operating systems and file origins (e.g., handling cp1252/Latin-1).
+        Includes an aggressive fallback to 'cp1252' if 'utf-8' is suspected of failing deep within the file.
+        Uses 'errors='replace'' in the final read step for maximum stability against corrupted bytes.
+        Returns a list of chunk file paths inside chunks_dir.
+        """
+        input_file = Path(file_path)
+        output_dir = Path(chunks_dir)
+        output_files = []
+        # 1. Ensure the output directory exists
+        output_dir.mkdir(parents=True, exist_ok=True)
+        base_fname = input_file.stem
+        ext = input_file.suffix
+        # 2. Robust Encoding Handling: Determine the correct encoding
+        encodings_to_try = [preferred_encoding, "cp1252", "latin-1"]
+        effective_encoding = None
+        for encoding in encodings_to_try:
+            try:
+                # Attempt to open and read only the header to test the encoding.
+                # We do not use 'errors' here to ensure we only accept encodings that cleanly read the header.
+                with open(input_file, "r", newline="", encoding=encoding) as f_test:
+                    test_reader = csv.reader(f_test)
+                    # Attempt to read the first row (header)
+                    header_test = next(test_reader)
+                    effective_encoding = encoding
+                    break
+            except UnicodeDecodeError:
+                print(f"Failed to decode with {encoding}. Trying next encoding...")
+                continue
+            except StopIteration:
+                raise FileProcessingError(f"CSV file {file_path} is empty.")
+            except Exception as e:
+                raise FileProcessingError(f"Error opening or reading file {file_path}: {e}")
+        # AGGRESSIVE FALLBACK LOGIC:
+        # If the file appears to be UTF-8 based on the header check, but is running on a Windows environment ('nt'),
+        # we preemptively switch to cp1252 to handle deep file corruption caused by non-UTF-8 characters.
+        if effective_encoding == preferred_encoding and preferred_encoding == "utf-8" and os.name == 'nt':
+            effective_encoding = "cp1252"
+            print(f"Warning: Aggressively switching encoding from {preferred_encoding} to {effective_encoding} to prevent deep file UnicodeDecodeError common in Windows CSVs.")
+        if not effective_encoding:
+            raise FileProcessingError(f"Could not decode CSV file {file_path} using any of the tested encodings: {', '.join(encodings_to_try)}")
+        # 3. Main Splitting Logic: Open the file again using the confirmed or aggressively set encoding
+        # CRITICAL FIX: Add errors='replace' to the main open call. This guarantees
+        # that if any remaining problematic bytes are found deep in the file, Python
+        # replaces them with a safe character instead of crashing the process.
+        with open(input_file, "r", newline="", encoding=effective_encoding, errors='replace') as f:
+            reader = csv.reader(f)
+            try:
+                header = next(reader)
+            except StopIteration:
+                # Should not happen if Step 2 passed, but defensive coding is included.
+                return output_files
+            # Calculate header size (using UTF-8 as the target output encoding for consistency)
+            header_bytes = len(",".join(header).encode("utf-8"))
+            part = 1
+            rows = []
+            current_size = header_bytes
+            # The row iteration now runs with the errors='replace' safety net from 'f'
+            for row in reader:
+                try:
+                    # Calculate row size based on the output encoding (UTF-8)
+                    row_bytes = len(",".join(row).encode("utf-8"))
+                except Exception as e:
+                    print(f"Warning: Skipping problematic row during size calculation: {e}")
+                    continue
+                if row_bytes > safe_limit:
+                    raise FileProcessingError(f"A CSV row exceeds the safe max size ({safe_limit} bytes) at part {part}")
+                if current_size + row_bytes > safe_limit:
+                    # Write the current chunk
+                    out_path = output_dir / f"{base_fname}_part_{part}{ext}"
+                    with open(out_path, "w", newline="", encoding="utf-8") as out:
+                        writer = csv.writer(out)
+                        writer.writerow(header)
+                        writer.writerows(rows)
+                    output_files.append(str(out_path))
+                    # Reset for the next chunk
+                    part += 1
+                    rows = []
+                    current_size = header_bytes
+                # Add the row to the current chunk
+                rows.append(row)
+                current_size += row_bytes
+            # Write the final chunk
+            if rows:
+                out_path = output_dir / f"{base_fname}_part_{part}{ext}"
+                with open(out_path, "w", newline="", encoding="utf-8") as out:
+                    writer = csv.writer(out)
+                    writer.writerow(header)
+                    writer.writerows(rows)
+                output_files.append(str(out_path))
+        return output_files

{waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0/waveflowdb_client.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: waveflowdb_client
-Version: 0.0.4
+Version: 1.0.0
 Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
 Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
 License: MIT License
@@ -27,7 +27,7 @@ License: MIT License
 Project-URL: Homepage, https://agentanalytics.ai
 Project-URL: Documentation, https://www.agentanalytics.ai/docs/waveflow-db
-Keywords: vector db,VECTOR QUERY LANGUAGE,waveflow,agentanalytics,VQL
+Keywords: vector db,VECTOR QUERY LANGUAGE,waveflowdb,agentanalytics,VQL
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE

waveflowdb_client-0.0.4/waveflowdb_client/utils.py DELETED Viewed

@@ -1,149 +0,0 @@
-import os
-import csv
-import logging
-from pathlib import Path
-from datetime import datetime
-import json
-import traceback
-import time
-from typing import List
-# Heavy libs are imported at runtime when needed in real environments.
-from .exceptions import FileProcessingError
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-class FileProcessor:
-    SUPPORTED_EXTENSIONS = ['txt', 'csv', 'json', 'py', 'docx', 'pdf']
-    @staticmethod
-    def read_file_content(filepath: str) -> str:
-        ext = filepath.lower().split('.')[-1]
-        if ext in ['txt', 'csv', 'py', 'json']:
-            with open(filepath, encoding='utf-8') as f:
-                return f.read()
-        # Defer complex parsing to runtime imports to avoid import-time failures.
-        if ext == 'docx':
-            try:
-                from docx import Document
-            except Exception as e:
-                raise FileProcessingError(f"python-docx not available: {e}")
-            doc = Document(filepath)
-            return '\n'.join(p.text for p in doc.paragraphs)
-        if ext == 'pdf':
-            try:
-                import PyPDF2
-            except Exception as e:
-                raise FileProcessingError(f"PyPDF2 not available: {e}")
-            with open(filepath, 'rb') as f:
-                reader = PyPDF2.PdfReader(f)
-                return '\n'.join(p.extract_text() or '' for p in reader.pages)
-        raise FileProcessingError(f'Unsupported extension: {ext}')
-    @staticmethod
-    def get_file_size(filepath: str) -> int:
-        return os.path.getsize(filepath)
-    @staticmethod
-    def is_supported_file(filename: str) -> bool:
-        ext = filename.lower().split('.')[-1]
-        return ext in FileProcessor.SUPPORTED_EXTENSIONS
-class Logger:
-    def __init__(self, log_dir: str):
-        Path(log_dir).mkdir(parents=True, exist_ok=True)
-        self.skipped_log = Path(log_dir) / 'skipped_files_log.csv'
-        self.api_error_log = Path(log_dir) / 'api_error_log.csv'
-        self.performance_log = Path(log_dir) / 'api_performance_log.csv'
-    def _write_csv_log(self, path: Path, header: List[str], row: List):
-        exists = path.exists()
-        with open(path, 'a', newline='', encoding='utf-8') as f:
-            writer = csv.writer(f)
-            if not exists:
-                writer.writerow(header)
-            writer.writerow(row)
-    def log_skipped_file(self, filename: str, reason: str):
-        self._write_csv_log(self.skipped_log, ['ts', 'filename', 'reason'], [datetime.utcnow().isoformat(), filename, reason])
-    def log_api_error(self, operation: str, batch_num: int, error_message: str):
-        self._write_csv_log(self.api_error_log, ['ts', 'operation', 'batch_num', 'err'], [datetime.utcnow().isoformat(), operation, batch_num, error_message])
-    def log_performance(self, operation=None, batch_num=None, latency=None,
-                    request_size=None, response_size=None, result_count=None,
-                    files_processed=None, error: Exception = None):
-        """
-        SAFE CSV logger for all SDK operations.
-        It will never throw TypeError regardless of input type.
-        """
-        logfile = getattr(self, "perf_csv", "performance_logs.csv")
-        # --- Safe stringify for ANY type ---
-        def safe_str(value):
-            try:
-                if isinstance(value, (list, tuple, set)):
-                    return ",".join(str(v) for v in value)
-                return str(value) if value is not None else ""
-            except Exception:
-                return "UNSERIALIZABLE"
-        row = {
-            "timestamp": datetime.utcnow().isoformat(),
-            "operation": safe_str(operation),
-            "batch_num": safe_str(batch_num),
-            "latency_ms": safe_str(latency),
-            "request_size": safe_str(request_size),
-            "response_size": safe_str(response_size),
-            "result_count": safe_str(result_count),
-            "files_processed": safe_str(files_processed),
-            "error": safe_str(error),
-        }
-        try:
-            file_exists = os.path.isfile(logfile)
-            with open(logfile, mode="a", newline="", encoding="utf-8") as f:
-                writer = csv.DictWriter(f, fieldnames=row.keys())
-                if not file_exists:
-                    writer.writeheader()
-                writer.writerow(row)
-        except Exception:
-            print("[CSV LOGGING ERROR] Could not write performance log:")
-            print(traceback.format_exc())
-class BatchManager:
-    def __init__(self, max_files: int, max_size_mb: int):
-        self.max_files = max_files
-        self.max_bytes = max_size_mb * 1024 * 1024
-    def create_batches(self, files: List[str], base_path: str) -> List[List[str]]:
-        file_info = []
-        for f in files:
-            p = os.path.join(base_path, f)
-            try:
-                size = os.path.getsize(p)
-                if size <= self.max_bytes:
-                    file_info.append((f, size))
-            except Exception:
-                continue
-        batches = []
-        cur = []
-        cur_size = 0
-        for fname, size in file_info:
-            if len(cur) >= self.max_files or (cur_size + size) > self.max_bytes:
-                if cur:
-                    batches.append(cur)
-                cur = []
-                cur_size = 0
-            cur.append(fname)
-            cur_size += size
-        if cur:
-            batches.append(cur)
-        return batches