PyPI - projectdavid - Versions diffs - 1.29.9__py3-none-any.whl → 1.38.1__py3-none-any.whl - Mend

projectdavid 1.29.9py3-none-any.whl → 1.38.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

projectdavid/clients/assistants_client.py +7 -13
projectdavid/clients/file_processor.py +216 -76
projectdavid/clients/messages_client.py +24 -39
projectdavid/clients/runs.py +156 -211
projectdavid/clients/synchronous_inference_wrapper.py +52 -24
projectdavid/clients/threads_client.py +32 -12
projectdavid/clients/vector_store_manager.py +110 -21
projectdavid/clients/vectors.py +250 -96
projectdavid/clients/vision-file_processor.py +462 -0
projectdavid/clients/vision_vectors.py +1058 -0
projectdavid/decorators.py +64 -0
projectdavid/entity.py +24 -5
projectdavid/synthesis/reranker.py +4 -2
projectdavid/utils/function_call_suppressor.py +40 -0
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/METADATA +8 -6
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/RECORD +19 -15
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/WHEEL +1 -1
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/licenses/LICENSE +0 -0
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/top_level.txt +0 -0

projectdavid/clients/assistants_client.py CHANGED Viewed

@@ -112,7 +112,6 @@ class AssistantsClient(BaseAPIClient):
         description: str = "",
         instructions: str = "",
         tools: Optional[List[Dict[str, Any]]] = None,
-        platform_tools: Optional[List[Dict[str, Any]]] = None,
         tool_resources: Optional[Dict[str, Dict[str, Any]]] = None,
         meta_data: Optional[Dict[str, Any]] = None,
         top_p: float = 1.0,
@@ -133,7 +132,6 @@ class AssistantsClient(BaseAPIClient):
             "model": model,
             "instructions": instructions,
             "tools": tools,
-            "platform_tools": platform_tools,
             "tool_resources": tool_resources,
             "meta_data": meta_data,
             "top_p": top_p,
@@ -263,14 +261,10 @@ class AssistantsClient(BaseAPIClient):
         )
         return {"message": "Assistant disassociated from user successfully"}
-    def list_assistants_by_user(
-        self, user_id: str
-    ) -> List[ent_validator.AssistantRead]:
-        logging_utility.info("Listing assistants for user id=%s", user_id)
-        try:
-            resp = self._request_with_retries("GET", f"/v1/users/{user_id}/assistants")
-            raw_list = self._parse_response(resp)
-            return [ent_validator.AssistantRead(**a) for a in raw_list]
-        except ValidationError as e:
-            logging_utility.error("Validation error: %s", e.json())
-            raise AssistantsClientError(f"Validation error: {e}") from e
+    def list(self) -> list[ent_validator.AssistantRead]:
+        """Return every assistant owned by *this* API key."""
+        logging_utility.info("Listing assistants")
+        resp = self._request_with_retries("GET", "/v1/assistants")
+        raw = self._parse_response(resp)
+        return [ent_validator.AssistantRead(**a) for a in raw]

projectdavid/clients/file_processor.py CHANGED Viewed

@@ -1,80 +1,184 @@
 import asyncio
 import csv
+import json
 import re
+import textwrap
 from concurrent.futures import ThreadPoolExecutor
-from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Tuple, Union
-try:
-    from typing import LiteralString  # Python 3.11+
-except ImportError:
+try:  # Python 3.11+
+    from typing import LiteralString
+except ImportError:  # 3.9–3.10
     from typing_extensions import LiteralString
 import numpy as np
 import pdfplumber
-import validators
+from docx import Document
+from pptx import Presentation
 from projectdavid_common import UtilsInterface
-from sentence_transformers import SentenceTransformer
 log = UtilsInterface.LoggingUtility()
 class FileProcessor:
+    # ------------------------------------------------------------------ #
+    #  Construction
+    # ------------------------------------------------------------------ #
     def __init__(self, max_workers: int = 4, chunk_size: int = 512):
-        self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
         self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
+        self._embedding_model = None
         self._executor = ThreadPoolExecutor(max_workers=max_workers)
-        # compute token limits
-        self.max_seq_length = self.embedding_model.get_max_seq_length()
-        self.special_tokens_count = 2
-        self.effective_max_length = self.max_seq_length - self.special_tokens_count
+        # Lazy-initialized attributes
+        self._requested_chunk_size = chunk_size
+        self._max_seq_length = None
+        self._effective_max_length = None
+        self._chunk_size = None
-        # chunk_size cannot exceed 4× model max
-        self.chunk_size = min(chunk_size, self.effective_max_length * 4)
+        log.info("Initialized Lazy-Loaded FileProcessor")
-        log.info("Initialized optimized FileProcessor")
+    def _ensure_model(self):
+        """
+        Internal helper to load the model and calculate limits only once.
+        This prevents heavy imports (scipy, torch) until actually needed.
+        """
+        if self._embedding_model is None:
+            try:
+                from sentence_transformers import SentenceTransformer
+                log.info(f"Lazy-loading model: {self.embedding_model_name}")
+                self._embedding_model = SentenceTransformer(self.embedding_model_name)
+                # Ported Limit Calculations
+                self._max_seq_length = self._embedding_model.get_max_seq_length()
+                special_tokens_count = 2
+                self._effective_max_length = self._max_seq_length - special_tokens_count
+                self._chunk_size = min(
+                    self._requested_chunk_size, self._effective_max_length * 4
+                )
+            except ImportError:
+                log.error(
+                    "sentence-transformers not found. Ensure 'pip install projectdavid[vision]' is installed."
+                )
+                raise ImportError(
+                    "Model-based features require 'sentence-transformers'. Install with [vision] extra."
+                )
+        return self._embedding_model
+    # Properties to maintain access to derived attributes
+    @property
+    def chunk_size(self):
+        if self._chunk_size is None:
+            self._ensure_model()
+        return self._chunk_size
+    @property
+    def effective_max_length(self):
+        if self._effective_max_length is None:
+            self._ensure_model()
+        return self._effective_max_length
+    # ------------------------------------------------------------------ #
+    #  Embeddings
+    # ------------------------------------------------------------------ #
+    def encode_text(self, text: str):
+        model = self._ensure_model()
+        return model.encode(
+            [text],
+            convert_to_numpy=True,
+            truncate="model_max_length",
+            normalize_embeddings=True,
+        )[0]
+    async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
+        model = self._ensure_model()
+        return await asyncio.get_event_loop().run_in_executor(
+            self._executor,
+            lambda: model.encode(
+                [chunk],
+                convert_to_numpy=True,
+                truncate="model_max_length",
+                normalize_embeddings=True,
+                show_progress_bar=False,
+            )[0],
+        )
+    # ------------------------------------------------------------------ #
+    #  Generic validators
+    # ------------------------------------------------------------------ #
     def validate_file(self, file_path: Path):
-        """Ensure file exists and is under 100 MB."""
+        """Ensure file exists and is under 100 MB."""
         max_size = 100 * 1024 * 1024
         if not file_path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
         if file_path.stat().st_size > max_size:
             mb = max_size // (1024 * 1024)
-            raise ValueError(f"{file_path.name} > {mb} MB limit")
+            raise ValueError(f"{file_path.name} > {mb} MB limit")
+    # ------------------------------------------------------------------ #
+    #  File-type detection
+    # ------------------------------------------------------------------ #
     def _detect_file_type(self, file_path: Path) -> str:
-        """Return 'pdf', 'text', or 'csv'."""
         suffix = file_path.suffix.lower()
         if suffix == ".pdf":
             return "pdf"
         if suffix == ".csv":
             return "csv"
-        if suffix in {".txt", ".md", ".rst"}:
+        if suffix == ".json":
+            return "json"
+        if suffix in {".doc", ".docx", ".pptx"}:
+            return "office"
+        text_exts = {
+            ".txt",
+            ".md",
+            ".rst",
+            ".c",
+            ".cpp",
+            ".cs",
+            ".go",
+            ".java",
+            ".js",
+            ".ts",
+            ".php",
+            ".py",
+            ".rb",
+            ".sh",
+            ".tex",
+            ".html",
+            ".css",
+        }
+        if suffix in text_exts:
             return "text"
-        return "unknown"
+        raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
+    # ------------------------------------------------------------------ #
+    #  Public entry-point
+    # ------------------------------------------------------------------ #
     async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
-        """
-        Async entrypoint: validate, detect type, then dispatch to the
-        appropriate processor (_process_pdf, _process_text, or _process_csv).
-        """
+        """Validate → detect → dispatch to the appropriate processor."""
         file_path = Path(file_path)
         self.validate_file(file_path)
         ftype = self._detect_file_type(file_path)
-        if ftype == "pdf":
-            return await self._process_pdf(file_path)
-        if ftype == "text":
-            return await self._process_text(file_path)
-        if ftype == "csv":
-            return await self._process_csv(file_path)
-        raise ValueError(f"Unsupported extension: {file_path.suffix}")
+        dispatch_map = {
+            "pdf": self._process_pdf,
+            "text": self._process_text,
+            "csv": self._process_csv,
+            "office": self._process_office,
+            "json": self._process_json,
+        }
+        if ftype not in dispatch_map:
+            raise ValueError(f"Unsupported file type: {file_path.suffix}")
-    # ——— PDF / TEXT pipelines unchanged ——— #
+        return await dispatch_map[ftype](file_path)
+    # ------------------------------------------------------------------ #
+    #  PDF
+    # ------------------------------------------------------------------ #
     async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
         page_chunks, doc_meta = await self._extract_text(file_path)
         all_chunks, line_data = [], []
@@ -82,7 +186,6 @@ class FileProcessor:
         for page_text, page_num, line_nums in page_chunks:
             lines = page_text.split("\n")
             buf, buf_lines, length = [], [], 0
             for line, ln in zip(lines, line_nums):
                 l = len(line) + 1
                 if length + l <= self.chunk_size:
@@ -94,12 +197,9 @@ class FileProcessor:
                         all_chunks.append("\n".join(buf))
                         line_data.append({"page": page_num, "lines": buf_lines})
                         buf, buf_lines, length = [], [], 0
-                    # split any oversized line
                     for piece in self._split_oversized_chunk(line):
                         all_chunks.append(piece)
                         line_data.append({"page": page_num, "lines": [ln]})
             if buf:
                 all_chunks.append("\n".join(buf))
                 line_data.append({"page": page_num, "lines": buf_lines})
@@ -107,7 +207,6 @@ class FileProcessor:
         vectors = await asyncio.gather(
             *[self._encode_chunk_async(c) for c in all_chunks]
         )
         return {
             "content": "\n\n".join(all_chunks),
             "metadata": {
@@ -121,6 +220,9 @@ class FileProcessor:
             "line_data": line_data,
         }
+    # ------------------------------------------------------------------ #
+    #  Plain-text / code / markup
+    # ------------------------------------------------------------------ #
     async def _process_text(self, file_path: Path) -> Dict[str, Any]:
         text, extra_meta, _ = await self._extract_text(file_path)
         chunks = self._chunk_text(text)
@@ -137,17 +239,13 @@ class FileProcessor:
             "vectors": [v.tolist() for v in vectors],
         }
-    # ——— NEW: CSV pipeline ——— #
+    # ------------------------------------------------------------------ #
+    #  CSV
+    # ------------------------------------------------------------------ #
     async def _process_csv(
         self, file_path: Path, text_field: str = "description"
     ) -> Dict[str, Any]:
-        """
-        Read each row, embed the `text_field`, and collect per-row metadata
-        from all other columns.
-        """
-        # load rows synchronously
-        rows, texts, metas = [], [], []
+        texts, metas = [], []
         with file_path.open(newline="", encoding="utf-8") as f:
             reader = csv.DictReader(f)
             for row in reader:
@@ -155,27 +253,67 @@ class FileProcessor:
                 if not txt:
                     continue
                 texts.append(txt)
-                # all other columns become metadata
-                row_meta = {k: v for k, v in row.items() if k != text_field and v}
-                metas.append(row_meta)
+                metas.append({k: v for k, v in row.items() if k != text_field and v})
-        # embed in parallel
         vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
+        return {
+            "content": None,
+            "metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
+            "chunks": texts,
+            "vectors": [v.tolist() for v in vectors],
+            "csv_row_metadata": metas,
+        }
+    # ------------------------------------------------------------------ #
+    #  Office docs (.doc/.docx/.pptx)
+    # ------------------------------------------------------------------ #
+    async def _process_office(self, file_path: Path) -> Dict[str, Any]:
+        loop = asyncio.get_event_loop()
+        if file_path.suffix.lower() in {".doc", ".docx"}:
+            text = await loop.run_in_executor(
+                self._executor, self._read_docx, file_path
+            )
+        else:  # .pptx
+            text = await loop.run_in_executor(
+                self._executor, self._read_pptx, file_path
+            )
+        chunks = self._chunk_text(text)
+        vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
         return {
-            "content": None,  # CSVs may not have monolithic text
+            "content": text,
             "metadata": {
                 "source": str(file_path),
-                "rows": len(texts),
-                "type": "csv",
+                "chunks": len(chunks),
+                "type": "office",
             },
-            "chunks": texts,
+            "chunks": chunks,
             "vectors": [v.tolist() for v in vectors],
-            "csv_row_metadata": metas,
         }
-    # ——— shared helpers ——— #
+    # ------------------------------------------------------------------ #
+    #  JSON
+    # ------------------------------------------------------------------ #
+    async def _process_json(self, file_path: Path) -> Dict[str, Any]:
+        text = await asyncio.get_event_loop().run_in_executor(
+            self._executor, self._read_json, file_path
+        )
+        chunks = self._chunk_text(text)
+        vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
+        return {
+            "content": text,
+            "metadata": {
+                "source": str(file_path),
+                "chunks": len(chunks),
+                "type": "json",
+            },
+            "chunks": chunks,
+            "vectors": [v.tolist() for v in vectors],
+        }
+    # ------------------------------------------------------------------ #
+    #  Shared helpers
+    # ------------------------------------------------------------------ #
     async def _extract_text(self, file_path: Path) -> Union[
         Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
         Tuple[str, Dict[str, Any], List[int]],
@@ -203,10 +341,8 @@ class FileProcessor:
             )
             for i, page in enumerate(pdf.pages, start=1):
                 lines = page.extract_text_lines()
-                txts, nums = [], []
-                # sort by vertical position
                 sorted_lines = sorted(lines, key=lambda x: x["top"])
-                # enumerate to get a reliable line number
+                txts, nums = [], []
                 for ln_idx, L in enumerate(sorted_lines, start=1):
                     t = L.get("text", "").strip()
                     if t:
@@ -222,23 +358,29 @@ class FileProcessor:
         except UnicodeDecodeError:
             return file_path.read_text(encoding="latin-1")
-    async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
-        return await asyncio.get_event_loop().run_in_executor(
-            self._executor,
-            lambda: self.embedding_model.encode(
-                [chunk],
-                convert_to_numpy=True,
-                truncate="model_max_length",
-                normalize_embeddings=True,
-                show_progress_bar=False,
-            )[0],
-        )
+    def _read_docx(self, path: Path) -> str:
+        doc = Document(path)
+        return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+    def _read_pptx(self, path: Path) -> str:
+        prs = Presentation(path)
+        slides = []
+        for slide in prs.slides:
+            chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
+            slides.append("\n".join(filter(None, chunks)))
+        return "\n\n".join(slides)
+    def _read_json(self, path: Path) -> str:
+        obj = json.loads(path.read_text(encoding="utf-8"))
+        pretty = json.dumps(obj, indent=2, ensure_ascii=False)
+        return "\n".join(textwrap.wrap(pretty, width=120))
+    # ------------------------------------------------------------------ #
+    #  Text chunking helpers
+    # ------------------------------------------------------------------ #
     def _chunk_text(self, text: str) -> List[str]:
-        # split into sentences, then re-chunk to token limits
         sentences = re.split(r"(?<=[\.!?])\s+", text)
         chunks, buf, length = [], [], 0
         for sent in sentences:
             slen = len(sent) + 1
             if length + slen <= self.chunk_size:
@@ -248,22 +390,20 @@ class FileProcessor:
                 if buf:
                     chunks.append(" ".join(buf))
                     buf, length = [], 0
-                # sentence itself may be too big
                 while len(sent) > self.chunk_size:
                     part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
                     chunks.append(part)
                 buf, length = [sent], len(sent)
         if buf:
             chunks.append(" ".join(buf))
         return chunks
     def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
+        model = self._ensure_model()  # Ensure model is loaded to access tokenizer
         if tokens is None:
-            tokens = self.embedding_model.tokenizer.tokenize(chunk)
+            tokens = model.tokenizer.tokenize(chunk)
         out = []
         for i in range(0, len(tokens), self.effective_max_length):
             seg = tokens[i : i + self.effective_max_length]
-            out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
+            out.append(model.tokenizer.convert_tokens_to_string(seg))
         return out

projectdavid/clients/messages_client.py CHANGED Viewed

@@ -153,18 +153,21 @@ class MessagesClient(BaseAPIClient):
             raise
     def list_messages(
-        self, thread_id: str, limit: int = 20, order: str = "asc"
-    ) -> List[Dict[str, Any]]:
+        self,
+        thread_id: str,
+        limit: int = 20,
+        order: str = "asc",
+    ) -> ent_validator.MessagesList:
         """
-        List messages for a given thread.
+        Fetch messages for a thread and return an OpenAI-style envelope.
         Args:
-            thread_id (str): The thread ID.
-            limit (int): Maximum number of messages to retrieve.
-            order (str): Order of messages ('asc' or 'desc').
+            thread_id (str): Target thread ID.
+            limit (int): Max messages to fetch.
+            order (str): 'asc' or 'desc'.
         Returns:
-            List[Dict[str, Any]]: A list of messages as dictionaries.
+            MessagesList: Wrapper containing .data[], .first_id, .last_id, .has_more …
         """
         logging_utility.info(
             "Listing messages for thread_id: %s, limit: %d, order: %s",
@@ -178,24 +181,19 @@ class MessagesClient(BaseAPIClient):
                 f"/v1/threads/{thread_id}/messages", params=params
             )
             response.raise_for_status()
-            messages = response.json()
-            validated_messages = [
-                ent_validator.MessageRead(**message) for message in messages
-            ]
-            logging_utility.info("Retrieved %d messages", len(validated_messages))
-            return [message.dict() for message in validated_messages]
+            envelope = ent_validator.MessagesList(**response.json())
+            logging_utility.info("Retrieved %d messages", len(envelope.data))
+            return envelope
         except ValidationError as e:
             logging_utility.error("Validation error: %s", e.json())
-            raise ValueError(f"Validation error: {e}")
+            raise ValueError(f"Validation error: {e}") from e
         except httpx.HTTPStatusError as e:
-            logging_utility.error(
-                "HTTP error occurred while listing messages: %s", str(e)
-            )
+            logging_utility.error("HTTP error while listing messages: %s", str(e))
             raise
         except Exception as e:
-            logging_utility.error(
-                "An error occurred while listing messages: %s", str(e)
-            )
+            logging_utility.error("Unexpected error while listing messages: %s", str(e))
             raise
     def get_formatted_messages(
@@ -294,32 +292,19 @@ class MessagesClient(BaseAPIClient):
             logging_utility.error("An error occurred: %s", str(e))
             raise RuntimeError(f"An error occurred: {str(e)}")
-    def delete_message(self, message_id: str) -> Dict[str, Any]:
-        """
-        Delete a message by its ID.
-        Args:
-            message_id (str): The ID of the message.
-        Returns:
-            Dict[str, Any]: The deletion result.
-        """
+    def delete_message(self, message_id: str) -> ent_validator.MessageDeleted:
+        """Delete a message and return deletion envelope."""
         logging_utility.info("Deleting message with id: %s", message_id)
         try:
             response = self.client.delete(f"/v1/messages/{message_id}")
             response.raise_for_status()
-            result = response.json()
-            logging_utility.info("Message deleted successfully")
-            return result
+            return ent_validator.MessageDeleted(**response.json())
         except httpx.HTTPStatusError as e:
-            logging_utility.error(
-                "HTTP error occurred while deleting message: %s", str(e)
-            )
+            logging_utility.error("HTTP error while deleting message: %s", str(e))
             raise
         except Exception as e:
-            logging_utility.error(
-                "An error occurred while deleting message: %s", str(e)
-            )
+            logging_utility.error("Unexpected error while deleting message: %s", str(e))
             raise
     def save_assistant_message_chunk(

projectdavid 1.29.9__py3-none-any.whl → 1.38.1__py3-none-any.whl

projectdavid 1.29.9py3-none-any.whl → 1.38.1py3-none-any.whl