PyPI - projectdavid - Versions diffs - 1.29.9__py3-none-any.whl → 1.38.1__py3-none-any.whl - Mend

projectdavid 1.29.9py3-none-any.whl → 1.38.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

projectdavid/clients/assistants_client.py +7 -13
projectdavid/clients/file_processor.py +216 -76
projectdavid/clients/messages_client.py +24 -39
projectdavid/clients/runs.py +156 -211
projectdavid/clients/synchronous_inference_wrapper.py +52 -24
projectdavid/clients/threads_client.py +32 -12
projectdavid/clients/vector_store_manager.py +110 -21
projectdavid/clients/vectors.py +250 -96
projectdavid/clients/vision-file_processor.py +462 -0
projectdavid/clients/vision_vectors.py +1058 -0
projectdavid/decorators.py +64 -0
projectdavid/entity.py +24 -5
projectdavid/synthesis/reranker.py +4 -2
projectdavid/utils/function_call_suppressor.py +40 -0
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/METADATA +8 -6
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/RECORD +19 -15
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/WHEEL +1 -1
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/licenses/LICENSE +0 -0
{projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/top_level.txt +0 -0

projectdavid/clients/vectors.py CHANGED Viewed

@@ -61,13 +61,16 @@ class VectorStoreClient:
     • create_vector_store() no longer takes user_id; ownership from token.
     """
-    # Construction / cleanup
+    # ------------------------------------------------------------------ #
+    #  Construction / cleanup
+    # ------------------------------------------------------------------ #
     def __init__(
         self,
         base_url: Optional[str] = None,
         api_key: Optional[str] = None,
         *,
         vector_store_host: str = "localhost",
+        file_processor_kwargs: Optional[dict] = None,  # 🔶 add arg
     ):
         self.base_url = (base_url or os.getenv("BASE_URL", "")).rstrip("/")
         self.api_key = api_key or os.getenv("API_KEY")
@@ -84,9 +87,13 @@ class VectorStoreClient:
             base_url=self.base_url, headers=self._base_headers, timeout=30.0
         )
-        # Local helpers
+        # Local helpers ---------------------------------------------------
         self.vector_manager = VectorStoreManager(vector_store_host=vector_store_host)
         self.identifier_service = UtilsInterface.IdentifierService()
+        # 🔶 forward kwargs into the upgraded FileProcessor
+        # self.file_processor = FileProcessor(**(file_processor_kwargs or {}))
+        # Using Stripped down version for now until we move forward with multi-modal stores
         self.file_processor = FileProcessor()
         log.info("VectorStoreClient → %s", self.base_url)
@@ -240,32 +247,31 @@ class VectorStoreClient:
     ) -> ValidationInterface.VectorStoreFileRead:
         processed = await self.file_processor.process_file(p)
         texts, vectors = processed["chunks"], processed["vectors"]
-        line_data = processed.get("line_data") or []  # ← NEW
-        base_md = meta or {}
-        base_md.update({"source": str(p), "file_name": p.name})
+        line_data = processed.get("line_data") or []
+        base_md = (meta or {}) | {"source": str(p), "file_name": p.name}
         file_record_id = f"vsf_{uuid.uuid4()}"
-        # Build per‑chunk payload, now including page/lines if present
         chunk_md = []
-        for i in range(len(texts)):
-            payload = {
-                **base_md,
-                "chunk_index": i,
-                "file_id": file_record_id,
-            }
-            if i < len(line_data):  # ← NEW
-                payload.update(line_data[i])  # {'page': …, 'lines': …}
+        for i, txt in enumerate(texts):
+            payload = {**base_md, "chunk_index": i, "file_id": file_record_id}
+            if i < len(line_data):
+                payload.update(line_data[i])  # {'page':…, 'lines':…}
             chunk_md.append(payload)
+        # 🔑 1. look up the backend store to get its *collection* name
+        store = self.retrieve_vector_store_sync(vector_store_id)
+        collection_name = store.collection_name
+        # 🔑 2. upsert via VectorStoreManager (auto-detects vector field)
         self.vector_manager.add_to_store(
-            store_name=vector_store_id,
+            store_name=collection_name,
             texts=texts,
             vectors=vectors,
             metadata=chunk_md,
         )
+        # 3. register the file with the API
         resp = await self._request(
             "POST",
             f"/v1/vector-stores/{vector_store_id}/files",
@@ -287,26 +293,36 @@ class VectorStoreClient:
         filters: Optional[Dict] = None,
         vector_store_host: Optional[str] = None,
     ) -> List[Dict[str, Any]]:
-        # Use the provided vector_store_host if specified, otherwise fall back to the default
-        if vector_store_host:
-            vector_manager = VectorStoreManager(vector_store_host=vector_store_host)
-        else:
-            vector_manager = self.vector_manager
+        # pick local vs. override host
+        vector_manager = (
+            VectorStoreManager(vector_store_host=vector_store_host)
+            if vector_store_host
+            else self.vector_manager
+        )
         store = self.retrieve_vector_store_sync(vector_store_id)
-        vec = self.file_processor.embedding_model.encode(query_text).tolist()
+        # 🔶 choose encoder by vector_size
+        if store.vector_size == 1024:  # images collection
+            vec = self.file_processor.encode_clip_text(query_text).tolist()
+            vector_field = "caption_vector"  # field name in Qdrant
+        else:  # 384-D text collection
+            vec = self.file_processor.encode_text(query_text).tolist()
+            vector_field = None  # default field
         return vector_manager.query_store(
             store_name=store.collection_name,
             query_vector=vec,
             top_k=top_k,
             filters=filters,
+            vector_field=vector_field,
         )
-    async def _delete_vs_async(
-        self, vector_store_id: str, permanent: bool
-    ) -> Dict[str, Any]:
-        qres = self.vector_manager.delete_store(vector_store_id)
+    async def _delete_vs_async(self, vector_store_id: str, permanent: bool):
+        # collection deletion must use the *collection* name
+        store = self.retrieve_vector_store_sync(vector_store_id)
+        qres = self.vector_manager.delete_store(store.collection_name)
         await self._request(
             "DELETE",
             f"/v1/vector-stores/{vector_store_id}",
@@ -319,10 +335,11 @@ class VectorStoreClient:
             "qdrant_result": qres,
         }
-    async def _delete_file_async(
-        self, vector_store_id: str, file_path: str
-    ) -> Dict[str, Any]:
-        fres = self.vector_manager.delete_file_from_store(vector_store_id, file_path)
+    async def _delete_file_async(self, vector_store_id: str, file_path: str):
+        store = self.retrieve_vector_store_sync(vector_store_id)
+        fres = self.vector_manager.delete_file_from_store(
+            store.collection_name, file_path
+        )
         await self._request(
             "DELETE",
             f"/v1/vector-stores/{vector_store_id}/files",
@@ -454,19 +471,101 @@ class VectorStoreClient:
             )
         )
+    # ───────────────────────────────────────────────────────────────
+    #  Convenience: ensure a per-user “file_search” store exists
+    # ───────────────────────────────────────────────────────────────
+    # unchanged … (get_or_create_file_search_store)
     def list_my_vector_stores(self) -> List[ValidationInterface.VectorStoreRead]:
-        """List all non-deleted stores owned by the caller."""
+        """List all non-deleted stores owned by *this* API-key’s user."""
         return self._run_sync(self._list_my_vs_async())
+    # ───────────────────────────────────────────────────────────────
+    #  NEW: real per-user listing (admin-only)
+    # ───────────────────────────────────────────────────────────────
+    async def _list_vs_by_user_async(self, user_id: str):
+        resp = await self._request(
+            "GET",
+            "/v1/vector-stores/admin/by-user",
+            params={"owner_id": user_id},
+        )
+        return [ValidationInterface.VectorStoreRead.model_validate(r) for r in resp]
     def get_stores_by_user(
-        self, _user_id: str
+        self,
+        _user_id: str,
     ) -> List[ValidationInterface.VectorStoreRead]:  # noqa: ARG002
+        """
+        ⚠️ **Deprecated** – prefer impersonating the user’s API-key or using
+        the newer RBAC endpoints, but keep working for legacy code.
+        """
         warnings.warn(
-            "`get_stores_by_user()` is deprecated; use `list_my_vector_stores()`.",
+            "`get_stores_by_user()` is deprecated; use `list_my_vector_stores()` or "
+            "`VectorStoreClient(list_my_vector_stores)` with an impersonated key.",
             DeprecationWarning,
             stacklevel=2,
         )
-        return self.list_my_vector_stores()
+        return self._run_sync(self._list_vs_by_user_async(_user_id))
+    # ───────────────────────────────────────────────────────────────
+    #  Convenience: ensure a per-user “file_search” store exists
+    # ───────────────────────────────────────────────────────────────
+    def get_or_create_file_search_store(self, user_id: Optional[str] = None) -> str:
+        """
+        Return the *oldest* vector-store named **file_search** for ``user_id``;
+        create one if none exist.
+        Parameters
+        ----------
+        user_id : Optional[str]
+            • If **None**  → operate on *this* API-key’s stores
+            • If not None → *admin-only*  – look up / create on behalf of ``user_id``
+        Returns
+        -------
+        str
+            The vector-store **id**.
+        """
+        # 1️⃣  Fetch candidate stores
+        if user_id is None:
+            # Normal user context – only see caller-owned stores
+            stores = self.list_my_vector_stores()
+        else:
+            # Admin context – may inspect another user’s stores
+            stores = self.get_stores_by_user(_user_id=user_id)
+        file_search_stores = [s for s in stores if s.name == "file_search"]
+        if file_search_stores:
+            # 2️⃣  Pick the *earliest* (oldest created_at) to keep things stable
+            chosen = min(
+                file_search_stores,
+                key=lambda s: (s.created_at or 0),
+            )
+            log.info(
+                "Re-using existing 'file_search' store %s for user %s",
+                chosen.id,
+                user_id or "<self>",
+            )
+            return chosen.id
+        # 3️⃣  Nothing found → create a fresh store
+        if user_id is None:
+            new_store = self.create_vector_store(name="file_search")
+        else:
+            # Requires admin API-key
+            new_store = self.create_vector_store_for_user(
+                owner_id=user_id,
+                name="file_search",
+            )
+        log.info(
+            "Created new 'file_search' store %s for user %s",
+            new_store.id,
+            user_id or "<self>",
+        )
+        return new_store.id
     def add_file_to_vector_store(
         self,
@@ -479,6 +578,67 @@ class VectorStoreClient:
             raise FileNotFoundError(f"File not found: {p}")
         return self._run_sync(self._add_file_async(vector_store_id, p, user_metadata))
+    def delete_vector_store(
+        self,
+        vector_store_id: str,
+        permanent: bool = False,
+    ) -> Dict[str, Any]:
+        return self._run_sync(self._delete_vs_async(vector_store_id, permanent))
+    def delete_file_from_vector_store(
+        self,
+        vector_store_id: str,
+        file_path: str,
+    ) -> Dict[str, Any]:
+        return self._run_sync(self._delete_file_async(vector_store_id, file_path))
+    def list_store_files(
+        self,
+        vector_store_id: str,
+    ) -> List[ValidationInterface.VectorStoreFileRead]:
+        return self._run_sync(self._list_store_files_async(vector_store_id))
+    def update_vector_store_file_status(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        status: ValidationInterface.StatusEnum,
+        error_message: Optional[str] = None,
+    ) -> ValidationInterface.VectorStoreFileRead:
+        return self._run_sync(
+            self._update_file_status_async(
+                vector_store_id, file_id, status, error_message
+            )
+        )
+    def get_vector_stores_for_assistant(
+        self,
+        assistant_id: str,
+    ) -> List[ValidationInterface.VectorStoreRead]:
+        return self._run_sync(self._get_assistant_vs_async(assistant_id))
+    def attach_vector_store_to_assistant(
+        self,
+        vector_store_id: str,
+        assistant_id: str,
+    ) -> bool:
+        return self._run_sync(self._attach_vs_async(vector_store_id, assistant_id))
+    def detach_vector_store_from_assistant(
+        self,
+        vector_store_id: str,
+        assistant_id: str,
+    ) -> bool:
+        return self._run_sync(self._detach_vs_async(vector_store_id, assistant_id))
+    def retrieve_vector_store_sync(
+        self,
+        vector_store_id: str,
+    ) -> ValidationInterface.VectorStoreRead:
+        resp = self._sync_api_client.get(f"/v1/vector-stores/{vector_store_id}")
+        resp.raise_for_status()
+        return ValidationInterface.VectorStoreRead.model_validate(resp.json())
     def vector_file_search_raw(
         self,
         vector_store_id: str,
@@ -545,71 +705,10 @@ class VectorStoreClient:
         # 4️⃣  Wrap everything into an OpenAI envelope
         return make_envelope(query_text, hits, answer_text)
-    def delete_vector_store(
-        self,
-        vector_store_id: str,
-        permanent: bool = False,
-    ) -> Dict[str, Any]:
-        return self._run_sync(self._delete_vs_async(vector_store_id, permanent))
-    def delete_file_from_vector_store(
-        self,
-        vector_store_id: str,
-        file_path: str,
-    ) -> Dict[str, Any]:
-        return self._run_sync(self._delete_file_async(vector_store_id, file_path))
-    def list_store_files(
-        self,
-        vector_store_id: str,
-    ) -> List[ValidationInterface.VectorStoreFileRead]:
-        return self._run_sync(self._list_store_files_async(vector_store_id))
-    def update_vector_store_file_status(
-        self,
-        vector_store_id: str,
-        file_id: str,
-        status: ValidationInterface.StatusEnum,
-        error_message: Optional[str] = None,
-    ) -> ValidationInterface.VectorStoreFileRead:
-        return self._run_sync(
-            self._update_file_status_async(
-                vector_store_id, file_id, status, error_message
-            )
-        )
-    def get_vector_stores_for_assistant(
-        self,
-        assistant_id: str,
-    ) -> List[ValidationInterface.VectorStoreRead]:
-        return self._run_sync(self._get_assistant_vs_async(assistant_id))
-    def attach_vector_store_to_assistant(
-        self,
-        vector_store_id: str,
-        assistant_id: str,
-    ) -> bool:
-        return self._run_sync(self._attach_vs_async(vector_store_id, assistant_id))
-    def detach_vector_store_from_assistant(
-        self,
-        vector_store_id: str,
-        assistant_id: str,
-    ) -> bool:
-        return self._run_sync(self._detach_vs_async(vector_store_id, assistant_id))
-    def retrieve_vector_store_sync(
-        self,
-        vector_store_id: str,
-    ) -> ValidationInterface.VectorStoreRead:
-        resp = self._sync_api_client.get(f"/v1/vector-stores/{vector_store_id}")
-        resp.raise_for_status()
-        return ValidationInterface.VectorStoreRead.model_validate(resp.json())
     # ────────────────────────────────────────────────────────────────
     #  End‑to‑end: retrieve → (rerank) → synthesize → envelope
     # ────────────────────────────────────────────────────────────────
-    def file_search(
+    def attended_file_search(
         self,
         vector_store_id: str,
         query_text: str,
@@ -659,3 +758,58 @@ class VectorStoreClient:
             base_url=self.base_url,  # Same backend
             provider_api_key=os.getenv("HYPERBOLIC_API_KEY"),  # Hyperbolic key
         )
+    # ────────────────────────────────────────────────────────────────
+    #  End‑to‑end: retrieve → (rerank) → synthesize → envelope
+    # ────────────────────────────────────────────────────────────────
+    def unattended_file_search(
+        self,
+        vector_store_id: str,
+        query_text: str,
+        k: int = 20,
+        vector_store_host: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """
+        Perform a search over the file vector store and return normalized retrieval hits.
+        This method executes a bare search pipeline: it retrieves vector-based candidates
+        using semantic similarity, optionally applies reranking (e.g., cross-encoder or LLM-based),
+        and normalizes the result schema. It does not perform synthesis or construct an OpenAI-style envelope.
+        Use this when you want direct access to retrieved content for custom downstream handling,
+        logging, inspection, or separate orchestration logic.
+        Parameters
+        ----------
+        vector_store_id : str
+            The ID of the vector store to search within.
+        query_text : str
+            The user query in natural language.
+        k : int, optional
+            The number of top hits to retrieve (default is 20).
+        vector_store_host : Optional[str], optional
+            Optional override for the vector store host (e.g., when calling remote Qdrant).
+        Returns
+        -------
+        Dict[str, Any]
+            A normalized list of retrieval results (each with metadata and score),
+            without abstraction, synthesis, or formatting.
+        """
+        # 1️⃣ Retrieve initial candidates (now with optional vector_store_host passthrough)
+        hits = retriever.retrieve(
+            self,
+            vector_store_id=vector_store_id,
+            query=query_text,
+            k=k,
+            vector_store_host=vector_store_host,
+        )
+        # 2️⃣ Optional cross-encoder / LLM rerank
+        hits = reranker.rerank(query_text, hits, top_k=min(len(hits), 10))
+        # 3️⃣ Normalize schema (guarantee 'meta_data')
+        hits = self._normalise_hits(hits)
+        return hits

projectdavid 1.29.9__py3-none-any.whl → 1.38.1__py3-none-any.whl

projectdavid 1.29.9py3-none-any.whl → 1.38.1py3-none-any.whl