PyPI - projectdavid - Versions diffs - 1.30.4__py3-none-any.whl → 1.31.1__py3-none-any.whl - Mend

projectdavid 1.30.4py3-none-any.whl → 1.31.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of projectdavid might be problematic. Click here for more details.

Files changed (7) hide show

projectdavid/clients/file_processor.py CHANGED Viewed

@@ -1,19 +1,21 @@
 import asyncio
 import csv
+import json
 import re
+import textwrap
 from concurrent.futures import ThreadPoolExecutor
-from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Tuple, Union
-try:
-    from typing import LiteralString  # Python 3.11+
-except ImportError:
+try:  # Python 3.11+
+    from typing import LiteralString
+except ImportError:  # 3.9–3.10
     from typing_extensions import LiteralString
 import numpy as np
 import pdfplumber
-import validators
+from docx import Document
+from pptx import Presentation
 from projectdavid_common import UtilsInterface
 from sentence_transformers import SentenceTransformer
@@ -21,60 +23,106 @@ log = UtilsInterface.LoggingUtility()
 class FileProcessor:
+    # ------------------------------------------------------------------ #
+    #  Construction
+    # ------------------------------------------------------------------ #
     def __init__(self, max_workers: int = 4, chunk_size: int = 512):
         self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
         self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
         self._executor = ThreadPoolExecutor(max_workers=max_workers)
-        # compute token limits
+        # token limits
         self.max_seq_length = self.embedding_model.get_max_seq_length()
         self.special_tokens_count = 2
         self.effective_max_length = self.max_seq_length - self.special_tokens_count
-        # chunk_size cannot exceed 4× model max
         self.chunk_size = min(chunk_size, self.effective_max_length * 4)
         log.info("Initialized optimized FileProcessor")
+    # ------------------------------------------------------------------ #
+    #  Generic validators
+    # ------------------------------------------------------------------ #
     def validate_file(self, file_path: Path):
-        """Ensure file exists and is under 100 MB."""
+        """Ensure file exists and is under 100 MB."""
         max_size = 100 * 1024 * 1024
         if not file_path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
         if file_path.stat().st_size > max_size:
             mb = max_size // (1024 * 1024)
-            raise ValueError(f"{file_path.name} > {mb} MB limit")
+            raise ValueError(f"{file_path.name} > {mb} MB limit")
+    # ------------------------------------------------------------------ #
+    #  File-type detection  (simple extension map – NO libmagic)
+    # ------------------------------------------------------------------ #
     def _detect_file_type(self, file_path: Path) -> str:
-        """Return 'pdf', 'text', or 'csv'."""
+        """
+        Return one of:
+            • 'pdf'   • 'csv'   • 'json'
+            • 'office' (.doc/.docx/.pptx)
+            • 'text'  (code / markup / plain text)
+        Raises *ValueError* if the extension is not recognised.
+        """
         suffix = file_path.suffix.lower()
         if suffix == ".pdf":
             return "pdf"
         if suffix == ".csv":
             return "csv"
-        if suffix in {".txt", ".md", ".rst"}:
+        if suffix == ".json":
+            return "json"
+        if suffix in {".doc", ".docx", ".pptx"}:
+            return "office"
+        text_exts = {
+            ".txt",
+            ".md",
+            ".rst",
+            ".c",
+            ".cpp",
+            ".cs",
+            ".go",
+            ".java",
+            ".js",
+            ".ts",
+            ".php",
+            ".py",
+            ".rb",
+            ".sh",
+            ".tex",
+            ".html",
+            ".css",
+        }
+        if suffix in text_exts:
             return "text"
-        return "unknown"
+        raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
+    # ------------------------------------------------------------------ #
+    #  Public entry-point
+    # ------------------------------------------------------------------ #
     async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
-        """
-        Async entrypoint: validate, detect type, then dispatch to the
-        appropriate processor (_process_pdf, _process_text, or _process_csv).
-        """
+        """Validate → detect → dispatch to the appropriate processor."""
         file_path = Path(file_path)
         self.validate_file(file_path)
         ftype = self._detect_file_type(file_path)
-        if ftype == "pdf":
-            return await self._process_pdf(file_path)
-        if ftype == "text":
-            return await self._process_text(file_path)
-        if ftype == "csv":
-            return await self._process_csv(file_path)
-        raise ValueError(f"Unsupported extension: {file_path.suffix}")
+        dispatch_map = {
+            "pdf": self._process_pdf,
+            "text": self._process_text,
+            "csv": self._process_csv,
+            "office": self._process_office,
+            "json": self._process_json,
+        }
+        if ftype not in dispatch_map:
+            raise ValueError(f"Unsupported file type: {file_path.suffix}")
-    # ——— PDF / TEXT pipelines unchanged ——— #
+        return await dispatch_map[ftype](file_path)
+    # ------------------------------------------------------------------ #
+    #  PDF
+    # ------------------------------------------------------------------ #
     async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
         page_chunks, doc_meta = await self._extract_text(file_path)
         all_chunks, line_data = [], []
@@ -82,7 +130,6 @@ class FileProcessor:
         for page_text, page_num, line_nums in page_chunks:
             lines = page_text.split("\n")
             buf, buf_lines, length = [], [], 0
             for line, ln in zip(lines, line_nums):
                 l = len(line) + 1
                 if length + l <= self.chunk_size:
@@ -94,12 +141,9 @@ class FileProcessor:
                         all_chunks.append("\n".join(buf))
                         line_data.append({"page": page_num, "lines": buf_lines})
                         buf, buf_lines, length = [], [], 0
-                    # split any oversized line
                     for piece in self._split_oversized_chunk(line):
                         all_chunks.append(piece)
                         line_data.append({"page": page_num, "lines": [ln]})
             if buf:
                 all_chunks.append("\n".join(buf))
                 line_data.append({"page": page_num, "lines": buf_lines})
@@ -107,7 +151,6 @@ class FileProcessor:
         vectors = await asyncio.gather(
             *[self._encode_chunk_async(c) for c in all_chunks]
         )
         return {
             "content": "\n\n".join(all_chunks),
             "metadata": {
@@ -121,6 +164,9 @@ class FileProcessor:
             "line_data": line_data,
         }
+    # ------------------------------------------------------------------ #
+    #  Plain-text / code / markup
+    # ------------------------------------------------------------------ #
     async def _process_text(self, file_path: Path) -> Dict[str, Any]:
         text, extra_meta, _ = await self._extract_text(file_path)
         chunks = self._chunk_text(text)
@@ -137,15 +183,12 @@ class FileProcessor:
             "vectors": [v.tolist() for v in vectors],
         }
-    # ——— NEW: CSV pipeline ——— #
+    # ------------------------------------------------------------------ #
+    #  CSV
+    # ------------------------------------------------------------------ #
     async def _process_csv(
         self, file_path: Path, text_field: str = "description"
     ) -> Dict[str, Any]:
-        """
-        Read each row, embed the `text_field`, and collect per-row metadata
-        from all other columns.
-        """
-        # load rows synchronously
         rows, texts, metas = [], [], []
         with file_path.open(newline="", encoding="utf-8") as f:
             reader = csv.DictReader(f)
@@ -154,27 +197,67 @@ class FileProcessor:
                 if not txt:
                     continue
                 texts.append(txt)
-                # all other columns become metadata
-                row_meta = {k: v for k, v in row.items() if k != text_field and v}
-                metas.append(row_meta)
+                metas.append({k: v for k, v in row.items() if k != text_field and v})
-        # embed in parallel
         vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
+        return {
+            "content": None,
+            "metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
+            "chunks": texts,
+            "vectors": [v.tolist() for v in vectors],
+            "csv_row_metadata": metas,
+        }
+    # ------------------------------------------------------------------ #
+    #  Office docs (.doc/.docx/.pptx)
+    # ------------------------------------------------------------------ #
+    async def _process_office(self, file_path: Path) -> Dict[str, Any]:
+        loop = asyncio.get_event_loop()
+        if file_path.suffix.lower() in {".doc", ".docx"}:
+            text = await loop.run_in_executor(
+                self._executor, self._read_docx, file_path
+            )
+        else:  # .pptx
+            text = await loop.run_in_executor(
+                self._executor, self._read_pptx, file_path
+            )
+        chunks = self._chunk_text(text)
+        vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
         return {
-            "content": None,  # CSVs may not have monolithic text
+            "content": text,
             "metadata": {
                 "source": str(file_path),
-                "rows": len(texts),
-                "type": "csv",
+                "chunks": len(chunks),
+                "type": "office",
             },
-            "chunks": texts,
+            "chunks": chunks,
             "vectors": [v.tolist() for v in vectors],
-            "csv_row_metadata": metas,
         }
-    # ——— shared helpers ——— #
+    # ------------------------------------------------------------------ #
+    #  JSON
+    # ------------------------------------------------------------------ #
+    async def _process_json(self, file_path: Path) -> Dict[str, Any]:
+        text = await asyncio.get_event_loop().run_in_executor(
+            self._executor, self._read_json, file_path
+        )
+        chunks = self._chunk_text(text)
+        vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
+        return {
+            "content": text,
+            "metadata": {
+                "source": str(file_path),
+                "chunks": len(chunks),
+                "type": "json",
+            },
+            "chunks": chunks,
+            "vectors": [v.tolist() for v in vectors],
+        }
+    # ------------------------------------------------------------------ #
+    #  Shared helpers
+    # ------------------------------------------------------------------ #
     async def _extract_text(self, file_path: Path) -> Union[
         Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
         Tuple[str, Dict[str, Any], List[int]],
@@ -202,10 +285,8 @@ class FileProcessor:
             )
             for i, page in enumerate(pdf.pages, start=1):
                 lines = page.extract_text_lines()
-                txts, nums = [], []
-                # sort by vertical position
                 sorted_lines = sorted(lines, key=lambda x: x["top"])
-                # enumerate to get a reliable line number
+                txts, nums = [], []
                 for ln_idx, L in enumerate(sorted_lines, start=1):
                     t = L.get("text", "").strip()
                     if t:
@@ -221,6 +302,23 @@ class FileProcessor:
         except UnicodeDecodeError:
             return file_path.read_text(encoding="latin-1")
+    def _read_docx(self, path: Path) -> str:
+        doc = Document(path)
+        return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+    def _read_pptx(self, path: Path) -> str:
+        prs = Presentation(path)
+        slides = []
+        for slide in prs.slides:
+            chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
+            slides.append("\n".join(filter(None, chunks)))
+        return "\n\n".join(slides)
+    def _read_json(self, path: Path) -> str:
+        obj = json.loads(path.read_text(encoding="utf-8"))
+        pretty = json.dumps(obj, indent=2, ensure_ascii=False)
+        return "\n".join(textwrap.wrap(pretty, width=120))
     async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
         return await asyncio.get_event_loop().run_in_executor(
             self._executor,
@@ -233,11 +331,12 @@ class FileProcessor:
             )[0],
         )
+    # ------------------------------------------------------------------ #
+    #  Text chunking helpers
+    # ------------------------------------------------------------------ #
     def _chunk_text(self, text: str) -> List[str]:
-        # split into sentences, then re-chunk to token limits
         sentences = re.split(r"(?<=[\.!?])\s+", text)
         chunks, buf, length = [], [], 0
         for sent in sentences:
             slen = len(sent) + 1
             if length + slen <= self.chunk_size:
@@ -247,15 +346,12 @@ class FileProcessor:
                 if buf:
                     chunks.append(" ".join(buf))
                     buf, length = [], 0
-                # sentence itself may be too big
                 while len(sent) > self.chunk_size:
                     part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
                     chunks.append(part)
                 buf, length = [sent], len(sent)
         if buf:
             chunks.append(" ".join(buf))
         return chunks
     def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:

projectdavid/clients/vectors.py CHANGED Viewed

@@ -561,6 +561,67 @@ class VectorStoreClient:
             raise FileNotFoundError(f"File not found: {p}")
         return self._run_sync(self._add_file_async(vector_store_id, p, user_metadata))
+    def delete_vector_store(
+        self,
+        vector_store_id: str,
+        permanent: bool = False,
+    ) -> Dict[str, Any]:
+        return self._run_sync(self._delete_vs_async(vector_store_id, permanent))
+    def delete_file_from_vector_store(
+        self,
+        vector_store_id: str,
+        file_path: str,
+    ) -> Dict[str, Any]:
+        return self._run_sync(self._delete_file_async(vector_store_id, file_path))
+    def list_store_files(
+        self,
+        vector_store_id: str,
+    ) -> List[ValidationInterface.VectorStoreFileRead]:
+        return self._run_sync(self._list_store_files_async(vector_store_id))
+    def update_vector_store_file_status(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        status: ValidationInterface.StatusEnum,
+        error_message: Optional[str] = None,
+    ) -> ValidationInterface.VectorStoreFileRead:
+        return self._run_sync(
+            self._update_file_status_async(
+                vector_store_id, file_id, status, error_message
+            )
+        )
+    def get_vector_stores_for_assistant(
+        self,
+        assistant_id: str,
+    ) -> List[ValidationInterface.VectorStoreRead]:
+        return self._run_sync(self._get_assistant_vs_async(assistant_id))
+    def attach_vector_store_to_assistant(
+        self,
+        vector_store_id: str,
+        assistant_id: str,
+    ) -> bool:
+        return self._run_sync(self._attach_vs_async(vector_store_id, assistant_id))
+    def detach_vector_store_from_assistant(
+        self,
+        vector_store_id: str,
+        assistant_id: str,
+    ) -> bool:
+        return self._run_sync(self._detach_vs_async(vector_store_id, assistant_id))
+    def retrieve_vector_store_sync(
+        self,
+        vector_store_id: str,
+    ) -> ValidationInterface.VectorStoreRead:
+        resp = self._sync_api_client.get(f"/v1/vector-stores/{vector_store_id}")
+        resp.raise_for_status()
+        return ValidationInterface.VectorStoreRead.model_validate(resp.json())
     def vector_file_search_raw(
         self,
         vector_store_id: str,
@@ -627,67 +688,6 @@ class VectorStoreClient:
         # 4️⃣  Wrap everything into an OpenAI envelope
         return make_envelope(query_text, hits, answer_text)
-    def delete_vector_store(
-        self,
-        vector_store_id: str,
-        permanent: bool = False,
-    ) -> Dict[str, Any]:
-        return self._run_sync(self._delete_vs_async(vector_store_id, permanent))
-    def delete_file_from_vector_store(
-        self,
-        vector_store_id: str,
-        file_path: str,
-    ) -> Dict[str, Any]:
-        return self._run_sync(self._delete_file_async(vector_store_id, file_path))
-    def list_store_files(
-        self,
-        vector_store_id: str,
-    ) -> List[ValidationInterface.VectorStoreFileRead]:
-        return self._run_sync(self._list_store_files_async(vector_store_id))
-    def update_vector_store_file_status(
-        self,
-        vector_store_id: str,
-        file_id: str,
-        status: ValidationInterface.StatusEnum,
-        error_message: Optional[str] = None,
-    ) -> ValidationInterface.VectorStoreFileRead:
-        return self._run_sync(
-            self._update_file_status_async(
-                vector_store_id, file_id, status, error_message
-            )
-        )
-    def get_vector_stores_for_assistant(
-        self,
-        assistant_id: str,
-    ) -> List[ValidationInterface.VectorStoreRead]:
-        return self._run_sync(self._get_assistant_vs_async(assistant_id))
-    def attach_vector_store_to_assistant(
-        self,
-        vector_store_id: str,
-        assistant_id: str,
-    ) -> bool:
-        return self._run_sync(self._attach_vs_async(vector_store_id, assistant_id))
-    def detach_vector_store_from_assistant(
-        self,
-        vector_store_id: str,
-        assistant_id: str,
-    ) -> bool:
-        return self._run_sync(self._detach_vs_async(vector_store_id, assistant_id))
-    def retrieve_vector_store_sync(
-        self,
-        vector_store_id: str,
-    ) -> ValidationInterface.VectorStoreRead:
-        resp = self._sync_api_client.get(f"/v1/vector-stores/{vector_store_id}")
-        resp.raise_for_status()
-        return ValidationInterface.VectorStoreRead.model_validate(resp.json())
     # ────────────────────────────────────────────────────────────────
     #  End‑to‑end: retrieve → (rerank) → synthesize → envelope
     # ────────────────────────────────────────────────────────────────

{projectdavid-1.30.4.dist-info → projectdavid-1.31.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: projectdavid
-Version: 1.30.4
+Version: 1.31.1
 Summary: Python SDK for interacting with the Entities Assistant API.
 Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
 License: PolyForm Noncommercial License 1.0.0
@@ -27,6 +27,8 @@ Requires-Dist: validators<0.35.0,>=0.29.0
 Requires-Dist: sentence-transformers<5.0,>=3.4.0
 Requires-Dist: sseclient-py
 Requires-Dist: requests
+Requires-Dist: python-docx
+Requires-Dist: python-pptx
 Provides-Extra: dev
 Requires-Dist: black>=23.3; extra == "dev"
 Requires-Dist: isort>=5.12; extra == "dev"

{projectdavid-1.30.4.dist-info → projectdavid-1.31.1.dist-info}/RECORD RENAMED Viewed

@@ -9,7 +9,7 @@ projectdavid/clients/assistants_client.py,sha256=SsIGa5wPr7ga9WX0ywam3djUF-uWFdk
 projectdavid/clients/base_client.py,sha256=UWl6nr6sxD1_xC6iyptQDR1tnNdFCOrEx5cEUPCRqJE,3417
 projectdavid/clients/base_vector_store.py,sha256=jXivmqAW1bgYcLgIeW-hPxOiWZbs2hCsLy4oWzSvpNI,2061
 projectdavid/clients/event_handler.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-projectdavid/clients/file_processor.py,sha256=97MYqpBcQOsPf1hiylPrNo0rJXBNMdSAP7JyAEC0nC0,9941
+projectdavid/clients/file_processor.py,sha256=t-Uw-kBP_VmlguMxO9PWY6ANuMAY0PstQDW37wLPF0Q,13980
 projectdavid/clients/file_search.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 projectdavid/clients/files_client.py,sha256=XkIDzbQFGDrd88taf0Kouc_4YJOPIYEHiIyWYLKDofI,15581
 projectdavid/clients/inference_client.py,sha256=xz4ACPv5Tkis604QxO5mJX1inH_TGDfQP-31geETYpE,6609
@@ -20,7 +20,7 @@ projectdavid/clients/threads_client.py,sha256=ekzU5w14zftmtmFkiec3NC90Of-_KVSUY1
 projectdavid/clients/tools_client.py,sha256=GkCVOmwpAoPqVt6aYmH0G1HIFha3iEwR9IIf9teR0j8,11487
 projectdavid/clients/users_client.py,sha256=eCuUb9qvyH1GUFhZu6TRL9zdoK-qzHSs8-Vmrk_0mmg,13729
 projectdavid/clients/vector_store_manager.py,sha256=lk-sWJjo6Z0EHZzjRoKiHPr0GpEXfE4bJBQzmKV8ezc,11372
-projectdavid/clients/vectors.py,sha256=PUSbBNG5DkEKXE6SmqSHrjPnkve8hMKGO1CxgrBJzbk,30986
+projectdavid/clients/vectors.py,sha256=1UNnLN5nsMvVHXK4Yf7iTXGWZfgIjQ9eLQtCBe0Cqew,30986
 projectdavid/constants/platform.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 projectdavid/services/logging_service.py,sha256=jdoRL46E42Ar8JFTDOV-xVD67CulcHSN-xhcEqA5CXQ,2643
 projectdavid/synthesis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -32,8 +32,8 @@ projectdavid/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
 projectdavid/utils/monitor_launcher.py,sha256=3YAgJdeuaUvq3JGvpA4ymqFsAnk29nH5q93cwStP4hc,2836
 projectdavid/utils/run_monitor.py,sha256=F_WkqIP-qnWH-4llIbileWWLfRj2Q1Cg-ni23SR1rec,3786
 projectdavid/utils/vector_search_formatter.py,sha256=YTe3HPGec26qGY7uxY8_GS8lc4QaN6aNXMzkl29nZpI,1735
-projectdavid-1.30.4.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
-projectdavid-1.30.4.dist-info/METADATA,sha256=2XCe-qaqKf4lgEmfSLe08RIh-0dHAJoBeJm1SOYI9Ak,10727
-projectdavid-1.30.4.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
-projectdavid-1.30.4.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
-projectdavid-1.30.4.dist-info/RECORD,,
+projectdavid-1.31.1.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
+projectdavid-1.31.1.dist-info/METADATA,sha256=f-SkJ06HipWaVJZ0W-bECBP7-2OjCNqTNc58kN7A0qw,10781
+projectdavid-1.31.1.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
+projectdavid-1.31.1.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
+projectdavid-1.31.1.dist-info/RECORD,,

{projectdavid-1.30.4.dist-info → projectdavid-1.31.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{projectdavid-1.30.4.dist-info → projectdavid-1.31.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{projectdavid-1.30.4.dist-info → projectdavid-1.31.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

projectdavid 1.30.4__py3-none-any.whl → 1.31.1__py3-none-any.whl

Potentially problematic release.

projectdavid 1.30.4py3-none-any.whl → 1.31.1py3-none-any.whl