PyPI - projectdavid - Versions diffs - 1.33.13__py3-none-any.whl → 1.33.14__py3-none-any.whl - Mend

projectdavid 1.33.13py3-none-any.whl → 1.33.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of projectdavid might be problematic. Click here for more details.

Files changed (7) hide show

projectdavid/clients/file_processor.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import asyncio
 import csv
-import hashlib
 import json
-import math
 import re
 import textwrap
 from concurrent.futures import ThreadPoolExecutor
@@ -15,124 +13,34 @@ except ImportError:  # 3.9–3.10
     from typing_extensions import LiteralString
 import numpy as np
-import open_clip
 import pdfplumber
-import torch
 from docx import Document
-from PIL import Image
 from pptx import Presentation
-from transformers import Blip2ForConditionalGeneration, Blip2Processor
-from ultralytics import YOLO
-# OCR fallback – optional
-try:
-    import pytesseract  # noqa: F401  # pylint: disable=unused-import
-except ImportError:
-    pytesseract = None
 from projectdavid_common import UtilsInterface
 from sentence_transformers import SentenceTransformer
 log = UtilsInterface.LoggingUtility()
-def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
-    """Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
-    lat_r = math.radians(lat)
-    lon_r = math.radians(lon)
-    return [
-        math.cos(lat_r) * math.cos(lon_r),
-        math.cos(lat_r) * math.sin(lon_r),
-        math.sin(lat_r),
-    ]
 class FileProcessor:
-    """Unified processor for text, tabular, office, JSON, **and image** files.
-    Each modality is embedded with its optimal model:
-        • Text   → paraphrase‑MiniLM‑L6‑v2 (384‑D)
-        • Image  → OpenCLIP ViT‑H/14         (1024‑D)
-        • Caption→ OpenCLIP text head        (1024‑D)
-    Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
-    GPU usage is optional; pass `use_gpu=False` to stay on CPU.
-    """
     # ------------------------------------------------------------------ #
     #  Construction
     # ------------------------------------------------------------------ #
-    def __init__(
-        self,
-        *,
-        max_workers: int = 4,
-        chunk_size: int = 512,
-        use_gpu: bool = True,
-        use_ocr: bool = True,
-        use_detection: bool = False,
-        image_model_name: str = "ViT-H-14",
-        caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
-    ):
-        # Device selection
-        if use_gpu and torch.cuda.is_available():
-            self.device = torch.device("cuda")
-            self.torch_dtype = torch.float16
-        else:
-            self.device = torch.device("cpu")
-            self.torch_dtype = torch.float32
-        # Feature flags
-        self.use_ocr = use_ocr and pytesseract is not None
-        self.use_detection = use_detection
-        if use_ocr and pytesseract is None:
-            log.warning("OCR requested but pytesseract not installed – skipping.")
-        if self.use_detection:
-            self.detector = YOLO("yolov8x.pt").to(self.device)
-        # Text embedder
+    def __init__(self, max_workers: int = 4, chunk_size: int = 512):
+        self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
         self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
-        self.embedding_model = SentenceTransformer(self.embedding_model_name)
-        self.embedding_model.to(str(self.device))
+        self._executor = ThreadPoolExecutor(max_workers=max_workers)
-        # Chunking parameters
+        # token limits
         self.max_seq_length = self.embedding_model.get_max_seq_length()
         self.special_tokens_count = 2
         self.effective_max_length = self.max_seq_length - self.special_tokens_count
         self.chunk_size = min(chunk_size, self.effective_max_length * 4)
-        # Image embedder
-        self.clip_model, _, self.clip_preprocess = (
-            open_clip.create_model_and_transforms(
-                image_model_name,
-                pretrained="laion2b_s32b_b79k",
-                precision="fp16" if self.device.type == "cuda" else "fp32",
-            )
-        )
-        self.clip_model = self.clip_model.to(self.device).eval()
-        self.clip_tokenizer = open_clip.get_tokenizer(image_model_name)
-        # Caption generator
-        self.blip_processor = Blip2Processor.from_pretrained(caption_model_name)
-        self.blip_model = (
-            Blip2ForConditionalGeneration.from_pretrained(
-                caption_model_name,
-                torch_dtype=self.torch_dtype,
-            )
-            .to(self.device)
-            .eval()
-        )
-        # Executor & logging
-        self._executor = ThreadPoolExecutor(max_workers=max_workers)
-        log.info(
-            "FileProcessor ready (device=%s, OCR=%s, detection=%s)",
-            self.device,
-            self.use_ocr,
-            self.use_detection,
-        )
+        log.info("Initialized optimized FileProcessor")
     # ------------------------------------------------------------------ #
-    #  Generic validators                                           *
+    #  Generic validators
     # ------------------------------------------------------------------ #
     def validate_file(self, file_path: Path):
         """Ensure file exists and is under 100 MB."""
@@ -144,10 +52,20 @@ class FileProcessor:
             raise ValueError(f"{file_path.name} > {mb} MB limit")
     # ------------------------------------------------------------------ #
-    #  File‑type detection (extension‑based – no libmagic)
+    #  File-type detection  (simple extension map – NO libmagic)
     # ------------------------------------------------------------------ #
     def _detect_file_type(self, file_path: Path) -> str:
+        """
+        Return one of:
+            • 'pdf'   • 'csv'   • 'json'
+            • 'office' (.doc/.docx/.pptx)
+            • 'text'  (code / markup / plain text)
+        Raises *ValueError* if the extension is not recognised.
+        """
         suffix = file_path.suffix.lower()
         if suffix == ".pdf":
             return "pdf"
         if suffix == ".csv":
@@ -156,8 +74,7 @@ class FileProcessor:
             return "json"
         if suffix in {".doc", ".docx", ".pptx"}:
             return "office"
-        if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
-            return "image"
         text_exts = {
             ".txt",
             ".md",
@@ -179,100 +96,29 @@ class FileProcessor:
         }
         if suffix in text_exts:
             return "text"
         raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
     # ------------------------------------------------------------------ #
-    # Dispatcher
+    #  Public entry-point
     # ------------------------------------------------------------------ #
     async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
-        path = Path(file_path)
-        self.validate_file(path)
-        ftype = self._detect_file_type(path)
-        return await getattr(self, f"_process_{ftype}")(path)
-    # ------------------------------------------------------------------ #
-    #  Image processing (OpenCLIP + BLIP-2 + OCR + YOLO)
-    # ------------------------------------------------------------------ #
-    async def _process_image(self, file_path: Path) -> Dict[str, Any]:
-        loop = asyncio.get_event_loop()
-        img = await loop.run_in_executor(self._executor, Image.open, file_path)
-        # 1) Image vector
-        def enc_img():
-            with torch.no_grad():
-                t = self.clip_preprocess(img).unsqueeze(0).to(self.device)
-                v = self.clip_model.encode_image(t).squeeze()
-                return (v / v.norm()).float().cpu().numpy()
-        image_vec = await loop.run_in_executor(self._executor, enc_img)
-        # 2) Caption
-        def gen_cap():
-            inp = self.blip_processor(images=img, return_tensors="pt").to(self.device)
-            with torch.no_grad():
-                ids = self.blip_model.generate(**inp, max_new_tokens=50)
-            return self.blip_processor.decode(ids[0], skip_special_tokens=True)
-        caption = await loop.run_in_executor(self._executor, gen_cap)
-        # 3) OCR
-        if self.use_ocr:
-            text = await loop.run_in_executor(
-                self._executor, pytesseract.image_to_string, img
-            )
-            if t := text.strip():
-                caption += "\n" + t
-        # 4) Caption vector
-        def enc_txt():
-            with torch.no_grad():
-                tok = self.clip_tokenizer(caption).unsqueeze(0).to(self.device)
-                v = self.clip_model.encode_text(tok).squeeze()
-                return (v / v.norm()).float().cpu().numpy()
-        caption_vec = await loop.run_in_executor(self._executor, enc_txt)
-        # 5) YOLO regions
-        region_vectors = []
-        if self.use_detection:
-            dets = self.detector(img)[0]
-            for box in dets.boxes:
-                x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().tolist())
-                crop = img.crop((x1, y1, x2, y2))
-                vec = self.encode_image(crop)
-                region_vectors.append(
-                    {
-                        "vector": vec.tolist(),
-                        "bbox": [x1, y1, x2, y2],
-                        "label": dets.names[int(box.cls)],
-                        "conf": float(box.conf),
-                    }
-                )
-        # Metadata
-        sha = hashlib.sha256(file_path.read_bytes()).hexdigest()
-        w, h = img.size
-        meta = {
-            "source": str(file_path),
-            "type": "image",
-            "width": w,
-            "height": h,
-            "mime": f"image/{file_path.suffix.lstrip('.')}",
-            "sha256": sha,
-            "embedding_model": "openclip-vit-h-14",
-            "caption": caption,
+        """Validate → detect → dispatch to the appropriate processor."""
+        file_path = Path(file_path)
+        self.validate_file(file_path)
+        ftype = self._detect_file_type(file_path)
+        dispatch_map = {
+            "pdf": self._process_pdf,
+            "text": self._process_text,
+            "csv": self._process_csv,
+            "office": self._process_office,
+            "json": self._process_json,
         }
+        if ftype not in dispatch_map:
+            raise ValueError(f"Unsupported file type: {file_path.suffix}")
-        result = {
-            "content": None,
-            "metadata": meta,
-            "chunks": [caption],
-            "vectors": [image_vec.tolist()],
-            "caption_vector": caption_vec.tolist(),
-        }
-        if region_vectors:
-            result["region_vectors"] = region_vectors
-        return result
+        return await dispatch_map[ftype](file_path)
     # ------------------------------------------------------------------ #
     #  PDF
@@ -280,6 +126,7 @@ class FileProcessor:
     async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
         page_chunks, doc_meta = await self._extract_text(file_path)
         all_chunks, line_data = [], []
         for page_text, page_num, line_nums in page_chunks:
             lines = page_text.split("\n")
             buf, buf_lines, length = [], [], 0
@@ -318,7 +165,7 @@ class FileProcessor:
         }
     # ------------------------------------------------------------------ #
-    #  Plain‑text / code / markup
+    #  Plain-text / code / markup
     # ------------------------------------------------------------------ #
     async def _process_text(self, file_path: Path) -> Dict[str, Any]:
         text, extra_meta, _ = await self._extract_text(file_path)
@@ -351,6 +198,7 @@ class FileProcessor:
                     continue
                 texts.append(txt)
                 metas.append({k: v for k, v in row.items() if k != text_field and v})
         vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
         return {
             "content": None,
@@ -361,7 +209,7 @@ class FileProcessor:
         }
     # ------------------------------------------------------------------ #
-    #  Office docs
+    #  Office docs (.doc/.docx/.pptx)
     # ------------------------------------------------------------------ #
     async def _process_office(self, file_path: Path) -> Dict[str, Any]:
         loop = asyncio.get_event_loop()
@@ -369,10 +217,11 @@ class FileProcessor:
             text = await loop.run_in_executor(
                 self._executor, self._read_docx, file_path
             )
-        else:
+        else:  # .pptx
             text = await loop.run_in_executor(
                 self._executor, self._read_pptx, file_path
             )
         chunks = self._chunk_text(text)
         vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
         return {
@@ -418,25 +267,11 @@ class FileProcessor:
             return await loop.run_in_executor(
                 self._executor, self._extract_pdf_text, file_path
             )
-        text = await loop.run_in_executor(
-            self._executor, self._read_text_file, file_path
-        )
-        return text, {}, []
-    # ------------------------------------------------------------------ #
-    # util: clip‑text encoder (public)
-    # ------------------------------------------------------------------ #
-    def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
-        with torch.no_grad():
-            toks = (
-                self.clip_tokenizer(text)
-                if isinstance(text, str)
-                else self.clip_tokenizer(text, truncate=True)
+        else:
+            text = await loop.run_in_executor(
+                self._executor, self._read_text_file, file_path
             )
-            tensor = toks.unsqueeze(0).to(self.device)
-            feat = self.clip_model.encode_text(tensor).squeeze()
-            feat = feat / feat.norm()
-            return feat.float().cpu().numpy()
+            return text, {}, []
     def _extract_pdf_text(self, file_path: Path):
         page_chunks, meta = [], {}
@@ -452,8 +287,8 @@ class FileProcessor:
                 lines = page.extract_text_lines()
                 sorted_lines = sorted(lines, key=lambda x: x["top"])
                 txts, nums = [], []
-                for ln_idx, line in enumerate(sorted_lines, start=1):
-                    t = line.get("text", "").strip()
+                for ln_idx, L in enumerate(sorted_lines, start=1):
+                    t = L.get("text", "").strip()
                     if t:
                         txts.append(t)
                         nums.append(ln_idx)
@@ -527,24 +362,3 @@ class FileProcessor:
             seg = tokens[i : i + self.effective_max_length]
             out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
         return out
-    # ------------------------------------------------------------------ #
-    #  Retrieval helpers (optional use)
-    # ------------------------------------------------------------------ #
-    def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
-        """Embed raw text with the SentenceTransformer model."""
-        single = isinstance(text, str)
-        out = self.embedding_model.encode(
-            text,
-            convert_to_numpy=True,
-            normalize_embeddings=True,
-            show_progress_bar=False,
-        )
-        return out if not single else out[0]
-    def encode_image(self, img: Image.Image) -> np.ndarray:
-        with torch.no_grad():
-            tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
-            feat = self.clip_model.encode_image(tensor).squeeze()
-            feat = feat / feat.norm()
-            return feat.float().cpu().numpy()

projectdavid/clients/vision-file_processor.py ADDED Viewed

@@ -0,0 +1,438 @@
+import asyncio
+import csv
+import hashlib
+import json
+import math
+import re
+import textwrap
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+try:  # Python 3.11+
+    from typing import LiteralString
+except ImportError:  # 3.9–3.10
+    from typing_extensions import LiteralString
+import numpy as np
+import open_clip
+import pdfplumber
+import torch
+from docx import Document
+from PIL import Image
+from pptx import Presentation
+from projectdavid_common import UtilsInterface
+from sentence_transformers import SentenceTransformer
+# from transformers import Blip2ForConditionalGeneration, Blip2Processor
+# from ultralytics import YOLO
+# OCR fallback – optional
+# try:
+#    import pytesseract  # noqa: F401  # pylint: disable=unused-import
+# except ImportError:
+#    pytesseract = None
+log = UtilsInterface.LoggingUtility()
+def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
+    """Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
+    lat_r = math.radians(lat)
+    lon_r = math.radians(lon)
+    return [
+        math.cos(lat_r) * math.cos(lon_r),
+        math.cos(lat_r) * math.sin(lon_r),
+        math.sin(lat_r),
+    ]
+class FileProcessor:
+    """Unified processor for text, tabular, office, JSON, **and image** files.
+    Each modality is embedded with its optimal model:
+        • Text   → paraphrase‑MiniLM‑L6‑v2 (384‑D)
+        • Image  → OpenCLIP ViT‑H/14         (1024‑D)
+        • Caption→ OpenCLIP text head        (1024‑D)
+    Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
+    GPU usage is optional; pass `use_gpu=False` to stay on CPU.
+    """
+    # ------------------------------------------------------------------ #
+    #  Construction
+    # ------------------------------------------------------------------ #
+    def __init__(
+        self,
+        *,
+        max_workers: int = 4,
+        chunk_size: int = 512,
+        use_gpu: bool = True,
+        use_ocr: bool = True,
+        use_detection: bool = False,
+        image_model_name: str = "ViT-H-14",
+        caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
+    ):
+        # Device selection
+        if use_gpu and torch.cuda.is_available():
+            self.device = torch.device("cuda")
+            self.torch_dtype = torch.float16
+        else:
+            self.device = torch.device("cpu")
+            self.torch_dtype = torch.float32
+        # Text embedder
+        self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
+        self.embedding_model = SentenceTransformer(self.embedding_model_name)
+        self.embedding_model.to(str(self.device))
+        # Chunking parameters
+        self.max_seq_length = self.embedding_model.get_max_seq_length()
+        self.special_tokens_count = 2
+        self.effective_max_length = self.max_seq_length - self.special_tokens_count
+        self.chunk_size = min(chunk_size, self.effective_max_length * 4)
+        # Executor & logging
+        self._executor = ThreadPoolExecutor(max_workers=max_workers)
+        log.info(
+            "FileProcessor ready (device=%s, OCR=%s, detection=%s)",
+            self.device,
+            # self.use_ocr,
+            # self.use_detection,
+        )
+    # ------------------------------------------------------------------ #
+    #  Generic validators                                           *
+    # ------------------------------------------------------------------ #
+    def validate_file(self, file_path: Path):
+        """Ensure file exists and is under 100 MB."""
+        max_size = 100 * 1024 * 1024
+        if not file_path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        if file_path.stat().st_size > max_size:
+            mb = max_size // (1024 * 1024)
+            raise ValueError(f"{file_path.name} > {mb} MB limit")
+    # ------------------------------------------------------------------ #
+    #  File‑type detection (extension‑based – no libmagic)
+    # ------------------------------------------------------------------ #
+    def _detect_file_type(self, file_path: Path) -> str:
+        suffix = file_path.suffix.lower()
+        if suffix == ".pdf":
+            return "pdf"
+        if suffix == ".csv":
+            return "csv"
+        if suffix == ".json":
+            return "json"
+        if suffix in {".doc", ".docx", ".pptx"}:
+            return "office"
+        if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
+            return "image"
+        text_exts = {
+            ".txt",
+            ".md",
+            ".rst",
+            ".c",
+            ".cpp",
+            ".cs",
+            ".go",
+            ".java",
+            ".js",
+            ".ts",
+            ".php",
+            ".py",
+            ".rb",
+            ".sh",
+            ".tex",
+            ".html",
+            ".css",
+        }
+        if suffix in text_exts:
+            return "text"
+        raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
+    # ------------------------------------------------------------------ #
+    # Dispatcher
+    # ------------------------------------------------------------------ #
+    async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
+        path = Path(file_path)
+        self.validate_file(path)
+        ftype = self._detect_file_type(path)
+        return await getattr(self, f"_process_{ftype}")(path)
+    # ------------------------------------------------------------------ #
+    #  PDF
+    # ------------------------------------------------------------------ #
+    async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
+        page_chunks, doc_meta = await self._extract_text(file_path)
+        all_chunks, line_data = [], []
+        for page_text, page_num, line_nums in page_chunks:
+            lines = page_text.split("\n")
+            buf, buf_lines, length = [], [], 0
+            for line, ln in zip(lines, line_nums):
+                l = len(line) + 1
+                if length + l <= self.chunk_size:
+                    buf.append(line)
+                    buf_lines.append(ln)
+                    length += l
+                else:
+                    if buf:
+                        all_chunks.append("\n".join(buf))
+                        line_data.append({"page": page_num, "lines": buf_lines})
+                        buf, buf_lines, length = [], [], 0
+                    for piece in self._split_oversized_chunk(line):
+                        all_chunks.append(piece)
+                        line_data.append({"page": page_num, "lines": [ln]})
+            if buf:
+                all_chunks.append("\n".join(buf))
+                line_data.append({"page": page_num, "lines": buf_lines})
+        vectors = await asyncio.gather(
+            *[self._encode_chunk_async(c) for c in all_chunks]
+        )
+        return {
+            "content": "\n\n".join(all_chunks),
+            "metadata": {
+                **doc_meta,
+                "source": str(file_path),
+                "chunks": len(all_chunks),
+                "type": "pdf",
+            },
+            "chunks": all_chunks,
+            "vectors": [v.tolist() for v in vectors],
+            "line_data": line_data,
+        }
+    # ------------------------------------------------------------------ #
+    #  Plain‑text / code / markup
+    # ------------------------------------------------------------------ #
+    async def _process_text(self, file_path: Path) -> Dict[str, Any]:
+        text, extra_meta, _ = await self._extract_text(file_path)
+        chunks = self._chunk_text(text)
+        vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
+        return {
+            "content": text,
+            "metadata": {
+                **extra_meta,
+                "source": str(file_path),
+                "chunks": len(chunks),
+                "type": "text",
+            },
+            "chunks": chunks,
+            "vectors": [v.tolist() for v in vectors],
+        }
+    # ------------------------------------------------------------------ #
+    #  CSV
+    # ------------------------------------------------------------------ #
+    async def _process_csv(
+        self, file_path: Path, text_field: str = "description"
+    ) -> Dict[str, Any]:
+        rows, texts, metas = [], [], []
+        with file_path.open(newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                txt = row.get(text_field, "").strip()
+                if not txt:
+                    continue
+                texts.append(txt)
+                metas.append({k: v for k, v in row.items() if k != text_field and v})
+        vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
+        return {
+            "content": None,
+            "metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
+            "chunks": texts,
+            "vectors": [v.tolist() for v in vectors],
+            "csv_row_metadata": metas,
+        }
+    # ------------------------------------------------------------------ #
+    #  Office docs
+    # ------------------------------------------------------------------ #
+    async def _process_office(self, file_path: Path) -> Dict[str, Any]:
+        loop = asyncio.get_event_loop()
+        if file_path.suffix.lower() in {".doc", ".docx"}:
+            text = await loop.run_in_executor(
+                self._executor, self._read_docx, file_path
+            )
+        else:
+            text = await loop.run_in_executor(
+                self._executor, self._read_pptx, file_path
+            )
+        chunks = self._chunk_text(text)
+        vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
+        return {
+            "content": text,
+            "metadata": {
+                "source": str(file_path),
+                "chunks": len(chunks),
+                "type": "office",
+            },
+            "chunks": chunks,
+            "vectors": [v.tolist() for v in vectors],
+        }
+    # ------------------------------------------------------------------ #
+    #  JSON
+    # ------------------------------------------------------------------ #
+    async def _process_json(self, file_path: Path) -> Dict[str, Any]:
+        text = await asyncio.get_event_loop().run_in_executor(
+            self._executor, self._read_json, file_path
+        )
+        chunks = self._chunk_text(text)
+        vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
+        return {
+            "content": text,
+            "metadata": {
+                "source": str(file_path),
+                "chunks": len(chunks),
+                "type": "json",
+            },
+            "chunks": chunks,
+            "vectors": [v.tolist() for v in vectors],
+        }
+    # ------------------------------------------------------------------ #
+    #  Shared helpers
+    # ------------------------------------------------------------------ #
+    async def _extract_text(self, file_path: Path) -> Union[
+        Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
+        Tuple[str, Dict[str, Any], List[int]],
+    ]:
+        loop = asyncio.get_event_loop()
+        if file_path.suffix.lower() == ".pdf":
+            return await loop.run_in_executor(
+                self._executor, self._extract_pdf_text, file_path
+            )
+        text = await loop.run_in_executor(
+            self._executor, self._read_text_file, file_path
+        )
+        return text, {}, []
+    # ------------------------------------------------------------------ #
+    # util: clip‑text encoder (public)
+    # ------------------------------------------------------------------ #
+    def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
+        with torch.no_grad():
+            toks = (
+                self.clip_tokenizer(text)
+                if isinstance(text, str)
+                else self.clip_tokenizer(text, truncate=True)
+            )
+            tensor = toks.unsqueeze(0).to(self.device)
+            feat = self.clip_model.encode_text(tensor).squeeze()
+            feat = feat / feat.norm()
+            return feat.float().cpu().numpy()
+    def _extract_pdf_text(self, file_path: Path):
+        page_chunks, meta = [], {}
+        with pdfplumber.open(file_path) as pdf:
+            meta.update(
+                {
+                    "author": pdf.metadata.get("Author", ""),
+                    "title": pdf.metadata.get("Title", file_path.stem),
+                    "page_count": len(pdf.pages),
+                }
+            )
+            for i, page in enumerate(pdf.pages, start=1):
+                lines = page.extract_text_lines()
+                sorted_lines = sorted(lines, key=lambda x: x["top"])
+                txts, nums = [], []
+                for ln_idx, line in enumerate(sorted_lines, start=1):
+                    t = line.get("text", "").strip()
+                    if t:
+                        txts.append(t)
+                        nums.append(ln_idx)
+                if txts:
+                    page_chunks.append(("\n".join(txts), i, nums))
+        return page_chunks, meta
+    def _read_text_file(self, file_path: Path) -> str:
+        try:
+            return file_path.read_text(encoding="utf-8")
+        except UnicodeDecodeError:
+            return file_path.read_text(encoding="latin-1")
+    def _read_docx(self, path: Path) -> str:
+        doc = Document(path)
+        return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+    def _read_pptx(self, path: Path) -> str:
+        prs = Presentation(path)
+        slides = []
+        for slide in prs.slides:
+            chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
+            slides.append("\n".join(filter(None, chunks)))
+        return "\n\n".join(slides)
+    def _read_json(self, path: Path) -> str:
+        obj = json.loads(path.read_text(encoding="utf-8"))
+        pretty = json.dumps(obj, indent=2, ensure_ascii=False)
+        return "\n".join(textwrap.wrap(pretty, width=120))
+    async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
+        return await asyncio.get_event_loop().run_in_executor(
+            self._executor,
+            lambda: self.embedding_model.encode(
+                [chunk],
+                convert_to_numpy=True,
+                truncate="model_max_length",
+                normalize_embeddings=True,
+                show_progress_bar=False,
+            )[0],
+        )
+    # ------------------------------------------------------------------ #
+    #  Text chunking helpers
+    # ------------------------------------------------------------------ #
+    def _chunk_text(self, text: str) -> List[str]:
+        sentences = re.split(r"(?<=[\.!?])\s+", text)
+        chunks, buf, length = [], [], 0
+        for sent in sentences:
+            slen = len(sent) + 1
+            if length + slen <= self.chunk_size:
+                buf.append(sent)
+                length += slen
+            else:
+                if buf:
+                    chunks.append(" ".join(buf))
+                    buf, length = [], 0
+                while len(sent) > self.chunk_size:
+                    part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
+                    chunks.append(part)
+                buf, length = [sent], len(sent)
+        if buf:
+            chunks.append(" ".join(buf))
+        return chunks
+    def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
+        if tokens is None:
+            tokens = self.embedding_model.tokenizer.tokenize(chunk)
+        out = []
+        for i in range(0, len(tokens), self.effective_max_length):
+            seg = tokens[i : i + self.effective_max_length]
+            out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
+        return out
+    # ------------------------------------------------------------------ #
+    #  Retrieval helpers (optional use)
+    # ------------------------------------------------------------------ #
+    def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
+        """Embed raw text with the SentenceTransformer model."""
+        single = isinstance(text, str)
+        out = self.embedding_model.encode(
+            text,
+            convert_to_numpy=True,
+            normalize_embeddings=True,
+            show_progress_bar=False,
+        )
+        return out if not single else out[0]
+    def encode_image(self, img: Image.Image) -> np.ndarray:
+        with torch.no_grad():
+            tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
+            feat = self.clip_model.encode_image(tensor).squeeze()
+            feat = feat / feat.norm()
+            return feat.float().cpu().numpy()

{projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: projectdavid
-Version: 1.33.13
+Version: 1.33.14
 Summary: Python SDK for interacting with the Entities Assistant API.
 Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
 License: PolyForm Noncommercial License 1.0.0

{projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/RECORD RENAMED Viewed

@@ -10,7 +10,7 @@ projectdavid/clients/assistants_client.py,sha256=SsIGa5wPr7ga9WX0ywam3djUF-uWFdk
 projectdavid/clients/base_client.py,sha256=UWl6nr6sxD1_xC6iyptQDR1tnNdFCOrEx5cEUPCRqJE,3417
 projectdavid/clients/base_vector_store.py,sha256=jXivmqAW1bgYcLgIeW-hPxOiWZbs2hCsLy4oWzSvpNI,2061
 projectdavid/clients/event_handler.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-projectdavid/clients/file_processor.py,sha256=nFccQmiow3lkjv1-Pdgv_2WQAtSy0FRN7oJlTKt4fs4,21114
+projectdavid/clients/file_processor.py,sha256=t-Uw-kBP_VmlguMxO9PWY6ANuMAY0PstQDW37wLPF0Q,13980
 projectdavid/clients/file_search.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 projectdavid/clients/files_client.py,sha256=XkIDzbQFGDrd88taf0Kouc_4YJOPIYEHiIyWYLKDofI,15581
 projectdavid/clients/inference_client.py,sha256=xz4ACPv5Tkis604QxO5mJX1inH_TGDfQP-31geETYpE,6609
@@ -22,6 +22,7 @@ projectdavid/clients/tools_client.py,sha256=GkCVOmwpAoPqVt6aYmH0G1HIFha3iEwR9IIf
 projectdavid/clients/users_client.py,sha256=eCuUb9qvyH1GUFhZu6TRL9zdoK-qzHSs8-Vmrk_0mmg,13729
 projectdavid/clients/vector_store_manager.py,sha256=q-ZgRQVX_S3nMrKYhmvkVrDjDRzM3ZFzUF55HBGRTe8,12861
 projectdavid/clients/vectors.py,sha256=cysPVbUzW3byB82MTqG2X1Iz5ZAe82WTS1JfQcoqVhE,40229
+projectdavid/clients/vision-file_processor.py,sha256=19ft9IUeY5x9_22vC4JqndiFlpDYyUn6z1ygv-EV2NE,16852
 projectdavid/constants/platform.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 projectdavid/services/logging_service.py,sha256=jdoRL46E42Ar8JFTDOV-xVD67CulcHSN-xhcEqA5CXQ,2643
 projectdavid/synthesis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -35,8 +36,8 @@ projectdavid/utils/monitor_launcher.py,sha256=3YAgJdeuaUvq3JGvpA4ymqFsAnk29nH5q9
 projectdavid/utils/peek_gate.py,sha256=5whMRnDOQjATRpThWDJkvY9ScXuJ7Sd_-9rvGgXeTAQ,2532
 projectdavid/utils/run_monitor.py,sha256=F_WkqIP-qnWH-4llIbileWWLfRj2Q1Cg-ni23SR1rec,3786
 projectdavid/utils/vector_search_formatter.py,sha256=YTe3HPGec26qGY7uxY8_GS8lc4QaN6aNXMzkl29nZpI,1735
-projectdavid-1.33.13.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
-projectdavid-1.33.13.dist-info/METADATA,sha256=wFCKMGJBgK8yku6jOoE2IPsdB9-kfIGL1kBYF89yfNM,11555
-projectdavid-1.33.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-projectdavid-1.33.13.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
-projectdavid-1.33.13.dist-info/RECORD,,
+projectdavid-1.33.14.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
+projectdavid-1.33.14.dist-info/METADATA,sha256=jFWdJGL8LYBQNEoEqBZ6DhLJ-HnVgLsvQ06K7PAkpRA,11555
+projectdavid-1.33.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+projectdavid-1.33.14.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
+projectdavid-1.33.14.dist-info/RECORD,,

{projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/WHEEL RENAMED Viewed

File without changes

{projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/top_level.txt RENAMED Viewed

File without changes

projectdavid 1.33.13__py3-none-any.whl → 1.33.14__py3-none-any.whl

Potentially problematic release.

projectdavid 1.33.13py3-none-any.whl → 1.33.14py3-none-any.whl