projectdavid 1.32.20__py3-none-any.whl → 1.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of projectdavid might be problematic. Click here for more details.

@@ -1,6 +1,8 @@
1
1
  import asyncio
2
2
  import csv
3
+ import hashlib
3
4
  import json
5
+ import math
4
6
  import re
5
7
  import textwrap
6
8
  from concurrent.futures import ThreadPoolExecutor
@@ -13,34 +15,124 @@ except ImportError: # 3.9–3.10
13
15
  from typing_extensions import LiteralString
14
16
 
15
17
  import numpy as np
18
+ import open_clip
16
19
  import pdfplumber
20
+ import torch
17
21
  from docx import Document
22
+ from PIL import Image
18
23
  from pptx import Presentation
24
+ from transformers import Blip2ForConditionalGeneration, Blip2Processor
25
+ from ultralytics import YOLO
26
+
27
+ # OCR fallback – optional
28
+ try:
29
+ import pytesseract # noqa: F401 # pylint: disable=unused-import
30
+ except ImportError:
31
+ pytesseract = None
32
+
19
33
  from projectdavid_common import UtilsInterface
20
34
  from sentence_transformers import SentenceTransformer
21
35
 
22
36
  log = UtilsInterface.LoggingUtility()
23
37
 
24
38
 
39
+ def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
40
+ """Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
41
+ lat_r = math.radians(lat)
42
+ lon_r = math.radians(lon)
43
+ return [
44
+ math.cos(lat_r) * math.cos(lon_r),
45
+ math.cos(lat_r) * math.sin(lon_r),
46
+ math.sin(lat_r),
47
+ ]
48
+
49
+
25
50
  class FileProcessor:
51
+ """Unified processor for text, tabular, office, JSON, **and image** files.
52
+
53
+ Each modality is embedded with its optimal model:
54
+ • Text → paraphrase‑MiniLM‑L6‑v2 (384‑D)
55
+ • Image → OpenCLIP ViT‑H/14 (1024‑D)
56
+ • Caption→ OpenCLIP text head (1024‑D)
57
+
58
+ Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
59
+ GPU usage is optional; pass `use_gpu=False` to stay on CPU.
60
+ """
61
+
26
62
  # ------------------------------------------------------------------ #
27
63
  # Construction
28
64
  # ------------------------------------------------------------------ #
29
- def __init__(self, max_workers: int = 4, chunk_size: int = 512):
30
- self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
65
+ def __init__(
66
+ self,
67
+ *,
68
+ max_workers: int = 4,
69
+ chunk_size: int = 512,
70
+ use_gpu: bool = True,
71
+ use_ocr: bool = True,
72
+ use_detection: bool = False,
73
+ image_model_name: str = "ViT-H-14",
74
+ caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
75
+ ):
76
+ # Device selection
77
+ if use_gpu and torch.cuda.is_available():
78
+ self.device = torch.device("cuda")
79
+ self.torch_dtype = torch.float16
80
+ else:
81
+ self.device = torch.device("cpu")
82
+ self.torch_dtype = torch.float32
83
+
84
+ # Feature flags
85
+ self.use_ocr = use_ocr and pytesseract is not None
86
+ self.use_detection = use_detection
87
+ if use_ocr and pytesseract is None:
88
+ log.warning("OCR requested but pytesseract not installed – skipping.")
89
+ if self.use_detection:
90
+ self.detector = YOLO("yolov8x.pt").to(self.device)
91
+
92
+ # Text embedder
31
93
  self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
32
- self._executor = ThreadPoolExecutor(max_workers=max_workers)
94
+ self.embedding_model = SentenceTransformer(self.embedding_model_name)
95
+ self.embedding_model.to(str(self.device))
33
96
 
34
- # token limits
97
+ # Chunking parameters
35
98
  self.max_seq_length = self.embedding_model.get_max_seq_length()
36
99
  self.special_tokens_count = 2
37
100
  self.effective_max_length = self.max_seq_length - self.special_tokens_count
38
101
  self.chunk_size = min(chunk_size, self.effective_max_length * 4)
39
102
 
40
- log.info("Initialized optimized FileProcessor")
103
+ # Image embedder
104
+ self.clip_model, _, self.clip_preprocess = (
105
+ open_clip.create_model_and_transforms(
106
+ image_model_name,
107
+ pretrained="laion2b_s32b_b79k",
108
+ precision="fp16" if self.device.type == "cuda" else "fp32",
109
+ )
110
+ )
111
+ self.clip_model = self.clip_model.to(self.device).eval()
112
+ self.clip_tokenizer = open_clip.get_tokenizer(image_model_name)
113
+
114
+ # Caption generator
115
+ self.blip_processor = Blip2Processor.from_pretrained(caption_model_name)
116
+ self.blip_model = (
117
+ Blip2ForConditionalGeneration.from_pretrained(
118
+ caption_model_name,
119
+ torch_dtype=self.torch_dtype,
120
+ )
121
+ .to(self.device)
122
+ .eval()
123
+ )
124
+
125
+ # Executor & logging
126
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
127
+ log.info(
128
+ "FileProcessor ready (device=%s, OCR=%s, detection=%s)",
129
+ self.device,
130
+ self.use_ocr,
131
+ self.use_detection,
132
+ )
41
133
 
42
134
  # ------------------------------------------------------------------ #
43
- # Generic validators
135
+ # Generic validators *
44
136
  # ------------------------------------------------------------------ #
45
137
  def validate_file(self, file_path: Path):
46
138
  """Ensure file exists and is under 100 MB."""
@@ -52,20 +144,10 @@ class FileProcessor:
52
144
  raise ValueError(f"{file_path.name} > {mb} MB limit")
53
145
 
54
146
  # ------------------------------------------------------------------ #
55
- # File-type detection (simple extension map NO libmagic)
147
+ # Filetype detection (extension‑basedno libmagic)
56
148
  # ------------------------------------------------------------------ #
57
149
  def _detect_file_type(self, file_path: Path) -> str:
58
- """
59
- Return one of:
60
-
61
- • 'pdf' • 'csv' • 'json'
62
- • 'office' (.doc/.docx/.pptx)
63
- • 'text' (code / markup / plain text)
64
-
65
- Raises *ValueError* if the extension is not recognised.
66
- """
67
150
  suffix = file_path.suffix.lower()
68
-
69
151
  if suffix == ".pdf":
70
152
  return "pdf"
71
153
  if suffix == ".csv":
@@ -74,7 +156,8 @@ class FileProcessor:
74
156
  return "json"
75
157
  if suffix in {".doc", ".docx", ".pptx"}:
76
158
  return "office"
77
-
159
+ if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
160
+ return "image"
78
161
  text_exts = {
79
162
  ".txt",
80
163
  ".md",
@@ -96,29 +179,100 @@ class FileProcessor:
96
179
  }
97
180
  if suffix in text_exts:
98
181
  return "text"
99
-
100
182
  raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
101
183
 
102
184
  # ------------------------------------------------------------------ #
103
- # Public entry-point
185
+ # Dispatcher
104
186
  # ------------------------------------------------------------------ #
105
187
  async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
106
- """Validate detect → dispatch to the appropriate processor."""
107
- file_path = Path(file_path)
108
- self.validate_file(file_path)
109
- ftype = self._detect_file_type(file_path)
110
-
111
- dispatch_map = {
112
- "pdf": self._process_pdf,
113
- "text": self._process_text,
114
- "csv": self._process_csv,
115
- "office": self._process_office,
116
- "json": self._process_json,
188
+ path = Path(file_path)
189
+ self.validate_file(path)
190
+ ftype = self._detect_file_type(path)
191
+ return await getattr(self, f"_process_{ftype}")(path)
192
+
193
+ # ------------------------------------------------------------------ #
194
+ # Image processing (OpenCLIP + BLIP-2 + OCR + YOLO)
195
+ # ------------------------------------------------------------------ #
196
+ async def _process_image(self, file_path: Path) -> Dict[str, Any]:
197
+ loop = asyncio.get_event_loop()
198
+ img = await loop.run_in_executor(self._executor, Image.open, file_path)
199
+
200
+ # 1) Image vector
201
+ def enc_img():
202
+ with torch.no_grad():
203
+ t = self.clip_preprocess(img).unsqueeze(0).to(self.device)
204
+ v = self.clip_model.encode_image(t).squeeze()
205
+ return (v / v.norm()).float().cpu().numpy()
206
+
207
+ image_vec = await loop.run_in_executor(self._executor, enc_img)
208
+
209
+ # 2) Caption
210
+ def gen_cap():
211
+ inp = self.blip_processor(images=img, return_tensors="pt").to(self.device)
212
+ with torch.no_grad():
213
+ ids = self.blip_model.generate(**inp, max_new_tokens=50)
214
+ return self.blip_processor.decode(ids[0], skip_special_tokens=True)
215
+
216
+ caption = await loop.run_in_executor(self._executor, gen_cap)
217
+
218
+ # 3) OCR
219
+ if self.use_ocr:
220
+ text = await loop.run_in_executor(
221
+ self._executor, pytesseract.image_to_string, img
222
+ )
223
+ if t := text.strip():
224
+ caption += "\n" + t
225
+
226
+ # 4) Caption vector
227
+ def enc_txt():
228
+ with torch.no_grad():
229
+ tok = self.clip_tokenizer(caption).unsqueeze(0).to(self.device)
230
+ v = self.clip_model.encode_text(tok).squeeze()
231
+ return (v / v.norm()).float().cpu().numpy()
232
+
233
+ caption_vec = await loop.run_in_executor(self._executor, enc_txt)
234
+
235
+ # 5) YOLO regions
236
+ region_vectors = []
237
+ if self.use_detection:
238
+ dets = self.detector(img)[0]
239
+ for box in dets.boxes:
240
+ x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().tolist())
241
+ crop = img.crop((x1, y1, x2, y2))
242
+ vec = self.encode_image(crop)
243
+ region_vectors.append(
244
+ {
245
+ "vector": vec.tolist(),
246
+ "bbox": [x1, y1, x2, y2],
247
+ "label": dets.names[int(box.cls)],
248
+ "conf": float(box.conf),
249
+ }
250
+ )
251
+
252
+ # Metadata
253
+ sha = hashlib.sha256(file_path.read_bytes()).hexdigest()
254
+ w, h = img.size
255
+ meta = {
256
+ "source": str(file_path),
257
+ "type": "image",
258
+ "width": w,
259
+ "height": h,
260
+ "mime": f"image/{file_path.suffix.lstrip('.')}",
261
+ "sha256": sha,
262
+ "embedding_model": "openclip-vit-h-14",
263
+ "caption": caption,
117
264
  }
118
- if ftype not in dispatch_map:
119
- raise ValueError(f"Unsupported file type: {file_path.suffix}")
120
265
 
121
- return await dispatch_map[ftype](file_path)
266
+ result = {
267
+ "content": None,
268
+ "metadata": meta,
269
+ "chunks": [caption],
270
+ "vectors": [image_vec.tolist()],
271
+ "caption_vector": caption_vec.tolist(),
272
+ }
273
+ if region_vectors:
274
+ result["region_vectors"] = region_vectors
275
+ return result
122
276
 
123
277
  # ------------------------------------------------------------------ #
124
278
  # PDF
@@ -126,7 +280,6 @@ class FileProcessor:
126
280
  async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
127
281
  page_chunks, doc_meta = await self._extract_text(file_path)
128
282
  all_chunks, line_data = [], []
129
-
130
283
  for page_text, page_num, line_nums in page_chunks:
131
284
  lines = page_text.split("\n")
132
285
  buf, buf_lines, length = [], [], 0
@@ -165,7 +318,7 @@ class FileProcessor:
165
318
  }
166
319
 
167
320
  # ------------------------------------------------------------------ #
168
- # Plain-text / code / markup
321
+ # Plaintext / code / markup
169
322
  # ------------------------------------------------------------------ #
170
323
  async def _process_text(self, file_path: Path) -> Dict[str, Any]:
171
324
  text, extra_meta, _ = await self._extract_text(file_path)
@@ -198,7 +351,6 @@ class FileProcessor:
198
351
  continue
199
352
  texts.append(txt)
200
353
  metas.append({k: v for k, v in row.items() if k != text_field and v})
201
-
202
354
  vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
203
355
  return {
204
356
  "content": None,
@@ -209,7 +361,7 @@ class FileProcessor:
209
361
  }
210
362
 
211
363
  # ------------------------------------------------------------------ #
212
- # Office docs (.doc/.docx/.pptx)
364
+ # Office docs
213
365
  # ------------------------------------------------------------------ #
214
366
  async def _process_office(self, file_path: Path) -> Dict[str, Any]:
215
367
  loop = asyncio.get_event_loop()
@@ -217,11 +369,10 @@ class FileProcessor:
217
369
  text = await loop.run_in_executor(
218
370
  self._executor, self._read_docx, file_path
219
371
  )
220
- else: # .pptx
372
+ else:
221
373
  text = await loop.run_in_executor(
222
374
  self._executor, self._read_pptx, file_path
223
375
  )
224
-
225
376
  chunks = self._chunk_text(text)
226
377
  vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
227
378
  return {
@@ -267,11 +418,25 @@ class FileProcessor:
267
418
  return await loop.run_in_executor(
268
419
  self._executor, self._extract_pdf_text, file_path
269
420
  )
270
- else:
271
- text = await loop.run_in_executor(
272
- self._executor, self._read_text_file, file_path
421
+ text = await loop.run_in_executor(
422
+ self._executor, self._read_text_file, file_path
423
+ )
424
+ return text, {}, []
425
+
426
+ # ------------------------------------------------------------------ #
427
+ # util: clip‑text encoder (public)
428
+ # ------------------------------------------------------------------ #
429
+ def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
430
+ with torch.no_grad():
431
+ toks = (
432
+ self.clip_tokenizer(text)
433
+ if isinstance(text, str)
434
+ else self.clip_tokenizer(text, truncate=True)
273
435
  )
274
- return text, {}, []
436
+ tensor = toks.unsqueeze(0).to(self.device)
437
+ feat = self.clip_model.encode_text(tensor).squeeze()
438
+ feat = feat / feat.norm()
439
+ return feat.float().cpu().numpy()
275
440
 
276
441
  def _extract_pdf_text(self, file_path: Path):
277
442
  page_chunks, meta = [], {}
@@ -287,8 +452,8 @@ class FileProcessor:
287
452
  lines = page.extract_text_lines()
288
453
  sorted_lines = sorted(lines, key=lambda x: x["top"])
289
454
  txts, nums = [], []
290
- for ln_idx, L in enumerate(sorted_lines, start=1):
291
- t = L.get("text", "").strip()
455
+ for ln_idx, line in enumerate(sorted_lines, start=1):
456
+ t = line.get("text", "").strip()
292
457
  if t:
293
458
  txts.append(t)
294
459
  nums.append(ln_idx)
@@ -362,3 +527,24 @@ class FileProcessor:
362
527
  seg = tokens[i : i + self.effective_max_length]
363
528
  out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
364
529
  return out
530
+
531
+ # ------------------------------------------------------------------ #
532
+ # Retrieval helpers (optional use)
533
+ # ------------------------------------------------------------------ #
534
+ def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
535
+ """Embed raw text with the SentenceTransformer model."""
536
+ single = isinstance(text, str)
537
+ out = self.embedding_model.encode(
538
+ text,
539
+ convert_to_numpy=True,
540
+ normalize_embeddings=True,
541
+ show_progress_bar=False,
542
+ )
543
+ return out if not single else out[0]
544
+
545
+ def encode_image(self, img: Image.Image) -> np.ndarray:
546
+ with torch.no_grad():
547
+ tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
548
+ feat = self.clip_model.encode_image(tensor).squeeze()
549
+ feat = feat / feat.norm()
550
+ return feat.float().cpu().numpy()
@@ -106,7 +106,14 @@ class SynchronousInferenceStream:
106
106
  # Always attach run_id
107
107
  chunk["run_id"] = self.run_id
108
108
 
109
- if chunk.get("type") in ("hot_code", "hot_code_output", "status"):
109
+ # ------------------------------------------------------
110
+ # allow status chunks to bypass suppression suppression
111
+ # -------------------------------------------------------
112
+ if chunk.get("type") == "status":
113
+ yield chunk
114
+ continue
115
+
116
+ if chunk.get("type") in ("hot_code", "hot_code_output"):
110
117
  yield chunk
111
118
  continue
112
119
 
@@ -50,11 +50,18 @@ class VectorStoreManager(BaseVectorStore):
50
50
  def create_store(
51
51
  self,
52
52
  collection_name: str,
53
+ *,
53
54
  vector_size: int = 384,
54
55
  distance: str = "COSINE",
56
+ vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None,
55
57
  ) -> dict:
58
+ """
59
+ Create or recreate a Qdrant collection. By default creates a single-vector
60
+ collection with `vector_size`. To define multi-vector schema, pass
61
+ `vectors_config` mapping field names to VectorParams.
62
+ """
56
63
  try:
57
- # quick existence check
64
+ # existence check
58
65
  if any(
59
66
  col.name == collection_name
60
67
  for col in self.client.get_collections().collections
@@ -65,16 +72,27 @@ class VectorStoreManager(BaseVectorStore):
65
72
  if dist not in qdrant.Distance.__members__:
66
73
  raise ValueError(f"Invalid distance metric '{distance}'")
67
74
 
75
+ # choose schema
76
+ if vectors_config:
77
+ config = vectors_config
78
+ else:
79
+ config = {
80
+ "_default": qdrant.VectorParams(
81
+ size=vector_size, distance=qdrant.Distance[dist]
82
+ )
83
+ }
84
+
85
+ # recreate with full schema
68
86
  self.client.recreate_collection(
69
87
  collection_name=collection_name,
70
- vectors_config=qdrant.VectorParams(
71
- size=vector_size, distance=qdrant.Distance[dist]
72
- ),
88
+ vectors_config=config,
73
89
  )
90
+ # record metadata for each field
74
91
  self.active_stores[collection_name] = {
75
92
  "created_at": int(time.time()),
76
93
  "vector_size": vector_size,
77
94
  "distance": dist,
95
+ "fields": list(config.keys()),
78
96
  }
79
97
  log.info("Created Qdrant collection %s", collection_name)
80
98
  return {"collection_name": collection_name, "status": "created"}
@@ -103,8 +121,9 @@ class VectorStoreManager(BaseVectorStore):
103
121
  "name": store_name,
104
122
  "status": "active",
105
123
  "vectors_count": info.points_count,
106
- "configuration": info.config.params["default"],
124
+ "configuration": info.config.params,
107
125
  "created_at": self.active_stores[store_name]["created_at"],
126
+ "fields": self.active_stores[store_name].get("fields"),
108
127
  }
109
128
  except Exception as e:
110
129
  log.error("Store info failed: %s", e)
@@ -119,6 +138,8 @@ class VectorStoreManager(BaseVectorStore):
119
138
  texts: List[str],
120
139
  vectors: List[List[float]],
121
140
  metadata: List[dict],
141
+ *,
142
+ vector_name: Optional[str] = None, # NEW
122
143
  ):
123
144
  if not vectors:
124
145
  raise ValueError("Empty vectors list")
@@ -136,7 +157,13 @@ class VectorStoreManager(BaseVectorStore):
136
157
  for txt, vec, meta in zip(texts, vectors, metadata)
137
158
  ]
138
159
  try:
139
- self.client.upsert(collection_name=store_name, points=points, wait=True)
160
+ # pass vector_name if multi-column
161
+ self.client.upsert(
162
+ collection_name=store_name,
163
+ points=points,
164
+ wait=True,
165
+ vector_name=vector_name, # ignored if None
166
+ )
140
167
  return {"status": "success", "points_inserted": len(points)}
141
168
  except Exception as e:
142
169
  log.error("Add‑to‑store failed: %s", e)
@@ -189,15 +216,25 @@ class VectorStoreManager(BaseVectorStore):
189
216
  query_vector: List[float],
190
217
  top_k: int = 5,
191
218
  filters: Optional[dict] = None,
219
+ *,
220
+ vector_field: Optional[str] = None, # ← NEW
192
221
  score_threshold: float = 0.0,
193
222
  offset: int = 0,
194
223
  limit: Optional[int] = None,
195
224
  ) -> List[dict]:
196
- """Run a similarity search that works with any 1.x qdrant‑client."""
225
+ """
226
+ Run a similarity search against *store_name*.
227
+
228
+ • Works with any Qdrant-client ≥ 1.0
229
+ • `vector_field` lets you target a non-default vector column
230
+ (e.g. ``\"caption_vector\"`` for image stores). Pass **None**
231
+ to use the collection’s default vector.
232
+ """
197
233
 
198
234
  limit = limit or top_k
199
235
  flt = self._dict_to_filter(filters) if filters else None
200
236
 
237
+ # ── shared kwargs ----------------------------------------------------
201
238
  common: Dict[str, Any] = dict(
202
239
  collection_name=store_name,
203
240
  query_vector=query_vector,
@@ -207,20 +244,21 @@ class VectorStoreManager(BaseVectorStore):
207
244
  with_payload=True,
208
245
  with_vectors=False,
209
246
  )
247
+ if vector_field: # ← inject when requested
248
+ common["vector_name"] = vector_field
210
249
 
250
+ # ── call search (new client first, fallback to old) ------------------
211
251
  try:
212
- # Newer clients (1.6) use `filter=`
213
- res = self.client.search(**common, filter=flt) # type: ignore[arg-type]
252
+ res = self.client.search(**common, filter=flt) # 1.6
214
253
  except AssertionError as ae:
215
254
  if "Unknown arguments" not in str(ae):
216
255
  raise
217
- # Older clients use `query_filter=`
218
- res = self.client.search(**common, query_filter=flt) # type: ignore[arg-type]
219
-
256
+ res = self.client.search(**common, query_filter=flt) # < 1.6
220
257
  except Exception as e:
221
258
  log.error("Query failed: %s", e)
222
259
  raise VectorStoreError(f"Query failed: {e}") from e
223
260
 
261
+ # ── normalise result -------------------------------------------------
224
262
  return [
225
263
  {
226
264
  "id": p.id,
@@ -13,8 +13,10 @@ from typing import Any, Dict, List, Optional, Union
13
13
 
14
14
  import httpx
15
15
  from dotenv import load_dotenv
16
+ from PIL import Image
16
17
  from projectdavid_common import UtilsInterface, ValidationInterface
17
18
  from pydantic import BaseModel, Field
19
+ from qdrant_client.http import models as qdrant
18
20
 
19
21
  from projectdavid.clients.file_processor import FileProcessor
20
22
  from projectdavid.clients.vector_store_manager import VectorStoreManager
@@ -61,13 +63,16 @@ class VectorStoreClient:
61
63
  • create_vector_store() no longer takes user_id; ownership from token.
62
64
  """
63
65
 
64
- # Construction / cleanup
66
+ # ------------------------------------------------------------------ #
67
+ # Construction / cleanup
68
+ # ------------------------------------------------------------------ #
65
69
  def __init__(
66
70
  self,
67
71
  base_url: Optional[str] = None,
68
72
  api_key: Optional[str] = None,
69
73
  *,
70
74
  vector_store_host: str = "localhost",
75
+ file_processor_kwargs: Optional[dict] = None, # 🔶 add arg
71
76
  ):
72
77
  self.base_url = (base_url or os.getenv("BASE_URL", "")).rstrip("/")
73
78
  self.api_key = api_key or os.getenv("API_KEY")
@@ -84,10 +89,12 @@ class VectorStoreClient:
84
89
  base_url=self.base_url, headers=self._base_headers, timeout=30.0
85
90
  )
86
91
 
87
- # Local helpers
92
+ # Local helpers ---------------------------------------------------
88
93
  self.vector_manager = VectorStoreManager(vector_store_host=vector_store_host)
89
94
  self.identifier_service = UtilsInterface.IdentifierService()
90
- self.file_processor = FileProcessor()
95
+
96
+ # 🔶 forward kwargs into the upgraded FileProcessor
97
+ self.file_processor = FileProcessor(**(file_processor_kwargs or {}))
91
98
 
92
99
  log.info("VectorStoreClient → %s", self.base_url)
93
100
 
@@ -180,12 +187,15 @@ class VectorStoreClient:
180
187
  vector_size: int,
181
188
  distance_metric: str,
182
189
  config: Optional[Dict[str, Any]],
190
+ vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
183
191
  ) -> ValidationInterface.VectorStoreRead:
184
192
  shared_id = self.identifier_service.generate_vector_id()
193
+ # forward multi-vector schema if given
185
194
  self.vector_manager.create_store(
186
195
  collection_name=shared_id,
187
196
  vector_size=vector_size,
188
197
  distance=distance_metric.upper(),
198
+ vectors_config=vectors_config,
189
199
  )
190
200
 
191
201
  payload = {
@@ -198,10 +208,6 @@ class VectorStoreClient:
198
208
  resp = await self._request("POST", "/v1/vector-stores", json=payload)
199
209
  return ValidationInterface.VectorStoreRead.model_validate(resp)
200
210
 
201
- async def _list_my_vs_async(self) -> List[ValidationInterface.VectorStoreRead]:
202
- resp = await self._request("GET", "/v1/vector-stores")
203
- return [ValidationInterface.VectorStoreRead.model_validate(r) for r in resp]
204
-
205
211
  # ------------------------------------------------------------------ #
206
212
  # NEW admin‑aware creation helper
207
213
  # ------------------------------------------------------------------ #
@@ -212,13 +218,17 @@ class VectorStoreClient:
212
218
  vector_size: int,
213
219
  distance_metric: str,
214
220
  config: Optional[Dict[str, Any]],
221
+ vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
215
222
  ) -> ValidationInterface.VectorStoreRead:
216
223
  shared_id = self.identifier_service.generate_vector_id()
224
+ # forward multi-vector schema if given
217
225
  self.vector_manager.create_store(
218
226
  collection_name=shared_id,
219
227
  vector_size=vector_size,
220
228
  distance=distance_metric.upper(),
229
+ vectors_config=vectors_config,
221
230
  )
231
+
222
232
  payload = {
223
233
  "shared_id": shared_id,
224
234
  "name": name,
@@ -226,7 +236,6 @@ class VectorStoreClient:
226
236
  "distance_metric": distance_metric.upper(),
227
237
  "config": config or {},
228
238
  }
229
- # pass owner_id as query‑param (backend enforces admin‑only)
230
239
  resp = await self._request(
231
240
  "POST",
232
241
  "/v1/vector-stores",
@@ -282,25 +291,63 @@ class VectorStoreClient:
282
291
  async def _search_vs_async(
283
292
  self,
284
293
  vector_store_id: str,
285
- query_text: str,
294
+ query_text: Union[str, List[float]],
286
295
  top_k: int,
287
296
  filters: Optional[Dict] = None,
288
297
  vector_store_host: Optional[str] = None,
298
+ vector_field: Optional[str] = None, # allow caller override
289
299
  ) -> List[Dict[str, Any]]:
290
- # Use the provided vector_store_host if specified, otherwise fall back to the default
291
- if vector_store_host:
292
- vector_manager = VectorStoreManager(vector_store_host=vector_store_host)
293
- else:
294
- vector_manager = self.vector_manager
300
+ """
301
+ Internal: run ANN search against the specified vector field or auto-detect by store size.
302
+
303
+ If `vector_field` is provided, it will be used directly. Otherwise:
304
+ 1024-D → caption_vector
305
+ • 3-D → geo_vector
306
+ • others → default vector (text)
307
+ """
308
+ # pick local vs. override host
309
+ vector_manager = (
310
+ VectorStoreManager(vector_store_host=vector_store_host)
311
+ if vector_store_host
312
+ else self.vector_manager
313
+ )
295
314
 
315
+ # fetch store info to inspect schema
296
316
  store = self.retrieve_vector_store_sync(vector_store_id)
297
- vec = self.file_processor.embedding_model.encode(query_text).tolist()
298
317
 
318
+ # determine the query vector and target field
319
+ if vector_field is not None:
320
+ # if caller passed a raw vector list, use it; otherwise treat as caption search
321
+ if isinstance(query_text, list):
322
+ vec = query_text
323
+ else:
324
+ vec = self.file_processor.encode_clip_text(query_text).tolist()
325
+ else:
326
+ # auto-detect based on stored vector dimensionality
327
+ if store.vector_size == 1024:
328
+ # image/caption space
329
+ vec = self.file_processor.encode_clip_text(query_text).tolist()
330
+ vector_field = "caption_vector"
331
+ elif store.vector_size == 3:
332
+ # geo space; query_text must be a raw 3-D list
333
+ if not isinstance(query_text, list):
334
+ raise VectorStoreClientError(
335
+ "Geo search requires a 3-element vector; pass raw unit-sphere list"
336
+ )
337
+ vec = query_text
338
+ vector_field = "geo_vector"
339
+ else:
340
+ # fallback to text embedding
341
+ vec = self.file_processor.encode_text(query_text).tolist()
342
+ vector_field = None # use default
343
+
344
+ # perform the search on the selected vector column
299
345
  return vector_manager.query_store(
300
346
  store_name=store.collection_name,
301
347
  query_vector=vec,
302
348
  top_k=top_k,
303
349
  filters=filters,
350
+ vector_field=vector_field,
304
351
  )
305
352
 
306
353
  async def _delete_vs_async(
@@ -427,12 +474,48 @@ class VectorStoreClient:
427
474
  vector_size: int = 384,
428
475
  distance_metric: str = "Cosine",
429
476
  config: Optional[Dict[str, Any]] = None,
477
+ vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
430
478
  ) -> ValidationInterface.VectorStoreRead:
431
- """Create a new store owned by *this* API key."""
479
+ """
480
+ Create a new store owned by this API key.
481
+
482
+ If `vectors_config` is provided, it should map each vector
483
+ field name to its Qdrant VectorParams (size + distance).
484
+ """
432
485
  return self._run_sync(
433
- self._create_vs_async(name, vector_size, distance_metric, config)
486
+ self._create_vs_async(
487
+ name,
488
+ vector_size,
489
+ distance_metric,
490
+ config,
491
+ vectors_config,
492
+ )
434
493
  )
435
494
 
495
+ def create_vector_vision_store(
496
+ self,
497
+ name: str = "vision",
498
+ ):
499
+
500
+ vectors_config = {
501
+ # Raw visual embeddings (OpenCLIP ViT-H/14 → 1024-D)
502
+ "image_vector": qdrant.VectorParams(
503
+ size=1024, distance=qdrant.Distance.COSINE
504
+ ),
505
+ # Language embeddings of your BLIP-2 captions → 1024-D
506
+ "caption_vector": qdrant.VectorParams(
507
+ size=1024, distance=qdrant.Distance.COSINE
508
+ ),
509
+ # Object-region embeddings (YOLO crop + Sentence-BERT) → 1024-D
510
+ "region_vector": qdrant.VectorParams(
511
+ size=1024, distance=qdrant.Distance.COSINE
512
+ ),
513
+ # Geo-location unit vectors (RegioNet) → 3-D
514
+ "geo_vector": qdrant.VectorParams(size=3, distance=qdrant.Distance.COSINE),
515
+ }
516
+
517
+ return self.create_vector_store(name=name, vectors_config=vectors_config)
518
+
436
519
  def create_vector_store_for_user(
437
520
  self,
438
521
  owner_id: str,
@@ -441,16 +524,20 @@ class VectorStoreClient:
441
524
  vector_size: int = 384,
442
525
  distance_metric: str = "Cosine",
443
526
  config: Optional[Dict[str, Any]] = None,
527
+ vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
444
528
  ) -> ValidationInterface.VectorStoreRead:
445
529
  """
446
- **Adminonly** helper → create a store on behalf of *owner_id*.
447
-
448
- The caller’s API‑key must belong to an admin; otherwise the
449
- request will be rejected by the server with HTTP 403.
530
+ Admin-only: create a store on behalf of another user.
531
+ Pass `vectors_config` to define a multi-vector schema.
450
532
  """
451
533
  return self._run_sync(
452
534
  self._create_vs_for_user_async(
453
- owner_id, name, vector_size, distance_metric, config
535
+ owner_id,
536
+ name,
537
+ vector_size,
538
+ distance_metric,
539
+ config,
540
+ vectors_config,
454
541
  )
455
542
  )
456
543
 
@@ -629,10 +716,16 @@ class VectorStoreClient:
629
716
  top_k: int = 5,
630
717
  filters: Optional[Dict] = None,
631
718
  vector_store_host: Optional[str] = None,
719
+ vector_field: Optional[str] = None, # ← NEW
632
720
  ) -> List[Dict[str, Any]]:
633
721
  return self._run_sync(
634
722
  self._search_vs_async(
635
- vector_store_id, query_text, top_k, filters, vector_store_host
723
+ vector_store_id,
724
+ query_text,
725
+ top_k,
726
+ filters,
727
+ vector_store_host,
728
+ vector_field,
636
729
  )
637
730
  )
638
731
 
@@ -796,3 +889,91 @@ class VectorStoreClient:
796
889
  hits = self._normalise_hits(hits)
797
890
 
798
891
  return hits
892
+
893
+ def image_similarity_search(
894
+ self,
895
+ vector_store_id: str,
896
+ img: Image.Image,
897
+ k: int = 10,
898
+ vector_store_host: Optional[str] = None,
899
+ ) -> List[Dict[str, Any]]:
900
+ vec = self.file_processor.encode_image(img).tolist()
901
+ return self.vector_file_search_raw(
902
+ vector_store_id=vector_store_id,
903
+ query_text=vec,
904
+ top_k=k,
905
+ filters=None,
906
+ vector_store_host=vector_store_host,
907
+ vector_field="image_vector",
908
+ )
909
+
910
+ def search_images(
911
+ self,
912
+ vector_store_id: str,
913
+ query: Union[str, Image.Image, List[float]],
914
+ *,
915
+ modality: Optional[str] = None,
916
+ k: int = 10,
917
+ vector_store_host: Optional[str] = None,
918
+ ) -> List[Dict[str, Any]]:
919
+ """
920
+ Unified image search across multiple modalities, with appropriate reranking:
921
+
922
+ - If `query` is a str → caption search (reranked)
923
+ - If `query` is a PIL.Image.Image → visual search (no rerank)
924
+ - If `query` is a list[float] → raw vector search
925
+ - `modality` override: one of 'caption', 'image', 'region', 'geo'
926
+ """
927
+ # Map modality to (vector_field, encoder)
928
+ field_map = {
929
+ "caption": (
930
+ "caption_vector",
931
+ lambda q: self.file_processor.encode_clip_text(q).tolist(),
932
+ ),
933
+ "image": (
934
+ "image_vector",
935
+ lambda q: self.file_processor.encode_image(q).tolist(),
936
+ ),
937
+ "region": (
938
+ "region_vector",
939
+ lambda q: self.file_processor.encode_text(q).tolist(),
940
+ ),
941
+ "geo": ("geo_vector", lambda q: q), # assume q is raw 3-D vector
942
+ }
943
+
944
+ # Auto-detect if not provided
945
+ if modality is None:
946
+ if isinstance(query, str):
947
+ modality = "caption"
948
+ elif isinstance(query, Image.Image):
949
+ modality = "image"
950
+ elif isinstance(query, list):
951
+ modality = "image"
952
+ else:
953
+ raise VectorStoreClientError(f"Unsupported query type: {type(query)}")
954
+
955
+ modality = modality.lower()
956
+ if modality not in field_map:
957
+ raise VectorStoreClientError(f"Unknown modality '{modality}'")
958
+
959
+ vector_field, encoder = field_map[modality]
960
+ vec = encoder(query)
961
+
962
+ # 1️⃣ ANN search
963
+ hits = self.vector_file_search_raw(
964
+ vector_store_id=vector_store_id,
965
+ query_text=vec,
966
+ top_k=k,
967
+ filters=None,
968
+ vector_store_host=vector_store_host,
969
+ vector_field=vector_field,
970
+ )
971
+
972
+ # 2️⃣ Rerank for text-based modalities
973
+ if modality in ("caption", "region"):
974
+ hits = reranker.rerank(
975
+ query if isinstance(query, str) else "", hits, top_k=min(len(hits), k)
976
+ )
977
+
978
+ # 3️⃣ Normalize and return
979
+ return self._normalise_hits(hits)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: projectdavid
3
- Version: 1.32.20
3
+ Version: 1.33.0
4
4
  Summary: Python SDK for interacting with the Entities Assistant API.
5
5
  Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
6
6
  License: PolyForm Noncommercial License 1.0.0
@@ -29,6 +29,13 @@ Requires-Dist: sseclient-py
29
29
  Requires-Dist: requests
30
30
  Requires-Dist: python-docx
31
31
  Requires-Dist: python-pptx
32
+ Requires-Dist: open_clip_torch>=2.24
33
+ Requires-Dist: pillow>=10.2
34
+ Requires-Dist: transformers>=4.41
35
+ Requires-Dist: accelerate>=0.28
36
+ Requires-Dist: sentencepiece>=0.2
37
+ Requires-Dist: ultralytics>=8.2.21
38
+ Requires-Dist: pytesseract>=0.3
32
39
  Provides-Extra: dev
33
40
  Requires-Dist: black>=23.3; extra == "dev"
34
41
  Requires-Dist: isort>=5.12; extra == "dev"
@@ -36,6 +43,17 @@ Requires-Dist: pytest>=7.2; extra == "dev"
36
43
  Requires-Dist: mypy>=1.0; extra == "dev"
37
44
  Requires-Dist: build; extra == "dev"
38
45
  Requires-Dist: twine; extra == "dev"
46
+ Provides-Extra: vision
47
+ Requires-Dist: torch>=2.2.1; extra == "vision"
48
+ Requires-Dist: torchvision>=0.17.1; extra == "vision"
49
+ Requires-Dist: torchaudio>=2.2.1; extra == "vision"
50
+ Requires-Dist: open_clip_torch>=2.24; extra == "vision"
51
+ Requires-Dist: pillow>=10.2; extra == "vision"
52
+ Requires-Dist: transformers>=4.41; extra == "vision"
53
+ Requires-Dist: accelerate>=0.28; extra == "vision"
54
+ Requires-Dist: sentencepiece>=0.2; extra == "vision"
55
+ Requires-Dist: ultralytics>=8.2.21; extra == "vision"
56
+ Requires-Dist: pytesseract>=0.3; extra == "vision"
39
57
  Dynamic: license-file
40
58
 
41
59
  # Entity — by Project David
@@ -9,18 +9,18 @@ projectdavid/clients/assistants_client.py,sha256=SsIGa5wPr7ga9WX0ywam3djUF-uWFdk
9
9
  projectdavid/clients/base_client.py,sha256=UWl6nr6sxD1_xC6iyptQDR1tnNdFCOrEx5cEUPCRqJE,3417
10
10
  projectdavid/clients/base_vector_store.py,sha256=jXivmqAW1bgYcLgIeW-hPxOiWZbs2hCsLy4oWzSvpNI,2061
11
11
  projectdavid/clients/event_handler.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- projectdavid/clients/file_processor.py,sha256=t-Uw-kBP_VmlguMxO9PWY6ANuMAY0PstQDW37wLPF0Q,13980
12
+ projectdavid/clients/file_processor.py,sha256=nFccQmiow3lkjv1-Pdgv_2WQAtSy0FRN7oJlTKt4fs4,21114
13
13
  projectdavid/clients/file_search.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  projectdavid/clients/files_client.py,sha256=XkIDzbQFGDrd88taf0Kouc_4YJOPIYEHiIyWYLKDofI,15581
15
15
  projectdavid/clients/inference_client.py,sha256=xz4ACPv5Tkis604QxO5mJX1inH_TGDfQP-31geETYpE,6609
16
16
  projectdavid/clients/messages_client.py,sha256=467xeIt3VYs6cG8-bl-eDRi_auWOPmfd5tSJDmQSJUI,17232
17
17
  projectdavid/clients/runs.py,sha256=-fXOq5L9w2efDPmZkNxb0s2yjl6oN0XN4_aLXqaeceo,25270
18
- projectdavid/clients/synchronous_inference_wrapper.py,sha256=M0Z8YvOIcYQsYKZ7m5U7edq-OwY1wU9BKH6EIlG3WHI,4769
18
+ projectdavid/clients/synchronous_inference_wrapper.py,sha256=mN5WAHmv0aRoeMIb7XPgv3cuqrMPzu378UsZ02jEvRY,5090
19
19
  projectdavid/clients/threads_client.py,sha256=ekzU5w14zftmtmFkiec3NC90Of-_KVSUY1qH9cmfSFg,6771
20
20
  projectdavid/clients/tools_client.py,sha256=GkCVOmwpAoPqVt6aYmH0G1HIFha3iEwR9IIf9teR0j8,11487
21
21
  projectdavid/clients/users_client.py,sha256=eCuUb9qvyH1GUFhZu6TRL9zdoK-qzHSs8-Vmrk_0mmg,13729
22
- projectdavid/clients/vector_store_manager.py,sha256=lk-sWJjo6Z0EHZzjRoKiHPr0GpEXfE4bJBQzmKV8ezc,11372
23
- projectdavid/clients/vectors.py,sha256=1UNnLN5nsMvVHXK4Yf7iTXGWZfgIjQ9eLQtCBe0Cqew,30986
22
+ projectdavid/clients/vector_store_manager.py,sha256=q-ZgRQVX_S3nMrKYhmvkVrDjDRzM3ZFzUF55HBGRTe8,12861
23
+ projectdavid/clients/vectors.py,sha256=sDbT1GbsIuwzvInuDDcyX4scuuLJDWWgL7Fmc-pguRQ,37672
24
24
  projectdavid/constants/platform.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  projectdavid/services/logging_service.py,sha256=jdoRL46E42Ar8JFTDOV-xVD67CulcHSN-xhcEqA5CXQ,2643
26
26
  projectdavid/synthesis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -34,8 +34,8 @@ projectdavid/utils/monitor_launcher.py,sha256=3YAgJdeuaUvq3JGvpA4ymqFsAnk29nH5q9
34
34
  projectdavid/utils/peek_gate.py,sha256=5whMRnDOQjATRpThWDJkvY9ScXuJ7Sd_-9rvGgXeTAQ,2532
35
35
  projectdavid/utils/run_monitor.py,sha256=F_WkqIP-qnWH-4llIbileWWLfRj2Q1Cg-ni23SR1rec,3786
36
36
  projectdavid/utils/vector_search_formatter.py,sha256=YTe3HPGec26qGY7uxY8_GS8lc4QaN6aNXMzkl29nZpI,1735
37
- projectdavid-1.32.20.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
38
- projectdavid-1.32.20.dist-info/METADATA,sha256=OpxxEI8F42M3uCPtloApoabA5c_AG4Yvnb_81ix0aHE,10782
39
- projectdavid-1.32.20.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- projectdavid-1.32.20.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
41
- projectdavid-1.32.20.dist-info/RECORD,,
37
+ projectdavid-1.33.0.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
38
+ projectdavid-1.33.0.dist-info/METADATA,sha256=1Yhr24U_T9-Wv6Ladi67ZjTqwvcmpP4C241DtcCJMRk,11554
39
+ projectdavid-1.33.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
+ projectdavid-1.33.0.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
41
+ projectdavid-1.33.0.dist-info/RECORD,,