projectdavid 1.32.21__py3-none-any.whl → 1.33.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of projectdavid might be problematic. Click here for more details.
- projectdavid/clients/file_processor.py +232 -46
- projectdavid/clients/vector_store_manager.py +50 -12
- projectdavid/clients/vectors.py +266 -23
- {projectdavid-1.32.21.dist-info → projectdavid-1.33.1.dist-info}/METADATA +19 -1
- {projectdavid-1.32.21.dist-info → projectdavid-1.33.1.dist-info}/RECORD +8 -8
- {projectdavid-1.32.21.dist-info → projectdavid-1.33.1.dist-info}/WHEEL +0 -0
- {projectdavid-1.32.21.dist-info → projectdavid-1.33.1.dist-info}/licenses/LICENSE +0 -0
- {projectdavid-1.32.21.dist-info → projectdavid-1.33.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
|
+
import hashlib
|
|
3
4
|
import json
|
|
5
|
+
import math
|
|
4
6
|
import re
|
|
5
7
|
import textwrap
|
|
6
8
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -13,34 +15,124 @@ except ImportError: # 3.9–3.10
|
|
|
13
15
|
from typing_extensions import LiteralString
|
|
14
16
|
|
|
15
17
|
import numpy as np
|
|
18
|
+
import open_clip
|
|
16
19
|
import pdfplumber
|
|
20
|
+
import torch
|
|
17
21
|
from docx import Document
|
|
22
|
+
from PIL import Image
|
|
18
23
|
from pptx import Presentation
|
|
24
|
+
from transformers import Blip2ForConditionalGeneration, Blip2Processor
|
|
25
|
+
from ultralytics import YOLO
|
|
26
|
+
|
|
27
|
+
# OCR fallback – optional
|
|
28
|
+
try:
|
|
29
|
+
import pytesseract # noqa: F401 # pylint: disable=unused-import
|
|
30
|
+
except ImportError:
|
|
31
|
+
pytesseract = None
|
|
32
|
+
|
|
19
33
|
from projectdavid_common import UtilsInterface
|
|
20
34
|
from sentence_transformers import SentenceTransformer
|
|
21
35
|
|
|
22
36
|
log = UtilsInterface.LoggingUtility()
|
|
23
37
|
|
|
24
38
|
|
|
39
|
+
def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
|
|
40
|
+
"""Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
|
|
41
|
+
lat_r = math.radians(lat)
|
|
42
|
+
lon_r = math.radians(lon)
|
|
43
|
+
return [
|
|
44
|
+
math.cos(lat_r) * math.cos(lon_r),
|
|
45
|
+
math.cos(lat_r) * math.sin(lon_r),
|
|
46
|
+
math.sin(lat_r),
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
|
|
25
50
|
class FileProcessor:
|
|
51
|
+
"""Unified processor for text, tabular, office, JSON, **and image** files.
|
|
52
|
+
|
|
53
|
+
Each modality is embedded with its optimal model:
|
|
54
|
+
• Text → paraphrase‑MiniLM‑L6‑v2 (384‑D)
|
|
55
|
+
• Image → OpenCLIP ViT‑H/14 (1024‑D)
|
|
56
|
+
• Caption→ OpenCLIP text head (1024‑D)
|
|
57
|
+
|
|
58
|
+
Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
|
|
59
|
+
GPU usage is optional; pass `use_gpu=False` to stay on CPU.
|
|
60
|
+
"""
|
|
61
|
+
|
|
26
62
|
# ------------------------------------------------------------------ #
|
|
27
63
|
# Construction
|
|
28
64
|
# ------------------------------------------------------------------ #
|
|
29
|
-
def __init__(
|
|
30
|
-
self
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
*,
|
|
68
|
+
max_workers: int = 4,
|
|
69
|
+
chunk_size: int = 512,
|
|
70
|
+
use_gpu: bool = True,
|
|
71
|
+
use_ocr: bool = True,
|
|
72
|
+
use_detection: bool = False,
|
|
73
|
+
image_model_name: str = "ViT-H-14",
|
|
74
|
+
caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
|
|
75
|
+
):
|
|
76
|
+
# Device selection
|
|
77
|
+
if use_gpu and torch.cuda.is_available():
|
|
78
|
+
self.device = torch.device("cuda")
|
|
79
|
+
self.torch_dtype = torch.float16
|
|
80
|
+
else:
|
|
81
|
+
self.device = torch.device("cpu")
|
|
82
|
+
self.torch_dtype = torch.float32
|
|
83
|
+
|
|
84
|
+
# Feature flags
|
|
85
|
+
self.use_ocr = use_ocr and pytesseract is not None
|
|
86
|
+
self.use_detection = use_detection
|
|
87
|
+
if use_ocr and pytesseract is None:
|
|
88
|
+
log.warning("OCR requested but pytesseract not installed – skipping.")
|
|
89
|
+
if self.use_detection:
|
|
90
|
+
self.detector = YOLO("yolov8x.pt").to(self.device)
|
|
91
|
+
|
|
92
|
+
# Text embedder
|
|
31
93
|
self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
|
|
32
|
-
self.
|
|
94
|
+
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
95
|
+
self.embedding_model.to(str(self.device))
|
|
33
96
|
|
|
34
|
-
#
|
|
97
|
+
# Chunking parameters
|
|
35
98
|
self.max_seq_length = self.embedding_model.get_max_seq_length()
|
|
36
99
|
self.special_tokens_count = 2
|
|
37
100
|
self.effective_max_length = self.max_seq_length - self.special_tokens_count
|
|
38
101
|
self.chunk_size = min(chunk_size, self.effective_max_length * 4)
|
|
39
102
|
|
|
40
|
-
|
|
103
|
+
# Image embedder
|
|
104
|
+
self.clip_model, _, self.clip_preprocess = (
|
|
105
|
+
open_clip.create_model_and_transforms(
|
|
106
|
+
image_model_name,
|
|
107
|
+
pretrained="laion2b_s32b_b79k",
|
|
108
|
+
precision="fp16" if self.device.type == "cuda" else "fp32",
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
self.clip_model = self.clip_model.to(self.device).eval()
|
|
112
|
+
self.clip_tokenizer = open_clip.get_tokenizer(image_model_name)
|
|
113
|
+
|
|
114
|
+
# Caption generator
|
|
115
|
+
self.blip_processor = Blip2Processor.from_pretrained(caption_model_name)
|
|
116
|
+
self.blip_model = (
|
|
117
|
+
Blip2ForConditionalGeneration.from_pretrained(
|
|
118
|
+
caption_model_name,
|
|
119
|
+
torch_dtype=self.torch_dtype,
|
|
120
|
+
)
|
|
121
|
+
.to(self.device)
|
|
122
|
+
.eval()
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Executor & logging
|
|
126
|
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
127
|
+
log.info(
|
|
128
|
+
"FileProcessor ready (device=%s, OCR=%s, detection=%s)",
|
|
129
|
+
self.device,
|
|
130
|
+
self.use_ocr,
|
|
131
|
+
self.use_detection,
|
|
132
|
+
)
|
|
41
133
|
|
|
42
134
|
# ------------------------------------------------------------------ #
|
|
43
|
-
# Generic validators
|
|
135
|
+
# Generic validators *
|
|
44
136
|
# ------------------------------------------------------------------ #
|
|
45
137
|
def validate_file(self, file_path: Path):
|
|
46
138
|
"""Ensure file exists and is under 100 MB."""
|
|
@@ -52,20 +144,10 @@ class FileProcessor:
|
|
|
52
144
|
raise ValueError(f"{file_path.name} > {mb} MB limit")
|
|
53
145
|
|
|
54
146
|
# ------------------------------------------------------------------ #
|
|
55
|
-
# File
|
|
147
|
+
# File‑type detection (extension‑based – no libmagic)
|
|
56
148
|
# ------------------------------------------------------------------ #
|
|
57
149
|
def _detect_file_type(self, file_path: Path) -> str:
|
|
58
|
-
"""
|
|
59
|
-
Return one of:
|
|
60
|
-
|
|
61
|
-
• 'pdf' • 'csv' • 'json'
|
|
62
|
-
• 'office' (.doc/.docx/.pptx)
|
|
63
|
-
• 'text' (code / markup / plain text)
|
|
64
|
-
|
|
65
|
-
Raises *ValueError* if the extension is not recognised.
|
|
66
|
-
"""
|
|
67
150
|
suffix = file_path.suffix.lower()
|
|
68
|
-
|
|
69
151
|
if suffix == ".pdf":
|
|
70
152
|
return "pdf"
|
|
71
153
|
if suffix == ".csv":
|
|
@@ -74,7 +156,8 @@ class FileProcessor:
|
|
|
74
156
|
return "json"
|
|
75
157
|
if suffix in {".doc", ".docx", ".pptx"}:
|
|
76
158
|
return "office"
|
|
77
|
-
|
|
159
|
+
if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
|
|
160
|
+
return "image"
|
|
78
161
|
text_exts = {
|
|
79
162
|
".txt",
|
|
80
163
|
".md",
|
|
@@ -96,29 +179,100 @@ class FileProcessor:
|
|
|
96
179
|
}
|
|
97
180
|
if suffix in text_exts:
|
|
98
181
|
return "text"
|
|
99
|
-
|
|
100
182
|
raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
|
|
101
183
|
|
|
102
184
|
# ------------------------------------------------------------------ #
|
|
103
|
-
#
|
|
185
|
+
# Dispatcher
|
|
104
186
|
# ------------------------------------------------------------------ #
|
|
105
187
|
async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
self.
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
188
|
+
path = Path(file_path)
|
|
189
|
+
self.validate_file(path)
|
|
190
|
+
ftype = self._detect_file_type(path)
|
|
191
|
+
return await getattr(self, f"_process_{ftype}")(path)
|
|
192
|
+
|
|
193
|
+
# ------------------------------------------------------------------ #
|
|
194
|
+
# Image processing (OpenCLIP + BLIP-2 + OCR + YOLO)
|
|
195
|
+
# ------------------------------------------------------------------ #
|
|
196
|
+
async def _process_image(self, file_path: Path) -> Dict[str, Any]:
|
|
197
|
+
loop = asyncio.get_event_loop()
|
|
198
|
+
img = await loop.run_in_executor(self._executor, Image.open, file_path)
|
|
199
|
+
|
|
200
|
+
# 1) Image vector
|
|
201
|
+
def enc_img():
|
|
202
|
+
with torch.no_grad():
|
|
203
|
+
t = self.clip_preprocess(img).unsqueeze(0).to(self.device)
|
|
204
|
+
v = self.clip_model.encode_image(t).squeeze()
|
|
205
|
+
return (v / v.norm()).float().cpu().numpy()
|
|
206
|
+
|
|
207
|
+
image_vec = await loop.run_in_executor(self._executor, enc_img)
|
|
208
|
+
|
|
209
|
+
# 2) Caption
|
|
210
|
+
def gen_cap():
|
|
211
|
+
inp = self.blip_processor(images=img, return_tensors="pt").to(self.device)
|
|
212
|
+
with torch.no_grad():
|
|
213
|
+
ids = self.blip_model.generate(**inp, max_new_tokens=50)
|
|
214
|
+
return self.blip_processor.decode(ids[0], skip_special_tokens=True)
|
|
215
|
+
|
|
216
|
+
caption = await loop.run_in_executor(self._executor, gen_cap)
|
|
217
|
+
|
|
218
|
+
# 3) OCR
|
|
219
|
+
if self.use_ocr:
|
|
220
|
+
text = await loop.run_in_executor(
|
|
221
|
+
self._executor, pytesseract.image_to_string, img
|
|
222
|
+
)
|
|
223
|
+
if t := text.strip():
|
|
224
|
+
caption += "\n" + t
|
|
225
|
+
|
|
226
|
+
# 4) Caption vector
|
|
227
|
+
def enc_txt():
|
|
228
|
+
with torch.no_grad():
|
|
229
|
+
tok = self.clip_tokenizer(caption).unsqueeze(0).to(self.device)
|
|
230
|
+
v = self.clip_model.encode_text(tok).squeeze()
|
|
231
|
+
return (v / v.norm()).float().cpu().numpy()
|
|
232
|
+
|
|
233
|
+
caption_vec = await loop.run_in_executor(self._executor, enc_txt)
|
|
234
|
+
|
|
235
|
+
# 5) YOLO regions
|
|
236
|
+
region_vectors = []
|
|
237
|
+
if self.use_detection:
|
|
238
|
+
dets = self.detector(img)[0]
|
|
239
|
+
for box in dets.boxes:
|
|
240
|
+
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().tolist())
|
|
241
|
+
crop = img.crop((x1, y1, x2, y2))
|
|
242
|
+
vec = self.encode_image(crop)
|
|
243
|
+
region_vectors.append(
|
|
244
|
+
{
|
|
245
|
+
"vector": vec.tolist(),
|
|
246
|
+
"bbox": [x1, y1, x2, y2],
|
|
247
|
+
"label": dets.names[int(box.cls)],
|
|
248
|
+
"conf": float(box.conf),
|
|
249
|
+
}
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Metadata
|
|
253
|
+
sha = hashlib.sha256(file_path.read_bytes()).hexdigest()
|
|
254
|
+
w, h = img.size
|
|
255
|
+
meta = {
|
|
256
|
+
"source": str(file_path),
|
|
257
|
+
"type": "image",
|
|
258
|
+
"width": w,
|
|
259
|
+
"height": h,
|
|
260
|
+
"mime": f"image/{file_path.suffix.lstrip('.')}",
|
|
261
|
+
"sha256": sha,
|
|
262
|
+
"embedding_model": "openclip-vit-h-14",
|
|
263
|
+
"caption": caption,
|
|
117
264
|
}
|
|
118
|
-
if ftype not in dispatch_map:
|
|
119
|
-
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
120
265
|
|
|
121
|
-
|
|
266
|
+
result = {
|
|
267
|
+
"content": None,
|
|
268
|
+
"metadata": meta,
|
|
269
|
+
"chunks": [caption],
|
|
270
|
+
"vectors": [image_vec.tolist()],
|
|
271
|
+
"caption_vector": caption_vec.tolist(),
|
|
272
|
+
}
|
|
273
|
+
if region_vectors:
|
|
274
|
+
result["region_vectors"] = region_vectors
|
|
275
|
+
return result
|
|
122
276
|
|
|
123
277
|
# ------------------------------------------------------------------ #
|
|
124
278
|
# PDF
|
|
@@ -126,7 +280,6 @@ class FileProcessor:
|
|
|
126
280
|
async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
|
|
127
281
|
page_chunks, doc_meta = await self._extract_text(file_path)
|
|
128
282
|
all_chunks, line_data = [], []
|
|
129
|
-
|
|
130
283
|
for page_text, page_num, line_nums in page_chunks:
|
|
131
284
|
lines = page_text.split("\n")
|
|
132
285
|
buf, buf_lines, length = [], [], 0
|
|
@@ -165,7 +318,7 @@ class FileProcessor:
|
|
|
165
318
|
}
|
|
166
319
|
|
|
167
320
|
# ------------------------------------------------------------------ #
|
|
168
|
-
# Plain
|
|
321
|
+
# Plain‑text / code / markup
|
|
169
322
|
# ------------------------------------------------------------------ #
|
|
170
323
|
async def _process_text(self, file_path: Path) -> Dict[str, Any]:
|
|
171
324
|
text, extra_meta, _ = await self._extract_text(file_path)
|
|
@@ -198,7 +351,6 @@ class FileProcessor:
|
|
|
198
351
|
continue
|
|
199
352
|
texts.append(txt)
|
|
200
353
|
metas.append({k: v for k, v in row.items() if k != text_field and v})
|
|
201
|
-
|
|
202
354
|
vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
|
|
203
355
|
return {
|
|
204
356
|
"content": None,
|
|
@@ -209,7 +361,7 @@ class FileProcessor:
|
|
|
209
361
|
}
|
|
210
362
|
|
|
211
363
|
# ------------------------------------------------------------------ #
|
|
212
|
-
# Office docs
|
|
364
|
+
# Office docs
|
|
213
365
|
# ------------------------------------------------------------------ #
|
|
214
366
|
async def _process_office(self, file_path: Path) -> Dict[str, Any]:
|
|
215
367
|
loop = asyncio.get_event_loop()
|
|
@@ -217,11 +369,10 @@ class FileProcessor:
|
|
|
217
369
|
text = await loop.run_in_executor(
|
|
218
370
|
self._executor, self._read_docx, file_path
|
|
219
371
|
)
|
|
220
|
-
else:
|
|
372
|
+
else:
|
|
221
373
|
text = await loop.run_in_executor(
|
|
222
374
|
self._executor, self._read_pptx, file_path
|
|
223
375
|
)
|
|
224
|
-
|
|
225
376
|
chunks = self._chunk_text(text)
|
|
226
377
|
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
227
378
|
return {
|
|
@@ -267,11 +418,25 @@ class FileProcessor:
|
|
|
267
418
|
return await loop.run_in_executor(
|
|
268
419
|
self._executor, self._extract_pdf_text, file_path
|
|
269
420
|
)
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
421
|
+
text = await loop.run_in_executor(
|
|
422
|
+
self._executor, self._read_text_file, file_path
|
|
423
|
+
)
|
|
424
|
+
return text, {}, []
|
|
425
|
+
|
|
426
|
+
# ------------------------------------------------------------------ #
|
|
427
|
+
# util: clip‑text encoder (public)
|
|
428
|
+
# ------------------------------------------------------------------ #
|
|
429
|
+
def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
430
|
+
with torch.no_grad():
|
|
431
|
+
toks = (
|
|
432
|
+
self.clip_tokenizer(text)
|
|
433
|
+
if isinstance(text, str)
|
|
434
|
+
else self.clip_tokenizer(text, truncate=True)
|
|
273
435
|
)
|
|
274
|
-
|
|
436
|
+
tensor = toks.unsqueeze(0).to(self.device)
|
|
437
|
+
feat = self.clip_model.encode_text(tensor).squeeze()
|
|
438
|
+
feat = feat / feat.norm()
|
|
439
|
+
return feat.float().cpu().numpy()
|
|
275
440
|
|
|
276
441
|
def _extract_pdf_text(self, file_path: Path):
|
|
277
442
|
page_chunks, meta = [], {}
|
|
@@ -287,8 +452,8 @@ class FileProcessor:
|
|
|
287
452
|
lines = page.extract_text_lines()
|
|
288
453
|
sorted_lines = sorted(lines, key=lambda x: x["top"])
|
|
289
454
|
txts, nums = [], []
|
|
290
|
-
for ln_idx,
|
|
291
|
-
t =
|
|
455
|
+
for ln_idx, line in enumerate(sorted_lines, start=1):
|
|
456
|
+
t = line.get("text", "").strip()
|
|
292
457
|
if t:
|
|
293
458
|
txts.append(t)
|
|
294
459
|
nums.append(ln_idx)
|
|
@@ -362,3 +527,24 @@ class FileProcessor:
|
|
|
362
527
|
seg = tokens[i : i + self.effective_max_length]
|
|
363
528
|
out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
|
|
364
529
|
return out
|
|
530
|
+
|
|
531
|
+
# ------------------------------------------------------------------ #
|
|
532
|
+
# Retrieval helpers (optional use)
|
|
533
|
+
# ------------------------------------------------------------------ #
|
|
534
|
+
def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
535
|
+
"""Embed raw text with the SentenceTransformer model."""
|
|
536
|
+
single = isinstance(text, str)
|
|
537
|
+
out = self.embedding_model.encode(
|
|
538
|
+
text,
|
|
539
|
+
convert_to_numpy=True,
|
|
540
|
+
normalize_embeddings=True,
|
|
541
|
+
show_progress_bar=False,
|
|
542
|
+
)
|
|
543
|
+
return out if not single else out[0]
|
|
544
|
+
|
|
545
|
+
def encode_image(self, img: Image.Image) -> np.ndarray:
|
|
546
|
+
with torch.no_grad():
|
|
547
|
+
tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
|
|
548
|
+
feat = self.clip_model.encode_image(tensor).squeeze()
|
|
549
|
+
feat = feat / feat.norm()
|
|
550
|
+
return feat.float().cpu().numpy()
|
|
@@ -50,11 +50,18 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
50
50
|
def create_store(
|
|
51
51
|
self,
|
|
52
52
|
collection_name: str,
|
|
53
|
+
*,
|
|
53
54
|
vector_size: int = 384,
|
|
54
55
|
distance: str = "COSINE",
|
|
56
|
+
vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None,
|
|
55
57
|
) -> dict:
|
|
58
|
+
"""
|
|
59
|
+
Create or recreate a Qdrant collection. By default creates a single-vector
|
|
60
|
+
collection with `vector_size`. To define multi-vector schema, pass
|
|
61
|
+
`vectors_config` mapping field names to VectorParams.
|
|
62
|
+
"""
|
|
56
63
|
try:
|
|
57
|
-
#
|
|
64
|
+
# existence check
|
|
58
65
|
if any(
|
|
59
66
|
col.name == collection_name
|
|
60
67
|
for col in self.client.get_collections().collections
|
|
@@ -65,16 +72,27 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
65
72
|
if dist not in qdrant.Distance.__members__:
|
|
66
73
|
raise ValueError(f"Invalid distance metric '{distance}'")
|
|
67
74
|
|
|
75
|
+
# choose schema
|
|
76
|
+
if vectors_config:
|
|
77
|
+
config = vectors_config
|
|
78
|
+
else:
|
|
79
|
+
config = {
|
|
80
|
+
"_default": qdrant.VectorParams(
|
|
81
|
+
size=vector_size, distance=qdrant.Distance[dist]
|
|
82
|
+
)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# recreate with full schema
|
|
68
86
|
self.client.recreate_collection(
|
|
69
87
|
collection_name=collection_name,
|
|
70
|
-
vectors_config=
|
|
71
|
-
size=vector_size, distance=qdrant.Distance[dist]
|
|
72
|
-
),
|
|
88
|
+
vectors_config=config,
|
|
73
89
|
)
|
|
90
|
+
# record metadata for each field
|
|
74
91
|
self.active_stores[collection_name] = {
|
|
75
92
|
"created_at": int(time.time()),
|
|
76
93
|
"vector_size": vector_size,
|
|
77
94
|
"distance": dist,
|
|
95
|
+
"fields": list(config.keys()),
|
|
78
96
|
}
|
|
79
97
|
log.info("Created Qdrant collection %s", collection_name)
|
|
80
98
|
return {"collection_name": collection_name, "status": "created"}
|
|
@@ -103,8 +121,9 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
103
121
|
"name": store_name,
|
|
104
122
|
"status": "active",
|
|
105
123
|
"vectors_count": info.points_count,
|
|
106
|
-
"configuration": info.config.params
|
|
124
|
+
"configuration": info.config.params,
|
|
107
125
|
"created_at": self.active_stores[store_name]["created_at"],
|
|
126
|
+
"fields": self.active_stores[store_name].get("fields"),
|
|
108
127
|
}
|
|
109
128
|
except Exception as e:
|
|
110
129
|
log.error("Store info failed: %s", e)
|
|
@@ -119,6 +138,8 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
119
138
|
texts: List[str],
|
|
120
139
|
vectors: List[List[float]],
|
|
121
140
|
metadata: List[dict],
|
|
141
|
+
*,
|
|
142
|
+
vector_name: Optional[str] = None, # NEW
|
|
122
143
|
):
|
|
123
144
|
if not vectors:
|
|
124
145
|
raise ValueError("Empty vectors list")
|
|
@@ -136,7 +157,13 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
136
157
|
for txt, vec, meta in zip(texts, vectors, metadata)
|
|
137
158
|
]
|
|
138
159
|
try:
|
|
139
|
-
|
|
160
|
+
# pass vector_name if multi-column
|
|
161
|
+
self.client.upsert(
|
|
162
|
+
collection_name=store_name,
|
|
163
|
+
points=points,
|
|
164
|
+
wait=True,
|
|
165
|
+
vector_name=vector_name, # ignored if None
|
|
166
|
+
)
|
|
140
167
|
return {"status": "success", "points_inserted": len(points)}
|
|
141
168
|
except Exception as e:
|
|
142
169
|
log.error("Add‑to‑store failed: %s", e)
|
|
@@ -189,15 +216,25 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
189
216
|
query_vector: List[float],
|
|
190
217
|
top_k: int = 5,
|
|
191
218
|
filters: Optional[dict] = None,
|
|
219
|
+
*,
|
|
220
|
+
vector_field: Optional[str] = None, # ← NEW
|
|
192
221
|
score_threshold: float = 0.0,
|
|
193
222
|
offset: int = 0,
|
|
194
223
|
limit: Optional[int] = None,
|
|
195
224
|
) -> List[dict]:
|
|
196
|
-
"""
|
|
225
|
+
"""
|
|
226
|
+
Run a similarity search against *store_name*.
|
|
227
|
+
|
|
228
|
+
• Works with any Qdrant-client ≥ 1.0
|
|
229
|
+
• `vector_field` lets you target a non-default vector column
|
|
230
|
+
(e.g. ``\"caption_vector\"`` for image stores). Pass **None**
|
|
231
|
+
to use the collection’s default vector.
|
|
232
|
+
"""
|
|
197
233
|
|
|
198
234
|
limit = limit or top_k
|
|
199
235
|
flt = self._dict_to_filter(filters) if filters else None
|
|
200
236
|
|
|
237
|
+
# ── shared kwargs ----------------------------------------------------
|
|
201
238
|
common: Dict[str, Any] = dict(
|
|
202
239
|
collection_name=store_name,
|
|
203
240
|
query_vector=query_vector,
|
|
@@ -207,20 +244,21 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
207
244
|
with_payload=True,
|
|
208
245
|
with_vectors=False,
|
|
209
246
|
)
|
|
247
|
+
if vector_field: # ← inject when requested
|
|
248
|
+
common["vector_name"] = vector_field
|
|
210
249
|
|
|
250
|
+
# ── call search (new client first, fallback to old) ------------------
|
|
211
251
|
try:
|
|
212
|
-
|
|
213
|
-
res = self.client.search(**common, filter=flt) # type: ignore[arg-type]
|
|
252
|
+
res = self.client.search(**common, filter=flt) # ≥ 1.6
|
|
214
253
|
except AssertionError as ae:
|
|
215
254
|
if "Unknown arguments" not in str(ae):
|
|
216
255
|
raise
|
|
217
|
-
|
|
218
|
-
res = self.client.search(**common, query_filter=flt) # type: ignore[arg-type]
|
|
219
|
-
|
|
256
|
+
res = self.client.search(**common, query_filter=flt) # < 1.6
|
|
220
257
|
except Exception as e:
|
|
221
258
|
log.error("Query failed: %s", e)
|
|
222
259
|
raise VectorStoreError(f"Query failed: {e}") from e
|
|
223
260
|
|
|
261
|
+
# ── normalise result -------------------------------------------------
|
|
224
262
|
return [
|
|
225
263
|
{
|
|
226
264
|
"id": p.id,
|
projectdavid/clients/vectors.py
CHANGED
|
@@ -13,8 +13,10 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
13
13
|
|
|
14
14
|
import httpx
|
|
15
15
|
from dotenv import load_dotenv
|
|
16
|
+
from PIL import Image
|
|
16
17
|
from projectdavid_common import UtilsInterface, ValidationInterface
|
|
17
18
|
from pydantic import BaseModel, Field
|
|
19
|
+
from qdrant_client.http import models as qdrant
|
|
18
20
|
|
|
19
21
|
from projectdavid.clients.file_processor import FileProcessor
|
|
20
22
|
from projectdavid.clients.vector_store_manager import VectorStoreManager
|
|
@@ -61,13 +63,16 @@ class VectorStoreClient:
|
|
|
61
63
|
• create_vector_store() no longer takes user_id; ownership from token.
|
|
62
64
|
"""
|
|
63
65
|
|
|
64
|
-
#
|
|
66
|
+
# ------------------------------------------------------------------ #
|
|
67
|
+
# Construction / cleanup
|
|
68
|
+
# ------------------------------------------------------------------ #
|
|
65
69
|
def __init__(
|
|
66
70
|
self,
|
|
67
71
|
base_url: Optional[str] = None,
|
|
68
72
|
api_key: Optional[str] = None,
|
|
69
73
|
*,
|
|
70
74
|
vector_store_host: str = "localhost",
|
|
75
|
+
file_processor_kwargs: Optional[dict] = None, # 🔶 add arg
|
|
71
76
|
):
|
|
72
77
|
self.base_url = (base_url or os.getenv("BASE_URL", "")).rstrip("/")
|
|
73
78
|
self.api_key = api_key or os.getenv("API_KEY")
|
|
@@ -84,10 +89,12 @@ class VectorStoreClient:
|
|
|
84
89
|
base_url=self.base_url, headers=self._base_headers, timeout=30.0
|
|
85
90
|
)
|
|
86
91
|
|
|
87
|
-
# Local helpers
|
|
92
|
+
# Local helpers ---------------------------------------------------
|
|
88
93
|
self.vector_manager = VectorStoreManager(vector_store_host=vector_store_host)
|
|
89
94
|
self.identifier_service = UtilsInterface.IdentifierService()
|
|
90
|
-
|
|
95
|
+
|
|
96
|
+
# 🔶 forward kwargs into the upgraded FileProcessor
|
|
97
|
+
self.file_processor = FileProcessor(**(file_processor_kwargs or {}))
|
|
91
98
|
|
|
92
99
|
log.info("VectorStoreClient → %s", self.base_url)
|
|
93
100
|
|
|
@@ -180,12 +187,15 @@ class VectorStoreClient:
|
|
|
180
187
|
vector_size: int,
|
|
181
188
|
distance_metric: str,
|
|
182
189
|
config: Optional[Dict[str, Any]],
|
|
190
|
+
vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
|
|
183
191
|
) -> ValidationInterface.VectorStoreRead:
|
|
184
192
|
shared_id = self.identifier_service.generate_vector_id()
|
|
193
|
+
# forward multi-vector schema if given
|
|
185
194
|
self.vector_manager.create_store(
|
|
186
195
|
collection_name=shared_id,
|
|
187
196
|
vector_size=vector_size,
|
|
188
197
|
distance=distance_metric.upper(),
|
|
198
|
+
vectors_config=vectors_config,
|
|
189
199
|
)
|
|
190
200
|
|
|
191
201
|
payload = {
|
|
@@ -198,10 +208,6 @@ class VectorStoreClient:
|
|
|
198
208
|
resp = await self._request("POST", "/v1/vector-stores", json=payload)
|
|
199
209
|
return ValidationInterface.VectorStoreRead.model_validate(resp)
|
|
200
210
|
|
|
201
|
-
async def _list_my_vs_async(self) -> List[ValidationInterface.VectorStoreRead]:
|
|
202
|
-
resp = await self._request("GET", "/v1/vector-stores")
|
|
203
|
-
return [ValidationInterface.VectorStoreRead.model_validate(r) for r in resp]
|
|
204
|
-
|
|
205
211
|
# ------------------------------------------------------------------ #
|
|
206
212
|
# NEW admin‑aware creation helper
|
|
207
213
|
# ------------------------------------------------------------------ #
|
|
@@ -212,13 +218,17 @@ class VectorStoreClient:
|
|
|
212
218
|
vector_size: int,
|
|
213
219
|
distance_metric: str,
|
|
214
220
|
config: Optional[Dict[str, Any]],
|
|
221
|
+
vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
|
|
215
222
|
) -> ValidationInterface.VectorStoreRead:
|
|
216
223
|
shared_id = self.identifier_service.generate_vector_id()
|
|
224
|
+
# forward multi-vector schema if given
|
|
217
225
|
self.vector_manager.create_store(
|
|
218
226
|
collection_name=shared_id,
|
|
219
227
|
vector_size=vector_size,
|
|
220
228
|
distance=distance_metric.upper(),
|
|
229
|
+
vectors_config=vectors_config,
|
|
221
230
|
)
|
|
231
|
+
|
|
222
232
|
payload = {
|
|
223
233
|
"shared_id": shared_id,
|
|
224
234
|
"name": name,
|
|
@@ -226,7 +236,6 @@ class VectorStoreClient:
|
|
|
226
236
|
"distance_metric": distance_metric.upper(),
|
|
227
237
|
"config": config or {},
|
|
228
238
|
}
|
|
229
|
-
# pass owner_id as query‑param (backend enforces admin‑only)
|
|
230
239
|
resp = await self._request(
|
|
231
240
|
"POST",
|
|
232
241
|
"/v1/vector-stores",
|
|
@@ -282,25 +291,63 @@ class VectorStoreClient:
|
|
|
282
291
|
async def _search_vs_async(
|
|
283
292
|
self,
|
|
284
293
|
vector_store_id: str,
|
|
285
|
-
query_text: str,
|
|
294
|
+
query_text: Union[str, List[float]],
|
|
286
295
|
top_k: int,
|
|
287
296
|
filters: Optional[Dict] = None,
|
|
288
297
|
vector_store_host: Optional[str] = None,
|
|
298
|
+
vector_field: Optional[str] = None, # allow caller override
|
|
289
299
|
) -> List[Dict[str, Any]]:
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
vector_manager = VectorStoreManager(vector_store_host=vector_store_host)
|
|
293
|
-
else:
|
|
294
|
-
vector_manager = self.vector_manager
|
|
300
|
+
"""
|
|
301
|
+
Internal: run ANN search against the specified vector field or auto-detect by store size.
|
|
295
302
|
|
|
303
|
+
If `vector_field` is provided, it will be used directly. Otherwise:
|
|
304
|
+
• 1024-D → caption_vector
|
|
305
|
+
• 3-D → geo_vector
|
|
306
|
+
• others → default vector (text)
|
|
307
|
+
"""
|
|
308
|
+
# pick local vs. override host
|
|
309
|
+
vector_manager = (
|
|
310
|
+
VectorStoreManager(vector_store_host=vector_store_host)
|
|
311
|
+
if vector_store_host
|
|
312
|
+
else self.vector_manager
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# fetch store info to inspect schema
|
|
296
316
|
store = self.retrieve_vector_store_sync(vector_store_id)
|
|
297
|
-
vec = self.file_processor.embedding_model.encode(query_text).tolist()
|
|
298
317
|
|
|
318
|
+
# determine the query vector and target field
|
|
319
|
+
if vector_field is not None:
|
|
320
|
+
# if caller passed a raw vector list, use it; otherwise treat as caption search
|
|
321
|
+
if isinstance(query_text, list):
|
|
322
|
+
vec = query_text
|
|
323
|
+
else:
|
|
324
|
+
vec = self.file_processor.encode_clip_text(query_text).tolist()
|
|
325
|
+
else:
|
|
326
|
+
# auto-detect based on stored vector dimensionality
|
|
327
|
+
if store.vector_size == 1024:
|
|
328
|
+
# image/caption space
|
|
329
|
+
vec = self.file_processor.encode_clip_text(query_text).tolist()
|
|
330
|
+
vector_field = "caption_vector"
|
|
331
|
+
elif store.vector_size == 3:
|
|
332
|
+
# geo space; query_text must be a raw 3-D list
|
|
333
|
+
if not isinstance(query_text, list):
|
|
334
|
+
raise VectorStoreClientError(
|
|
335
|
+
"Geo search requires a 3-element vector; pass raw unit-sphere list"
|
|
336
|
+
)
|
|
337
|
+
vec = query_text
|
|
338
|
+
vector_field = "geo_vector"
|
|
339
|
+
else:
|
|
340
|
+
# fallback to text embedding
|
|
341
|
+
vec = self.file_processor.encode_text(query_text).tolist()
|
|
342
|
+
vector_field = None # use default
|
|
343
|
+
|
|
344
|
+
# perform the search on the selected vector column
|
|
299
345
|
return vector_manager.query_store(
|
|
300
346
|
store_name=store.collection_name,
|
|
301
347
|
query_vector=vec,
|
|
302
348
|
top_k=top_k,
|
|
303
349
|
filters=filters,
|
|
350
|
+
vector_field=vector_field,
|
|
304
351
|
)
|
|
305
352
|
|
|
306
353
|
async def _delete_vs_async(
|
|
@@ -427,13 +474,65 @@ class VectorStoreClient:
|
|
|
427
474
|
vector_size: int = 384,
|
|
428
475
|
distance_metric: str = "Cosine",
|
|
429
476
|
config: Optional[Dict[str, Any]] = None,
|
|
477
|
+
vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
|
|
430
478
|
) -> ValidationInterface.VectorStoreRead:
|
|
431
|
-
"""
|
|
479
|
+
"""
|
|
480
|
+
Create a new store owned by this API key.
|
|
481
|
+
|
|
482
|
+
If `vectors_config` is provided, it should map each vector
|
|
483
|
+
field name to its Qdrant VectorParams (size + distance).
|
|
484
|
+
"""
|
|
432
485
|
return self._run_sync(
|
|
433
|
-
self._create_vs_async(
|
|
486
|
+
self._create_vs_async(
|
|
487
|
+
name,
|
|
488
|
+
vector_size,
|
|
489
|
+
distance_metric,
|
|
490
|
+
config,
|
|
491
|
+
vectors_config,
|
|
492
|
+
)
|
|
434
493
|
)
|
|
435
494
|
|
|
436
|
-
def
|
|
495
|
+
def create_vector_vision_store(
|
|
496
|
+
self,
|
|
497
|
+
name: str,
|
|
498
|
+
*,
|
|
499
|
+
vector_size: int = 384,
|
|
500
|
+
distance_metric: str = "Cosine",
|
|
501
|
+
config: Optional[Dict[str, Any]] = None,
|
|
502
|
+
vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
|
|
503
|
+
) -> ValidationInterface.VectorStoreRead:
|
|
504
|
+
|
|
505
|
+
if not vectors_config:
|
|
506
|
+
vectors_config = {
|
|
507
|
+
# Raw visual embeddings (OpenCLIP ViT-H/14 → 1024-D)
|
|
508
|
+
"image_vector": qdrant.VectorParams(
|
|
509
|
+
size=1024, distance=qdrant.Distance.COSINE
|
|
510
|
+
),
|
|
511
|
+
# Language embeddings of your BLIP-2 captions → 1024-D
|
|
512
|
+
"caption_vector": qdrant.VectorParams(
|
|
513
|
+
size=1024, distance=qdrant.Distance.COSINE
|
|
514
|
+
),
|
|
515
|
+
# Object-region embeddings (YOLO crop + Sentence-BERT) → 1024-D
|
|
516
|
+
"region_vector": qdrant.VectorParams(
|
|
517
|
+
size=1024, distance=qdrant.Distance.COSINE
|
|
518
|
+
),
|
|
519
|
+
# Geo-location unit vectors (RegioNet) → 3-D
|
|
520
|
+
"geo_vector": qdrant.VectorParams(
|
|
521
|
+
size=3, distance=qdrant.Distance.COSINE
|
|
522
|
+
),
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
return self._run_sync(
|
|
526
|
+
self._create_vs_async(
|
|
527
|
+
name,
|
|
528
|
+
vector_size,
|
|
529
|
+
distance_metric,
|
|
530
|
+
config,
|
|
531
|
+
vectors_config,
|
|
532
|
+
)
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
def create_vector_vision_store_for_user(
|
|
437
536
|
self,
|
|
438
537
|
owner_id: str,
|
|
439
538
|
name: str,
|
|
@@ -441,16 +540,66 @@ class VectorStoreClient:
|
|
|
441
540
|
vector_size: int = 384,
|
|
442
541
|
distance_metric: str = "Cosine",
|
|
443
542
|
config: Optional[Dict[str, Any]] = None,
|
|
543
|
+
vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
|
|
444
544
|
) -> ValidationInterface.VectorStoreRead:
|
|
445
545
|
"""
|
|
446
|
-
|
|
546
|
+
Admin-only: create a store on behalf of another user.
|
|
547
|
+
Pass `vectors_config` to define a multi-vector schema.
|
|
548
|
+
"""
|
|
549
|
+
if not vectors_config:
|
|
550
|
+
|
|
551
|
+
vectors_config = {
|
|
552
|
+
# Raw visual embeddings (OpenCLIP ViT-H/14 → 1024-D)
|
|
553
|
+
"image_vector": qdrant.VectorParams(
|
|
554
|
+
size=1024, distance=qdrant.Distance.COSINE
|
|
555
|
+
),
|
|
556
|
+
# Language embeddings of your BLIP-2 captions → 1024-D
|
|
557
|
+
"caption_vector": qdrant.VectorParams(
|
|
558
|
+
size=1024, distance=qdrant.Distance.COSINE
|
|
559
|
+
),
|
|
560
|
+
# Object-region embeddings (YOLO crop + Sentence-BERT) → 1024-D
|
|
561
|
+
"region_vector": qdrant.VectorParams(
|
|
562
|
+
size=1024, distance=qdrant.Distance.COSINE
|
|
563
|
+
),
|
|
564
|
+
# Geo-location unit vectors (RegioNet) → 3-D
|
|
565
|
+
"geo_vector": qdrant.VectorParams(
|
|
566
|
+
size=3, distance=qdrant.Distance.COSINE
|
|
567
|
+
),
|
|
568
|
+
}
|
|
447
569
|
|
|
448
|
-
|
|
449
|
-
|
|
570
|
+
return self._run_sync(
|
|
571
|
+
self._create_vs_for_user_async(
|
|
572
|
+
owner_id,
|
|
573
|
+
name,
|
|
574
|
+
vector_size,
|
|
575
|
+
distance_metric,
|
|
576
|
+
config,
|
|
577
|
+
vectors_config,
|
|
578
|
+
)
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
def create_vector_store_for_user(
|
|
582
|
+
self,
|
|
583
|
+
owner_id: str,
|
|
584
|
+
name: str,
|
|
585
|
+
*,
|
|
586
|
+
vector_size: int = 384,
|
|
587
|
+
distance_metric: str = "Cosine",
|
|
588
|
+
config: Optional[Dict[str, Any]] = None,
|
|
589
|
+
vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None, # ← NEW
|
|
590
|
+
) -> ValidationInterface.VectorStoreRead:
|
|
591
|
+
"""
|
|
592
|
+
Admin-only: create a store on behalf of another user.
|
|
593
|
+
Pass `vectors_config` to define a multi-vector schema.
|
|
450
594
|
"""
|
|
451
595
|
return self._run_sync(
|
|
452
596
|
self._create_vs_for_user_async(
|
|
453
|
-
owner_id,
|
|
597
|
+
owner_id,
|
|
598
|
+
name,
|
|
599
|
+
vector_size,
|
|
600
|
+
distance_metric,
|
|
601
|
+
config,
|
|
602
|
+
vectors_config,
|
|
454
603
|
)
|
|
455
604
|
)
|
|
456
605
|
|
|
@@ -629,10 +778,16 @@ class VectorStoreClient:
|
|
|
629
778
|
top_k: int = 5,
|
|
630
779
|
filters: Optional[Dict] = None,
|
|
631
780
|
vector_store_host: Optional[str] = None,
|
|
781
|
+
vector_field: Optional[str] = None, # ← NEW
|
|
632
782
|
) -> List[Dict[str, Any]]:
|
|
633
783
|
return self._run_sync(
|
|
634
784
|
self._search_vs_async(
|
|
635
|
-
vector_store_id,
|
|
785
|
+
vector_store_id,
|
|
786
|
+
query_text,
|
|
787
|
+
top_k,
|
|
788
|
+
filters,
|
|
789
|
+
vector_store_host,
|
|
790
|
+
vector_field,
|
|
636
791
|
)
|
|
637
792
|
)
|
|
638
793
|
|
|
@@ -796,3 +951,91 @@ class VectorStoreClient:
|
|
|
796
951
|
hits = self._normalise_hits(hits)
|
|
797
952
|
|
|
798
953
|
return hits
|
|
954
|
+
|
|
955
|
+
def image_similarity_search(
|
|
956
|
+
self,
|
|
957
|
+
vector_store_id: str,
|
|
958
|
+
img: Image.Image,
|
|
959
|
+
k: int = 10,
|
|
960
|
+
vector_store_host: Optional[str] = None,
|
|
961
|
+
) -> List[Dict[str, Any]]:
|
|
962
|
+
vec = self.file_processor.encode_image(img).tolist()
|
|
963
|
+
return self.vector_file_search_raw(
|
|
964
|
+
vector_store_id=vector_store_id,
|
|
965
|
+
query_text=vec,
|
|
966
|
+
top_k=k,
|
|
967
|
+
filters=None,
|
|
968
|
+
vector_store_host=vector_store_host,
|
|
969
|
+
vector_field="image_vector",
|
|
970
|
+
)
|
|
971
|
+
|
|
972
|
+
def search_images(
|
|
973
|
+
self,
|
|
974
|
+
vector_store_id: str,
|
|
975
|
+
query: Union[str, Image.Image, List[float]],
|
|
976
|
+
*,
|
|
977
|
+
modality: Optional[str] = None,
|
|
978
|
+
k: int = 10,
|
|
979
|
+
vector_store_host: Optional[str] = None,
|
|
980
|
+
) -> List[Dict[str, Any]]:
|
|
981
|
+
"""
|
|
982
|
+
Unified image search across multiple modalities, with appropriate reranking:
|
|
983
|
+
|
|
984
|
+
- If `query` is a str → caption search (reranked)
|
|
985
|
+
- If `query` is a PIL.Image.Image → visual search (no rerank)
|
|
986
|
+
- If `query` is a list[float] → raw vector search
|
|
987
|
+
- `modality` override: one of 'caption', 'image', 'region', 'geo'
|
|
988
|
+
"""
|
|
989
|
+
# Map modality to (vector_field, encoder)
|
|
990
|
+
field_map = {
|
|
991
|
+
"caption": (
|
|
992
|
+
"caption_vector",
|
|
993
|
+
lambda q: self.file_processor.encode_clip_text(q).tolist(),
|
|
994
|
+
),
|
|
995
|
+
"image": (
|
|
996
|
+
"image_vector",
|
|
997
|
+
lambda q: self.file_processor.encode_image(q).tolist(),
|
|
998
|
+
),
|
|
999
|
+
"region": (
|
|
1000
|
+
"region_vector",
|
|
1001
|
+
lambda q: self.file_processor.encode_text(q).tolist(),
|
|
1002
|
+
),
|
|
1003
|
+
"geo": ("geo_vector", lambda q: q), # assume q is raw 3-D vector
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
# Auto-detect if not provided
|
|
1007
|
+
if modality is None:
|
|
1008
|
+
if isinstance(query, str):
|
|
1009
|
+
modality = "caption"
|
|
1010
|
+
elif isinstance(query, Image.Image):
|
|
1011
|
+
modality = "image"
|
|
1012
|
+
elif isinstance(query, list):
|
|
1013
|
+
modality = "image"
|
|
1014
|
+
else:
|
|
1015
|
+
raise VectorStoreClientError(f"Unsupported query type: {type(query)}")
|
|
1016
|
+
|
|
1017
|
+
modality = modality.lower()
|
|
1018
|
+
if modality not in field_map:
|
|
1019
|
+
raise VectorStoreClientError(f"Unknown modality '{modality}'")
|
|
1020
|
+
|
|
1021
|
+
vector_field, encoder = field_map[modality]
|
|
1022
|
+
vec = encoder(query)
|
|
1023
|
+
|
|
1024
|
+
# 1️⃣ ANN search
|
|
1025
|
+
hits = self.vector_file_search_raw(
|
|
1026
|
+
vector_store_id=vector_store_id,
|
|
1027
|
+
query_text=vec,
|
|
1028
|
+
top_k=k,
|
|
1029
|
+
filters=None,
|
|
1030
|
+
vector_store_host=vector_store_host,
|
|
1031
|
+
vector_field=vector_field,
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
# 2️⃣ Rerank for text-based modalities
|
|
1035
|
+
if modality in ("caption", "region"):
|
|
1036
|
+
hits = reranker.rerank(
|
|
1037
|
+
query if isinstance(query, str) else "", hits, top_k=min(len(hits), k)
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
# 3️⃣ Normalize and return
|
|
1041
|
+
return self._normalise_hits(hits)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: projectdavid
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.33.1
|
|
4
4
|
Summary: Python SDK for interacting with the Entities Assistant API.
|
|
5
5
|
Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
|
|
6
6
|
License: PolyForm Noncommercial License 1.0.0
|
|
@@ -29,6 +29,13 @@ Requires-Dist: sseclient-py
|
|
|
29
29
|
Requires-Dist: requests
|
|
30
30
|
Requires-Dist: python-docx
|
|
31
31
|
Requires-Dist: python-pptx
|
|
32
|
+
Requires-Dist: open_clip_torch>=2.24
|
|
33
|
+
Requires-Dist: pillow>=10.2
|
|
34
|
+
Requires-Dist: transformers>=4.41
|
|
35
|
+
Requires-Dist: accelerate>=0.28
|
|
36
|
+
Requires-Dist: sentencepiece>=0.2
|
|
37
|
+
Requires-Dist: ultralytics>=8.2.21
|
|
38
|
+
Requires-Dist: pytesseract>=0.3
|
|
32
39
|
Provides-Extra: dev
|
|
33
40
|
Requires-Dist: black>=23.3; extra == "dev"
|
|
34
41
|
Requires-Dist: isort>=5.12; extra == "dev"
|
|
@@ -36,6 +43,17 @@ Requires-Dist: pytest>=7.2; extra == "dev"
|
|
|
36
43
|
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
37
44
|
Requires-Dist: build; extra == "dev"
|
|
38
45
|
Requires-Dist: twine; extra == "dev"
|
|
46
|
+
Provides-Extra: vision
|
|
47
|
+
Requires-Dist: torch>=2.2.1; extra == "vision"
|
|
48
|
+
Requires-Dist: torchvision>=0.17.1; extra == "vision"
|
|
49
|
+
Requires-Dist: torchaudio>=2.2.1; extra == "vision"
|
|
50
|
+
Requires-Dist: open_clip_torch>=2.24; extra == "vision"
|
|
51
|
+
Requires-Dist: pillow>=10.2; extra == "vision"
|
|
52
|
+
Requires-Dist: transformers>=4.41; extra == "vision"
|
|
53
|
+
Requires-Dist: accelerate>=0.28; extra == "vision"
|
|
54
|
+
Requires-Dist: sentencepiece>=0.2; extra == "vision"
|
|
55
|
+
Requires-Dist: ultralytics>=8.2.21; extra == "vision"
|
|
56
|
+
Requires-Dist: pytesseract>=0.3; extra == "vision"
|
|
39
57
|
Dynamic: license-file
|
|
40
58
|
|
|
41
59
|
# Entity — by Project David
|
|
@@ -9,7 +9,7 @@ projectdavid/clients/assistants_client.py,sha256=SsIGa5wPr7ga9WX0ywam3djUF-uWFdk
|
|
|
9
9
|
projectdavid/clients/base_client.py,sha256=UWl6nr6sxD1_xC6iyptQDR1tnNdFCOrEx5cEUPCRqJE,3417
|
|
10
10
|
projectdavid/clients/base_vector_store.py,sha256=jXivmqAW1bgYcLgIeW-hPxOiWZbs2hCsLy4oWzSvpNI,2061
|
|
11
11
|
projectdavid/clients/event_handler.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
projectdavid/clients/file_processor.py,sha256=
|
|
12
|
+
projectdavid/clients/file_processor.py,sha256=nFccQmiow3lkjv1-Pdgv_2WQAtSy0FRN7oJlTKt4fs4,21114
|
|
13
13
|
projectdavid/clients/file_search.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
14
|
projectdavid/clients/files_client.py,sha256=XkIDzbQFGDrd88taf0Kouc_4YJOPIYEHiIyWYLKDofI,15581
|
|
15
15
|
projectdavid/clients/inference_client.py,sha256=xz4ACPv5Tkis604QxO5mJX1inH_TGDfQP-31geETYpE,6609
|
|
@@ -19,8 +19,8 @@ projectdavid/clients/synchronous_inference_wrapper.py,sha256=mN5WAHmv0aRoeMIb7XP
|
|
|
19
19
|
projectdavid/clients/threads_client.py,sha256=ekzU5w14zftmtmFkiec3NC90Of-_KVSUY1qH9cmfSFg,6771
|
|
20
20
|
projectdavid/clients/tools_client.py,sha256=GkCVOmwpAoPqVt6aYmH0G1HIFha3iEwR9IIf9teR0j8,11487
|
|
21
21
|
projectdavid/clients/users_client.py,sha256=eCuUb9qvyH1GUFhZu6TRL9zdoK-qzHSs8-Vmrk_0mmg,13729
|
|
22
|
-
projectdavid/clients/vector_store_manager.py,sha256=
|
|
23
|
-
projectdavid/clients/vectors.py,sha256=
|
|
22
|
+
projectdavid/clients/vector_store_manager.py,sha256=q-ZgRQVX_S3nMrKYhmvkVrDjDRzM3ZFzUF55HBGRTe8,12861
|
|
23
|
+
projectdavid/clients/vectors.py,sha256=hJeZS174evrOcZLVtYjnlq1dUFgWx3p-DpVbkfbb4k4,39882
|
|
24
24
|
projectdavid/constants/platform.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
25
|
projectdavid/services/logging_service.py,sha256=jdoRL46E42Ar8JFTDOV-xVD67CulcHSN-xhcEqA5CXQ,2643
|
|
26
26
|
projectdavid/synthesis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -34,8 +34,8 @@ projectdavid/utils/monitor_launcher.py,sha256=3YAgJdeuaUvq3JGvpA4ymqFsAnk29nH5q9
|
|
|
34
34
|
projectdavid/utils/peek_gate.py,sha256=5whMRnDOQjATRpThWDJkvY9ScXuJ7Sd_-9rvGgXeTAQ,2532
|
|
35
35
|
projectdavid/utils/run_monitor.py,sha256=F_WkqIP-qnWH-4llIbileWWLfRj2Q1Cg-ni23SR1rec,3786
|
|
36
36
|
projectdavid/utils/vector_search_formatter.py,sha256=YTe3HPGec26qGY7uxY8_GS8lc4QaN6aNXMzkl29nZpI,1735
|
|
37
|
-
projectdavid-1.
|
|
38
|
-
projectdavid-1.
|
|
39
|
-
projectdavid-1.
|
|
40
|
-
projectdavid-1.
|
|
41
|
-
projectdavid-1.
|
|
37
|
+
projectdavid-1.33.1.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
|
|
38
|
+
projectdavid-1.33.1.dist-info/METADATA,sha256=-tWFpgevsgdM0-J4I38xM8Kp4EWftSflMaugZ6jtyLU,11554
|
|
39
|
+
projectdavid-1.33.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
40
|
+
projectdavid-1.33.1.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
|
|
41
|
+
projectdavid-1.33.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|