projectdavid 1.32.21__tar.gz → 1.33.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of projectdavid might be problematic. Click here for more details.

Files changed (68) hide show
  1. {projectdavid-1.32.21 → projectdavid-1.33.1}/CHANGELOG.md +25 -0
  2. {projectdavid-1.32.21/src/projectdavid.egg-info → projectdavid-1.33.1}/PKG-INFO +19 -1
  3. {projectdavid-1.32.21 → projectdavid-1.33.1}/pyproject.toml +29 -2
  4. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/file_processor.py +232 -46
  5. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/vector_store_manager.py +50 -12
  6. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/vectors.py +266 -23
  7. {projectdavid-1.32.21 → projectdavid-1.33.1/src/projectdavid.egg-info}/PKG-INFO +19 -1
  8. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid.egg-info/requires.txt +19 -0
  9. {projectdavid-1.32.21 → projectdavid-1.33.1}/LICENSE +0 -0
  10. {projectdavid-1.32.21 → projectdavid-1.33.1}/MANIFEST.in +0 -0
  11. {projectdavid-1.32.21 → projectdavid-1.33.1}/README.md +0 -0
  12. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/assistants.md +0 -0
  13. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/code_interpretation.md +0 -0
  14. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/database.md +0 -0
  15. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/database_assistant_example.md +0 -0
  16. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/docker_comtainers.md +0 -0
  17. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/file_search.md +0 -0
  18. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/files.md +0 -0
  19. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/function_call_definition.md +0 -0
  20. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/function_calls.md +0 -0
  21. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/handling_function_calls.md +0 -0
  22. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/inference.md +0 -0
  23. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/messages.md +0 -0
  24. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/runs.md +0 -0
  25. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/streams.md +0 -0
  26. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/threads.md +0 -0
  27. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/tools.md +0 -0
  28. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/users.md +0 -0
  29. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/vector_store.md +0 -0
  30. {projectdavid-1.32.21 → projectdavid-1.33.1}/docs/versioning.md +0 -0
  31. {projectdavid-1.32.21 → projectdavid-1.33.1}/setup.cfg +0 -0
  32. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/__init__.py +0 -0
  33. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/_version.py +0 -0
  34. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/actions_client.py +0 -0
  35. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/api_key_client.py +0 -0
  36. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/assistants_client.py +0 -0
  37. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/base_client.py +0 -0
  38. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/base_vector_store.py +0 -0
  39. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/event_handler.py +0 -0
  40. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/file_search.py +0 -0
  41. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/files_client.py +0 -0
  42. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/inference_client.py +0 -0
  43. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/messages_client.py +0 -0
  44. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/runs.py +0 -0
  45. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/synchronous_inference_wrapper.py +0 -0
  46. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/threads_client.py +0 -0
  47. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/tools_client.py +0 -0
  48. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/clients/users_client.py +0 -0
  49. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/constants/platform.py +0 -0
  50. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/entity.py +0 -0
  51. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/events.py +0 -0
  52. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/serializers.py +0 -0
  53. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/services/logging_service.py +0 -0
  54. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/synthesis/__init__.py +0 -0
  55. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/synthesis/llm_synthesizer.py +0 -0
  56. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/synthesis/prompt.py +0 -0
  57. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/synthesis/reranker.py +0 -0
  58. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/synthesis/retriever.py +0 -0
  59. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/utils/__init__.py +0 -0
  60. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/utils/function_call_suppressor.py +0 -0
  61. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/utils/monitor_launcher.py +0 -0
  62. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/utils/peek_gate.py +0 -0
  63. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/utils/run_monitor.py +0 -0
  64. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid/utils/vector_search_formatter.py +0 -0
  65. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid.egg-info/SOURCES.txt +0 -0
  66. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid.egg-info/dependency_links.txt +0 -0
  67. {projectdavid-1.32.21 → projectdavid-1.33.1}/src/projectdavid.egg-info/top_level.txt +0 -0
  68. {projectdavid-1.32.21 → projectdavid-1.33.1}/tests/test_clients.py +0 -0
@@ -1,3 +1,28 @@
1
+ ## [1.33.1](https://github.com/frankie336/projectdavid/compare/v1.33.0...v1.33.1) (2025-06-10)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * Add create_vector_vision_store_for_user ([392813b](https://github.com/frankie336/projectdavid/commit/392813bef20e12c2aca456e349b6d937e686f78c))
7
+
8
+ # [1.33.0](https://github.com/frankie336/projectdavid/compare/v1.32.21...v1.33.0) (2025-06-10)
9
+
10
+
11
+ ### Features
12
+
13
+ * Add support for multi-modal image search ([58e7e27](https://github.com/frankie336/projectdavid/commit/58e7e270be849e36bcd93e6a19942fa3e8abbd25))
14
+ * Add support for multi-modal image search-1 ([b8ebc7c](https://github.com/frankie336/projectdavid/commit/b8ebc7c4fb73cec0bff1b98ee45fa5b52e41a9b3))
15
+ * Add support for multi-modal image search-1 ([2362069](https://github.com/frankie336/projectdavid/commit/2362069e4b5390b4eb2b1007a413a6adb1a8bc7b))
16
+ * Add support for multi-modal image search-2 ([07f81fe](https://github.com/frankie336/projectdavid/commit/07f81fe0a475652bc6d316f3dc45e341452f43b7))
17
+ * Add support for multi-modal image search-3 ([29bce72](https://github.com/frankie336/projectdavid/commit/29bce72b12e3b2b5d2daeafe2367908e0cc3b402))
18
+ * Add support for multi-modal image search-3 ([3f8149e](https://github.com/frankie336/projectdavid/commit/3f8149e31371efa8727b96fa16d92fbe5474f727))
19
+ * Add support for multi-modal image search-4 ([b434d6d](https://github.com/frankie336/projectdavid/commit/b434d6d035324f444b46bd49dd15cbed528527a5))
20
+ * Add support for multi-modal image search-4 ([6acddf0](https://github.com/frankie336/projectdavid/commit/6acddf0c3b38ed6ca9e786ddb6d8ebf1a1328ac5))
21
+ * Add support for multi-modal image search-5 ([1dd9dd9](https://github.com/frankie336/projectdavid/commit/1dd9dd9d91556df8a0089255efad82bfe3f9a6b6))
22
+ * Add support for multi-modal image search-6 ([33a6069](https://github.com/frankie336/projectdavid/commit/33a6069b9f7a9e9007c156d511b3cb8abf859760))
23
+ * Add support for multi-modal image search-7 ([01d68e5](https://github.com/frankie336/projectdavid/commit/01d68e591c8dbc52c81b6bfcd522bb95d27c9ddd))
24
+ * Add support for multi-modal image search-8 ([8663b2a](https://github.com/frankie336/projectdavid/commit/8663b2ab7f0f035ae953281d86ba01a0db926839))
25
+
1
26
  ## [1.32.21](https://github.com/frankie336/projectdavid/compare/v1.32.20...v1.32.21) (2025-06-10)
2
27
 
3
28
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: projectdavid
3
- Version: 1.32.21
3
+ Version: 1.33.1
4
4
  Summary: Python SDK for interacting with the Entities Assistant API.
5
5
  Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
6
6
  License: PolyForm Noncommercial License 1.0.0
@@ -29,6 +29,13 @@ Requires-Dist: sseclient-py
29
29
  Requires-Dist: requests
30
30
  Requires-Dist: python-docx
31
31
  Requires-Dist: python-pptx
32
+ Requires-Dist: open_clip_torch>=2.24
33
+ Requires-Dist: pillow>=10.2
34
+ Requires-Dist: transformers>=4.41
35
+ Requires-Dist: accelerate>=0.28
36
+ Requires-Dist: sentencepiece>=0.2
37
+ Requires-Dist: ultralytics>=8.2.21
38
+ Requires-Dist: pytesseract>=0.3
32
39
  Provides-Extra: dev
33
40
  Requires-Dist: black>=23.3; extra == "dev"
34
41
  Requires-Dist: isort>=5.12; extra == "dev"
@@ -36,6 +43,17 @@ Requires-Dist: pytest>=7.2; extra == "dev"
36
43
  Requires-Dist: mypy>=1.0; extra == "dev"
37
44
  Requires-Dist: build; extra == "dev"
38
45
  Requires-Dist: twine; extra == "dev"
46
+ Provides-Extra: vision
47
+ Requires-Dist: torch>=2.2.1; extra == "vision"
48
+ Requires-Dist: torchvision>=0.17.1; extra == "vision"
49
+ Requires-Dist: torchaudio>=2.2.1; extra == "vision"
50
+ Requires-Dist: open_clip_torch>=2.24; extra == "vision"
51
+ Requires-Dist: pillow>=10.2; extra == "vision"
52
+ Requires-Dist: transformers>=4.41; extra == "vision"
53
+ Requires-Dist: accelerate>=0.28; extra == "vision"
54
+ Requires-Dist: sentencepiece>=0.2; extra == "vision"
55
+ Requires-Dist: ultralytics>=8.2.21; extra == "vision"
56
+ Requires-Dist: pytesseract>=0.3; extra == "vision"
39
57
  Dynamic: license-file
40
58
 
41
59
  # Entity — by Project David
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "projectdavid"
7
- version = "1.32.21"
7
+ version = "1.33.1"
8
8
  description = "Python SDK for interacting with the Entities Assistant API."
9
9
  readme = "README.md"
10
10
  authors = [
@@ -26,10 +26,18 @@ dependencies = [
26
26
  "validators>=0.29.0,<0.35.0",
27
27
  "sentence-transformers>=3.4.0,<5.0",
28
28
  "sseclient-py",
29
- "requests",
29
+ "requests",
30
30
  "python-docx",
31
31
  "python-pptx",
32
32
 
33
+ # Vision / multimodal dependencies
34
+ "open_clip_torch>=2.24",
35
+ "pillow>=10.2",
36
+ "transformers>=4.41",
37
+ "accelerate>=0.28",
38
+ "sentencepiece>=0.2",
39
+ "ultralytics>=8.2.21",
40
+ "pytesseract>=0.3",
33
41
  ]
34
42
 
35
43
  classifiers = [
@@ -52,5 +60,24 @@ dev = [
52
60
  "twine"
53
61
  ]
54
62
 
63
+ vision = [
64
+ # Users must supply the correct torch wheel (cpu / cu121 / cu118) at install time
65
+ "torch>=2.2.1",
66
+ "torchvision>=0.17.1",
67
+ "torchaudio>=2.2.1",
68
+
69
+ # OpenCLIP + captioning stack
70
+ "open_clip_torch>=2.24",
71
+ "pillow>=10.2",
72
+ "transformers>=4.41",
73
+ "accelerate>=0.28",
74
+ "sentencepiece>=0.2",
75
+ "ultralytics>=8.2.21",
76
+ "pytesseract>=0.3",
77
+
78
+ # Geolocation package pending release (uncomment when available)
79
+ # "geoloc-regio-net>=0.2.0 ; extra == 'vision'",
80
+ ]
81
+
55
82
  [tool.isort]
56
83
  profile = "black"
@@ -1,6 +1,8 @@
1
1
  import asyncio
2
2
  import csv
3
+ import hashlib
3
4
  import json
5
+ import math
4
6
  import re
5
7
  import textwrap
6
8
  from concurrent.futures import ThreadPoolExecutor
@@ -13,34 +15,124 @@ except ImportError: # 3.9–3.10
13
15
  from typing_extensions import LiteralString
14
16
 
15
17
  import numpy as np
18
+ import open_clip
16
19
  import pdfplumber
20
+ import torch
17
21
  from docx import Document
22
+ from PIL import Image
18
23
  from pptx import Presentation
24
+ from transformers import Blip2ForConditionalGeneration, Blip2Processor
25
+ from ultralytics import YOLO
26
+
27
+ # OCR fallback – optional
28
+ try:
29
+ import pytesseract # noqa: F401 # pylint: disable=unused-import
30
+ except ImportError:
31
+ pytesseract = None
32
+
19
33
  from projectdavid_common import UtilsInterface
20
34
  from sentence_transformers import SentenceTransformer
21
35
 
22
36
  log = UtilsInterface.LoggingUtility()
23
37
 
24
38
 
39
+ def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
40
+ """Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
41
+ lat_r = math.radians(lat)
42
+ lon_r = math.radians(lon)
43
+ return [
44
+ math.cos(lat_r) * math.cos(lon_r),
45
+ math.cos(lat_r) * math.sin(lon_r),
46
+ math.sin(lat_r),
47
+ ]
48
+
49
+
25
50
  class FileProcessor:
51
+ """Unified processor for text, tabular, office, JSON, **and image** files.
52
+
53
+ Each modality is embedded with its optimal model:
54
+ • Text → paraphrase‑MiniLM‑L6‑v2 (384‑D)
55
+ • Image → OpenCLIP ViT‑H/14 (1024‑D)
56
+ • Caption→ OpenCLIP text head (1024‑D)
57
+
58
+ Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
59
+ GPU usage is optional; pass `use_gpu=False` to stay on CPU.
60
+ """
61
+
26
62
  # ------------------------------------------------------------------ #
27
63
  # Construction
28
64
  # ------------------------------------------------------------------ #
29
- def __init__(self, max_workers: int = 4, chunk_size: int = 512):
30
- self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
65
+ def __init__(
66
+ self,
67
+ *,
68
+ max_workers: int = 4,
69
+ chunk_size: int = 512,
70
+ use_gpu: bool = True,
71
+ use_ocr: bool = True,
72
+ use_detection: bool = False,
73
+ image_model_name: str = "ViT-H-14",
74
+ caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
75
+ ):
76
+ # Device selection
77
+ if use_gpu and torch.cuda.is_available():
78
+ self.device = torch.device("cuda")
79
+ self.torch_dtype = torch.float16
80
+ else:
81
+ self.device = torch.device("cpu")
82
+ self.torch_dtype = torch.float32
83
+
84
+ # Feature flags
85
+ self.use_ocr = use_ocr and pytesseract is not None
86
+ self.use_detection = use_detection
87
+ if use_ocr and pytesseract is None:
88
+ log.warning("OCR requested but pytesseract not installed – skipping.")
89
+ if self.use_detection:
90
+ self.detector = YOLO("yolov8x.pt").to(self.device)
91
+
92
+ # Text embedder
31
93
  self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
32
- self._executor = ThreadPoolExecutor(max_workers=max_workers)
94
+ self.embedding_model = SentenceTransformer(self.embedding_model_name)
95
+ self.embedding_model.to(str(self.device))
33
96
 
34
- # token limits
97
+ # Chunking parameters
35
98
  self.max_seq_length = self.embedding_model.get_max_seq_length()
36
99
  self.special_tokens_count = 2
37
100
  self.effective_max_length = self.max_seq_length - self.special_tokens_count
38
101
  self.chunk_size = min(chunk_size, self.effective_max_length * 4)
39
102
 
40
- log.info("Initialized optimized FileProcessor")
103
+ # Image embedder
104
+ self.clip_model, _, self.clip_preprocess = (
105
+ open_clip.create_model_and_transforms(
106
+ image_model_name,
107
+ pretrained="laion2b_s32b_b79k",
108
+ precision="fp16" if self.device.type == "cuda" else "fp32",
109
+ )
110
+ )
111
+ self.clip_model = self.clip_model.to(self.device).eval()
112
+ self.clip_tokenizer = open_clip.get_tokenizer(image_model_name)
113
+
114
+ # Caption generator
115
+ self.blip_processor = Blip2Processor.from_pretrained(caption_model_name)
116
+ self.blip_model = (
117
+ Blip2ForConditionalGeneration.from_pretrained(
118
+ caption_model_name,
119
+ torch_dtype=self.torch_dtype,
120
+ )
121
+ .to(self.device)
122
+ .eval()
123
+ )
124
+
125
+ # Executor & logging
126
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
127
+ log.info(
128
+ "FileProcessor ready (device=%s, OCR=%s, detection=%s)",
129
+ self.device,
130
+ self.use_ocr,
131
+ self.use_detection,
132
+ )
41
133
 
42
134
  # ------------------------------------------------------------------ #
43
- # Generic validators
135
+ # Generic validators *
44
136
  # ------------------------------------------------------------------ #
45
137
  def validate_file(self, file_path: Path):
46
138
  """Ensure file exists and is under 100 MB."""
@@ -52,20 +144,10 @@ class FileProcessor:
52
144
  raise ValueError(f"{file_path.name} > {mb} MB limit")
53
145
 
54
146
  # ------------------------------------------------------------------ #
55
- # File-type detection (simple extension map NO libmagic)
147
+ # Filetype detection (extension‑basedno libmagic)
56
148
  # ------------------------------------------------------------------ #
57
149
  def _detect_file_type(self, file_path: Path) -> str:
58
- """
59
- Return one of:
60
-
61
- • 'pdf' • 'csv' • 'json'
62
- • 'office' (.doc/.docx/.pptx)
63
- • 'text' (code / markup / plain text)
64
-
65
- Raises *ValueError* if the extension is not recognised.
66
- """
67
150
  suffix = file_path.suffix.lower()
68
-
69
151
  if suffix == ".pdf":
70
152
  return "pdf"
71
153
  if suffix == ".csv":
@@ -74,7 +156,8 @@ class FileProcessor:
74
156
  return "json"
75
157
  if suffix in {".doc", ".docx", ".pptx"}:
76
158
  return "office"
77
-
159
+ if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
160
+ return "image"
78
161
  text_exts = {
79
162
  ".txt",
80
163
  ".md",
@@ -96,29 +179,100 @@ class FileProcessor:
96
179
  }
97
180
  if suffix in text_exts:
98
181
  return "text"
99
-
100
182
  raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
101
183
 
102
184
  # ------------------------------------------------------------------ #
103
- # Public entry-point
185
+ # Dispatcher
104
186
  # ------------------------------------------------------------------ #
105
187
  async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
106
- """Validate detect → dispatch to the appropriate processor."""
107
- file_path = Path(file_path)
108
- self.validate_file(file_path)
109
- ftype = self._detect_file_type(file_path)
110
-
111
- dispatch_map = {
112
- "pdf": self._process_pdf,
113
- "text": self._process_text,
114
- "csv": self._process_csv,
115
- "office": self._process_office,
116
- "json": self._process_json,
188
+ path = Path(file_path)
189
+ self.validate_file(path)
190
+ ftype = self._detect_file_type(path)
191
+ return await getattr(self, f"_process_{ftype}")(path)
192
+
193
+ # ------------------------------------------------------------------ #
194
+ # Image processing (OpenCLIP + BLIP-2 + OCR + YOLO)
195
+ # ------------------------------------------------------------------ #
196
+ async def _process_image(self, file_path: Path) -> Dict[str, Any]:
197
+ loop = asyncio.get_event_loop()
198
+ img = await loop.run_in_executor(self._executor, Image.open, file_path)
199
+
200
+ # 1) Image vector
201
+ def enc_img():
202
+ with torch.no_grad():
203
+ t = self.clip_preprocess(img).unsqueeze(0).to(self.device)
204
+ v = self.clip_model.encode_image(t).squeeze()
205
+ return (v / v.norm()).float().cpu().numpy()
206
+
207
+ image_vec = await loop.run_in_executor(self._executor, enc_img)
208
+
209
+ # 2) Caption
210
+ def gen_cap():
211
+ inp = self.blip_processor(images=img, return_tensors="pt").to(self.device)
212
+ with torch.no_grad():
213
+ ids = self.blip_model.generate(**inp, max_new_tokens=50)
214
+ return self.blip_processor.decode(ids[0], skip_special_tokens=True)
215
+
216
+ caption = await loop.run_in_executor(self._executor, gen_cap)
217
+
218
+ # 3) OCR
219
+ if self.use_ocr:
220
+ text = await loop.run_in_executor(
221
+ self._executor, pytesseract.image_to_string, img
222
+ )
223
+ if t := text.strip():
224
+ caption += "\n" + t
225
+
226
+ # 4) Caption vector
227
+ def enc_txt():
228
+ with torch.no_grad():
229
+ tok = self.clip_tokenizer(caption).unsqueeze(0).to(self.device)
230
+ v = self.clip_model.encode_text(tok).squeeze()
231
+ return (v / v.norm()).float().cpu().numpy()
232
+
233
+ caption_vec = await loop.run_in_executor(self._executor, enc_txt)
234
+
235
+ # 5) YOLO regions
236
+ region_vectors = []
237
+ if self.use_detection:
238
+ dets = self.detector(img)[0]
239
+ for box in dets.boxes:
240
+ x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().tolist())
241
+ crop = img.crop((x1, y1, x2, y2))
242
+ vec = self.encode_image(crop)
243
+ region_vectors.append(
244
+ {
245
+ "vector": vec.tolist(),
246
+ "bbox": [x1, y1, x2, y2],
247
+ "label": dets.names[int(box.cls)],
248
+ "conf": float(box.conf),
249
+ }
250
+ )
251
+
252
+ # Metadata
253
+ sha = hashlib.sha256(file_path.read_bytes()).hexdigest()
254
+ w, h = img.size
255
+ meta = {
256
+ "source": str(file_path),
257
+ "type": "image",
258
+ "width": w,
259
+ "height": h,
260
+ "mime": f"image/{file_path.suffix.lstrip('.')}",
261
+ "sha256": sha,
262
+ "embedding_model": "openclip-vit-h-14",
263
+ "caption": caption,
117
264
  }
118
- if ftype not in dispatch_map:
119
- raise ValueError(f"Unsupported file type: {file_path.suffix}")
120
265
 
121
- return await dispatch_map[ftype](file_path)
266
+ result = {
267
+ "content": None,
268
+ "metadata": meta,
269
+ "chunks": [caption],
270
+ "vectors": [image_vec.tolist()],
271
+ "caption_vector": caption_vec.tolist(),
272
+ }
273
+ if region_vectors:
274
+ result["region_vectors"] = region_vectors
275
+ return result
122
276
 
123
277
  # ------------------------------------------------------------------ #
124
278
  # PDF
@@ -126,7 +280,6 @@ class FileProcessor:
126
280
  async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
127
281
  page_chunks, doc_meta = await self._extract_text(file_path)
128
282
  all_chunks, line_data = [], []
129
-
130
283
  for page_text, page_num, line_nums in page_chunks:
131
284
  lines = page_text.split("\n")
132
285
  buf, buf_lines, length = [], [], 0
@@ -165,7 +318,7 @@ class FileProcessor:
165
318
  }
166
319
 
167
320
  # ------------------------------------------------------------------ #
168
- # Plain-text / code / markup
321
+ # Plaintext / code / markup
169
322
  # ------------------------------------------------------------------ #
170
323
  async def _process_text(self, file_path: Path) -> Dict[str, Any]:
171
324
  text, extra_meta, _ = await self._extract_text(file_path)
@@ -198,7 +351,6 @@ class FileProcessor:
198
351
  continue
199
352
  texts.append(txt)
200
353
  metas.append({k: v for k, v in row.items() if k != text_field and v})
201
-
202
354
  vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
203
355
  return {
204
356
  "content": None,
@@ -209,7 +361,7 @@ class FileProcessor:
209
361
  }
210
362
 
211
363
  # ------------------------------------------------------------------ #
212
- # Office docs (.doc/.docx/.pptx)
364
+ # Office docs
213
365
  # ------------------------------------------------------------------ #
214
366
  async def _process_office(self, file_path: Path) -> Dict[str, Any]:
215
367
  loop = asyncio.get_event_loop()
@@ -217,11 +369,10 @@ class FileProcessor:
217
369
  text = await loop.run_in_executor(
218
370
  self._executor, self._read_docx, file_path
219
371
  )
220
- else: # .pptx
372
+ else:
221
373
  text = await loop.run_in_executor(
222
374
  self._executor, self._read_pptx, file_path
223
375
  )
224
-
225
376
  chunks = self._chunk_text(text)
226
377
  vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
227
378
  return {
@@ -267,11 +418,25 @@ class FileProcessor:
267
418
  return await loop.run_in_executor(
268
419
  self._executor, self._extract_pdf_text, file_path
269
420
  )
270
- else:
271
- text = await loop.run_in_executor(
272
- self._executor, self._read_text_file, file_path
421
+ text = await loop.run_in_executor(
422
+ self._executor, self._read_text_file, file_path
423
+ )
424
+ return text, {}, []
425
+
426
+ # ------------------------------------------------------------------ #
427
+ # util: clip‑text encoder (public)
428
+ # ------------------------------------------------------------------ #
429
+ def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
430
+ with torch.no_grad():
431
+ toks = (
432
+ self.clip_tokenizer(text)
433
+ if isinstance(text, str)
434
+ else self.clip_tokenizer(text, truncate=True)
273
435
  )
274
- return text, {}, []
436
+ tensor = toks.unsqueeze(0).to(self.device)
437
+ feat = self.clip_model.encode_text(tensor).squeeze()
438
+ feat = feat / feat.norm()
439
+ return feat.float().cpu().numpy()
275
440
 
276
441
  def _extract_pdf_text(self, file_path: Path):
277
442
  page_chunks, meta = [], {}
@@ -287,8 +452,8 @@ class FileProcessor:
287
452
  lines = page.extract_text_lines()
288
453
  sorted_lines = sorted(lines, key=lambda x: x["top"])
289
454
  txts, nums = [], []
290
- for ln_idx, L in enumerate(sorted_lines, start=1):
291
- t = L.get("text", "").strip()
455
+ for ln_idx, line in enumerate(sorted_lines, start=1):
456
+ t = line.get("text", "").strip()
292
457
  if t:
293
458
  txts.append(t)
294
459
  nums.append(ln_idx)
@@ -362,3 +527,24 @@ class FileProcessor:
362
527
  seg = tokens[i : i + self.effective_max_length]
363
528
  out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
364
529
  return out
530
+
531
+ # ------------------------------------------------------------------ #
532
+ # Retrieval helpers (optional use)
533
+ # ------------------------------------------------------------------ #
534
+ def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
535
+ """Embed raw text with the SentenceTransformer model."""
536
+ single = isinstance(text, str)
537
+ out = self.embedding_model.encode(
538
+ text,
539
+ convert_to_numpy=True,
540
+ normalize_embeddings=True,
541
+ show_progress_bar=False,
542
+ )
543
+ return out if not single else out[0]
544
+
545
+ def encode_image(self, img: Image.Image) -> np.ndarray:
546
+ with torch.no_grad():
547
+ tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
548
+ feat = self.clip_model.encode_image(tensor).squeeze()
549
+ feat = feat / feat.norm()
550
+ return feat.float().cpu().numpy()
@@ -50,11 +50,18 @@ class VectorStoreManager(BaseVectorStore):
50
50
  def create_store(
51
51
  self,
52
52
  collection_name: str,
53
+ *,
53
54
  vector_size: int = 384,
54
55
  distance: str = "COSINE",
56
+ vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None,
55
57
  ) -> dict:
58
+ """
59
+ Create or recreate a Qdrant collection. By default creates a single-vector
60
+ collection with `vector_size`. To define multi-vector schema, pass
61
+ `vectors_config` mapping field names to VectorParams.
62
+ """
56
63
  try:
57
- # quick existence check
64
+ # existence check
58
65
  if any(
59
66
  col.name == collection_name
60
67
  for col in self.client.get_collections().collections
@@ -65,16 +72,27 @@ class VectorStoreManager(BaseVectorStore):
65
72
  if dist not in qdrant.Distance.__members__:
66
73
  raise ValueError(f"Invalid distance metric '{distance}'")
67
74
 
75
+ # choose schema
76
+ if vectors_config:
77
+ config = vectors_config
78
+ else:
79
+ config = {
80
+ "_default": qdrant.VectorParams(
81
+ size=vector_size, distance=qdrant.Distance[dist]
82
+ )
83
+ }
84
+
85
+ # recreate with full schema
68
86
  self.client.recreate_collection(
69
87
  collection_name=collection_name,
70
- vectors_config=qdrant.VectorParams(
71
- size=vector_size, distance=qdrant.Distance[dist]
72
- ),
88
+ vectors_config=config,
73
89
  )
90
+ # record metadata for each field
74
91
  self.active_stores[collection_name] = {
75
92
  "created_at": int(time.time()),
76
93
  "vector_size": vector_size,
77
94
  "distance": dist,
95
+ "fields": list(config.keys()),
78
96
  }
79
97
  log.info("Created Qdrant collection %s", collection_name)
80
98
  return {"collection_name": collection_name, "status": "created"}
@@ -103,8 +121,9 @@ class VectorStoreManager(BaseVectorStore):
103
121
  "name": store_name,
104
122
  "status": "active",
105
123
  "vectors_count": info.points_count,
106
- "configuration": info.config.params["default"],
124
+ "configuration": info.config.params,
107
125
  "created_at": self.active_stores[store_name]["created_at"],
126
+ "fields": self.active_stores[store_name].get("fields"),
108
127
  }
109
128
  except Exception as e:
110
129
  log.error("Store info failed: %s", e)
@@ -119,6 +138,8 @@ class VectorStoreManager(BaseVectorStore):
119
138
  texts: List[str],
120
139
  vectors: List[List[float]],
121
140
  metadata: List[dict],
141
+ *,
142
+ vector_name: Optional[str] = None, # NEW
122
143
  ):
123
144
  if not vectors:
124
145
  raise ValueError("Empty vectors list")
@@ -136,7 +157,13 @@ class VectorStoreManager(BaseVectorStore):
136
157
  for txt, vec, meta in zip(texts, vectors, metadata)
137
158
  ]
138
159
  try:
139
- self.client.upsert(collection_name=store_name, points=points, wait=True)
160
+ # pass vector_name if multi-column
161
+ self.client.upsert(
162
+ collection_name=store_name,
163
+ points=points,
164
+ wait=True,
165
+ vector_name=vector_name, # ignored if None
166
+ )
140
167
  return {"status": "success", "points_inserted": len(points)}
141
168
  except Exception as e:
142
169
  log.error("Add‑to‑store failed: %s", e)
@@ -189,15 +216,25 @@ class VectorStoreManager(BaseVectorStore):
189
216
  query_vector: List[float],
190
217
  top_k: int = 5,
191
218
  filters: Optional[dict] = None,
219
+ *,
220
+ vector_field: Optional[str] = None, # ← NEW
192
221
  score_threshold: float = 0.0,
193
222
  offset: int = 0,
194
223
  limit: Optional[int] = None,
195
224
  ) -> List[dict]:
196
- """Run a similarity search that works with any 1.x qdrant‑client."""
225
+ """
226
+ Run a similarity search against *store_name*.
227
+
228
+ • Works with any Qdrant-client ≥ 1.0
229
+ • `vector_field` lets you target a non-default vector column
230
+ (e.g. ``\"caption_vector\"`` for image stores). Pass **None**
231
+ to use the collection’s default vector.
232
+ """
197
233
 
198
234
  limit = limit or top_k
199
235
  flt = self._dict_to_filter(filters) if filters else None
200
236
 
237
+ # ── shared kwargs ----------------------------------------------------
201
238
  common: Dict[str, Any] = dict(
202
239
  collection_name=store_name,
203
240
  query_vector=query_vector,
@@ -207,20 +244,21 @@ class VectorStoreManager(BaseVectorStore):
207
244
  with_payload=True,
208
245
  with_vectors=False,
209
246
  )
247
+ if vector_field: # ← inject when requested
248
+ common["vector_name"] = vector_field
210
249
 
250
+ # ── call search (new client first, fallback to old) ------------------
211
251
  try:
212
- # Newer clients (1.6) use `filter=`
213
- res = self.client.search(**common, filter=flt) # type: ignore[arg-type]
252
+ res = self.client.search(**common, filter=flt) # 1.6
214
253
  except AssertionError as ae:
215
254
  if "Unknown arguments" not in str(ae):
216
255
  raise
217
- # Older clients use `query_filter=`
218
- res = self.client.search(**common, query_filter=flt) # type: ignore[arg-type]
219
-
256
+ res = self.client.search(**common, query_filter=flt) # < 1.6
220
257
  except Exception as e:
221
258
  log.error("Query failed: %s", e)
222
259
  raise VectorStoreError(f"Query failed: {e}") from e
223
260
 
261
+ # ── normalise result -------------------------------------------------
224
262
  return [
225
263
  {
226
264
  "id": p.id,