projectdavid 1.32.21__tar.gz → 1.33.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of projectdavid might be problematic. Click here for more details.
- {projectdavid-1.32.21 → projectdavid-1.33.0}/CHANGELOG.md +18 -0
- {projectdavid-1.32.21/src/projectdavid.egg-info → projectdavid-1.33.0}/PKG-INFO +19 -1
- {projectdavid-1.32.21 → projectdavid-1.33.0}/pyproject.toml +29 -2
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/file_processor.py +232 -46
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/vector_store_manager.py +50 -12
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/vectors.py +204 -23
- {projectdavid-1.32.21 → projectdavid-1.33.0/src/projectdavid.egg-info}/PKG-INFO +19 -1
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid.egg-info/requires.txt +19 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/LICENSE +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/MANIFEST.in +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/README.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/assistants.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/code_interpretation.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/database.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/database_assistant_example.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/docker_comtainers.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/file_search.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/files.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/function_call_definition.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/function_calls.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/handling_function_calls.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/inference.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/messages.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/runs.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/streams.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/threads.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/tools.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/users.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/vector_store.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/docs/versioning.md +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/setup.cfg +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/__init__.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/_version.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/actions_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/api_key_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/assistants_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/base_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/base_vector_store.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/event_handler.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/file_search.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/files_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/inference_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/messages_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/runs.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/synchronous_inference_wrapper.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/threads_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/tools_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/users_client.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/constants/platform.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/entity.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/events.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/serializers.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/services/logging_service.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/synthesis/__init__.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/synthesis/llm_synthesizer.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/synthesis/prompt.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/synthesis/reranker.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/synthesis/retriever.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/utils/__init__.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/utils/function_call_suppressor.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/utils/monitor_launcher.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/utils/peek_gate.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/utils/run_monitor.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/utils/vector_search_formatter.py +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid.egg-info/SOURCES.txt +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid.egg-info/dependency_links.txt +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid.egg-info/top_level.txt +0 -0
- {projectdavid-1.32.21 → projectdavid-1.33.0}/tests/test_clients.py +0 -0
|
@@ -1,3 +1,21 @@
|
|
|
1
|
+
# [1.33.0](https://github.com/frankie336/projectdavid/compare/v1.32.21...v1.33.0) (2025-06-10)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* Add support for multi-modal image search ([58e7e27](https://github.com/frankie336/projectdavid/commit/58e7e270be849e36bcd93e6a19942fa3e8abbd25))
|
|
7
|
+
* Add support for multi-modal image search-1 ([b8ebc7c](https://github.com/frankie336/projectdavid/commit/b8ebc7c4fb73cec0bff1b98ee45fa5b52e41a9b3))
|
|
8
|
+
* Add support for multi-modal image search-1 ([2362069](https://github.com/frankie336/projectdavid/commit/2362069e4b5390b4eb2b1007a413a6adb1a8bc7b))
|
|
9
|
+
* Add support for multi-modal image search-2 ([07f81fe](https://github.com/frankie336/projectdavid/commit/07f81fe0a475652bc6d316f3dc45e341452f43b7))
|
|
10
|
+
* Add support for multi-modal image search-3 ([29bce72](https://github.com/frankie336/projectdavid/commit/29bce72b12e3b2b5d2daeafe2367908e0cc3b402))
|
|
11
|
+
* Add support for multi-modal image search-3 ([3f8149e](https://github.com/frankie336/projectdavid/commit/3f8149e31371efa8727b96fa16d92fbe5474f727))
|
|
12
|
+
* Add support for multi-modal image search-4 ([b434d6d](https://github.com/frankie336/projectdavid/commit/b434d6d035324f444b46bd49dd15cbed528527a5))
|
|
13
|
+
* Add support for multi-modal image search-4 ([6acddf0](https://github.com/frankie336/projectdavid/commit/6acddf0c3b38ed6ca9e786ddb6d8ebf1a1328ac5))
|
|
14
|
+
* Add support for multi-modal image search-5 ([1dd9dd9](https://github.com/frankie336/projectdavid/commit/1dd9dd9d91556df8a0089255efad82bfe3f9a6b6))
|
|
15
|
+
* Add support for multi-modal image search-6 ([33a6069](https://github.com/frankie336/projectdavid/commit/33a6069b9f7a9e9007c156d511b3cb8abf859760))
|
|
16
|
+
* Add support for multi-modal image search-7 ([01d68e5](https://github.com/frankie336/projectdavid/commit/01d68e591c8dbc52c81b6bfcd522bb95d27c9ddd))
|
|
17
|
+
* Add support for multi-modal image search-8 ([8663b2a](https://github.com/frankie336/projectdavid/commit/8663b2ab7f0f035ae953281d86ba01a0db926839))
|
|
18
|
+
|
|
1
19
|
## [1.32.21](https://github.com/frankie336/projectdavid/compare/v1.32.20...v1.32.21) (2025-06-10)
|
|
2
20
|
|
|
3
21
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: projectdavid
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.33.0
|
|
4
4
|
Summary: Python SDK for interacting with the Entities Assistant API.
|
|
5
5
|
Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
|
|
6
6
|
License: PolyForm Noncommercial License 1.0.0
|
|
@@ -29,6 +29,13 @@ Requires-Dist: sseclient-py
|
|
|
29
29
|
Requires-Dist: requests
|
|
30
30
|
Requires-Dist: python-docx
|
|
31
31
|
Requires-Dist: python-pptx
|
|
32
|
+
Requires-Dist: open_clip_torch>=2.24
|
|
33
|
+
Requires-Dist: pillow>=10.2
|
|
34
|
+
Requires-Dist: transformers>=4.41
|
|
35
|
+
Requires-Dist: accelerate>=0.28
|
|
36
|
+
Requires-Dist: sentencepiece>=0.2
|
|
37
|
+
Requires-Dist: ultralytics>=8.2.21
|
|
38
|
+
Requires-Dist: pytesseract>=0.3
|
|
32
39
|
Provides-Extra: dev
|
|
33
40
|
Requires-Dist: black>=23.3; extra == "dev"
|
|
34
41
|
Requires-Dist: isort>=5.12; extra == "dev"
|
|
@@ -36,6 +43,17 @@ Requires-Dist: pytest>=7.2; extra == "dev"
|
|
|
36
43
|
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
37
44
|
Requires-Dist: build; extra == "dev"
|
|
38
45
|
Requires-Dist: twine; extra == "dev"
|
|
46
|
+
Provides-Extra: vision
|
|
47
|
+
Requires-Dist: torch>=2.2.1; extra == "vision"
|
|
48
|
+
Requires-Dist: torchvision>=0.17.1; extra == "vision"
|
|
49
|
+
Requires-Dist: torchaudio>=2.2.1; extra == "vision"
|
|
50
|
+
Requires-Dist: open_clip_torch>=2.24; extra == "vision"
|
|
51
|
+
Requires-Dist: pillow>=10.2; extra == "vision"
|
|
52
|
+
Requires-Dist: transformers>=4.41; extra == "vision"
|
|
53
|
+
Requires-Dist: accelerate>=0.28; extra == "vision"
|
|
54
|
+
Requires-Dist: sentencepiece>=0.2; extra == "vision"
|
|
55
|
+
Requires-Dist: ultralytics>=8.2.21; extra == "vision"
|
|
56
|
+
Requires-Dist: pytesseract>=0.3; extra == "vision"
|
|
39
57
|
Dynamic: license-file
|
|
40
58
|
|
|
41
59
|
# Entity — by Project David
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "projectdavid"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.33.0"
|
|
8
8
|
description = "Python SDK for interacting with the Entities Assistant API."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [
|
|
@@ -26,10 +26,18 @@ dependencies = [
|
|
|
26
26
|
"validators>=0.29.0,<0.35.0",
|
|
27
27
|
"sentence-transformers>=3.4.0,<5.0",
|
|
28
28
|
"sseclient-py",
|
|
29
|
-
|
|
29
|
+
"requests",
|
|
30
30
|
"python-docx",
|
|
31
31
|
"python-pptx",
|
|
32
32
|
|
|
33
|
+
# Vision / multimodal dependencies
|
|
34
|
+
"open_clip_torch>=2.24",
|
|
35
|
+
"pillow>=10.2",
|
|
36
|
+
"transformers>=4.41",
|
|
37
|
+
"accelerate>=0.28",
|
|
38
|
+
"sentencepiece>=0.2",
|
|
39
|
+
"ultralytics>=8.2.21",
|
|
40
|
+
"pytesseract>=0.3",
|
|
33
41
|
]
|
|
34
42
|
|
|
35
43
|
classifiers = [
|
|
@@ -52,5 +60,24 @@ dev = [
|
|
|
52
60
|
"twine"
|
|
53
61
|
]
|
|
54
62
|
|
|
63
|
+
vision = [
|
|
64
|
+
# Users must supply the correct torch wheel (cpu / cu121 / cu118) at install time
|
|
65
|
+
"torch>=2.2.1",
|
|
66
|
+
"torchvision>=0.17.1",
|
|
67
|
+
"torchaudio>=2.2.1",
|
|
68
|
+
|
|
69
|
+
# OpenCLIP + captioning stack
|
|
70
|
+
"open_clip_torch>=2.24",
|
|
71
|
+
"pillow>=10.2",
|
|
72
|
+
"transformers>=4.41",
|
|
73
|
+
"accelerate>=0.28",
|
|
74
|
+
"sentencepiece>=0.2",
|
|
75
|
+
"ultralytics>=8.2.21",
|
|
76
|
+
"pytesseract>=0.3",
|
|
77
|
+
|
|
78
|
+
# Geolocation package pending release (uncomment when available)
|
|
79
|
+
# "geoloc-regio-net>=0.2.0 ; extra == 'vision'",
|
|
80
|
+
]
|
|
81
|
+
|
|
55
82
|
[tool.isort]
|
|
56
83
|
profile = "black"
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
|
+
import hashlib
|
|
3
4
|
import json
|
|
5
|
+
import math
|
|
4
6
|
import re
|
|
5
7
|
import textwrap
|
|
6
8
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -13,34 +15,124 @@ except ImportError: # 3.9–3.10
|
|
|
13
15
|
from typing_extensions import LiteralString
|
|
14
16
|
|
|
15
17
|
import numpy as np
|
|
18
|
+
import open_clip
|
|
16
19
|
import pdfplumber
|
|
20
|
+
import torch
|
|
17
21
|
from docx import Document
|
|
22
|
+
from PIL import Image
|
|
18
23
|
from pptx import Presentation
|
|
24
|
+
from transformers import Blip2ForConditionalGeneration, Blip2Processor
|
|
25
|
+
from ultralytics import YOLO
|
|
26
|
+
|
|
27
|
+
# OCR fallback – optional
|
|
28
|
+
try:
|
|
29
|
+
import pytesseract # noqa: F401 # pylint: disable=unused-import
|
|
30
|
+
except ImportError:
|
|
31
|
+
pytesseract = None
|
|
32
|
+
|
|
19
33
|
from projectdavid_common import UtilsInterface
|
|
20
34
|
from sentence_transformers import SentenceTransformer
|
|
21
35
|
|
|
22
36
|
log = UtilsInterface.LoggingUtility()
|
|
23
37
|
|
|
24
38
|
|
|
39
|
+
def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
|
|
40
|
+
"""Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
|
|
41
|
+
lat_r = math.radians(lat)
|
|
42
|
+
lon_r = math.radians(lon)
|
|
43
|
+
return [
|
|
44
|
+
math.cos(lat_r) * math.cos(lon_r),
|
|
45
|
+
math.cos(lat_r) * math.sin(lon_r),
|
|
46
|
+
math.sin(lat_r),
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
|
|
25
50
|
class FileProcessor:
|
|
51
|
+
"""Unified processor for text, tabular, office, JSON, **and image** files.
|
|
52
|
+
|
|
53
|
+
Each modality is embedded with its optimal model:
|
|
54
|
+
• Text → paraphrase‑MiniLM‑L6‑v2 (384‑D)
|
|
55
|
+
• Image → OpenCLIP ViT‑H/14 (1024‑D)
|
|
56
|
+
• Caption→ OpenCLIP text head (1024‑D)
|
|
57
|
+
|
|
58
|
+
Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
|
|
59
|
+
GPU usage is optional; pass `use_gpu=False` to stay on CPU.
|
|
60
|
+
"""
|
|
61
|
+
|
|
26
62
|
# ------------------------------------------------------------------ #
|
|
27
63
|
# Construction
|
|
28
64
|
# ------------------------------------------------------------------ #
|
|
29
|
-
def __init__(
|
|
30
|
-
self
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
*,
|
|
68
|
+
max_workers: int = 4,
|
|
69
|
+
chunk_size: int = 512,
|
|
70
|
+
use_gpu: bool = True,
|
|
71
|
+
use_ocr: bool = True,
|
|
72
|
+
use_detection: bool = False,
|
|
73
|
+
image_model_name: str = "ViT-H-14",
|
|
74
|
+
caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
|
|
75
|
+
):
|
|
76
|
+
# Device selection
|
|
77
|
+
if use_gpu and torch.cuda.is_available():
|
|
78
|
+
self.device = torch.device("cuda")
|
|
79
|
+
self.torch_dtype = torch.float16
|
|
80
|
+
else:
|
|
81
|
+
self.device = torch.device("cpu")
|
|
82
|
+
self.torch_dtype = torch.float32
|
|
83
|
+
|
|
84
|
+
# Feature flags
|
|
85
|
+
self.use_ocr = use_ocr and pytesseract is not None
|
|
86
|
+
self.use_detection = use_detection
|
|
87
|
+
if use_ocr and pytesseract is None:
|
|
88
|
+
log.warning("OCR requested but pytesseract not installed – skipping.")
|
|
89
|
+
if self.use_detection:
|
|
90
|
+
self.detector = YOLO("yolov8x.pt").to(self.device)
|
|
91
|
+
|
|
92
|
+
# Text embedder
|
|
31
93
|
self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
|
|
32
|
-
self.
|
|
94
|
+
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
95
|
+
self.embedding_model.to(str(self.device))
|
|
33
96
|
|
|
34
|
-
#
|
|
97
|
+
# Chunking parameters
|
|
35
98
|
self.max_seq_length = self.embedding_model.get_max_seq_length()
|
|
36
99
|
self.special_tokens_count = 2
|
|
37
100
|
self.effective_max_length = self.max_seq_length - self.special_tokens_count
|
|
38
101
|
self.chunk_size = min(chunk_size, self.effective_max_length * 4)
|
|
39
102
|
|
|
40
|
-
|
|
103
|
+
# Image embedder
|
|
104
|
+
self.clip_model, _, self.clip_preprocess = (
|
|
105
|
+
open_clip.create_model_and_transforms(
|
|
106
|
+
image_model_name,
|
|
107
|
+
pretrained="laion2b_s32b_b79k",
|
|
108
|
+
precision="fp16" if self.device.type == "cuda" else "fp32",
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
self.clip_model = self.clip_model.to(self.device).eval()
|
|
112
|
+
self.clip_tokenizer = open_clip.get_tokenizer(image_model_name)
|
|
113
|
+
|
|
114
|
+
# Caption generator
|
|
115
|
+
self.blip_processor = Blip2Processor.from_pretrained(caption_model_name)
|
|
116
|
+
self.blip_model = (
|
|
117
|
+
Blip2ForConditionalGeneration.from_pretrained(
|
|
118
|
+
caption_model_name,
|
|
119
|
+
torch_dtype=self.torch_dtype,
|
|
120
|
+
)
|
|
121
|
+
.to(self.device)
|
|
122
|
+
.eval()
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Executor & logging
|
|
126
|
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
127
|
+
log.info(
|
|
128
|
+
"FileProcessor ready (device=%s, OCR=%s, detection=%s)",
|
|
129
|
+
self.device,
|
|
130
|
+
self.use_ocr,
|
|
131
|
+
self.use_detection,
|
|
132
|
+
)
|
|
41
133
|
|
|
42
134
|
# ------------------------------------------------------------------ #
|
|
43
|
-
# Generic validators
|
|
135
|
+
# Generic validators *
|
|
44
136
|
# ------------------------------------------------------------------ #
|
|
45
137
|
def validate_file(self, file_path: Path):
|
|
46
138
|
"""Ensure file exists and is under 100 MB."""
|
|
@@ -52,20 +144,10 @@ class FileProcessor:
|
|
|
52
144
|
raise ValueError(f"{file_path.name} > {mb} MB limit")
|
|
53
145
|
|
|
54
146
|
# ------------------------------------------------------------------ #
|
|
55
|
-
# File
|
|
147
|
+
# File‑type detection (extension‑based – no libmagic)
|
|
56
148
|
# ------------------------------------------------------------------ #
|
|
57
149
|
def _detect_file_type(self, file_path: Path) -> str:
|
|
58
|
-
"""
|
|
59
|
-
Return one of:
|
|
60
|
-
|
|
61
|
-
• 'pdf' • 'csv' • 'json'
|
|
62
|
-
• 'office' (.doc/.docx/.pptx)
|
|
63
|
-
• 'text' (code / markup / plain text)
|
|
64
|
-
|
|
65
|
-
Raises *ValueError* if the extension is not recognised.
|
|
66
|
-
"""
|
|
67
150
|
suffix = file_path.suffix.lower()
|
|
68
|
-
|
|
69
151
|
if suffix == ".pdf":
|
|
70
152
|
return "pdf"
|
|
71
153
|
if suffix == ".csv":
|
|
@@ -74,7 +156,8 @@ class FileProcessor:
|
|
|
74
156
|
return "json"
|
|
75
157
|
if suffix in {".doc", ".docx", ".pptx"}:
|
|
76
158
|
return "office"
|
|
77
|
-
|
|
159
|
+
if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
|
|
160
|
+
return "image"
|
|
78
161
|
text_exts = {
|
|
79
162
|
".txt",
|
|
80
163
|
".md",
|
|
@@ -96,29 +179,100 @@ class FileProcessor:
|
|
|
96
179
|
}
|
|
97
180
|
if suffix in text_exts:
|
|
98
181
|
return "text"
|
|
99
|
-
|
|
100
182
|
raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
|
|
101
183
|
|
|
102
184
|
# ------------------------------------------------------------------ #
|
|
103
|
-
#
|
|
185
|
+
# Dispatcher
|
|
104
186
|
# ------------------------------------------------------------------ #
|
|
105
187
|
async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
self.
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
188
|
+
path = Path(file_path)
|
|
189
|
+
self.validate_file(path)
|
|
190
|
+
ftype = self._detect_file_type(path)
|
|
191
|
+
return await getattr(self, f"_process_{ftype}")(path)
|
|
192
|
+
|
|
193
|
+
# ------------------------------------------------------------------ #
|
|
194
|
+
# Image processing (OpenCLIP + BLIP-2 + OCR + YOLO)
|
|
195
|
+
# ------------------------------------------------------------------ #
|
|
196
|
+
async def _process_image(self, file_path: Path) -> Dict[str, Any]:
|
|
197
|
+
loop = asyncio.get_event_loop()
|
|
198
|
+
img = await loop.run_in_executor(self._executor, Image.open, file_path)
|
|
199
|
+
|
|
200
|
+
# 1) Image vector
|
|
201
|
+
def enc_img():
|
|
202
|
+
with torch.no_grad():
|
|
203
|
+
t = self.clip_preprocess(img).unsqueeze(0).to(self.device)
|
|
204
|
+
v = self.clip_model.encode_image(t).squeeze()
|
|
205
|
+
return (v / v.norm()).float().cpu().numpy()
|
|
206
|
+
|
|
207
|
+
image_vec = await loop.run_in_executor(self._executor, enc_img)
|
|
208
|
+
|
|
209
|
+
# 2) Caption
|
|
210
|
+
def gen_cap():
|
|
211
|
+
inp = self.blip_processor(images=img, return_tensors="pt").to(self.device)
|
|
212
|
+
with torch.no_grad():
|
|
213
|
+
ids = self.blip_model.generate(**inp, max_new_tokens=50)
|
|
214
|
+
return self.blip_processor.decode(ids[0], skip_special_tokens=True)
|
|
215
|
+
|
|
216
|
+
caption = await loop.run_in_executor(self._executor, gen_cap)
|
|
217
|
+
|
|
218
|
+
# 3) OCR
|
|
219
|
+
if self.use_ocr:
|
|
220
|
+
text = await loop.run_in_executor(
|
|
221
|
+
self._executor, pytesseract.image_to_string, img
|
|
222
|
+
)
|
|
223
|
+
if t := text.strip():
|
|
224
|
+
caption += "\n" + t
|
|
225
|
+
|
|
226
|
+
# 4) Caption vector
|
|
227
|
+
def enc_txt():
|
|
228
|
+
with torch.no_grad():
|
|
229
|
+
tok = self.clip_tokenizer(caption).unsqueeze(0).to(self.device)
|
|
230
|
+
v = self.clip_model.encode_text(tok).squeeze()
|
|
231
|
+
return (v / v.norm()).float().cpu().numpy()
|
|
232
|
+
|
|
233
|
+
caption_vec = await loop.run_in_executor(self._executor, enc_txt)
|
|
234
|
+
|
|
235
|
+
# 5) YOLO regions
|
|
236
|
+
region_vectors = []
|
|
237
|
+
if self.use_detection:
|
|
238
|
+
dets = self.detector(img)[0]
|
|
239
|
+
for box in dets.boxes:
|
|
240
|
+
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().tolist())
|
|
241
|
+
crop = img.crop((x1, y1, x2, y2))
|
|
242
|
+
vec = self.encode_image(crop)
|
|
243
|
+
region_vectors.append(
|
|
244
|
+
{
|
|
245
|
+
"vector": vec.tolist(),
|
|
246
|
+
"bbox": [x1, y1, x2, y2],
|
|
247
|
+
"label": dets.names[int(box.cls)],
|
|
248
|
+
"conf": float(box.conf),
|
|
249
|
+
}
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Metadata
|
|
253
|
+
sha = hashlib.sha256(file_path.read_bytes()).hexdigest()
|
|
254
|
+
w, h = img.size
|
|
255
|
+
meta = {
|
|
256
|
+
"source": str(file_path),
|
|
257
|
+
"type": "image",
|
|
258
|
+
"width": w,
|
|
259
|
+
"height": h,
|
|
260
|
+
"mime": f"image/{file_path.suffix.lstrip('.')}",
|
|
261
|
+
"sha256": sha,
|
|
262
|
+
"embedding_model": "openclip-vit-h-14",
|
|
263
|
+
"caption": caption,
|
|
117
264
|
}
|
|
118
|
-
if ftype not in dispatch_map:
|
|
119
|
-
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
120
265
|
|
|
121
|
-
|
|
266
|
+
result = {
|
|
267
|
+
"content": None,
|
|
268
|
+
"metadata": meta,
|
|
269
|
+
"chunks": [caption],
|
|
270
|
+
"vectors": [image_vec.tolist()],
|
|
271
|
+
"caption_vector": caption_vec.tolist(),
|
|
272
|
+
}
|
|
273
|
+
if region_vectors:
|
|
274
|
+
result["region_vectors"] = region_vectors
|
|
275
|
+
return result
|
|
122
276
|
|
|
123
277
|
# ------------------------------------------------------------------ #
|
|
124
278
|
# PDF
|
|
@@ -126,7 +280,6 @@ class FileProcessor:
|
|
|
126
280
|
async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
|
|
127
281
|
page_chunks, doc_meta = await self._extract_text(file_path)
|
|
128
282
|
all_chunks, line_data = [], []
|
|
129
|
-
|
|
130
283
|
for page_text, page_num, line_nums in page_chunks:
|
|
131
284
|
lines = page_text.split("\n")
|
|
132
285
|
buf, buf_lines, length = [], [], 0
|
|
@@ -165,7 +318,7 @@ class FileProcessor:
|
|
|
165
318
|
}
|
|
166
319
|
|
|
167
320
|
# ------------------------------------------------------------------ #
|
|
168
|
-
# Plain
|
|
321
|
+
# Plain‑text / code / markup
|
|
169
322
|
# ------------------------------------------------------------------ #
|
|
170
323
|
async def _process_text(self, file_path: Path) -> Dict[str, Any]:
|
|
171
324
|
text, extra_meta, _ = await self._extract_text(file_path)
|
|
@@ -198,7 +351,6 @@ class FileProcessor:
|
|
|
198
351
|
continue
|
|
199
352
|
texts.append(txt)
|
|
200
353
|
metas.append({k: v for k, v in row.items() if k != text_field and v})
|
|
201
|
-
|
|
202
354
|
vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
|
|
203
355
|
return {
|
|
204
356
|
"content": None,
|
|
@@ -209,7 +361,7 @@ class FileProcessor:
|
|
|
209
361
|
}
|
|
210
362
|
|
|
211
363
|
# ------------------------------------------------------------------ #
|
|
212
|
-
# Office docs
|
|
364
|
+
# Office docs
|
|
213
365
|
# ------------------------------------------------------------------ #
|
|
214
366
|
async def _process_office(self, file_path: Path) -> Dict[str, Any]:
|
|
215
367
|
loop = asyncio.get_event_loop()
|
|
@@ -217,11 +369,10 @@ class FileProcessor:
|
|
|
217
369
|
text = await loop.run_in_executor(
|
|
218
370
|
self._executor, self._read_docx, file_path
|
|
219
371
|
)
|
|
220
|
-
else:
|
|
372
|
+
else:
|
|
221
373
|
text = await loop.run_in_executor(
|
|
222
374
|
self._executor, self._read_pptx, file_path
|
|
223
375
|
)
|
|
224
|
-
|
|
225
376
|
chunks = self._chunk_text(text)
|
|
226
377
|
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
227
378
|
return {
|
|
@@ -267,11 +418,25 @@ class FileProcessor:
|
|
|
267
418
|
return await loop.run_in_executor(
|
|
268
419
|
self._executor, self._extract_pdf_text, file_path
|
|
269
420
|
)
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
421
|
+
text = await loop.run_in_executor(
|
|
422
|
+
self._executor, self._read_text_file, file_path
|
|
423
|
+
)
|
|
424
|
+
return text, {}, []
|
|
425
|
+
|
|
426
|
+
# ------------------------------------------------------------------ #
|
|
427
|
+
# util: clip‑text encoder (public)
|
|
428
|
+
# ------------------------------------------------------------------ #
|
|
429
|
+
def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
430
|
+
with torch.no_grad():
|
|
431
|
+
toks = (
|
|
432
|
+
self.clip_tokenizer(text)
|
|
433
|
+
if isinstance(text, str)
|
|
434
|
+
else self.clip_tokenizer(text, truncate=True)
|
|
273
435
|
)
|
|
274
|
-
|
|
436
|
+
tensor = toks.unsqueeze(0).to(self.device)
|
|
437
|
+
feat = self.clip_model.encode_text(tensor).squeeze()
|
|
438
|
+
feat = feat / feat.norm()
|
|
439
|
+
return feat.float().cpu().numpy()
|
|
275
440
|
|
|
276
441
|
def _extract_pdf_text(self, file_path: Path):
|
|
277
442
|
page_chunks, meta = [], {}
|
|
@@ -287,8 +452,8 @@ class FileProcessor:
|
|
|
287
452
|
lines = page.extract_text_lines()
|
|
288
453
|
sorted_lines = sorted(lines, key=lambda x: x["top"])
|
|
289
454
|
txts, nums = [], []
|
|
290
|
-
for ln_idx,
|
|
291
|
-
t =
|
|
455
|
+
for ln_idx, line in enumerate(sorted_lines, start=1):
|
|
456
|
+
t = line.get("text", "").strip()
|
|
292
457
|
if t:
|
|
293
458
|
txts.append(t)
|
|
294
459
|
nums.append(ln_idx)
|
|
@@ -362,3 +527,24 @@ class FileProcessor:
|
|
|
362
527
|
seg = tokens[i : i + self.effective_max_length]
|
|
363
528
|
out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
|
|
364
529
|
return out
|
|
530
|
+
|
|
531
|
+
# ------------------------------------------------------------------ #
|
|
532
|
+
# Retrieval helpers (optional use)
|
|
533
|
+
# ------------------------------------------------------------------ #
|
|
534
|
+
def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
535
|
+
"""Embed raw text with the SentenceTransformer model."""
|
|
536
|
+
single = isinstance(text, str)
|
|
537
|
+
out = self.embedding_model.encode(
|
|
538
|
+
text,
|
|
539
|
+
convert_to_numpy=True,
|
|
540
|
+
normalize_embeddings=True,
|
|
541
|
+
show_progress_bar=False,
|
|
542
|
+
)
|
|
543
|
+
return out if not single else out[0]
|
|
544
|
+
|
|
545
|
+
def encode_image(self, img: Image.Image) -> np.ndarray:
|
|
546
|
+
with torch.no_grad():
|
|
547
|
+
tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
|
|
548
|
+
feat = self.clip_model.encode_image(tensor).squeeze()
|
|
549
|
+
feat = feat / feat.norm()
|
|
550
|
+
return feat.float().cpu().numpy()
|
{projectdavid-1.32.21 → projectdavid-1.33.0}/src/projectdavid/clients/vector_store_manager.py
RENAMED
|
@@ -50,11 +50,18 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
50
50
|
def create_store(
|
|
51
51
|
self,
|
|
52
52
|
collection_name: str,
|
|
53
|
+
*,
|
|
53
54
|
vector_size: int = 384,
|
|
54
55
|
distance: str = "COSINE",
|
|
56
|
+
vectors_config: Optional[Dict[str, qdrant.VectorParams]] = None,
|
|
55
57
|
) -> dict:
|
|
58
|
+
"""
|
|
59
|
+
Create or recreate a Qdrant collection. By default creates a single-vector
|
|
60
|
+
collection with `vector_size`. To define multi-vector schema, pass
|
|
61
|
+
`vectors_config` mapping field names to VectorParams.
|
|
62
|
+
"""
|
|
56
63
|
try:
|
|
57
|
-
#
|
|
64
|
+
# existence check
|
|
58
65
|
if any(
|
|
59
66
|
col.name == collection_name
|
|
60
67
|
for col in self.client.get_collections().collections
|
|
@@ -65,16 +72,27 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
65
72
|
if dist not in qdrant.Distance.__members__:
|
|
66
73
|
raise ValueError(f"Invalid distance metric '{distance}'")
|
|
67
74
|
|
|
75
|
+
# choose schema
|
|
76
|
+
if vectors_config:
|
|
77
|
+
config = vectors_config
|
|
78
|
+
else:
|
|
79
|
+
config = {
|
|
80
|
+
"_default": qdrant.VectorParams(
|
|
81
|
+
size=vector_size, distance=qdrant.Distance[dist]
|
|
82
|
+
)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# recreate with full schema
|
|
68
86
|
self.client.recreate_collection(
|
|
69
87
|
collection_name=collection_name,
|
|
70
|
-
vectors_config=
|
|
71
|
-
size=vector_size, distance=qdrant.Distance[dist]
|
|
72
|
-
),
|
|
88
|
+
vectors_config=config,
|
|
73
89
|
)
|
|
90
|
+
# record metadata for each field
|
|
74
91
|
self.active_stores[collection_name] = {
|
|
75
92
|
"created_at": int(time.time()),
|
|
76
93
|
"vector_size": vector_size,
|
|
77
94
|
"distance": dist,
|
|
95
|
+
"fields": list(config.keys()),
|
|
78
96
|
}
|
|
79
97
|
log.info("Created Qdrant collection %s", collection_name)
|
|
80
98
|
return {"collection_name": collection_name, "status": "created"}
|
|
@@ -103,8 +121,9 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
103
121
|
"name": store_name,
|
|
104
122
|
"status": "active",
|
|
105
123
|
"vectors_count": info.points_count,
|
|
106
|
-
"configuration": info.config.params
|
|
124
|
+
"configuration": info.config.params,
|
|
107
125
|
"created_at": self.active_stores[store_name]["created_at"],
|
|
126
|
+
"fields": self.active_stores[store_name].get("fields"),
|
|
108
127
|
}
|
|
109
128
|
except Exception as e:
|
|
110
129
|
log.error("Store info failed: %s", e)
|
|
@@ -119,6 +138,8 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
119
138
|
texts: List[str],
|
|
120
139
|
vectors: List[List[float]],
|
|
121
140
|
metadata: List[dict],
|
|
141
|
+
*,
|
|
142
|
+
vector_name: Optional[str] = None, # NEW
|
|
122
143
|
):
|
|
123
144
|
if not vectors:
|
|
124
145
|
raise ValueError("Empty vectors list")
|
|
@@ -136,7 +157,13 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
136
157
|
for txt, vec, meta in zip(texts, vectors, metadata)
|
|
137
158
|
]
|
|
138
159
|
try:
|
|
139
|
-
|
|
160
|
+
# pass vector_name if multi-column
|
|
161
|
+
self.client.upsert(
|
|
162
|
+
collection_name=store_name,
|
|
163
|
+
points=points,
|
|
164
|
+
wait=True,
|
|
165
|
+
vector_name=vector_name, # ignored if None
|
|
166
|
+
)
|
|
140
167
|
return {"status": "success", "points_inserted": len(points)}
|
|
141
168
|
except Exception as e:
|
|
142
169
|
log.error("Add‑to‑store failed: %s", e)
|
|
@@ -189,15 +216,25 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
189
216
|
query_vector: List[float],
|
|
190
217
|
top_k: int = 5,
|
|
191
218
|
filters: Optional[dict] = None,
|
|
219
|
+
*,
|
|
220
|
+
vector_field: Optional[str] = None, # ← NEW
|
|
192
221
|
score_threshold: float = 0.0,
|
|
193
222
|
offset: int = 0,
|
|
194
223
|
limit: Optional[int] = None,
|
|
195
224
|
) -> List[dict]:
|
|
196
|
-
"""
|
|
225
|
+
"""
|
|
226
|
+
Run a similarity search against *store_name*.
|
|
227
|
+
|
|
228
|
+
• Works with any Qdrant-client ≥ 1.0
|
|
229
|
+
• `vector_field` lets you target a non-default vector column
|
|
230
|
+
(e.g. ``\"caption_vector\"`` for image stores). Pass **None**
|
|
231
|
+
to use the collection’s default vector.
|
|
232
|
+
"""
|
|
197
233
|
|
|
198
234
|
limit = limit or top_k
|
|
199
235
|
flt = self._dict_to_filter(filters) if filters else None
|
|
200
236
|
|
|
237
|
+
# ── shared kwargs ----------------------------------------------------
|
|
201
238
|
common: Dict[str, Any] = dict(
|
|
202
239
|
collection_name=store_name,
|
|
203
240
|
query_vector=query_vector,
|
|
@@ -207,20 +244,21 @@ class VectorStoreManager(BaseVectorStore):
|
|
|
207
244
|
with_payload=True,
|
|
208
245
|
with_vectors=False,
|
|
209
246
|
)
|
|
247
|
+
if vector_field: # ← inject when requested
|
|
248
|
+
common["vector_name"] = vector_field
|
|
210
249
|
|
|
250
|
+
# ── call search (new client first, fallback to old) ------------------
|
|
211
251
|
try:
|
|
212
|
-
|
|
213
|
-
res = self.client.search(**common, filter=flt) # type: ignore[arg-type]
|
|
252
|
+
res = self.client.search(**common, filter=flt) # ≥ 1.6
|
|
214
253
|
except AssertionError as ae:
|
|
215
254
|
if "Unknown arguments" not in str(ae):
|
|
216
255
|
raise
|
|
217
|
-
|
|
218
|
-
res = self.client.search(**common, query_filter=flt) # type: ignore[arg-type]
|
|
219
|
-
|
|
256
|
+
res = self.client.search(**common, query_filter=flt) # < 1.6
|
|
220
257
|
except Exception as e:
|
|
221
258
|
log.error("Query failed: %s", e)
|
|
222
259
|
raise VectorStoreError(f"Query failed: {e}") from e
|
|
223
260
|
|
|
261
|
+
# ── normalise result -------------------------------------------------
|
|
224
262
|
return [
|
|
225
263
|
{
|
|
226
264
|
"id": p.id,
|