projectdavid 1.31.0__py3-none-any.whl → 1.38.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- projectdavid/clients/assistants_client.py +7 -13
- projectdavid/clients/file_processor.py +102 -107
- projectdavid/clients/messages_client.py +24 -39
- projectdavid/clients/runs.py +156 -211
- projectdavid/clients/synchronous_inference_wrapper.py +52 -24
- projectdavid/clients/threads_client.py +32 -12
- projectdavid/clients/vector_store_manager.py +110 -21
- projectdavid/clients/vectors.py +47 -30
- projectdavid/clients/vision-file_processor.py +462 -0
- projectdavid/clients/vision_vectors.py +1058 -0
- projectdavid/decorators.py +64 -0
- projectdavid/entity.py +24 -5
- projectdavid/synthesis/reranker.py +4 -2
- projectdavid/utils/function_call_suppressor.py +40 -0
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/METADATA +6 -7
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/RECORD +19 -15
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/WHEEL +1 -1
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/licenses/LICENSE +0 -0
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/top_level.txt +0 -0
|
@@ -112,7 +112,6 @@ class AssistantsClient(BaseAPIClient):
|
|
|
112
112
|
description: str = "",
|
|
113
113
|
instructions: str = "",
|
|
114
114
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
115
|
-
platform_tools: Optional[List[Dict[str, Any]]] = None,
|
|
116
115
|
tool_resources: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
117
116
|
meta_data: Optional[Dict[str, Any]] = None,
|
|
118
117
|
top_p: float = 1.0,
|
|
@@ -133,7 +132,6 @@ class AssistantsClient(BaseAPIClient):
|
|
|
133
132
|
"model": model,
|
|
134
133
|
"instructions": instructions,
|
|
135
134
|
"tools": tools,
|
|
136
|
-
"platform_tools": platform_tools,
|
|
137
135
|
"tool_resources": tool_resources,
|
|
138
136
|
"meta_data": meta_data,
|
|
139
137
|
"top_p": top_p,
|
|
@@ -263,14 +261,10 @@ class AssistantsClient(BaseAPIClient):
|
|
|
263
261
|
)
|
|
264
262
|
return {"message": "Assistant disassociated from user successfully"}
|
|
265
263
|
|
|
266
|
-
def
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
return [ent_validator.AssistantRead(**a) for a in raw_list]
|
|
274
|
-
except ValidationError as e:
|
|
275
|
-
logging_utility.error("Validation error: %s", e.json())
|
|
276
|
-
raise AssistantsClientError(f"Validation error: {e}") from e
|
|
264
|
+
def list(self) -> list[ent_validator.AssistantRead]:
|
|
265
|
+
"""Return every assistant owned by *this* API key."""
|
|
266
|
+
logging_utility.info("Listing assistants")
|
|
267
|
+
|
|
268
|
+
resp = self._request_with_retries("GET", "/v1/assistants")
|
|
269
|
+
raw = self._parse_response(resp)
|
|
270
|
+
return [ent_validator.AssistantRead(**a) for a in raw]
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
3
|
import json
|
|
4
|
-
import mimetypes
|
|
5
4
|
import re
|
|
6
5
|
import textwrap
|
|
7
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -10,16 +9,14 @@ from typing import Any, Dict, List, Tuple, Union
|
|
|
10
9
|
|
|
11
10
|
try: # Python 3.11+
|
|
12
11
|
from typing import LiteralString
|
|
13
|
-
except ImportError: # 3.9
|
|
12
|
+
except ImportError: # 3.9–3.10
|
|
14
13
|
from typing_extensions import LiteralString
|
|
15
14
|
|
|
16
|
-
import magic
|
|
17
15
|
import numpy as np
|
|
18
16
|
import pdfplumber
|
|
19
17
|
from docx import Document
|
|
20
18
|
from pptx import Presentation
|
|
21
19
|
from projectdavid_common import UtilsInterface
|
|
22
|
-
from sentence_transformers import SentenceTransformer
|
|
23
20
|
|
|
24
21
|
log = UtilsInterface.LoggingUtility()
|
|
25
22
|
|
|
@@ -29,17 +26,85 @@ class FileProcessor:
|
|
|
29
26
|
# Construction
|
|
30
27
|
# ------------------------------------------------------------------ #
|
|
31
28
|
def __init__(self, max_workers: int = 4, chunk_size: int = 512):
|
|
32
|
-
self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
|
33
29
|
self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
|
|
30
|
+
self._embedding_model = None
|
|
34
31
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
35
32
|
|
|
36
|
-
#
|
|
37
|
-
self.
|
|
38
|
-
self.
|
|
39
|
-
self.
|
|
40
|
-
self.
|
|
33
|
+
# Lazy-initialized attributes
|
|
34
|
+
self._requested_chunk_size = chunk_size
|
|
35
|
+
self._max_seq_length = None
|
|
36
|
+
self._effective_max_length = None
|
|
37
|
+
self._chunk_size = None
|
|
41
38
|
|
|
42
|
-
log.info("Initialized
|
|
39
|
+
log.info("Initialized Lazy-Loaded FileProcessor")
|
|
40
|
+
|
|
41
|
+
def _ensure_model(self):
|
|
42
|
+
"""
|
|
43
|
+
Internal helper to load the model and calculate limits only once.
|
|
44
|
+
This prevents heavy imports (scipy, torch) until actually needed.
|
|
45
|
+
"""
|
|
46
|
+
if self._embedding_model is None:
|
|
47
|
+
try:
|
|
48
|
+
from sentence_transformers import SentenceTransformer
|
|
49
|
+
|
|
50
|
+
log.info(f"Lazy-loading model: {self.embedding_model_name}")
|
|
51
|
+
|
|
52
|
+
self._embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
53
|
+
|
|
54
|
+
# Ported Limit Calculations
|
|
55
|
+
self._max_seq_length = self._embedding_model.get_max_seq_length()
|
|
56
|
+
special_tokens_count = 2
|
|
57
|
+
self._effective_max_length = self._max_seq_length - special_tokens_count
|
|
58
|
+
self._chunk_size = min(
|
|
59
|
+
self._requested_chunk_size, self._effective_max_length * 4
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
except ImportError:
|
|
63
|
+
log.error(
|
|
64
|
+
"sentence-transformers not found. Ensure 'pip install projectdavid[vision]' is installed."
|
|
65
|
+
)
|
|
66
|
+
raise ImportError(
|
|
67
|
+
"Model-based features require 'sentence-transformers'. Install with [vision] extra."
|
|
68
|
+
)
|
|
69
|
+
return self._embedding_model
|
|
70
|
+
|
|
71
|
+
# Properties to maintain access to derived attributes
|
|
72
|
+
@property
|
|
73
|
+
def chunk_size(self):
|
|
74
|
+
if self._chunk_size is None:
|
|
75
|
+
self._ensure_model()
|
|
76
|
+
return self._chunk_size
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def effective_max_length(self):
|
|
80
|
+
if self._effective_max_length is None:
|
|
81
|
+
self._ensure_model()
|
|
82
|
+
return self._effective_max_length
|
|
83
|
+
|
|
84
|
+
# ------------------------------------------------------------------ #
|
|
85
|
+
# Embeddings
|
|
86
|
+
# ------------------------------------------------------------------ #
|
|
87
|
+
def encode_text(self, text: str):
|
|
88
|
+
model = self._ensure_model()
|
|
89
|
+
return model.encode(
|
|
90
|
+
[text],
|
|
91
|
+
convert_to_numpy=True,
|
|
92
|
+
truncate="model_max_length",
|
|
93
|
+
normalize_embeddings=True,
|
|
94
|
+
)[0]
|
|
95
|
+
|
|
96
|
+
async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
|
|
97
|
+
model = self._ensure_model()
|
|
98
|
+
return await asyncio.get_event_loop().run_in_executor(
|
|
99
|
+
self._executor,
|
|
100
|
+
lambda: model.encode(
|
|
101
|
+
[chunk],
|
|
102
|
+
convert_to_numpy=True,
|
|
103
|
+
truncate="model_max_length",
|
|
104
|
+
normalize_embeddings=True,
|
|
105
|
+
show_progress_bar=False,
|
|
106
|
+
)[0],
|
|
107
|
+
)
|
|
43
108
|
|
|
44
109
|
# ------------------------------------------------------------------ #
|
|
45
110
|
# Generic validators
|
|
@@ -54,54 +119,20 @@ class FileProcessor:
|
|
|
54
119
|
raise ValueError(f"{file_path.name} > {mb} MB limit")
|
|
55
120
|
|
|
56
121
|
# ------------------------------------------------------------------ #
|
|
57
|
-
# File-type detection
|
|
122
|
+
# File-type detection
|
|
58
123
|
# ------------------------------------------------------------------ #
|
|
59
124
|
def _detect_file_type(self, file_path: Path) -> str:
|
|
60
|
-
"""
|
|
61
|
-
Return a handler tag:
|
|
62
|
-
|
|
63
|
-
• 'pdf' • 'csv'
|
|
64
|
-
• 'json' • 'office'
|
|
65
|
-
• 'text'
|
|
66
|
-
|
|
67
|
-
Raises *ValueError* on anything unknown.
|
|
68
|
-
"""
|
|
69
|
-
# 1️⃣ Best-effort MIME sniff
|
|
70
|
-
mime_type: str | None = None
|
|
71
|
-
if magic is not None:
|
|
72
|
-
try:
|
|
73
|
-
mime_type = magic.from_file(str(file_path), mime=True)
|
|
74
|
-
except Exception:
|
|
75
|
-
mime_type = None
|
|
76
|
-
|
|
77
|
-
# 2️⃣ Fallback → mimetypes
|
|
78
|
-
if not mime_type:
|
|
79
|
-
mime_type, _ = mimetypes.guess_type(file_path.name)
|
|
80
|
-
|
|
81
125
|
suffix = file_path.suffix.lower()
|
|
126
|
+
if suffix == ".pdf":
|
|
127
|
+
return "pdf"
|
|
128
|
+
if suffix == ".csv":
|
|
129
|
+
return "csv"
|
|
130
|
+
if suffix == ".json":
|
|
131
|
+
return "json"
|
|
132
|
+
if suffix in {".doc", ".docx", ".pptx"}:
|
|
133
|
+
return "office"
|
|
82
134
|
|
|
83
|
-
|
|
84
|
-
CSV_MIMES = {"text/csv", "application/csv"}
|
|
85
|
-
JSON_MIMES = {"application/json"}
|
|
86
|
-
OFFICE_MIMES = {
|
|
87
|
-
"application/msword",
|
|
88
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
89
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
90
|
-
}
|
|
91
|
-
TEXT_MIMES = {
|
|
92
|
-
"text/plain",
|
|
93
|
-
"text/markdown",
|
|
94
|
-
"text/x-python",
|
|
95
|
-
"text/x-c",
|
|
96
|
-
"text/x-c++",
|
|
97
|
-
"text/x-java-source",
|
|
98
|
-
"text/x-script.python",
|
|
99
|
-
"text/html",
|
|
100
|
-
"text/css",
|
|
101
|
-
"application/typescript",
|
|
102
|
-
"text/javascript",
|
|
103
|
-
}
|
|
104
|
-
TEXT_EXTS = {
|
|
135
|
+
text_exts = {
|
|
105
136
|
".txt",
|
|
106
137
|
".md",
|
|
107
138
|
".rst",
|
|
@@ -120,32 +151,9 @@ class FileProcessor:
|
|
|
120
151
|
".html",
|
|
121
152
|
".css",
|
|
122
153
|
}
|
|
123
|
-
|
|
124
|
-
# --- PDF ---
|
|
125
|
-
if mime_type in PDF_MIMES or suffix == ".pdf":
|
|
126
|
-
return "pdf"
|
|
127
|
-
|
|
128
|
-
# --- CSV ---
|
|
129
|
-
if mime_type in CSV_MIMES or suffix == ".csv":
|
|
130
|
-
return "csv"
|
|
131
|
-
|
|
132
|
-
# --- JSON ---
|
|
133
|
-
if mime_type in JSON_MIMES or suffix == ".json":
|
|
134
|
-
return "json"
|
|
135
|
-
|
|
136
|
-
# --- Office documents ---
|
|
137
|
-
if mime_type in OFFICE_MIMES or suffix in {".doc", ".docx", ".pptx"}:
|
|
138
|
-
return "office"
|
|
139
|
-
|
|
140
|
-
# --- Generic text / code / markup ---
|
|
141
|
-
if mime_type in TEXT_MIMES or suffix in TEXT_EXTS:
|
|
154
|
+
if suffix in text_exts:
|
|
142
155
|
return "text"
|
|
143
|
-
|
|
144
|
-
# --- Unsupported ---
|
|
145
|
-
raise ValueError(
|
|
146
|
-
f"Unsupported file type for '{file_path.name}': "
|
|
147
|
-
f"MIME={mime_type or 'unknown'} extension={suffix}"
|
|
148
|
-
)
|
|
156
|
+
raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
|
|
149
157
|
|
|
150
158
|
# ------------------------------------------------------------------ #
|
|
151
159
|
# Public entry-point
|
|
@@ -156,19 +164,17 @@ class FileProcessor:
|
|
|
156
164
|
self.validate_file(file_path)
|
|
157
165
|
ftype = self._detect_file_type(file_path)
|
|
158
166
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
return await self._process_json(file_path)
|
|
167
|
+
dispatch_map = {
|
|
168
|
+
"pdf": self._process_pdf,
|
|
169
|
+
"text": self._process_text,
|
|
170
|
+
"csv": self._process_csv,
|
|
171
|
+
"office": self._process_office,
|
|
172
|
+
"json": self._process_json,
|
|
173
|
+
}
|
|
174
|
+
if ftype not in dispatch_map:
|
|
175
|
+
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
169
176
|
|
|
170
|
-
|
|
171
|
-
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
177
|
+
return await dispatch_map[ftype](file_path)
|
|
172
178
|
|
|
173
179
|
# ------------------------------------------------------------------ #
|
|
174
180
|
# PDF
|
|
@@ -239,7 +245,7 @@ class FileProcessor:
|
|
|
239
245
|
async def _process_csv(
|
|
240
246
|
self, file_path: Path, text_field: str = "description"
|
|
241
247
|
) -> Dict[str, Any]:
|
|
242
|
-
|
|
248
|
+
texts, metas = [], []
|
|
243
249
|
with file_path.open(newline="", encoding="utf-8") as f:
|
|
244
250
|
reader = csv.DictReader(f)
|
|
245
251
|
for row in reader:
|
|
@@ -369,18 +375,6 @@ class FileProcessor:
|
|
|
369
375
|
pretty = json.dumps(obj, indent=2, ensure_ascii=False)
|
|
370
376
|
return "\n".join(textwrap.wrap(pretty, width=120))
|
|
371
377
|
|
|
372
|
-
async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
|
|
373
|
-
return await asyncio.get_event_loop().run_in_executor(
|
|
374
|
-
self._executor,
|
|
375
|
-
lambda: self.embedding_model.encode(
|
|
376
|
-
[chunk],
|
|
377
|
-
convert_to_numpy=True,
|
|
378
|
-
truncate="model_max_length",
|
|
379
|
-
normalize_embeddings=True,
|
|
380
|
-
show_progress_bar=False,
|
|
381
|
-
)[0],
|
|
382
|
-
)
|
|
383
|
-
|
|
384
378
|
# ------------------------------------------------------------------ #
|
|
385
379
|
# Text chunking helpers
|
|
386
380
|
# ------------------------------------------------------------------ #
|
|
@@ -405,10 +399,11 @@ class FileProcessor:
|
|
|
405
399
|
return chunks
|
|
406
400
|
|
|
407
401
|
def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
|
|
402
|
+
model = self._ensure_model() # Ensure model is loaded to access tokenizer
|
|
408
403
|
if tokens is None:
|
|
409
|
-
tokens =
|
|
404
|
+
tokens = model.tokenizer.tokenize(chunk)
|
|
410
405
|
out = []
|
|
411
406
|
for i in range(0, len(tokens), self.effective_max_length):
|
|
412
407
|
seg = tokens[i : i + self.effective_max_length]
|
|
413
|
-
out.append(
|
|
408
|
+
out.append(model.tokenizer.convert_tokens_to_string(seg))
|
|
414
409
|
return out
|
|
@@ -153,18 +153,21 @@ class MessagesClient(BaseAPIClient):
|
|
|
153
153
|
raise
|
|
154
154
|
|
|
155
155
|
def list_messages(
|
|
156
|
-
self,
|
|
157
|
-
|
|
156
|
+
self,
|
|
157
|
+
thread_id: str,
|
|
158
|
+
limit: int = 20,
|
|
159
|
+
order: str = "asc",
|
|
160
|
+
) -> ent_validator.MessagesList:
|
|
158
161
|
"""
|
|
159
|
-
|
|
162
|
+
Fetch messages for a thread and return an OpenAI-style envelope.
|
|
160
163
|
|
|
161
164
|
Args:
|
|
162
|
-
thread_id (str):
|
|
163
|
-
limit (int):
|
|
164
|
-
order (str):
|
|
165
|
+
thread_id (str): Target thread ID.
|
|
166
|
+
limit (int): Max messages to fetch.
|
|
167
|
+
order (str): 'asc' or 'desc'.
|
|
165
168
|
|
|
166
169
|
Returns:
|
|
167
|
-
|
|
170
|
+
MessagesList: Wrapper containing .data[], .first_id, .last_id, .has_more …
|
|
168
171
|
"""
|
|
169
172
|
logging_utility.info(
|
|
170
173
|
"Listing messages for thread_id: %s, limit: %d, order: %s",
|
|
@@ -178,24 +181,19 @@ class MessagesClient(BaseAPIClient):
|
|
|
178
181
|
f"/v1/threads/{thread_id}/messages", params=params
|
|
179
182
|
)
|
|
180
183
|
response.raise_for_status()
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
return [message.dict() for message in validated_messages]
|
|
184
|
+
|
|
185
|
+
envelope = ent_validator.MessagesList(**response.json())
|
|
186
|
+
logging_utility.info("Retrieved %d messages", len(envelope.data))
|
|
187
|
+
return envelope
|
|
188
|
+
|
|
187
189
|
except ValidationError as e:
|
|
188
190
|
logging_utility.error("Validation error: %s", e.json())
|
|
189
|
-
raise ValueError(f"Validation error: {e}")
|
|
191
|
+
raise ValueError(f"Validation error: {e}") from e
|
|
190
192
|
except httpx.HTTPStatusError as e:
|
|
191
|
-
logging_utility.error(
|
|
192
|
-
"HTTP error occurred while listing messages: %s", str(e)
|
|
193
|
-
)
|
|
193
|
+
logging_utility.error("HTTP error while listing messages: %s", str(e))
|
|
194
194
|
raise
|
|
195
195
|
except Exception as e:
|
|
196
|
-
logging_utility.error(
|
|
197
|
-
"An error occurred while listing messages: %s", str(e)
|
|
198
|
-
)
|
|
196
|
+
logging_utility.error("Unexpected error while listing messages: %s", str(e))
|
|
199
197
|
raise
|
|
200
198
|
|
|
201
199
|
def get_formatted_messages(
|
|
@@ -294,32 +292,19 @@ class MessagesClient(BaseAPIClient):
|
|
|
294
292
|
logging_utility.error("An error occurred: %s", str(e))
|
|
295
293
|
raise RuntimeError(f"An error occurred: {str(e)}")
|
|
296
294
|
|
|
297
|
-
def delete_message(self, message_id: str) ->
|
|
298
|
-
"""
|
|
299
|
-
Delete a message by its ID.
|
|
300
|
-
|
|
301
|
-
Args:
|
|
302
|
-
message_id (str): The ID of the message.
|
|
303
|
-
|
|
304
|
-
Returns:
|
|
305
|
-
Dict[str, Any]: The deletion result.
|
|
306
|
-
"""
|
|
295
|
+
def delete_message(self, message_id: str) -> ent_validator.MessageDeleted:
|
|
296
|
+
"""Delete a message and return deletion envelope."""
|
|
307
297
|
logging_utility.info("Deleting message with id: %s", message_id)
|
|
308
298
|
try:
|
|
309
299
|
response = self.client.delete(f"/v1/messages/{message_id}")
|
|
310
300
|
response.raise_for_status()
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
return result
|
|
301
|
+
return ent_validator.MessageDeleted(**response.json())
|
|
302
|
+
|
|
314
303
|
except httpx.HTTPStatusError as e:
|
|
315
|
-
logging_utility.error(
|
|
316
|
-
"HTTP error occurred while deleting message: %s", str(e)
|
|
317
|
-
)
|
|
304
|
+
logging_utility.error("HTTP error while deleting message: %s", str(e))
|
|
318
305
|
raise
|
|
319
306
|
except Exception as e:
|
|
320
|
-
logging_utility.error(
|
|
321
|
-
"An error occurred while deleting message: %s", str(e)
|
|
322
|
-
)
|
|
307
|
+
logging_utility.error("Unexpected error while deleting message: %s", str(e))
|
|
323
308
|
raise
|
|
324
309
|
|
|
325
310
|
def save_assistant_message_chunk(
|