projectdavid 1.29.9__py3-none-any.whl → 1.38.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- projectdavid/clients/assistants_client.py +7 -13
- projectdavid/clients/file_processor.py +216 -76
- projectdavid/clients/messages_client.py +24 -39
- projectdavid/clients/runs.py +156 -211
- projectdavid/clients/synchronous_inference_wrapper.py +52 -24
- projectdavid/clients/threads_client.py +32 -12
- projectdavid/clients/vector_store_manager.py +110 -21
- projectdavid/clients/vectors.py +250 -96
- projectdavid/clients/vision-file_processor.py +462 -0
- projectdavid/clients/vision_vectors.py +1058 -0
- projectdavid/decorators.py +64 -0
- projectdavid/entity.py +24 -5
- projectdavid/synthesis/reranker.py +4 -2
- projectdavid/utils/function_call_suppressor.py +40 -0
- {projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/METADATA +8 -6
- {projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/RECORD +19 -15
- {projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/WHEEL +1 -1
- {projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/licenses/LICENSE +0 -0
- {projectdavid-1.29.9.dist-info → projectdavid-1.38.1.dist-info}/top_level.txt +0 -0
|
@@ -112,7 +112,6 @@ class AssistantsClient(BaseAPIClient):
|
|
|
112
112
|
description: str = "",
|
|
113
113
|
instructions: str = "",
|
|
114
114
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
115
|
-
platform_tools: Optional[List[Dict[str, Any]]] = None,
|
|
116
115
|
tool_resources: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
117
116
|
meta_data: Optional[Dict[str, Any]] = None,
|
|
118
117
|
top_p: float = 1.0,
|
|
@@ -133,7 +132,6 @@ class AssistantsClient(BaseAPIClient):
|
|
|
133
132
|
"model": model,
|
|
134
133
|
"instructions": instructions,
|
|
135
134
|
"tools": tools,
|
|
136
|
-
"platform_tools": platform_tools,
|
|
137
135
|
"tool_resources": tool_resources,
|
|
138
136
|
"meta_data": meta_data,
|
|
139
137
|
"top_p": top_p,
|
|
@@ -263,14 +261,10 @@ class AssistantsClient(BaseAPIClient):
|
|
|
263
261
|
)
|
|
264
262
|
return {"message": "Assistant disassociated from user successfully"}
|
|
265
263
|
|
|
266
|
-
def
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
return [ent_validator.AssistantRead(**a) for a in raw_list]
|
|
274
|
-
except ValidationError as e:
|
|
275
|
-
logging_utility.error("Validation error: %s", e.json())
|
|
276
|
-
raise AssistantsClientError(f"Validation error: {e}") from e
|
|
264
|
+
def list(self) -> list[ent_validator.AssistantRead]:
|
|
265
|
+
"""Return every assistant owned by *this* API key."""
|
|
266
|
+
logging_utility.info("Listing assistants")
|
|
267
|
+
|
|
268
|
+
resp = self._request_with_retries("GET", "/v1/assistants")
|
|
269
|
+
raw = self._parse_response(resp)
|
|
270
|
+
return [ent_validator.AssistantRead(**a) for a in raw]
|
|
@@ -1,80 +1,184 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
|
+
import json
|
|
3
4
|
import re
|
|
5
|
+
import textwrap
|
|
4
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
-
from datetime import datetime
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Any, Dict, List, Tuple, Union
|
|
8
9
|
|
|
9
|
-
try:
|
|
10
|
-
from typing import LiteralString
|
|
11
|
-
except ImportError:
|
|
10
|
+
try: # Python 3.11+
|
|
11
|
+
from typing import LiteralString
|
|
12
|
+
except ImportError: # 3.9–3.10
|
|
12
13
|
from typing_extensions import LiteralString
|
|
13
14
|
|
|
14
15
|
import numpy as np
|
|
15
16
|
import pdfplumber
|
|
16
|
-
import
|
|
17
|
+
from docx import Document
|
|
18
|
+
from pptx import Presentation
|
|
17
19
|
from projectdavid_common import UtilsInterface
|
|
18
|
-
from sentence_transformers import SentenceTransformer
|
|
19
20
|
|
|
20
21
|
log = UtilsInterface.LoggingUtility()
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class FileProcessor:
|
|
25
|
+
# ------------------------------------------------------------------ #
|
|
26
|
+
# Construction
|
|
27
|
+
# ------------------------------------------------------------------ #
|
|
24
28
|
def __init__(self, max_workers: int = 4, chunk_size: int = 512):
|
|
25
|
-
self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
|
26
29
|
self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
|
|
30
|
+
self._embedding_model = None
|
|
27
31
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
28
32
|
|
|
29
|
-
#
|
|
30
|
-
self.
|
|
31
|
-
self.
|
|
32
|
-
self.
|
|
33
|
+
# Lazy-initialized attributes
|
|
34
|
+
self._requested_chunk_size = chunk_size
|
|
35
|
+
self._max_seq_length = None
|
|
36
|
+
self._effective_max_length = None
|
|
37
|
+
self._chunk_size = None
|
|
33
38
|
|
|
34
|
-
|
|
35
|
-
self.chunk_size = min(chunk_size, self.effective_max_length * 4)
|
|
39
|
+
log.info("Initialized Lazy-Loaded FileProcessor")
|
|
36
40
|
|
|
37
|
-
|
|
41
|
+
def _ensure_model(self):
|
|
42
|
+
"""
|
|
43
|
+
Internal helper to load the model and calculate limits only once.
|
|
44
|
+
This prevents heavy imports (scipy, torch) until actually needed.
|
|
45
|
+
"""
|
|
46
|
+
if self._embedding_model is None:
|
|
47
|
+
try:
|
|
48
|
+
from sentence_transformers import SentenceTransformer
|
|
49
|
+
|
|
50
|
+
log.info(f"Lazy-loading model: {self.embedding_model_name}")
|
|
51
|
+
|
|
52
|
+
self._embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
53
|
+
|
|
54
|
+
# Ported Limit Calculations
|
|
55
|
+
self._max_seq_length = self._embedding_model.get_max_seq_length()
|
|
56
|
+
special_tokens_count = 2
|
|
57
|
+
self._effective_max_length = self._max_seq_length - special_tokens_count
|
|
58
|
+
self._chunk_size = min(
|
|
59
|
+
self._requested_chunk_size, self._effective_max_length * 4
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
except ImportError:
|
|
63
|
+
log.error(
|
|
64
|
+
"sentence-transformers not found. Ensure 'pip install projectdavid[vision]' is installed."
|
|
65
|
+
)
|
|
66
|
+
raise ImportError(
|
|
67
|
+
"Model-based features require 'sentence-transformers'. Install with [vision] extra."
|
|
68
|
+
)
|
|
69
|
+
return self._embedding_model
|
|
70
|
+
|
|
71
|
+
# Properties to maintain access to derived attributes
|
|
72
|
+
@property
|
|
73
|
+
def chunk_size(self):
|
|
74
|
+
if self._chunk_size is None:
|
|
75
|
+
self._ensure_model()
|
|
76
|
+
return self._chunk_size
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def effective_max_length(self):
|
|
80
|
+
if self._effective_max_length is None:
|
|
81
|
+
self._ensure_model()
|
|
82
|
+
return self._effective_max_length
|
|
83
|
+
|
|
84
|
+
# ------------------------------------------------------------------ #
|
|
85
|
+
# Embeddings
|
|
86
|
+
# ------------------------------------------------------------------ #
|
|
87
|
+
def encode_text(self, text: str):
|
|
88
|
+
model = self._ensure_model()
|
|
89
|
+
return model.encode(
|
|
90
|
+
[text],
|
|
91
|
+
convert_to_numpy=True,
|
|
92
|
+
truncate="model_max_length",
|
|
93
|
+
normalize_embeddings=True,
|
|
94
|
+
)[0]
|
|
95
|
+
|
|
96
|
+
async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
|
|
97
|
+
model = self._ensure_model()
|
|
98
|
+
return await asyncio.get_event_loop().run_in_executor(
|
|
99
|
+
self._executor,
|
|
100
|
+
lambda: model.encode(
|
|
101
|
+
[chunk],
|
|
102
|
+
convert_to_numpy=True,
|
|
103
|
+
truncate="model_max_length",
|
|
104
|
+
normalize_embeddings=True,
|
|
105
|
+
show_progress_bar=False,
|
|
106
|
+
)[0],
|
|
107
|
+
)
|
|
38
108
|
|
|
109
|
+
# ------------------------------------------------------------------ #
|
|
110
|
+
# Generic validators
|
|
111
|
+
# ------------------------------------------------------------------ #
|
|
39
112
|
def validate_file(self, file_path: Path):
|
|
40
|
-
"""Ensure file exists and is under 100
|
|
113
|
+
"""Ensure file exists and is under 100 MB."""
|
|
41
114
|
max_size = 100 * 1024 * 1024
|
|
42
115
|
if not file_path.exists():
|
|
43
116
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
44
117
|
if file_path.stat().st_size > max_size:
|
|
45
118
|
mb = max_size // (1024 * 1024)
|
|
46
|
-
raise ValueError(f"{file_path.name} > {mb}
|
|
119
|
+
raise ValueError(f"{file_path.name} > {mb} MB limit")
|
|
47
120
|
|
|
121
|
+
# ------------------------------------------------------------------ #
|
|
122
|
+
# File-type detection
|
|
123
|
+
# ------------------------------------------------------------------ #
|
|
48
124
|
def _detect_file_type(self, file_path: Path) -> str:
|
|
49
|
-
"""Return 'pdf', 'text', or 'csv'."""
|
|
50
125
|
suffix = file_path.suffix.lower()
|
|
51
126
|
if suffix == ".pdf":
|
|
52
127
|
return "pdf"
|
|
53
128
|
if suffix == ".csv":
|
|
54
129
|
return "csv"
|
|
55
|
-
if suffix
|
|
130
|
+
if suffix == ".json":
|
|
131
|
+
return "json"
|
|
132
|
+
if suffix in {".doc", ".docx", ".pptx"}:
|
|
133
|
+
return "office"
|
|
134
|
+
|
|
135
|
+
text_exts = {
|
|
136
|
+
".txt",
|
|
137
|
+
".md",
|
|
138
|
+
".rst",
|
|
139
|
+
".c",
|
|
140
|
+
".cpp",
|
|
141
|
+
".cs",
|
|
142
|
+
".go",
|
|
143
|
+
".java",
|
|
144
|
+
".js",
|
|
145
|
+
".ts",
|
|
146
|
+
".php",
|
|
147
|
+
".py",
|
|
148
|
+
".rb",
|
|
149
|
+
".sh",
|
|
150
|
+
".tex",
|
|
151
|
+
".html",
|
|
152
|
+
".css",
|
|
153
|
+
}
|
|
154
|
+
if suffix in text_exts:
|
|
56
155
|
return "text"
|
|
57
|
-
|
|
156
|
+
raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
|
|
58
157
|
|
|
158
|
+
# ------------------------------------------------------------------ #
|
|
159
|
+
# Public entry-point
|
|
160
|
+
# ------------------------------------------------------------------ #
|
|
59
161
|
async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
60
|
-
"""
|
|
61
|
-
Async entrypoint: validate, detect type, then dispatch to the
|
|
62
|
-
appropriate processor (_process_pdf, _process_text, or _process_csv).
|
|
63
|
-
"""
|
|
162
|
+
"""Validate → detect → dispatch to the appropriate processor."""
|
|
64
163
|
file_path = Path(file_path)
|
|
65
164
|
self.validate_file(file_path)
|
|
66
165
|
ftype = self._detect_file_type(file_path)
|
|
67
166
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
167
|
+
dispatch_map = {
|
|
168
|
+
"pdf": self._process_pdf,
|
|
169
|
+
"text": self._process_text,
|
|
170
|
+
"csv": self._process_csv,
|
|
171
|
+
"office": self._process_office,
|
|
172
|
+
"json": self._process_json,
|
|
173
|
+
}
|
|
174
|
+
if ftype not in dispatch_map:
|
|
175
|
+
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
75
176
|
|
|
76
|
-
|
|
177
|
+
return await dispatch_map[ftype](file_path)
|
|
77
178
|
|
|
179
|
+
# ------------------------------------------------------------------ #
|
|
180
|
+
# PDF
|
|
181
|
+
# ------------------------------------------------------------------ #
|
|
78
182
|
async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
|
|
79
183
|
page_chunks, doc_meta = await self._extract_text(file_path)
|
|
80
184
|
all_chunks, line_data = [], []
|
|
@@ -82,7 +186,6 @@ class FileProcessor:
|
|
|
82
186
|
for page_text, page_num, line_nums in page_chunks:
|
|
83
187
|
lines = page_text.split("\n")
|
|
84
188
|
buf, buf_lines, length = [], [], 0
|
|
85
|
-
|
|
86
189
|
for line, ln in zip(lines, line_nums):
|
|
87
190
|
l = len(line) + 1
|
|
88
191
|
if length + l <= self.chunk_size:
|
|
@@ -94,12 +197,9 @@ class FileProcessor:
|
|
|
94
197
|
all_chunks.append("\n".join(buf))
|
|
95
198
|
line_data.append({"page": page_num, "lines": buf_lines})
|
|
96
199
|
buf, buf_lines, length = [], [], 0
|
|
97
|
-
|
|
98
|
-
# split any oversized line
|
|
99
200
|
for piece in self._split_oversized_chunk(line):
|
|
100
201
|
all_chunks.append(piece)
|
|
101
202
|
line_data.append({"page": page_num, "lines": [ln]})
|
|
102
|
-
|
|
103
203
|
if buf:
|
|
104
204
|
all_chunks.append("\n".join(buf))
|
|
105
205
|
line_data.append({"page": page_num, "lines": buf_lines})
|
|
@@ -107,7 +207,6 @@ class FileProcessor:
|
|
|
107
207
|
vectors = await asyncio.gather(
|
|
108
208
|
*[self._encode_chunk_async(c) for c in all_chunks]
|
|
109
209
|
)
|
|
110
|
-
|
|
111
210
|
return {
|
|
112
211
|
"content": "\n\n".join(all_chunks),
|
|
113
212
|
"metadata": {
|
|
@@ -121,6 +220,9 @@ class FileProcessor:
|
|
|
121
220
|
"line_data": line_data,
|
|
122
221
|
}
|
|
123
222
|
|
|
223
|
+
# ------------------------------------------------------------------ #
|
|
224
|
+
# Plain-text / code / markup
|
|
225
|
+
# ------------------------------------------------------------------ #
|
|
124
226
|
async def _process_text(self, file_path: Path) -> Dict[str, Any]:
|
|
125
227
|
text, extra_meta, _ = await self._extract_text(file_path)
|
|
126
228
|
chunks = self._chunk_text(text)
|
|
@@ -137,17 +239,13 @@ class FileProcessor:
|
|
|
137
239
|
"vectors": [v.tolist() for v in vectors],
|
|
138
240
|
}
|
|
139
241
|
|
|
140
|
-
#
|
|
141
|
-
|
|
242
|
+
# ------------------------------------------------------------------ #
|
|
243
|
+
# CSV
|
|
244
|
+
# ------------------------------------------------------------------ #
|
|
142
245
|
async def _process_csv(
|
|
143
246
|
self, file_path: Path, text_field: str = "description"
|
|
144
247
|
) -> Dict[str, Any]:
|
|
145
|
-
|
|
146
|
-
Read each row, embed the `text_field`, and collect per-row metadata
|
|
147
|
-
from all other columns.
|
|
148
|
-
"""
|
|
149
|
-
# load rows synchronously
|
|
150
|
-
rows, texts, metas = [], [], []
|
|
248
|
+
texts, metas = [], []
|
|
151
249
|
with file_path.open(newline="", encoding="utf-8") as f:
|
|
152
250
|
reader = csv.DictReader(f)
|
|
153
251
|
for row in reader:
|
|
@@ -155,27 +253,67 @@ class FileProcessor:
|
|
|
155
253
|
if not txt:
|
|
156
254
|
continue
|
|
157
255
|
texts.append(txt)
|
|
158
|
-
|
|
159
|
-
row_meta = {k: v for k, v in row.items() if k != text_field and v}
|
|
160
|
-
metas.append(row_meta)
|
|
256
|
+
metas.append({k: v for k, v in row.items() if k != text_field and v})
|
|
161
257
|
|
|
162
|
-
# embed in parallel
|
|
163
258
|
vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
|
|
259
|
+
return {
|
|
260
|
+
"content": None,
|
|
261
|
+
"metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
|
|
262
|
+
"chunks": texts,
|
|
263
|
+
"vectors": [v.tolist() for v in vectors],
|
|
264
|
+
"csv_row_metadata": metas,
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
# ------------------------------------------------------------------ #
|
|
268
|
+
# Office docs (.doc/.docx/.pptx)
|
|
269
|
+
# ------------------------------------------------------------------ #
|
|
270
|
+
async def _process_office(self, file_path: Path) -> Dict[str, Any]:
|
|
271
|
+
loop = asyncio.get_event_loop()
|
|
272
|
+
if file_path.suffix.lower() in {".doc", ".docx"}:
|
|
273
|
+
text = await loop.run_in_executor(
|
|
274
|
+
self._executor, self._read_docx, file_path
|
|
275
|
+
)
|
|
276
|
+
else: # .pptx
|
|
277
|
+
text = await loop.run_in_executor(
|
|
278
|
+
self._executor, self._read_pptx, file_path
|
|
279
|
+
)
|
|
164
280
|
|
|
281
|
+
chunks = self._chunk_text(text)
|
|
282
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
165
283
|
return {
|
|
166
|
-
"content":
|
|
284
|
+
"content": text,
|
|
167
285
|
"metadata": {
|
|
168
286
|
"source": str(file_path),
|
|
169
|
-
"
|
|
170
|
-
"type": "
|
|
287
|
+
"chunks": len(chunks),
|
|
288
|
+
"type": "office",
|
|
171
289
|
},
|
|
172
|
-
"chunks":
|
|
290
|
+
"chunks": chunks,
|
|
173
291
|
"vectors": [v.tolist() for v in vectors],
|
|
174
|
-
"csv_row_metadata": metas,
|
|
175
292
|
}
|
|
176
293
|
|
|
177
|
-
#
|
|
294
|
+
# ------------------------------------------------------------------ #
|
|
295
|
+
# JSON
|
|
296
|
+
# ------------------------------------------------------------------ #
|
|
297
|
+
async def _process_json(self, file_path: Path) -> Dict[str, Any]:
|
|
298
|
+
text = await asyncio.get_event_loop().run_in_executor(
|
|
299
|
+
self._executor, self._read_json, file_path
|
|
300
|
+
)
|
|
301
|
+
chunks = self._chunk_text(text)
|
|
302
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
303
|
+
return {
|
|
304
|
+
"content": text,
|
|
305
|
+
"metadata": {
|
|
306
|
+
"source": str(file_path),
|
|
307
|
+
"chunks": len(chunks),
|
|
308
|
+
"type": "json",
|
|
309
|
+
},
|
|
310
|
+
"chunks": chunks,
|
|
311
|
+
"vectors": [v.tolist() for v in vectors],
|
|
312
|
+
}
|
|
178
313
|
|
|
314
|
+
# ------------------------------------------------------------------ #
|
|
315
|
+
# Shared helpers
|
|
316
|
+
# ------------------------------------------------------------------ #
|
|
179
317
|
async def _extract_text(self, file_path: Path) -> Union[
|
|
180
318
|
Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
|
|
181
319
|
Tuple[str, Dict[str, Any], List[int]],
|
|
@@ -203,10 +341,8 @@ class FileProcessor:
|
|
|
203
341
|
)
|
|
204
342
|
for i, page in enumerate(pdf.pages, start=1):
|
|
205
343
|
lines = page.extract_text_lines()
|
|
206
|
-
txts, nums = [], []
|
|
207
|
-
# sort by vertical position
|
|
208
344
|
sorted_lines = sorted(lines, key=lambda x: x["top"])
|
|
209
|
-
|
|
345
|
+
txts, nums = [], []
|
|
210
346
|
for ln_idx, L in enumerate(sorted_lines, start=1):
|
|
211
347
|
t = L.get("text", "").strip()
|
|
212
348
|
if t:
|
|
@@ -222,23 +358,29 @@ class FileProcessor:
|
|
|
222
358
|
except UnicodeDecodeError:
|
|
223
359
|
return file_path.read_text(encoding="latin-1")
|
|
224
360
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
)
|
|
235
|
-
)
|
|
236
|
-
|
|
361
|
+
def _read_docx(self, path: Path) -> str:
|
|
362
|
+
doc = Document(path)
|
|
363
|
+
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
|
364
|
+
|
|
365
|
+
def _read_pptx(self, path: Path) -> str:
|
|
366
|
+
prs = Presentation(path)
|
|
367
|
+
slides = []
|
|
368
|
+
for slide in prs.slides:
|
|
369
|
+
chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
|
|
370
|
+
slides.append("\n".join(filter(None, chunks)))
|
|
371
|
+
return "\n\n".join(slides)
|
|
372
|
+
|
|
373
|
+
def _read_json(self, path: Path) -> str:
|
|
374
|
+
obj = json.loads(path.read_text(encoding="utf-8"))
|
|
375
|
+
pretty = json.dumps(obj, indent=2, ensure_ascii=False)
|
|
376
|
+
return "\n".join(textwrap.wrap(pretty, width=120))
|
|
377
|
+
|
|
378
|
+
# ------------------------------------------------------------------ #
|
|
379
|
+
# Text chunking helpers
|
|
380
|
+
# ------------------------------------------------------------------ #
|
|
237
381
|
def _chunk_text(self, text: str) -> List[str]:
|
|
238
|
-
# split into sentences, then re-chunk to token limits
|
|
239
382
|
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
|
240
383
|
chunks, buf, length = [], [], 0
|
|
241
|
-
|
|
242
384
|
for sent in sentences:
|
|
243
385
|
slen = len(sent) + 1
|
|
244
386
|
if length + slen <= self.chunk_size:
|
|
@@ -248,22 +390,20 @@ class FileProcessor:
|
|
|
248
390
|
if buf:
|
|
249
391
|
chunks.append(" ".join(buf))
|
|
250
392
|
buf, length = [], 0
|
|
251
|
-
# sentence itself may be too big
|
|
252
393
|
while len(sent) > self.chunk_size:
|
|
253
394
|
part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
|
|
254
395
|
chunks.append(part)
|
|
255
396
|
buf, length = [sent], len(sent)
|
|
256
|
-
|
|
257
397
|
if buf:
|
|
258
398
|
chunks.append(" ".join(buf))
|
|
259
|
-
|
|
260
399
|
return chunks
|
|
261
400
|
|
|
262
401
|
def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
|
|
402
|
+
model = self._ensure_model() # Ensure model is loaded to access tokenizer
|
|
263
403
|
if tokens is None:
|
|
264
|
-
tokens =
|
|
404
|
+
tokens = model.tokenizer.tokenize(chunk)
|
|
265
405
|
out = []
|
|
266
406
|
for i in range(0, len(tokens), self.effective_max_length):
|
|
267
407
|
seg = tokens[i : i + self.effective_max_length]
|
|
268
|
-
out.append(
|
|
408
|
+
out.append(model.tokenizer.convert_tokens_to_string(seg))
|
|
269
409
|
return out
|
|
@@ -153,18 +153,21 @@ class MessagesClient(BaseAPIClient):
|
|
|
153
153
|
raise
|
|
154
154
|
|
|
155
155
|
def list_messages(
|
|
156
|
-
self,
|
|
157
|
-
|
|
156
|
+
self,
|
|
157
|
+
thread_id: str,
|
|
158
|
+
limit: int = 20,
|
|
159
|
+
order: str = "asc",
|
|
160
|
+
) -> ent_validator.MessagesList:
|
|
158
161
|
"""
|
|
159
|
-
|
|
162
|
+
Fetch messages for a thread and return an OpenAI-style envelope.
|
|
160
163
|
|
|
161
164
|
Args:
|
|
162
|
-
thread_id (str):
|
|
163
|
-
limit (int):
|
|
164
|
-
order (str):
|
|
165
|
+
thread_id (str): Target thread ID.
|
|
166
|
+
limit (int): Max messages to fetch.
|
|
167
|
+
order (str): 'asc' or 'desc'.
|
|
165
168
|
|
|
166
169
|
Returns:
|
|
167
|
-
|
|
170
|
+
MessagesList: Wrapper containing .data[], .first_id, .last_id, .has_more …
|
|
168
171
|
"""
|
|
169
172
|
logging_utility.info(
|
|
170
173
|
"Listing messages for thread_id: %s, limit: %d, order: %s",
|
|
@@ -178,24 +181,19 @@ class MessagesClient(BaseAPIClient):
|
|
|
178
181
|
f"/v1/threads/{thread_id}/messages", params=params
|
|
179
182
|
)
|
|
180
183
|
response.raise_for_status()
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
return [message.dict() for message in validated_messages]
|
|
184
|
+
|
|
185
|
+
envelope = ent_validator.MessagesList(**response.json())
|
|
186
|
+
logging_utility.info("Retrieved %d messages", len(envelope.data))
|
|
187
|
+
return envelope
|
|
188
|
+
|
|
187
189
|
except ValidationError as e:
|
|
188
190
|
logging_utility.error("Validation error: %s", e.json())
|
|
189
|
-
raise ValueError(f"Validation error: {e}")
|
|
191
|
+
raise ValueError(f"Validation error: {e}") from e
|
|
190
192
|
except httpx.HTTPStatusError as e:
|
|
191
|
-
logging_utility.error(
|
|
192
|
-
"HTTP error occurred while listing messages: %s", str(e)
|
|
193
|
-
)
|
|
193
|
+
logging_utility.error("HTTP error while listing messages: %s", str(e))
|
|
194
194
|
raise
|
|
195
195
|
except Exception as e:
|
|
196
|
-
logging_utility.error(
|
|
197
|
-
"An error occurred while listing messages: %s", str(e)
|
|
198
|
-
)
|
|
196
|
+
logging_utility.error("Unexpected error while listing messages: %s", str(e))
|
|
199
197
|
raise
|
|
200
198
|
|
|
201
199
|
def get_formatted_messages(
|
|
@@ -294,32 +292,19 @@ class MessagesClient(BaseAPIClient):
|
|
|
294
292
|
logging_utility.error("An error occurred: %s", str(e))
|
|
295
293
|
raise RuntimeError(f"An error occurred: {str(e)}")
|
|
296
294
|
|
|
297
|
-
def delete_message(self, message_id: str) ->
|
|
298
|
-
"""
|
|
299
|
-
Delete a message by its ID.
|
|
300
|
-
|
|
301
|
-
Args:
|
|
302
|
-
message_id (str): The ID of the message.
|
|
303
|
-
|
|
304
|
-
Returns:
|
|
305
|
-
Dict[str, Any]: The deletion result.
|
|
306
|
-
"""
|
|
295
|
+
def delete_message(self, message_id: str) -> ent_validator.MessageDeleted:
|
|
296
|
+
"""Delete a message and return deletion envelope."""
|
|
307
297
|
logging_utility.info("Deleting message with id: %s", message_id)
|
|
308
298
|
try:
|
|
309
299
|
response = self.client.delete(f"/v1/messages/{message_id}")
|
|
310
300
|
response.raise_for_status()
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
return result
|
|
301
|
+
return ent_validator.MessageDeleted(**response.json())
|
|
302
|
+
|
|
314
303
|
except httpx.HTTPStatusError as e:
|
|
315
|
-
logging_utility.error(
|
|
316
|
-
"HTTP error occurred while deleting message: %s", str(e)
|
|
317
|
-
)
|
|
304
|
+
logging_utility.error("HTTP error while deleting message: %s", str(e))
|
|
318
305
|
raise
|
|
319
306
|
except Exception as e:
|
|
320
|
-
logging_utility.error(
|
|
321
|
-
"An error occurred while deleting message: %s", str(e)
|
|
322
|
-
)
|
|
307
|
+
logging_utility.error("Unexpected error while deleting message: %s", str(e))
|
|
323
308
|
raise
|
|
324
309
|
|
|
325
310
|
def save_assistant_message_chunk(
|