projectdavid 1.30.4__py3-none-any.whl → 1.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- projectdavid/clients/file_processor.py +196 -50
- projectdavid/clients/vectors.py +61 -61
- {projectdavid-1.30.4.dist-info → projectdavid-1.31.0.dist-info}/METADATA +4 -1
- {projectdavid-1.30.4.dist-info → projectdavid-1.31.0.dist-info}/RECORD +7 -7
- {projectdavid-1.30.4.dist-info → projectdavid-1.31.0.dist-info}/WHEEL +0 -0
- {projectdavid-1.30.4.dist-info → projectdavid-1.31.0.dist-info}/licenses/LICENSE +0 -0
- {projectdavid-1.30.4.dist-info → projectdavid-1.31.0.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,23 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
|
+
import json
|
|
4
|
+
import mimetypes
|
|
3
5
|
import re
|
|
6
|
+
import textwrap
|
|
4
7
|
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
-
from datetime import datetime
|
|
6
8
|
from pathlib import Path
|
|
7
9
|
from typing import Any, Dict, List, Tuple, Union
|
|
8
10
|
|
|
9
|
-
try:
|
|
10
|
-
from typing import LiteralString
|
|
11
|
-
except ImportError:
|
|
11
|
+
try: # Python 3.11+
|
|
12
|
+
from typing import LiteralString
|
|
13
|
+
except ImportError: # 3.9 - 3.10
|
|
12
14
|
from typing_extensions import LiteralString
|
|
13
15
|
|
|
16
|
+
import magic
|
|
14
17
|
import numpy as np
|
|
15
18
|
import pdfplumber
|
|
16
|
-
import
|
|
19
|
+
from docx import Document
|
|
20
|
+
from pptx import Presentation
|
|
17
21
|
from projectdavid_common import UtilsInterface
|
|
18
22
|
from sentence_transformers import SentenceTransformer
|
|
19
23
|
|
|
@@ -21,46 +25,133 @@ log = UtilsInterface.LoggingUtility()
|
|
|
21
25
|
|
|
22
26
|
|
|
23
27
|
class FileProcessor:
|
|
28
|
+
# ------------------------------------------------------------------ #
|
|
29
|
+
# Construction
|
|
30
|
+
# ------------------------------------------------------------------ #
|
|
24
31
|
def __init__(self, max_workers: int = 4, chunk_size: int = 512):
|
|
25
32
|
self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
|
26
33
|
self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
|
|
27
34
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
28
35
|
|
|
29
|
-
#
|
|
36
|
+
# token limits
|
|
30
37
|
self.max_seq_length = self.embedding_model.get_max_seq_length()
|
|
31
38
|
self.special_tokens_count = 2
|
|
32
39
|
self.effective_max_length = self.max_seq_length - self.special_tokens_count
|
|
33
|
-
|
|
34
|
-
# chunk_size cannot exceed 4× model max
|
|
35
40
|
self.chunk_size = min(chunk_size, self.effective_max_length * 4)
|
|
36
41
|
|
|
37
42
|
log.info("Initialized optimized FileProcessor")
|
|
38
43
|
|
|
44
|
+
# ------------------------------------------------------------------ #
|
|
45
|
+
# Generic validators
|
|
46
|
+
# ------------------------------------------------------------------ #
|
|
39
47
|
def validate_file(self, file_path: Path):
|
|
40
|
-
"""Ensure file exists and is under 100
|
|
48
|
+
"""Ensure file exists and is under 100 MB."""
|
|
41
49
|
max_size = 100 * 1024 * 1024
|
|
42
50
|
if not file_path.exists():
|
|
43
51
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
44
52
|
if file_path.stat().st_size > max_size:
|
|
45
53
|
mb = max_size // (1024 * 1024)
|
|
46
|
-
raise ValueError(f"{file_path.name} > {mb}
|
|
54
|
+
raise ValueError(f"{file_path.name} > {mb} MB limit")
|
|
47
55
|
|
|
56
|
+
# ------------------------------------------------------------------ #
|
|
57
|
+
# File-type detection (extension + MIME)
|
|
58
|
+
# ------------------------------------------------------------------ #
|
|
48
59
|
def _detect_file_type(self, file_path: Path) -> str:
|
|
49
|
-
"""
|
|
60
|
+
"""
|
|
61
|
+
Return a handler tag:
|
|
62
|
+
|
|
63
|
+
• 'pdf' • 'csv'
|
|
64
|
+
• 'json' • 'office'
|
|
65
|
+
• 'text'
|
|
66
|
+
|
|
67
|
+
Raises *ValueError* on anything unknown.
|
|
68
|
+
"""
|
|
69
|
+
# 1️⃣ Best-effort MIME sniff
|
|
70
|
+
mime_type: str | None = None
|
|
71
|
+
if magic is not None:
|
|
72
|
+
try:
|
|
73
|
+
mime_type = magic.from_file(str(file_path), mime=True)
|
|
74
|
+
except Exception:
|
|
75
|
+
mime_type = None
|
|
76
|
+
|
|
77
|
+
# 2️⃣ Fallback → mimetypes
|
|
78
|
+
if not mime_type:
|
|
79
|
+
mime_type, _ = mimetypes.guess_type(file_path.name)
|
|
80
|
+
|
|
50
81
|
suffix = file_path.suffix.lower()
|
|
51
|
-
|
|
82
|
+
|
|
83
|
+
PDF_MIMES = {"application/pdf"}
|
|
84
|
+
CSV_MIMES = {"text/csv", "application/csv"}
|
|
85
|
+
JSON_MIMES = {"application/json"}
|
|
86
|
+
OFFICE_MIMES = {
|
|
87
|
+
"application/msword",
|
|
88
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
89
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
90
|
+
}
|
|
91
|
+
TEXT_MIMES = {
|
|
92
|
+
"text/plain",
|
|
93
|
+
"text/markdown",
|
|
94
|
+
"text/x-python",
|
|
95
|
+
"text/x-c",
|
|
96
|
+
"text/x-c++",
|
|
97
|
+
"text/x-java-source",
|
|
98
|
+
"text/x-script.python",
|
|
99
|
+
"text/html",
|
|
100
|
+
"text/css",
|
|
101
|
+
"application/typescript",
|
|
102
|
+
"text/javascript",
|
|
103
|
+
}
|
|
104
|
+
TEXT_EXTS = {
|
|
105
|
+
".txt",
|
|
106
|
+
".md",
|
|
107
|
+
".rst",
|
|
108
|
+
".c",
|
|
109
|
+
".cpp",
|
|
110
|
+
".cs",
|
|
111
|
+
".go",
|
|
112
|
+
".java",
|
|
113
|
+
".js",
|
|
114
|
+
".ts",
|
|
115
|
+
".php",
|
|
116
|
+
".py",
|
|
117
|
+
".rb",
|
|
118
|
+
".sh",
|
|
119
|
+
".tex",
|
|
120
|
+
".html",
|
|
121
|
+
".css",
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
# --- PDF ---
|
|
125
|
+
if mime_type in PDF_MIMES or suffix == ".pdf":
|
|
52
126
|
return "pdf"
|
|
53
|
-
|
|
127
|
+
|
|
128
|
+
# --- CSV ---
|
|
129
|
+
if mime_type in CSV_MIMES or suffix == ".csv":
|
|
54
130
|
return "csv"
|
|
55
|
-
|
|
131
|
+
|
|
132
|
+
# --- JSON ---
|
|
133
|
+
if mime_type in JSON_MIMES or suffix == ".json":
|
|
134
|
+
return "json"
|
|
135
|
+
|
|
136
|
+
# --- Office documents ---
|
|
137
|
+
if mime_type in OFFICE_MIMES or suffix in {".doc", ".docx", ".pptx"}:
|
|
138
|
+
return "office"
|
|
139
|
+
|
|
140
|
+
# --- Generic text / code / markup ---
|
|
141
|
+
if mime_type in TEXT_MIMES or suffix in TEXT_EXTS:
|
|
56
142
|
return "text"
|
|
57
|
-
return "unknown"
|
|
58
143
|
|
|
144
|
+
# --- Unsupported ---
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"Unsupported file type for '{file_path.name}': "
|
|
147
|
+
f"MIME={mime_type or 'unknown'} extension={suffix}"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# ------------------------------------------------------------------ #
|
|
151
|
+
# Public entry-point
|
|
152
|
+
# ------------------------------------------------------------------ #
|
|
59
153
|
async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
60
|
-
"""
|
|
61
|
-
Async entrypoint: validate, detect type, then dispatch to the
|
|
62
|
-
appropriate processor (_process_pdf, _process_text, or _process_csv).
|
|
63
|
-
"""
|
|
154
|
+
"""Validate → detect → dispatch to the appropriate processor."""
|
|
64
155
|
file_path = Path(file_path)
|
|
65
156
|
self.validate_file(file_path)
|
|
66
157
|
ftype = self._detect_file_type(file_path)
|
|
@@ -71,10 +162,17 @@ class FileProcessor:
|
|
|
71
162
|
return await self._process_text(file_path)
|
|
72
163
|
if ftype == "csv":
|
|
73
164
|
return await self._process_csv(file_path)
|
|
74
|
-
|
|
165
|
+
if ftype == "office":
|
|
166
|
+
return await self._process_office(file_path)
|
|
167
|
+
if ftype == "json":
|
|
168
|
+
return await self._process_json(file_path)
|
|
75
169
|
|
|
76
|
-
|
|
170
|
+
# Safety net (should never hit)
|
|
171
|
+
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
77
172
|
|
|
173
|
+
# ------------------------------------------------------------------ #
|
|
174
|
+
# PDF
|
|
175
|
+
# ------------------------------------------------------------------ #
|
|
78
176
|
async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
|
|
79
177
|
page_chunks, doc_meta = await self._extract_text(file_path)
|
|
80
178
|
all_chunks, line_data = [], []
|
|
@@ -82,7 +180,6 @@ class FileProcessor:
|
|
|
82
180
|
for page_text, page_num, line_nums in page_chunks:
|
|
83
181
|
lines = page_text.split("\n")
|
|
84
182
|
buf, buf_lines, length = [], [], 0
|
|
85
|
-
|
|
86
183
|
for line, ln in zip(lines, line_nums):
|
|
87
184
|
l = len(line) + 1
|
|
88
185
|
if length + l <= self.chunk_size:
|
|
@@ -94,12 +191,9 @@ class FileProcessor:
|
|
|
94
191
|
all_chunks.append("\n".join(buf))
|
|
95
192
|
line_data.append({"page": page_num, "lines": buf_lines})
|
|
96
193
|
buf, buf_lines, length = [], [], 0
|
|
97
|
-
|
|
98
|
-
# split any oversized line
|
|
99
194
|
for piece in self._split_oversized_chunk(line):
|
|
100
195
|
all_chunks.append(piece)
|
|
101
196
|
line_data.append({"page": page_num, "lines": [ln]})
|
|
102
|
-
|
|
103
197
|
if buf:
|
|
104
198
|
all_chunks.append("\n".join(buf))
|
|
105
199
|
line_data.append({"page": page_num, "lines": buf_lines})
|
|
@@ -107,7 +201,6 @@ class FileProcessor:
|
|
|
107
201
|
vectors = await asyncio.gather(
|
|
108
202
|
*[self._encode_chunk_async(c) for c in all_chunks]
|
|
109
203
|
)
|
|
110
|
-
|
|
111
204
|
return {
|
|
112
205
|
"content": "\n\n".join(all_chunks),
|
|
113
206
|
"metadata": {
|
|
@@ -121,6 +214,9 @@ class FileProcessor:
|
|
|
121
214
|
"line_data": line_data,
|
|
122
215
|
}
|
|
123
216
|
|
|
217
|
+
# ------------------------------------------------------------------ #
|
|
218
|
+
# Plain-text / code / markup
|
|
219
|
+
# ------------------------------------------------------------------ #
|
|
124
220
|
async def _process_text(self, file_path: Path) -> Dict[str, Any]:
|
|
125
221
|
text, extra_meta, _ = await self._extract_text(file_path)
|
|
126
222
|
chunks = self._chunk_text(text)
|
|
@@ -137,15 +233,12 @@ class FileProcessor:
|
|
|
137
233
|
"vectors": [v.tolist() for v in vectors],
|
|
138
234
|
}
|
|
139
235
|
|
|
140
|
-
#
|
|
236
|
+
# ------------------------------------------------------------------ #
|
|
237
|
+
# CSV
|
|
238
|
+
# ------------------------------------------------------------------ #
|
|
141
239
|
async def _process_csv(
|
|
142
240
|
self, file_path: Path, text_field: str = "description"
|
|
143
241
|
) -> Dict[str, Any]:
|
|
144
|
-
"""
|
|
145
|
-
Read each row, embed the `text_field`, and collect per-row metadata
|
|
146
|
-
from all other columns.
|
|
147
|
-
"""
|
|
148
|
-
# load rows synchronously
|
|
149
242
|
rows, texts, metas = [], [], []
|
|
150
243
|
with file_path.open(newline="", encoding="utf-8") as f:
|
|
151
244
|
reader = csv.DictReader(f)
|
|
@@ -154,27 +247,67 @@ class FileProcessor:
|
|
|
154
247
|
if not txt:
|
|
155
248
|
continue
|
|
156
249
|
texts.append(txt)
|
|
157
|
-
|
|
158
|
-
row_meta = {k: v for k, v in row.items() if k != text_field and v}
|
|
159
|
-
metas.append(row_meta)
|
|
250
|
+
metas.append({k: v for k, v in row.items() if k != text_field and v})
|
|
160
251
|
|
|
161
|
-
# embed in parallel
|
|
162
252
|
vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
|
|
253
|
+
return {
|
|
254
|
+
"content": None,
|
|
255
|
+
"metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
|
|
256
|
+
"chunks": texts,
|
|
257
|
+
"vectors": [v.tolist() for v in vectors],
|
|
258
|
+
"csv_row_metadata": metas,
|
|
259
|
+
}
|
|
163
260
|
|
|
261
|
+
# ------------------------------------------------------------------ #
|
|
262
|
+
# Office docs (.doc/.docx/.pptx)
|
|
263
|
+
# ------------------------------------------------------------------ #
|
|
264
|
+
async def _process_office(self, file_path: Path) -> Dict[str, Any]:
|
|
265
|
+
loop = asyncio.get_event_loop()
|
|
266
|
+
if file_path.suffix.lower() in {".doc", ".docx"}:
|
|
267
|
+
text = await loop.run_in_executor(
|
|
268
|
+
self._executor, self._read_docx, file_path
|
|
269
|
+
)
|
|
270
|
+
else: # .pptx
|
|
271
|
+
text = await loop.run_in_executor(
|
|
272
|
+
self._executor, self._read_pptx, file_path
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
chunks = self._chunk_text(text)
|
|
276
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
164
277
|
return {
|
|
165
|
-
"content":
|
|
278
|
+
"content": text,
|
|
166
279
|
"metadata": {
|
|
167
280
|
"source": str(file_path),
|
|
168
|
-
"
|
|
169
|
-
"type": "
|
|
281
|
+
"chunks": len(chunks),
|
|
282
|
+
"type": "office",
|
|
170
283
|
},
|
|
171
|
-
"chunks":
|
|
284
|
+
"chunks": chunks,
|
|
172
285
|
"vectors": [v.tolist() for v in vectors],
|
|
173
|
-
"csv_row_metadata": metas,
|
|
174
286
|
}
|
|
175
287
|
|
|
176
|
-
#
|
|
288
|
+
# ------------------------------------------------------------------ #
|
|
289
|
+
# JSON
|
|
290
|
+
# ------------------------------------------------------------------ #
|
|
291
|
+
async def _process_json(self, file_path: Path) -> Dict[str, Any]:
|
|
292
|
+
text = await asyncio.get_event_loop().run_in_executor(
|
|
293
|
+
self._executor, self._read_json, file_path
|
|
294
|
+
)
|
|
295
|
+
chunks = self._chunk_text(text)
|
|
296
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
297
|
+
return {
|
|
298
|
+
"content": text,
|
|
299
|
+
"metadata": {
|
|
300
|
+
"source": str(file_path),
|
|
301
|
+
"chunks": len(chunks),
|
|
302
|
+
"type": "json",
|
|
303
|
+
},
|
|
304
|
+
"chunks": chunks,
|
|
305
|
+
"vectors": [v.tolist() for v in vectors],
|
|
306
|
+
}
|
|
177
307
|
|
|
308
|
+
# ------------------------------------------------------------------ #
|
|
309
|
+
# Shared helpers
|
|
310
|
+
# ------------------------------------------------------------------ #
|
|
178
311
|
async def _extract_text(self, file_path: Path) -> Union[
|
|
179
312
|
Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
|
|
180
313
|
Tuple[str, Dict[str, Any], List[int]],
|
|
@@ -202,10 +335,8 @@ class FileProcessor:
|
|
|
202
335
|
)
|
|
203
336
|
for i, page in enumerate(pdf.pages, start=1):
|
|
204
337
|
lines = page.extract_text_lines()
|
|
205
|
-
txts, nums = [], []
|
|
206
|
-
# sort by vertical position
|
|
207
338
|
sorted_lines = sorted(lines, key=lambda x: x["top"])
|
|
208
|
-
|
|
339
|
+
txts, nums = [], []
|
|
209
340
|
for ln_idx, L in enumerate(sorted_lines, start=1):
|
|
210
341
|
t = L.get("text", "").strip()
|
|
211
342
|
if t:
|
|
@@ -221,6 +352,23 @@ class FileProcessor:
|
|
|
221
352
|
except UnicodeDecodeError:
|
|
222
353
|
return file_path.read_text(encoding="latin-1")
|
|
223
354
|
|
|
355
|
+
def _read_docx(self, path: Path) -> str:
|
|
356
|
+
doc = Document(path)
|
|
357
|
+
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
|
358
|
+
|
|
359
|
+
def _read_pptx(self, path: Path) -> str:
|
|
360
|
+
prs = Presentation(path)
|
|
361
|
+
slides = []
|
|
362
|
+
for slide in prs.slides:
|
|
363
|
+
chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
|
|
364
|
+
slides.append("\n".join(filter(None, chunks)))
|
|
365
|
+
return "\n\n".join(slides)
|
|
366
|
+
|
|
367
|
+
def _read_json(self, path: Path) -> str:
|
|
368
|
+
obj = json.loads(path.read_text(encoding="utf-8"))
|
|
369
|
+
pretty = json.dumps(obj, indent=2, ensure_ascii=False)
|
|
370
|
+
return "\n".join(textwrap.wrap(pretty, width=120))
|
|
371
|
+
|
|
224
372
|
async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
|
|
225
373
|
return await asyncio.get_event_loop().run_in_executor(
|
|
226
374
|
self._executor,
|
|
@@ -233,11 +381,12 @@ class FileProcessor:
|
|
|
233
381
|
)[0],
|
|
234
382
|
)
|
|
235
383
|
|
|
384
|
+
# ------------------------------------------------------------------ #
|
|
385
|
+
# Text chunking helpers
|
|
386
|
+
# ------------------------------------------------------------------ #
|
|
236
387
|
def _chunk_text(self, text: str) -> List[str]:
|
|
237
|
-
# split into sentences, then re-chunk to token limits
|
|
238
388
|
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
|
239
389
|
chunks, buf, length = [], [], 0
|
|
240
|
-
|
|
241
390
|
for sent in sentences:
|
|
242
391
|
slen = len(sent) + 1
|
|
243
392
|
if length + slen <= self.chunk_size:
|
|
@@ -247,15 +396,12 @@ class FileProcessor:
|
|
|
247
396
|
if buf:
|
|
248
397
|
chunks.append(" ".join(buf))
|
|
249
398
|
buf, length = [], 0
|
|
250
|
-
# sentence itself may be too big
|
|
251
399
|
while len(sent) > self.chunk_size:
|
|
252
400
|
part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
|
|
253
401
|
chunks.append(part)
|
|
254
402
|
buf, length = [sent], len(sent)
|
|
255
|
-
|
|
256
403
|
if buf:
|
|
257
404
|
chunks.append(" ".join(buf))
|
|
258
|
-
|
|
259
405
|
return chunks
|
|
260
406
|
|
|
261
407
|
def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
|
projectdavid/clients/vectors.py
CHANGED
|
@@ -561,6 +561,67 @@ class VectorStoreClient:
|
|
|
561
561
|
raise FileNotFoundError(f"File not found: {p}")
|
|
562
562
|
return self._run_sync(self._add_file_async(vector_store_id, p, user_metadata))
|
|
563
563
|
|
|
564
|
+
def delete_vector_store(
|
|
565
|
+
self,
|
|
566
|
+
vector_store_id: str,
|
|
567
|
+
permanent: bool = False,
|
|
568
|
+
) -> Dict[str, Any]:
|
|
569
|
+
return self._run_sync(self._delete_vs_async(vector_store_id, permanent))
|
|
570
|
+
|
|
571
|
+
def delete_file_from_vector_store(
|
|
572
|
+
self,
|
|
573
|
+
vector_store_id: str,
|
|
574
|
+
file_path: str,
|
|
575
|
+
) -> Dict[str, Any]:
|
|
576
|
+
return self._run_sync(self._delete_file_async(vector_store_id, file_path))
|
|
577
|
+
|
|
578
|
+
def list_store_files(
|
|
579
|
+
self,
|
|
580
|
+
vector_store_id: str,
|
|
581
|
+
) -> List[ValidationInterface.VectorStoreFileRead]:
|
|
582
|
+
return self._run_sync(self._list_store_files_async(vector_store_id))
|
|
583
|
+
|
|
584
|
+
def update_vector_store_file_status(
|
|
585
|
+
self,
|
|
586
|
+
vector_store_id: str,
|
|
587
|
+
file_id: str,
|
|
588
|
+
status: ValidationInterface.StatusEnum,
|
|
589
|
+
error_message: Optional[str] = None,
|
|
590
|
+
) -> ValidationInterface.VectorStoreFileRead:
|
|
591
|
+
return self._run_sync(
|
|
592
|
+
self._update_file_status_async(
|
|
593
|
+
vector_store_id, file_id, status, error_message
|
|
594
|
+
)
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
def get_vector_stores_for_assistant(
|
|
598
|
+
self,
|
|
599
|
+
assistant_id: str,
|
|
600
|
+
) -> List[ValidationInterface.VectorStoreRead]:
|
|
601
|
+
return self._run_sync(self._get_assistant_vs_async(assistant_id))
|
|
602
|
+
|
|
603
|
+
def attach_vector_store_to_assistant(
|
|
604
|
+
self,
|
|
605
|
+
vector_store_id: str,
|
|
606
|
+
assistant_id: str,
|
|
607
|
+
) -> bool:
|
|
608
|
+
return self._run_sync(self._attach_vs_async(vector_store_id, assistant_id))
|
|
609
|
+
|
|
610
|
+
def detach_vector_store_from_assistant(
|
|
611
|
+
self,
|
|
612
|
+
vector_store_id: str,
|
|
613
|
+
assistant_id: str,
|
|
614
|
+
) -> bool:
|
|
615
|
+
return self._run_sync(self._detach_vs_async(vector_store_id, assistant_id))
|
|
616
|
+
|
|
617
|
+
def retrieve_vector_store_sync(
|
|
618
|
+
self,
|
|
619
|
+
vector_store_id: str,
|
|
620
|
+
) -> ValidationInterface.VectorStoreRead:
|
|
621
|
+
resp = self._sync_api_client.get(f"/v1/vector-stores/{vector_store_id}")
|
|
622
|
+
resp.raise_for_status()
|
|
623
|
+
return ValidationInterface.VectorStoreRead.model_validate(resp.json())
|
|
624
|
+
|
|
564
625
|
def vector_file_search_raw(
|
|
565
626
|
self,
|
|
566
627
|
vector_store_id: str,
|
|
@@ -627,67 +688,6 @@ class VectorStoreClient:
|
|
|
627
688
|
# 4️⃣ Wrap everything into an OpenAI envelope
|
|
628
689
|
return make_envelope(query_text, hits, answer_text)
|
|
629
690
|
|
|
630
|
-
def delete_vector_store(
|
|
631
|
-
self,
|
|
632
|
-
vector_store_id: str,
|
|
633
|
-
permanent: bool = False,
|
|
634
|
-
) -> Dict[str, Any]:
|
|
635
|
-
return self._run_sync(self._delete_vs_async(vector_store_id, permanent))
|
|
636
|
-
|
|
637
|
-
def delete_file_from_vector_store(
|
|
638
|
-
self,
|
|
639
|
-
vector_store_id: str,
|
|
640
|
-
file_path: str,
|
|
641
|
-
) -> Dict[str, Any]:
|
|
642
|
-
return self._run_sync(self._delete_file_async(vector_store_id, file_path))
|
|
643
|
-
|
|
644
|
-
def list_store_files(
|
|
645
|
-
self,
|
|
646
|
-
vector_store_id: str,
|
|
647
|
-
) -> List[ValidationInterface.VectorStoreFileRead]:
|
|
648
|
-
return self._run_sync(self._list_store_files_async(vector_store_id))
|
|
649
|
-
|
|
650
|
-
def update_vector_store_file_status(
|
|
651
|
-
self,
|
|
652
|
-
vector_store_id: str,
|
|
653
|
-
file_id: str,
|
|
654
|
-
status: ValidationInterface.StatusEnum,
|
|
655
|
-
error_message: Optional[str] = None,
|
|
656
|
-
) -> ValidationInterface.VectorStoreFileRead:
|
|
657
|
-
return self._run_sync(
|
|
658
|
-
self._update_file_status_async(
|
|
659
|
-
vector_store_id, file_id, status, error_message
|
|
660
|
-
)
|
|
661
|
-
)
|
|
662
|
-
|
|
663
|
-
def get_vector_stores_for_assistant(
|
|
664
|
-
self,
|
|
665
|
-
assistant_id: str,
|
|
666
|
-
) -> List[ValidationInterface.VectorStoreRead]:
|
|
667
|
-
return self._run_sync(self._get_assistant_vs_async(assistant_id))
|
|
668
|
-
|
|
669
|
-
def attach_vector_store_to_assistant(
|
|
670
|
-
self,
|
|
671
|
-
vector_store_id: str,
|
|
672
|
-
assistant_id: str,
|
|
673
|
-
) -> bool:
|
|
674
|
-
return self._run_sync(self._attach_vs_async(vector_store_id, assistant_id))
|
|
675
|
-
|
|
676
|
-
def detach_vector_store_from_assistant(
|
|
677
|
-
self,
|
|
678
|
-
vector_store_id: str,
|
|
679
|
-
assistant_id: str,
|
|
680
|
-
) -> bool:
|
|
681
|
-
return self._run_sync(self._detach_vs_async(vector_store_id, assistant_id))
|
|
682
|
-
|
|
683
|
-
def retrieve_vector_store_sync(
|
|
684
|
-
self,
|
|
685
|
-
vector_store_id: str,
|
|
686
|
-
) -> ValidationInterface.VectorStoreRead:
|
|
687
|
-
resp = self._sync_api_client.get(f"/v1/vector-stores/{vector_store_id}")
|
|
688
|
-
resp.raise_for_status()
|
|
689
|
-
return ValidationInterface.VectorStoreRead.model_validate(resp.json())
|
|
690
|
-
|
|
691
691
|
# ────────────────────────────────────────────────────────────────
|
|
692
692
|
# End‑to‑end: retrieve → (rerank) → synthesize → envelope
|
|
693
693
|
# ────────────────────────────────────────────────────────────────
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: projectdavid
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.31.0
|
|
4
4
|
Summary: Python SDK for interacting with the Entities Assistant API.
|
|
5
5
|
Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
|
|
6
6
|
License: PolyForm Noncommercial License 1.0.0
|
|
@@ -27,6 +27,9 @@ Requires-Dist: validators<0.35.0,>=0.29.0
|
|
|
27
27
|
Requires-Dist: sentence-transformers<5.0,>=3.4.0
|
|
28
28
|
Requires-Dist: sseclient-py
|
|
29
29
|
Requires-Dist: requests
|
|
30
|
+
Requires-Dist: python-magic
|
|
31
|
+
Requires-Dist: python-docx
|
|
32
|
+
Requires-Dist: python-pptx
|
|
30
33
|
Provides-Extra: dev
|
|
31
34
|
Requires-Dist: black>=23.3; extra == "dev"
|
|
32
35
|
Requires-Dist: isort>=5.12; extra == "dev"
|
|
@@ -9,7 +9,7 @@ projectdavid/clients/assistants_client.py,sha256=SsIGa5wPr7ga9WX0ywam3djUF-uWFdk
|
|
|
9
9
|
projectdavid/clients/base_client.py,sha256=UWl6nr6sxD1_xC6iyptQDR1tnNdFCOrEx5cEUPCRqJE,3417
|
|
10
10
|
projectdavid/clients/base_vector_store.py,sha256=jXivmqAW1bgYcLgIeW-hPxOiWZbs2hCsLy4oWzSvpNI,2061
|
|
11
11
|
projectdavid/clients/event_handler.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
projectdavid/clients/file_processor.py,sha256
|
|
12
|
+
projectdavid/clients/file_processor.py,sha256=-dRibUVVfGXjPRP3P2kzJaRZYvagIUKgDmfmi96V45w,15586
|
|
13
13
|
projectdavid/clients/file_search.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
14
|
projectdavid/clients/files_client.py,sha256=XkIDzbQFGDrd88taf0Kouc_4YJOPIYEHiIyWYLKDofI,15581
|
|
15
15
|
projectdavid/clients/inference_client.py,sha256=xz4ACPv5Tkis604QxO5mJX1inH_TGDfQP-31geETYpE,6609
|
|
@@ -20,7 +20,7 @@ projectdavid/clients/threads_client.py,sha256=ekzU5w14zftmtmFkiec3NC90Of-_KVSUY1
|
|
|
20
20
|
projectdavid/clients/tools_client.py,sha256=GkCVOmwpAoPqVt6aYmH0G1HIFha3iEwR9IIf9teR0j8,11487
|
|
21
21
|
projectdavid/clients/users_client.py,sha256=eCuUb9qvyH1GUFhZu6TRL9zdoK-qzHSs8-Vmrk_0mmg,13729
|
|
22
22
|
projectdavid/clients/vector_store_manager.py,sha256=lk-sWJjo6Z0EHZzjRoKiHPr0GpEXfE4bJBQzmKV8ezc,11372
|
|
23
|
-
projectdavid/clients/vectors.py,sha256=
|
|
23
|
+
projectdavid/clients/vectors.py,sha256=1UNnLN5nsMvVHXK4Yf7iTXGWZfgIjQ9eLQtCBe0Cqew,30986
|
|
24
24
|
projectdavid/constants/platform.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
25
|
projectdavid/services/logging_service.py,sha256=jdoRL46E42Ar8JFTDOV-xVD67CulcHSN-xhcEqA5CXQ,2643
|
|
26
26
|
projectdavid/synthesis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -32,8 +32,8 @@ projectdavid/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
|
|
|
32
32
|
projectdavid/utils/monitor_launcher.py,sha256=3YAgJdeuaUvq3JGvpA4ymqFsAnk29nH5q93cwStP4hc,2836
|
|
33
33
|
projectdavid/utils/run_monitor.py,sha256=F_WkqIP-qnWH-4llIbileWWLfRj2Q1Cg-ni23SR1rec,3786
|
|
34
34
|
projectdavid/utils/vector_search_formatter.py,sha256=YTe3HPGec26qGY7uxY8_GS8lc4QaN6aNXMzkl29nZpI,1735
|
|
35
|
-
projectdavid-1.
|
|
36
|
-
projectdavid-1.
|
|
37
|
-
projectdavid-1.
|
|
38
|
-
projectdavid-1.
|
|
39
|
-
projectdavid-1.
|
|
35
|
+
projectdavid-1.31.0.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
|
|
36
|
+
projectdavid-1.31.0.dist-info/METADATA,sha256=we616BQkChiuQYO_-UfQ1VL-7j-IfDfPI7OtheEMsUM,10809
|
|
37
|
+
projectdavid-1.31.0.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
|
38
|
+
projectdavid-1.31.0.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
|
|
39
|
+
projectdavid-1.31.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|