projectdavid 1.31.0__py3-none-any.whl → 1.38.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,7 +112,6 @@ class AssistantsClient(BaseAPIClient):
112
112
  description: str = "",
113
113
  instructions: str = "",
114
114
  tools: Optional[List[Dict[str, Any]]] = None,
115
- platform_tools: Optional[List[Dict[str, Any]]] = None,
116
115
  tool_resources: Optional[Dict[str, Dict[str, Any]]] = None,
117
116
  meta_data: Optional[Dict[str, Any]] = None,
118
117
  top_p: float = 1.0,
@@ -133,7 +132,6 @@ class AssistantsClient(BaseAPIClient):
133
132
  "model": model,
134
133
  "instructions": instructions,
135
134
  "tools": tools,
136
- "platform_tools": platform_tools,
137
135
  "tool_resources": tool_resources,
138
136
  "meta_data": meta_data,
139
137
  "top_p": top_p,
@@ -263,14 +261,10 @@ class AssistantsClient(BaseAPIClient):
263
261
  )
264
262
  return {"message": "Assistant disassociated from user successfully"}
265
263
 
266
- def list_assistants_by_user(
267
- self, user_id: str
268
- ) -> List[ent_validator.AssistantRead]:
269
- logging_utility.info("Listing assistants for user id=%s", user_id)
270
- try:
271
- resp = self._request_with_retries("GET", f"/v1/users/{user_id}/assistants")
272
- raw_list = self._parse_response(resp)
273
- return [ent_validator.AssistantRead(**a) for a in raw_list]
274
- except ValidationError as e:
275
- logging_utility.error("Validation error: %s", e.json())
276
- raise AssistantsClientError(f"Validation error: {e}") from e
264
+ def list(self) -> list[ent_validator.AssistantRead]:
265
+ """Return every assistant owned by *this* API key."""
266
+ logging_utility.info("Listing assistants")
267
+
268
+ resp = self._request_with_retries("GET", "/v1/assistants")
269
+ raw = self._parse_response(resp)
270
+ return [ent_validator.AssistantRead(**a) for a in raw]
@@ -1,7 +1,6 @@
1
1
  import asyncio
2
2
  import csv
3
3
  import json
4
- import mimetypes
5
4
  import re
6
5
  import textwrap
7
6
  from concurrent.futures import ThreadPoolExecutor
@@ -10,16 +9,14 @@ from typing import Any, Dict, List, Tuple, Union
10
9
 
11
10
  try: # Python 3.11+
12
11
  from typing import LiteralString
13
- except ImportError: # 3.9 - 3.10
12
+ except ImportError: # 3.93.10
14
13
  from typing_extensions import LiteralString
15
14
 
16
- import magic
17
15
  import numpy as np
18
16
  import pdfplumber
19
17
  from docx import Document
20
18
  from pptx import Presentation
21
19
  from projectdavid_common import UtilsInterface
22
- from sentence_transformers import SentenceTransformer
23
20
 
24
21
  log = UtilsInterface.LoggingUtility()
25
22
 
@@ -29,17 +26,85 @@ class FileProcessor:
29
26
  # Construction
30
27
  # ------------------------------------------------------------------ #
31
28
  def __init__(self, max_workers: int = 4, chunk_size: int = 512):
32
- self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
33
29
  self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
30
+ self._embedding_model = None
34
31
  self._executor = ThreadPoolExecutor(max_workers=max_workers)
35
32
 
36
- # token limits
37
- self.max_seq_length = self.embedding_model.get_max_seq_length()
38
- self.special_tokens_count = 2
39
- self.effective_max_length = self.max_seq_length - self.special_tokens_count
40
- self.chunk_size = min(chunk_size, self.effective_max_length * 4)
33
+ # Lazy-initialized attributes
34
+ self._requested_chunk_size = chunk_size
35
+ self._max_seq_length = None
36
+ self._effective_max_length = None
37
+ self._chunk_size = None
41
38
 
42
- log.info("Initialized optimized FileProcessor")
39
+ log.info("Initialized Lazy-Loaded FileProcessor")
40
+
41
+ def _ensure_model(self):
42
+ """
43
+ Internal helper to load the model and calculate limits only once.
44
+ This prevents heavy imports (scipy, torch) until actually needed.
45
+ """
46
+ if self._embedding_model is None:
47
+ try:
48
+ from sentence_transformers import SentenceTransformer
49
+
50
+ log.info(f"Lazy-loading model: {self.embedding_model_name}")
51
+
52
+ self._embedding_model = SentenceTransformer(self.embedding_model_name)
53
+
54
+ # Ported Limit Calculations
55
+ self._max_seq_length = self._embedding_model.get_max_seq_length()
56
+ special_tokens_count = 2
57
+ self._effective_max_length = self._max_seq_length - special_tokens_count
58
+ self._chunk_size = min(
59
+ self._requested_chunk_size, self._effective_max_length * 4
60
+ )
61
+
62
+ except ImportError:
63
+ log.error(
64
+ "sentence-transformers not found. Ensure 'pip install projectdavid[vision]' is installed."
65
+ )
66
+ raise ImportError(
67
+ "Model-based features require 'sentence-transformers'. Install with [vision] extra."
68
+ )
69
+ return self._embedding_model
70
+
71
+ # Properties to maintain access to derived attributes
72
+ @property
73
+ def chunk_size(self):
74
+ if self._chunk_size is None:
75
+ self._ensure_model()
76
+ return self._chunk_size
77
+
78
+ @property
79
+ def effective_max_length(self):
80
+ if self._effective_max_length is None:
81
+ self._ensure_model()
82
+ return self._effective_max_length
83
+
84
+ # ------------------------------------------------------------------ #
85
+ # Embeddings
86
+ # ------------------------------------------------------------------ #
87
+ def encode_text(self, text: str):
88
+ model = self._ensure_model()
89
+ return model.encode(
90
+ [text],
91
+ convert_to_numpy=True,
92
+ truncate="model_max_length",
93
+ normalize_embeddings=True,
94
+ )[0]
95
+
96
+ async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
97
+ model = self._ensure_model()
98
+ return await asyncio.get_event_loop().run_in_executor(
99
+ self._executor,
100
+ lambda: model.encode(
101
+ [chunk],
102
+ convert_to_numpy=True,
103
+ truncate="model_max_length",
104
+ normalize_embeddings=True,
105
+ show_progress_bar=False,
106
+ )[0],
107
+ )
43
108
 
44
109
  # ------------------------------------------------------------------ #
45
110
  # Generic validators
@@ -54,54 +119,20 @@ class FileProcessor:
54
119
  raise ValueError(f"{file_path.name} > {mb} MB limit")
55
120
 
56
121
  # ------------------------------------------------------------------ #
57
- # File-type detection (extension + MIME)
122
+ # File-type detection
58
123
  # ------------------------------------------------------------------ #
59
124
  def _detect_file_type(self, file_path: Path) -> str:
60
- """
61
- Return a handler tag:
62
-
63
- • 'pdf' • 'csv'
64
- • 'json' • 'office'
65
- • 'text'
66
-
67
- Raises *ValueError* on anything unknown.
68
- """
69
- # 1️⃣ Best-effort MIME sniff
70
- mime_type: str | None = None
71
- if magic is not None:
72
- try:
73
- mime_type = magic.from_file(str(file_path), mime=True)
74
- except Exception:
75
- mime_type = None
76
-
77
- # 2️⃣ Fallback → mimetypes
78
- if not mime_type:
79
- mime_type, _ = mimetypes.guess_type(file_path.name)
80
-
81
125
  suffix = file_path.suffix.lower()
126
+ if suffix == ".pdf":
127
+ return "pdf"
128
+ if suffix == ".csv":
129
+ return "csv"
130
+ if suffix == ".json":
131
+ return "json"
132
+ if suffix in {".doc", ".docx", ".pptx"}:
133
+ return "office"
82
134
 
83
- PDF_MIMES = {"application/pdf"}
84
- CSV_MIMES = {"text/csv", "application/csv"}
85
- JSON_MIMES = {"application/json"}
86
- OFFICE_MIMES = {
87
- "application/msword",
88
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
89
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
90
- }
91
- TEXT_MIMES = {
92
- "text/plain",
93
- "text/markdown",
94
- "text/x-python",
95
- "text/x-c",
96
- "text/x-c++",
97
- "text/x-java-source",
98
- "text/x-script.python",
99
- "text/html",
100
- "text/css",
101
- "application/typescript",
102
- "text/javascript",
103
- }
104
- TEXT_EXTS = {
135
+ text_exts = {
105
136
  ".txt",
106
137
  ".md",
107
138
  ".rst",
@@ -120,32 +151,9 @@ class FileProcessor:
120
151
  ".html",
121
152
  ".css",
122
153
  }
123
-
124
- # --- PDF ---
125
- if mime_type in PDF_MIMES or suffix == ".pdf":
126
- return "pdf"
127
-
128
- # --- CSV ---
129
- if mime_type in CSV_MIMES or suffix == ".csv":
130
- return "csv"
131
-
132
- # --- JSON ---
133
- if mime_type in JSON_MIMES or suffix == ".json":
134
- return "json"
135
-
136
- # --- Office documents ---
137
- if mime_type in OFFICE_MIMES or suffix in {".doc", ".docx", ".pptx"}:
138
- return "office"
139
-
140
- # --- Generic text / code / markup ---
141
- if mime_type in TEXT_MIMES or suffix in TEXT_EXTS:
154
+ if suffix in text_exts:
142
155
  return "text"
143
-
144
- # --- Unsupported ---
145
- raise ValueError(
146
- f"Unsupported file type for '{file_path.name}': "
147
- f"MIME={mime_type or 'unknown'} extension={suffix}"
148
- )
156
+ raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
149
157
 
150
158
  # ------------------------------------------------------------------ #
151
159
  # Public entry-point
@@ -156,19 +164,17 @@ class FileProcessor:
156
164
  self.validate_file(file_path)
157
165
  ftype = self._detect_file_type(file_path)
158
166
 
159
- if ftype == "pdf":
160
- return await self._process_pdf(file_path)
161
- if ftype == "text":
162
- return await self._process_text(file_path)
163
- if ftype == "csv":
164
- return await self._process_csv(file_path)
165
- if ftype == "office":
166
- return await self._process_office(file_path)
167
- if ftype == "json":
168
- return await self._process_json(file_path)
167
+ dispatch_map = {
168
+ "pdf": self._process_pdf,
169
+ "text": self._process_text,
170
+ "csv": self._process_csv,
171
+ "office": self._process_office,
172
+ "json": self._process_json,
173
+ }
174
+ if ftype not in dispatch_map:
175
+ raise ValueError(f"Unsupported file type: {file_path.suffix}")
169
176
 
170
- # Safety net (should never hit)
171
- raise ValueError(f"Unsupported file type: {file_path.suffix}")
177
+ return await dispatch_map[ftype](file_path)
172
178
 
173
179
  # ------------------------------------------------------------------ #
174
180
  # PDF
@@ -239,7 +245,7 @@ class FileProcessor:
239
245
  async def _process_csv(
240
246
  self, file_path: Path, text_field: str = "description"
241
247
  ) -> Dict[str, Any]:
242
- rows, texts, metas = [], [], []
248
+ texts, metas = [], []
243
249
  with file_path.open(newline="", encoding="utf-8") as f:
244
250
  reader = csv.DictReader(f)
245
251
  for row in reader:
@@ -369,18 +375,6 @@ class FileProcessor:
369
375
  pretty = json.dumps(obj, indent=2, ensure_ascii=False)
370
376
  return "\n".join(textwrap.wrap(pretty, width=120))
371
377
 
372
- async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
373
- return await asyncio.get_event_loop().run_in_executor(
374
- self._executor,
375
- lambda: self.embedding_model.encode(
376
- [chunk],
377
- convert_to_numpy=True,
378
- truncate="model_max_length",
379
- normalize_embeddings=True,
380
- show_progress_bar=False,
381
- )[0],
382
- )
383
-
384
378
  # ------------------------------------------------------------------ #
385
379
  # Text chunking helpers
386
380
  # ------------------------------------------------------------------ #
@@ -405,10 +399,11 @@ class FileProcessor:
405
399
  return chunks
406
400
 
407
401
  def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
402
+ model = self._ensure_model() # Ensure model is loaded to access tokenizer
408
403
  if tokens is None:
409
- tokens = self.embedding_model.tokenizer.tokenize(chunk)
404
+ tokens = model.tokenizer.tokenize(chunk)
410
405
  out = []
411
406
  for i in range(0, len(tokens), self.effective_max_length):
412
407
  seg = tokens[i : i + self.effective_max_length]
413
- out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
408
+ out.append(model.tokenizer.convert_tokens_to_string(seg))
414
409
  return out
@@ -153,18 +153,21 @@ class MessagesClient(BaseAPIClient):
153
153
  raise
154
154
 
155
155
  def list_messages(
156
- self, thread_id: str, limit: int = 20, order: str = "asc"
157
- ) -> List[Dict[str, Any]]:
156
+ self,
157
+ thread_id: str,
158
+ limit: int = 20,
159
+ order: str = "asc",
160
+ ) -> ent_validator.MessagesList:
158
161
  """
159
- List messages for a given thread.
162
+ Fetch messages for a thread and return an OpenAI-style envelope.
160
163
 
161
164
  Args:
162
- thread_id (str): The thread ID.
163
- limit (int): Maximum number of messages to retrieve.
164
- order (str): Order of messages ('asc' or 'desc').
165
+ thread_id (str): Target thread ID.
166
+ limit (int): Max messages to fetch.
167
+ order (str): 'asc' or 'desc'.
165
168
 
166
169
  Returns:
167
- List[Dict[str, Any]]: A list of messages as dictionaries.
170
+ MessagesList: Wrapper containing .data[], .first_id, .last_id, .has_more …
168
171
  """
169
172
  logging_utility.info(
170
173
  "Listing messages for thread_id: %s, limit: %d, order: %s",
@@ -178,24 +181,19 @@ class MessagesClient(BaseAPIClient):
178
181
  f"/v1/threads/{thread_id}/messages", params=params
179
182
  )
180
183
  response.raise_for_status()
181
- messages = response.json()
182
- validated_messages = [
183
- ent_validator.MessageRead(**message) for message in messages
184
- ]
185
- logging_utility.info("Retrieved %d messages", len(validated_messages))
186
- return [message.dict() for message in validated_messages]
184
+
185
+ envelope = ent_validator.MessagesList(**response.json())
186
+ logging_utility.info("Retrieved %d messages", len(envelope.data))
187
+ return envelope
188
+
187
189
  except ValidationError as e:
188
190
  logging_utility.error("Validation error: %s", e.json())
189
- raise ValueError(f"Validation error: {e}")
191
+ raise ValueError(f"Validation error: {e}") from e
190
192
  except httpx.HTTPStatusError as e:
191
- logging_utility.error(
192
- "HTTP error occurred while listing messages: %s", str(e)
193
- )
193
+ logging_utility.error("HTTP error while listing messages: %s", str(e))
194
194
  raise
195
195
  except Exception as e:
196
- logging_utility.error(
197
- "An error occurred while listing messages: %s", str(e)
198
- )
196
+ logging_utility.error("Unexpected error while listing messages: %s", str(e))
199
197
  raise
200
198
 
201
199
  def get_formatted_messages(
@@ -294,32 +292,19 @@ class MessagesClient(BaseAPIClient):
294
292
  logging_utility.error("An error occurred: %s", str(e))
295
293
  raise RuntimeError(f"An error occurred: {str(e)}")
296
294
 
297
- def delete_message(self, message_id: str) -> Dict[str, Any]:
298
- """
299
- Delete a message by its ID.
300
-
301
- Args:
302
- message_id (str): The ID of the message.
303
-
304
- Returns:
305
- Dict[str, Any]: The deletion result.
306
- """
295
+ def delete_message(self, message_id: str) -> ent_validator.MessageDeleted:
296
+ """Delete a message and return deletion envelope."""
307
297
  logging_utility.info("Deleting message with id: %s", message_id)
308
298
  try:
309
299
  response = self.client.delete(f"/v1/messages/{message_id}")
310
300
  response.raise_for_status()
311
- result = response.json()
312
- logging_utility.info("Message deleted successfully")
313
- return result
301
+ return ent_validator.MessageDeleted(**response.json())
302
+
314
303
  except httpx.HTTPStatusError as e:
315
- logging_utility.error(
316
- "HTTP error occurred while deleting message: %s", str(e)
317
- )
304
+ logging_utility.error("HTTP error while deleting message: %s", str(e))
318
305
  raise
319
306
  except Exception as e:
320
- logging_utility.error(
321
- "An error occurred while deleting message: %s", str(e)
322
- )
307
+ logging_utility.error("Unexpected error while deleting message: %s", str(e))
323
308
  raise
324
309
 
325
310
  def save_assistant_message_chunk(