projectdavid 1.29.9__py3-none-any.whl → 1.38.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,7 +112,6 @@ class AssistantsClient(BaseAPIClient):
112
112
  description: str = "",
113
113
  instructions: str = "",
114
114
  tools: Optional[List[Dict[str, Any]]] = None,
115
- platform_tools: Optional[List[Dict[str, Any]]] = None,
116
115
  tool_resources: Optional[Dict[str, Dict[str, Any]]] = None,
117
116
  meta_data: Optional[Dict[str, Any]] = None,
118
117
  top_p: float = 1.0,
@@ -133,7 +132,6 @@ class AssistantsClient(BaseAPIClient):
133
132
  "model": model,
134
133
  "instructions": instructions,
135
134
  "tools": tools,
136
- "platform_tools": platform_tools,
137
135
  "tool_resources": tool_resources,
138
136
  "meta_data": meta_data,
139
137
  "top_p": top_p,
@@ -263,14 +261,10 @@ class AssistantsClient(BaseAPIClient):
263
261
  )
264
262
  return {"message": "Assistant disassociated from user successfully"}
265
263
 
266
- def list_assistants_by_user(
267
- self, user_id: str
268
- ) -> List[ent_validator.AssistantRead]:
269
- logging_utility.info("Listing assistants for user id=%s", user_id)
270
- try:
271
- resp = self._request_with_retries("GET", f"/v1/users/{user_id}/assistants")
272
- raw_list = self._parse_response(resp)
273
- return [ent_validator.AssistantRead(**a) for a in raw_list]
274
- except ValidationError as e:
275
- logging_utility.error("Validation error: %s", e.json())
276
- raise AssistantsClientError(f"Validation error: {e}") from e
264
+ def list(self) -> list[ent_validator.AssistantRead]:
265
+ """Return every assistant owned by *this* API key."""
266
+ logging_utility.info("Listing assistants")
267
+
268
+ resp = self._request_with_retries("GET", "/v1/assistants")
269
+ raw = self._parse_response(resp)
270
+ return [ent_validator.AssistantRead(**a) for a in raw]
@@ -1,80 +1,184 @@
1
1
  import asyncio
2
2
  import csv
3
+ import json
3
4
  import re
5
+ import textwrap
4
6
  from concurrent.futures import ThreadPoolExecutor
5
- from datetime import datetime
6
7
  from pathlib import Path
7
8
  from typing import Any, Dict, List, Tuple, Union
8
9
 
9
- try:
10
- from typing import LiteralString # Python 3.11+
11
- except ImportError:
10
+ try: # Python 3.11+
11
+ from typing import LiteralString
12
+ except ImportError: # 3.9–3.10
12
13
  from typing_extensions import LiteralString
13
14
 
14
15
  import numpy as np
15
16
  import pdfplumber
16
- import validators
17
+ from docx import Document
18
+ from pptx import Presentation
17
19
  from projectdavid_common import UtilsInterface
18
- from sentence_transformers import SentenceTransformer
19
20
 
20
21
  log = UtilsInterface.LoggingUtility()
21
22
 
22
23
 
23
24
  class FileProcessor:
25
+ # ------------------------------------------------------------------ #
26
+ # Construction
27
+ # ------------------------------------------------------------------ #
24
28
  def __init__(self, max_workers: int = 4, chunk_size: int = 512):
25
- self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
26
29
  self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
30
+ self._embedding_model = None
27
31
  self._executor = ThreadPoolExecutor(max_workers=max_workers)
28
32
 
29
- # compute token limits
30
- self.max_seq_length = self.embedding_model.get_max_seq_length()
31
- self.special_tokens_count = 2
32
- self.effective_max_length = self.max_seq_length - self.special_tokens_count
33
+ # Lazy-initialized attributes
34
+ self._requested_chunk_size = chunk_size
35
+ self._max_seq_length = None
36
+ self._effective_max_length = None
37
+ self._chunk_size = None
33
38
 
34
- # chunk_size cannot exceed 4× model max
35
- self.chunk_size = min(chunk_size, self.effective_max_length * 4)
39
+ log.info("Initialized Lazy-Loaded FileProcessor")
36
40
 
37
- log.info("Initialized optimized FileProcessor")
41
+ def _ensure_model(self):
42
+ """
43
+ Internal helper to load the model and calculate limits only once.
44
+ This prevents heavy imports (scipy, torch) until actually needed.
45
+ """
46
+ if self._embedding_model is None:
47
+ try:
48
+ from sentence_transformers import SentenceTransformer
49
+
50
+ log.info(f"Lazy-loading model: {self.embedding_model_name}")
51
+
52
+ self._embedding_model = SentenceTransformer(self.embedding_model_name)
53
+
54
+ # Ported Limit Calculations
55
+ self._max_seq_length = self._embedding_model.get_max_seq_length()
56
+ special_tokens_count = 2
57
+ self._effective_max_length = self._max_seq_length - special_tokens_count
58
+ self._chunk_size = min(
59
+ self._requested_chunk_size, self._effective_max_length * 4
60
+ )
61
+
62
+ except ImportError:
63
+ log.error(
64
+ "sentence-transformers not found. Ensure 'pip install projectdavid[vision]' is installed."
65
+ )
66
+ raise ImportError(
67
+ "Model-based features require 'sentence-transformers'. Install with [vision] extra."
68
+ )
69
+ return self._embedding_model
70
+
71
+ # Properties to maintain access to derived attributes
72
+ @property
73
+ def chunk_size(self):
74
+ if self._chunk_size is None:
75
+ self._ensure_model()
76
+ return self._chunk_size
77
+
78
+ @property
79
+ def effective_max_length(self):
80
+ if self._effective_max_length is None:
81
+ self._ensure_model()
82
+ return self._effective_max_length
83
+
84
+ # ------------------------------------------------------------------ #
85
+ # Embeddings
86
+ # ------------------------------------------------------------------ #
87
+ def encode_text(self, text: str):
88
+ model = self._ensure_model()
89
+ return model.encode(
90
+ [text],
91
+ convert_to_numpy=True,
92
+ truncate="model_max_length",
93
+ normalize_embeddings=True,
94
+ )[0]
95
+
96
+ async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
97
+ model = self._ensure_model()
98
+ return await asyncio.get_event_loop().run_in_executor(
99
+ self._executor,
100
+ lambda: model.encode(
101
+ [chunk],
102
+ convert_to_numpy=True,
103
+ truncate="model_max_length",
104
+ normalize_embeddings=True,
105
+ show_progress_bar=False,
106
+ )[0],
107
+ )
38
108
 
109
+ # ------------------------------------------------------------------ #
110
+ # Generic validators
111
+ # ------------------------------------------------------------------ #
39
112
  def validate_file(self, file_path: Path):
40
- """Ensure file exists and is under 100 MB."""
113
+ """Ensure file exists and is under 100 MB."""
41
114
  max_size = 100 * 1024 * 1024
42
115
  if not file_path.exists():
43
116
  raise FileNotFoundError(f"File not found: {file_path}")
44
117
  if file_path.stat().st_size > max_size:
45
118
  mb = max_size // (1024 * 1024)
46
- raise ValueError(f"{file_path.name} > {mb} MB limit")
119
+ raise ValueError(f"{file_path.name} > {mb} MB limit")
47
120
 
121
+ # ------------------------------------------------------------------ #
122
+ # File-type detection
123
+ # ------------------------------------------------------------------ #
48
124
  def _detect_file_type(self, file_path: Path) -> str:
49
- """Return 'pdf', 'text', or 'csv'."""
50
125
  suffix = file_path.suffix.lower()
51
126
  if suffix == ".pdf":
52
127
  return "pdf"
53
128
  if suffix == ".csv":
54
129
  return "csv"
55
- if suffix in {".txt", ".md", ".rst"}:
130
+ if suffix == ".json":
131
+ return "json"
132
+ if suffix in {".doc", ".docx", ".pptx"}:
133
+ return "office"
134
+
135
+ text_exts = {
136
+ ".txt",
137
+ ".md",
138
+ ".rst",
139
+ ".c",
140
+ ".cpp",
141
+ ".cs",
142
+ ".go",
143
+ ".java",
144
+ ".js",
145
+ ".ts",
146
+ ".php",
147
+ ".py",
148
+ ".rb",
149
+ ".sh",
150
+ ".tex",
151
+ ".html",
152
+ ".css",
153
+ }
154
+ if suffix in text_exts:
56
155
  return "text"
57
- return "unknown"
156
+ raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
58
157
 
158
+ # ------------------------------------------------------------------ #
159
+ # Public entry-point
160
+ # ------------------------------------------------------------------ #
59
161
  async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
60
- """
61
- Async entrypoint: validate, detect type, then dispatch to the
62
- appropriate processor (_process_pdf, _process_text, or _process_csv).
63
- """
162
+ """Validate → detect → dispatch to the appropriate processor."""
64
163
  file_path = Path(file_path)
65
164
  self.validate_file(file_path)
66
165
  ftype = self._detect_file_type(file_path)
67
166
 
68
- if ftype == "pdf":
69
- return await self._process_pdf(file_path)
70
- if ftype == "text":
71
- return await self._process_text(file_path)
72
- if ftype == "csv":
73
- return await self._process_csv(file_path)
74
- raise ValueError(f"Unsupported extension: {file_path.suffix}")
167
+ dispatch_map = {
168
+ "pdf": self._process_pdf,
169
+ "text": self._process_text,
170
+ "csv": self._process_csv,
171
+ "office": self._process_office,
172
+ "json": self._process_json,
173
+ }
174
+ if ftype not in dispatch_map:
175
+ raise ValueError(f"Unsupported file type: {file_path.suffix}")
75
176
 
76
- # ——— PDF / TEXT pipelines unchanged ——— #
177
+ return await dispatch_map[ftype](file_path)
77
178
 
179
+ # ------------------------------------------------------------------ #
180
+ # PDF
181
+ # ------------------------------------------------------------------ #
78
182
  async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
79
183
  page_chunks, doc_meta = await self._extract_text(file_path)
80
184
  all_chunks, line_data = [], []
@@ -82,7 +186,6 @@ class FileProcessor:
82
186
  for page_text, page_num, line_nums in page_chunks:
83
187
  lines = page_text.split("\n")
84
188
  buf, buf_lines, length = [], [], 0
85
-
86
189
  for line, ln in zip(lines, line_nums):
87
190
  l = len(line) + 1
88
191
  if length + l <= self.chunk_size:
@@ -94,12 +197,9 @@ class FileProcessor:
94
197
  all_chunks.append("\n".join(buf))
95
198
  line_data.append({"page": page_num, "lines": buf_lines})
96
199
  buf, buf_lines, length = [], [], 0
97
-
98
- # split any oversized line
99
200
  for piece in self._split_oversized_chunk(line):
100
201
  all_chunks.append(piece)
101
202
  line_data.append({"page": page_num, "lines": [ln]})
102
-
103
203
  if buf:
104
204
  all_chunks.append("\n".join(buf))
105
205
  line_data.append({"page": page_num, "lines": buf_lines})
@@ -107,7 +207,6 @@ class FileProcessor:
107
207
  vectors = await asyncio.gather(
108
208
  *[self._encode_chunk_async(c) for c in all_chunks]
109
209
  )
110
-
111
210
  return {
112
211
  "content": "\n\n".join(all_chunks),
113
212
  "metadata": {
@@ -121,6 +220,9 @@ class FileProcessor:
121
220
  "line_data": line_data,
122
221
  }
123
222
 
223
+ # ------------------------------------------------------------------ #
224
+ # Plain-text / code / markup
225
+ # ------------------------------------------------------------------ #
124
226
  async def _process_text(self, file_path: Path) -> Dict[str, Any]:
125
227
  text, extra_meta, _ = await self._extract_text(file_path)
126
228
  chunks = self._chunk_text(text)
@@ -137,17 +239,13 @@ class FileProcessor:
137
239
  "vectors": [v.tolist() for v in vectors],
138
240
  }
139
241
 
140
- # ——— NEW: CSV pipeline ——— #
141
-
242
+ # ------------------------------------------------------------------ #
243
+ # CSV
244
+ # ------------------------------------------------------------------ #
142
245
  async def _process_csv(
143
246
  self, file_path: Path, text_field: str = "description"
144
247
  ) -> Dict[str, Any]:
145
- """
146
- Read each row, embed the `text_field`, and collect per-row metadata
147
- from all other columns.
148
- """
149
- # load rows synchronously
150
- rows, texts, metas = [], [], []
248
+ texts, metas = [], []
151
249
  with file_path.open(newline="", encoding="utf-8") as f:
152
250
  reader = csv.DictReader(f)
153
251
  for row in reader:
@@ -155,27 +253,67 @@ class FileProcessor:
155
253
  if not txt:
156
254
  continue
157
255
  texts.append(txt)
158
- # all other columns become metadata
159
- row_meta = {k: v for k, v in row.items() if k != text_field and v}
160
- metas.append(row_meta)
256
+ metas.append({k: v for k, v in row.items() if k != text_field and v})
161
257
 
162
- # embed in parallel
163
258
  vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
259
+ return {
260
+ "content": None,
261
+ "metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
262
+ "chunks": texts,
263
+ "vectors": [v.tolist() for v in vectors],
264
+ "csv_row_metadata": metas,
265
+ }
266
+
267
+ # ------------------------------------------------------------------ #
268
+ # Office docs (.doc/.docx/.pptx)
269
+ # ------------------------------------------------------------------ #
270
+ async def _process_office(self, file_path: Path) -> Dict[str, Any]:
271
+ loop = asyncio.get_event_loop()
272
+ if file_path.suffix.lower() in {".doc", ".docx"}:
273
+ text = await loop.run_in_executor(
274
+ self._executor, self._read_docx, file_path
275
+ )
276
+ else: # .pptx
277
+ text = await loop.run_in_executor(
278
+ self._executor, self._read_pptx, file_path
279
+ )
164
280
 
281
+ chunks = self._chunk_text(text)
282
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
165
283
  return {
166
- "content": None, # CSVs may not have monolithic text
284
+ "content": text,
167
285
  "metadata": {
168
286
  "source": str(file_path),
169
- "rows": len(texts),
170
- "type": "csv",
287
+ "chunks": len(chunks),
288
+ "type": "office",
171
289
  },
172
- "chunks": texts,
290
+ "chunks": chunks,
173
291
  "vectors": [v.tolist() for v in vectors],
174
- "csv_row_metadata": metas,
175
292
  }
176
293
 
177
- # ——— shared helpers ——— #
294
+ # ------------------------------------------------------------------ #
295
+ # JSON
296
+ # ------------------------------------------------------------------ #
297
+ async def _process_json(self, file_path: Path) -> Dict[str, Any]:
298
+ text = await asyncio.get_event_loop().run_in_executor(
299
+ self._executor, self._read_json, file_path
300
+ )
301
+ chunks = self._chunk_text(text)
302
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
303
+ return {
304
+ "content": text,
305
+ "metadata": {
306
+ "source": str(file_path),
307
+ "chunks": len(chunks),
308
+ "type": "json",
309
+ },
310
+ "chunks": chunks,
311
+ "vectors": [v.tolist() for v in vectors],
312
+ }
178
313
 
314
+ # ------------------------------------------------------------------ #
315
+ # Shared helpers
316
+ # ------------------------------------------------------------------ #
179
317
  async def _extract_text(self, file_path: Path) -> Union[
180
318
  Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
181
319
  Tuple[str, Dict[str, Any], List[int]],
@@ -203,10 +341,8 @@ class FileProcessor:
203
341
  )
204
342
  for i, page in enumerate(pdf.pages, start=1):
205
343
  lines = page.extract_text_lines()
206
- txts, nums = [], []
207
- # sort by vertical position
208
344
  sorted_lines = sorted(lines, key=lambda x: x["top"])
209
- # enumerate to get a reliable line number
345
+ txts, nums = [], []
210
346
  for ln_idx, L in enumerate(sorted_lines, start=1):
211
347
  t = L.get("text", "").strip()
212
348
  if t:
@@ -222,23 +358,29 @@ class FileProcessor:
222
358
  except UnicodeDecodeError:
223
359
  return file_path.read_text(encoding="latin-1")
224
360
 
225
- async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
226
- return await asyncio.get_event_loop().run_in_executor(
227
- self._executor,
228
- lambda: self.embedding_model.encode(
229
- [chunk],
230
- convert_to_numpy=True,
231
- truncate="model_max_length",
232
- normalize_embeddings=True,
233
- show_progress_bar=False,
234
- )[0],
235
- )
236
-
361
+ def _read_docx(self, path: Path) -> str:
362
+ doc = Document(path)
363
+ return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
364
+
365
+ def _read_pptx(self, path: Path) -> str:
366
+ prs = Presentation(path)
367
+ slides = []
368
+ for slide in prs.slides:
369
+ chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
370
+ slides.append("\n".join(filter(None, chunks)))
371
+ return "\n\n".join(slides)
372
+
373
+ def _read_json(self, path: Path) -> str:
374
+ obj = json.loads(path.read_text(encoding="utf-8"))
375
+ pretty = json.dumps(obj, indent=2, ensure_ascii=False)
376
+ return "\n".join(textwrap.wrap(pretty, width=120))
377
+
378
+ # ------------------------------------------------------------------ #
379
+ # Text chunking helpers
380
+ # ------------------------------------------------------------------ #
237
381
  def _chunk_text(self, text: str) -> List[str]:
238
- # split into sentences, then re-chunk to token limits
239
382
  sentences = re.split(r"(?<=[\.!?])\s+", text)
240
383
  chunks, buf, length = [], [], 0
241
-
242
384
  for sent in sentences:
243
385
  slen = len(sent) + 1
244
386
  if length + slen <= self.chunk_size:
@@ -248,22 +390,20 @@ class FileProcessor:
248
390
  if buf:
249
391
  chunks.append(" ".join(buf))
250
392
  buf, length = [], 0
251
- # sentence itself may be too big
252
393
  while len(sent) > self.chunk_size:
253
394
  part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
254
395
  chunks.append(part)
255
396
  buf, length = [sent], len(sent)
256
-
257
397
  if buf:
258
398
  chunks.append(" ".join(buf))
259
-
260
399
  return chunks
261
400
 
262
401
  def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
402
+ model = self._ensure_model() # Ensure model is loaded to access tokenizer
263
403
  if tokens is None:
264
- tokens = self.embedding_model.tokenizer.tokenize(chunk)
404
+ tokens = model.tokenizer.tokenize(chunk)
265
405
  out = []
266
406
  for i in range(0, len(tokens), self.effective_max_length):
267
407
  seg = tokens[i : i + self.effective_max_length]
268
- out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
408
+ out.append(model.tokenizer.convert_tokens_to_string(seg))
269
409
  return out
@@ -153,18 +153,21 @@ class MessagesClient(BaseAPIClient):
153
153
  raise
154
154
 
155
155
  def list_messages(
156
- self, thread_id: str, limit: int = 20, order: str = "asc"
157
- ) -> List[Dict[str, Any]]:
156
+ self,
157
+ thread_id: str,
158
+ limit: int = 20,
159
+ order: str = "asc",
160
+ ) -> ent_validator.MessagesList:
158
161
  """
159
- List messages for a given thread.
162
+ Fetch messages for a thread and return an OpenAI-style envelope.
160
163
 
161
164
  Args:
162
- thread_id (str): The thread ID.
163
- limit (int): Maximum number of messages to retrieve.
164
- order (str): Order of messages ('asc' or 'desc').
165
+ thread_id (str): Target thread ID.
166
+ limit (int): Max messages to fetch.
167
+ order (str): 'asc' or 'desc'.
165
168
 
166
169
  Returns:
167
- List[Dict[str, Any]]: A list of messages as dictionaries.
170
+ MessagesList: Wrapper containing .data[], .first_id, .last_id, .has_more …
168
171
  """
169
172
  logging_utility.info(
170
173
  "Listing messages for thread_id: %s, limit: %d, order: %s",
@@ -178,24 +181,19 @@ class MessagesClient(BaseAPIClient):
178
181
  f"/v1/threads/{thread_id}/messages", params=params
179
182
  )
180
183
  response.raise_for_status()
181
- messages = response.json()
182
- validated_messages = [
183
- ent_validator.MessageRead(**message) for message in messages
184
- ]
185
- logging_utility.info("Retrieved %d messages", len(validated_messages))
186
- return [message.dict() for message in validated_messages]
184
+
185
+ envelope = ent_validator.MessagesList(**response.json())
186
+ logging_utility.info("Retrieved %d messages", len(envelope.data))
187
+ return envelope
188
+
187
189
  except ValidationError as e:
188
190
  logging_utility.error("Validation error: %s", e.json())
189
- raise ValueError(f"Validation error: {e}")
191
+ raise ValueError(f"Validation error: {e}") from e
190
192
  except httpx.HTTPStatusError as e:
191
- logging_utility.error(
192
- "HTTP error occurred while listing messages: %s", str(e)
193
- )
193
+ logging_utility.error("HTTP error while listing messages: %s", str(e))
194
194
  raise
195
195
  except Exception as e:
196
- logging_utility.error(
197
- "An error occurred while listing messages: %s", str(e)
198
- )
196
+ logging_utility.error("Unexpected error while listing messages: %s", str(e))
199
197
  raise
200
198
 
201
199
  def get_formatted_messages(
@@ -294,32 +292,19 @@ class MessagesClient(BaseAPIClient):
294
292
  logging_utility.error("An error occurred: %s", str(e))
295
293
  raise RuntimeError(f"An error occurred: {str(e)}")
296
294
 
297
- def delete_message(self, message_id: str) -> Dict[str, Any]:
298
- """
299
- Delete a message by its ID.
300
-
301
- Args:
302
- message_id (str): The ID of the message.
303
-
304
- Returns:
305
- Dict[str, Any]: The deletion result.
306
- """
295
+ def delete_message(self, message_id: str) -> ent_validator.MessageDeleted:
296
+ """Delete a message and return deletion envelope."""
307
297
  logging_utility.info("Deleting message with id: %s", message_id)
308
298
  try:
309
299
  response = self.client.delete(f"/v1/messages/{message_id}")
310
300
  response.raise_for_status()
311
- result = response.json()
312
- logging_utility.info("Message deleted successfully")
313
- return result
301
+ return ent_validator.MessageDeleted(**response.json())
302
+
314
303
  except httpx.HTTPStatusError as e:
315
- logging_utility.error(
316
- "HTTP error occurred while deleting message: %s", str(e)
317
- )
304
+ logging_utility.error("HTTP error while deleting message: %s", str(e))
318
305
  raise
319
306
  except Exception as e:
320
- logging_utility.error(
321
- "An error occurred while deleting message: %s", str(e)
322
- )
307
+ logging_utility.error("Unexpected error while deleting message: %s", str(e))
323
308
  raise
324
309
 
325
310
  def save_assistant_message_chunk(