projectdavid 1.33.12__tar.gz → 1.33.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of projectdavid might be problematic. Click here for more details.

Files changed (70) hide show
  1. {projectdavid-1.33.12 → projectdavid-1.33.14}/CHANGELOG.md +14 -0
  2. {projectdavid-1.33.12/src/projectdavid.egg-info → projectdavid-1.33.14}/PKG-INFO +1 -1
  3. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/vector_store.md +7 -6
  4. {projectdavid-1.33.12 → projectdavid-1.33.14}/pyproject.toml +1 -1
  5. projectdavid-1.33.14/src/projectdavid/clients/file_processor.py +364 -0
  6. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/synchronous_inference_wrapper.py +44 -9
  7. projectdavid-1.33.12/src/projectdavid/clients/file_processor.py → projectdavid-1.33.14/src/projectdavid/clients/vision-file_processor.py +12 -124
  8. {projectdavid-1.33.12 → projectdavid-1.33.14/src/projectdavid.egg-info}/PKG-INFO +1 -1
  9. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid.egg-info/SOURCES.txt +1 -0
  10. {projectdavid-1.33.12 → projectdavid-1.33.14}/LICENSE +0 -0
  11. {projectdavid-1.33.12 → projectdavid-1.33.14}/MANIFEST.in +0 -0
  12. {projectdavid-1.33.12 → projectdavid-1.33.14}/README.md +0 -0
  13. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/assistants.md +0 -0
  14. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/code_interpretation.md +0 -0
  15. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/database.md +0 -0
  16. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/database_assistant_example.md +0 -0
  17. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/docker_comtainers.md +0 -0
  18. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/file_search.md +0 -0
  19. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/files.md +0 -0
  20. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/function_call_definition.md +0 -0
  21. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/function_calls.md +0 -0
  22. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/handling_function_calls.md +0 -0
  23. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/inference.md +0 -0
  24. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/messages.md +0 -0
  25. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/runs.md +0 -0
  26. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/streams.md +0 -0
  27. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/threads.md +0 -0
  28. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/tools.md +0 -0
  29. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/users.md +0 -0
  30. {projectdavid-1.33.12 → projectdavid-1.33.14}/docs/versioning.md +0 -0
  31. {projectdavid-1.33.12 → projectdavid-1.33.14}/setup.cfg +0 -0
  32. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/__init__.py +0 -0
  33. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/_version.py +0 -0
  34. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/actions_client.py +0 -0
  35. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/api_key_client.py +0 -0
  36. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/assistants_client.py +0 -0
  37. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/base_client.py +0 -0
  38. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/base_vector_store.py +0 -0
  39. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/event_handler.py +0 -0
  40. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/file_search.py +0 -0
  41. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/files_client.py +0 -0
  42. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/inference_client.py +0 -0
  43. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/messages_client.py +0 -0
  44. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/runs.py +0 -0
  45. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/threads_client.py +0 -0
  46. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/tools_client.py +0 -0
  47. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/users_client.py +0 -0
  48. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/vector_store_manager.py +0 -0
  49. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/clients/vectors.py +0 -0
  50. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/constants/platform.py +0 -0
  51. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/decorators.py +0 -0
  52. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/entity.py +0 -0
  53. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/events.py +0 -0
  54. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/serializers.py +0 -0
  55. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/services/logging_service.py +0 -0
  56. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/synthesis/__init__.py +0 -0
  57. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/synthesis/llm_synthesizer.py +0 -0
  58. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/synthesis/prompt.py +0 -0
  59. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/synthesis/reranker.py +0 -0
  60. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/synthesis/retriever.py +0 -0
  61. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/utils/__init__.py +0 -0
  62. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/utils/function_call_suppressor.py +0 -0
  63. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/utils/monitor_launcher.py +0 -0
  64. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/utils/peek_gate.py +0 -0
  65. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/utils/run_monitor.py +0 -0
  66. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid/utils/vector_search_formatter.py +0 -0
  67. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid.egg-info/dependency_links.txt +0 -0
  68. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid.egg-info/requires.txt +0 -0
  69. {projectdavid-1.33.12 → projectdavid-1.33.14}/src/projectdavid.egg-info/top_level.txt +0 -0
  70. {projectdavid-1.33.12 → projectdavid-1.33.14}/tests/test_clients.py +0 -0
@@ -1,3 +1,17 @@
1
+ ## [1.33.14](https://github.com/frankie336/projectdavid/compare/v1.33.13...v1.33.14) (2025-06-16)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * Back out from vision support - resource issue. Revisit in grand plan ([3199ba7](https://github.com/frankie336/projectdavid/commit/3199ba7a18b3cfcc0f7306cd8748105f593a1836))
7
+
8
+ ## [1.33.13](https://github.com/frankie336/projectdavid/compare/v1.33.12...v1.33.13) (2025-06-13)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * restore code_interpreter_stream passthrough.14 ([df2a75f](https://github.com/frankie336/projectdavid/commit/df2a75f47a55d07d42af3a9949ef9bed4496a602))
14
+
1
15
  ## [1.33.12](https://github.com/frankie336/projectdavid/compare/v1.33.11...v1.33.12) (2025-06-13)
2
16
 
3
17
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: projectdavid
3
- Version: 1.33.12
3
+ Version: 1.33.14
4
4
  Summary: Python SDK for interacting with the Entities Assistant API.
5
5
  Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
6
6
  License: PolyForm Noncommercial License 1.0.0
@@ -24,9 +24,8 @@ print(test_user)
24
24
 
25
25
  # create a vector store
26
26
  store = client.vectors.create_vector_store(
27
- name='Test Vector Store1',
28
- user_id=test_user.id,
29
- )
27
+ name='Test Vector Store1')
28
+
30
29
  print(store)
31
30
  ```
32
31
 
@@ -81,14 +80,16 @@ At this point, your file has been vectorized to your store.
81
80
 
82
81
  ---
83
82
 
83
+ ### Searches
84
84
 
85
- ## Supporting image vectors
86
85
 
87
- Entities now ingests and vectorizes a wide range of image formats for semantic search. You can leverage these image embeddings to extend text-only models into powerful multi-modal workflows; enriching chatbots, document search, recommendation engines, and more.
88
86
 
89
87
  ---
90
88
 
91
- ### Making Searches Against Files in a Store
89
+
90
+
91
+
92
+
92
93
 
93
94
  - The assistant will self-select appropriate vector store
94
95
  searches using its latent logic when responding to a prompt.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "projectdavid"
7
- version = "1.33.12"
7
+ version = "1.33.14"
8
8
  description = "Python SDK for interacting with the Entities Assistant API."
9
9
  readme = "README.md"
10
10
  authors = [
@@ -0,0 +1,364 @@
1
+ import asyncio
2
+ import csv
3
+ import json
4
+ import re
5
+ import textwrap
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Tuple, Union
9
+
10
+ try: # Python 3.11+
11
+ from typing import LiteralString
12
+ except ImportError: # 3.9–3.10
13
+ from typing_extensions import LiteralString
14
+
15
+ import numpy as np
16
+ import pdfplumber
17
+ from docx import Document
18
+ from pptx import Presentation
19
+ from projectdavid_common import UtilsInterface
20
+ from sentence_transformers import SentenceTransformer
21
+
22
+ log = UtilsInterface.LoggingUtility()
23
+
24
+
25
+ class FileProcessor:
26
+ # ------------------------------------------------------------------ #
27
+ # Construction
28
+ # ------------------------------------------------------------------ #
29
+ def __init__(self, max_workers: int = 4, chunk_size: int = 512):
30
+ self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
31
+ self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
32
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
33
+
34
+ # token limits
35
+ self.max_seq_length = self.embedding_model.get_max_seq_length()
36
+ self.special_tokens_count = 2
37
+ self.effective_max_length = self.max_seq_length - self.special_tokens_count
38
+ self.chunk_size = min(chunk_size, self.effective_max_length * 4)
39
+
40
+ log.info("Initialized optimized FileProcessor")
41
+
42
+ # ------------------------------------------------------------------ #
43
+ # Generic validators
44
+ # ------------------------------------------------------------------ #
45
+ def validate_file(self, file_path: Path):
46
+ """Ensure file exists and is under 100 MB."""
47
+ max_size = 100 * 1024 * 1024
48
+ if not file_path.exists():
49
+ raise FileNotFoundError(f"File not found: {file_path}")
50
+ if file_path.stat().st_size > max_size:
51
+ mb = max_size // (1024 * 1024)
52
+ raise ValueError(f"{file_path.name} > {mb} MB limit")
53
+
54
+ # ------------------------------------------------------------------ #
55
+ # File-type detection (simple extension map – NO libmagic)
56
+ # ------------------------------------------------------------------ #
57
+ def _detect_file_type(self, file_path: Path) -> str:
58
+ """
59
+ Return one of:
60
+
61
+ • 'pdf' • 'csv' • 'json'
62
+ • 'office' (.doc/.docx/.pptx)
63
+ • 'text' (code / markup / plain text)
64
+
65
+ Raises *ValueError* if the extension is not recognised.
66
+ """
67
+ suffix = file_path.suffix.lower()
68
+
69
+ if suffix == ".pdf":
70
+ return "pdf"
71
+ if suffix == ".csv":
72
+ return "csv"
73
+ if suffix == ".json":
74
+ return "json"
75
+ if suffix in {".doc", ".docx", ".pptx"}:
76
+ return "office"
77
+
78
+ text_exts = {
79
+ ".txt",
80
+ ".md",
81
+ ".rst",
82
+ ".c",
83
+ ".cpp",
84
+ ".cs",
85
+ ".go",
86
+ ".java",
87
+ ".js",
88
+ ".ts",
89
+ ".php",
90
+ ".py",
91
+ ".rb",
92
+ ".sh",
93
+ ".tex",
94
+ ".html",
95
+ ".css",
96
+ }
97
+ if suffix in text_exts:
98
+ return "text"
99
+
100
+ raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
101
+
102
+ # ------------------------------------------------------------------ #
103
+ # Public entry-point
104
+ # ------------------------------------------------------------------ #
105
+ async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
106
+ """Validate → detect → dispatch to the appropriate processor."""
107
+ file_path = Path(file_path)
108
+ self.validate_file(file_path)
109
+ ftype = self._detect_file_type(file_path)
110
+
111
+ dispatch_map = {
112
+ "pdf": self._process_pdf,
113
+ "text": self._process_text,
114
+ "csv": self._process_csv,
115
+ "office": self._process_office,
116
+ "json": self._process_json,
117
+ }
118
+ if ftype not in dispatch_map:
119
+ raise ValueError(f"Unsupported file type: {file_path.suffix}")
120
+
121
+ return await dispatch_map[ftype](file_path)
122
+
123
+ # ------------------------------------------------------------------ #
124
+ # PDF
125
+ # ------------------------------------------------------------------ #
126
+ async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
127
+ page_chunks, doc_meta = await self._extract_text(file_path)
128
+ all_chunks, line_data = [], []
129
+
130
+ for page_text, page_num, line_nums in page_chunks:
131
+ lines = page_text.split("\n")
132
+ buf, buf_lines, length = [], [], 0
133
+ for line, ln in zip(lines, line_nums):
134
+ l = len(line) + 1
135
+ if length + l <= self.chunk_size:
136
+ buf.append(line)
137
+ buf_lines.append(ln)
138
+ length += l
139
+ else:
140
+ if buf:
141
+ all_chunks.append("\n".join(buf))
142
+ line_data.append({"page": page_num, "lines": buf_lines})
143
+ buf, buf_lines, length = [], [], 0
144
+ for piece in self._split_oversized_chunk(line):
145
+ all_chunks.append(piece)
146
+ line_data.append({"page": page_num, "lines": [ln]})
147
+ if buf:
148
+ all_chunks.append("\n".join(buf))
149
+ line_data.append({"page": page_num, "lines": buf_lines})
150
+
151
+ vectors = await asyncio.gather(
152
+ *[self._encode_chunk_async(c) for c in all_chunks]
153
+ )
154
+ return {
155
+ "content": "\n\n".join(all_chunks),
156
+ "metadata": {
157
+ **doc_meta,
158
+ "source": str(file_path),
159
+ "chunks": len(all_chunks),
160
+ "type": "pdf",
161
+ },
162
+ "chunks": all_chunks,
163
+ "vectors": [v.tolist() for v in vectors],
164
+ "line_data": line_data,
165
+ }
166
+
167
+ # ------------------------------------------------------------------ #
168
+ # Plain-text / code / markup
169
+ # ------------------------------------------------------------------ #
170
+ async def _process_text(self, file_path: Path) -> Dict[str, Any]:
171
+ text, extra_meta, _ = await self._extract_text(file_path)
172
+ chunks = self._chunk_text(text)
173
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
174
+ return {
175
+ "content": text,
176
+ "metadata": {
177
+ **extra_meta,
178
+ "source": str(file_path),
179
+ "chunks": len(chunks),
180
+ "type": "text",
181
+ },
182
+ "chunks": chunks,
183
+ "vectors": [v.tolist() for v in vectors],
184
+ }
185
+
186
+ # ------------------------------------------------------------------ #
187
+ # CSV
188
+ # ------------------------------------------------------------------ #
189
+ async def _process_csv(
190
+ self, file_path: Path, text_field: str = "description"
191
+ ) -> Dict[str, Any]:
192
+ rows, texts, metas = [], [], []
193
+ with file_path.open(newline="", encoding="utf-8") as f:
194
+ reader = csv.DictReader(f)
195
+ for row in reader:
196
+ txt = row.get(text_field, "").strip()
197
+ if not txt:
198
+ continue
199
+ texts.append(txt)
200
+ metas.append({k: v for k, v in row.items() if k != text_field and v})
201
+
202
+ vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
203
+ return {
204
+ "content": None,
205
+ "metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
206
+ "chunks": texts,
207
+ "vectors": [v.tolist() for v in vectors],
208
+ "csv_row_metadata": metas,
209
+ }
210
+
211
+ # ------------------------------------------------------------------ #
212
+ # Office docs (.doc/.docx/.pptx)
213
+ # ------------------------------------------------------------------ #
214
+ async def _process_office(self, file_path: Path) -> Dict[str, Any]:
215
+ loop = asyncio.get_event_loop()
216
+ if file_path.suffix.lower() in {".doc", ".docx"}:
217
+ text = await loop.run_in_executor(
218
+ self._executor, self._read_docx, file_path
219
+ )
220
+ else: # .pptx
221
+ text = await loop.run_in_executor(
222
+ self._executor, self._read_pptx, file_path
223
+ )
224
+
225
+ chunks = self._chunk_text(text)
226
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
227
+ return {
228
+ "content": text,
229
+ "metadata": {
230
+ "source": str(file_path),
231
+ "chunks": len(chunks),
232
+ "type": "office",
233
+ },
234
+ "chunks": chunks,
235
+ "vectors": [v.tolist() for v in vectors],
236
+ }
237
+
238
+ # ------------------------------------------------------------------ #
239
+ # JSON
240
+ # ------------------------------------------------------------------ #
241
+ async def _process_json(self, file_path: Path) -> Dict[str, Any]:
242
+ text = await asyncio.get_event_loop().run_in_executor(
243
+ self._executor, self._read_json, file_path
244
+ )
245
+ chunks = self._chunk_text(text)
246
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
247
+ return {
248
+ "content": text,
249
+ "metadata": {
250
+ "source": str(file_path),
251
+ "chunks": len(chunks),
252
+ "type": "json",
253
+ },
254
+ "chunks": chunks,
255
+ "vectors": [v.tolist() for v in vectors],
256
+ }
257
+
258
+ # ------------------------------------------------------------------ #
259
+ # Shared helpers
260
+ # ------------------------------------------------------------------ #
261
+ async def _extract_text(self, file_path: Path) -> Union[
262
+ Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
263
+ Tuple[str, Dict[str, Any], List[int]],
264
+ ]:
265
+ loop = asyncio.get_event_loop()
266
+ if file_path.suffix.lower() == ".pdf":
267
+ return await loop.run_in_executor(
268
+ self._executor, self._extract_pdf_text, file_path
269
+ )
270
+ else:
271
+ text = await loop.run_in_executor(
272
+ self._executor, self._read_text_file, file_path
273
+ )
274
+ return text, {}, []
275
+
276
+ def _extract_pdf_text(self, file_path: Path):
277
+ page_chunks, meta = [], {}
278
+ with pdfplumber.open(file_path) as pdf:
279
+ meta.update(
280
+ {
281
+ "author": pdf.metadata.get("Author", ""),
282
+ "title": pdf.metadata.get("Title", file_path.stem),
283
+ "page_count": len(pdf.pages),
284
+ }
285
+ )
286
+ for i, page in enumerate(pdf.pages, start=1):
287
+ lines = page.extract_text_lines()
288
+ sorted_lines = sorted(lines, key=lambda x: x["top"])
289
+ txts, nums = [], []
290
+ for ln_idx, L in enumerate(sorted_lines, start=1):
291
+ t = L.get("text", "").strip()
292
+ if t:
293
+ txts.append(t)
294
+ nums.append(ln_idx)
295
+ if txts:
296
+ page_chunks.append(("\n".join(txts), i, nums))
297
+ return page_chunks, meta
298
+
299
+ def _read_text_file(self, file_path: Path) -> str:
300
+ try:
301
+ return file_path.read_text(encoding="utf-8")
302
+ except UnicodeDecodeError:
303
+ return file_path.read_text(encoding="latin-1")
304
+
305
+ def _read_docx(self, path: Path) -> str:
306
+ doc = Document(path)
307
+ return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
308
+
309
+ def _read_pptx(self, path: Path) -> str:
310
+ prs = Presentation(path)
311
+ slides = []
312
+ for slide in prs.slides:
313
+ chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
314
+ slides.append("\n".join(filter(None, chunks)))
315
+ return "\n\n".join(slides)
316
+
317
+ def _read_json(self, path: Path) -> str:
318
+ obj = json.loads(path.read_text(encoding="utf-8"))
319
+ pretty = json.dumps(obj, indent=2, ensure_ascii=False)
320
+ return "\n".join(textwrap.wrap(pretty, width=120))
321
+
322
+ async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
323
+ return await asyncio.get_event_loop().run_in_executor(
324
+ self._executor,
325
+ lambda: self.embedding_model.encode(
326
+ [chunk],
327
+ convert_to_numpy=True,
328
+ truncate="model_max_length",
329
+ normalize_embeddings=True,
330
+ show_progress_bar=False,
331
+ )[0],
332
+ )
333
+
334
+ # ------------------------------------------------------------------ #
335
+ # Text chunking helpers
336
+ # ------------------------------------------------------------------ #
337
+ def _chunk_text(self, text: str) -> List[str]:
338
+ sentences = re.split(r"(?<=[\.!?])\s+", text)
339
+ chunks, buf, length = [], [], 0
340
+ for sent in sentences:
341
+ slen = len(sent) + 1
342
+ if length + slen <= self.chunk_size:
343
+ buf.append(sent)
344
+ length += slen
345
+ else:
346
+ if buf:
347
+ chunks.append(" ".join(buf))
348
+ buf, length = [], 0
349
+ while len(sent) > self.chunk_size:
350
+ part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
351
+ chunks.append(part)
352
+ buf, length = [sent], len(sent)
353
+ if buf:
354
+ chunks.append(" ".join(buf))
355
+ return chunks
356
+
357
+ def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
358
+ if tokens is None:
359
+ tokens = self.embedding_model.tokenizer.tokenize(chunk)
360
+ out = []
361
+ for i in range(0, len(tokens), self.effective_max_length):
362
+ seg = tokens[i : i + self.effective_max_length]
363
+ out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
364
+ return out
@@ -11,9 +11,15 @@ LOG = UtilsInterface.LoggingUtility()
11
11
 
12
12
 
13
13
  class SynchronousInferenceStream:
14
+ # ------------------------------------------------------------ #
15
+ # GLOBAL EVENT LOOP (single hidden thread for sync wrapper)
16
+ # ------------------------------------------------------------ #
14
17
  _GLOBAL_LOOP = asyncio.new_event_loop()
15
18
  asyncio.set_event_loop(_GLOBAL_LOOP)
16
19
 
20
+ # ------------------------------------------------------------ #
21
+ # Init / setup
22
+ # ------------------------------------------------------------ #
17
23
  def __init__(self, inference) -> None:
18
24
  self.inference_client = inference
19
25
  self.user_id: Optional[str] = None
@@ -32,6 +38,7 @@ class SynchronousInferenceStream:
32
38
  run_id: str,
33
39
  api_key: str,
34
40
  ) -> None:
41
+ """Populate IDs once, so callers only provide provider/model."""
35
42
  self.user_id = user_id
36
43
  self.thread_id = thread_id
37
44
  self.assistant_id = assistant_id
@@ -39,7 +46,10 @@ class SynchronousInferenceStream:
39
46
  self.run_id = run_id
40
47
  self.api_key = api_key
41
48
 
42
- def stream_chunks(
49
+ # ------------------------------------------------------------ #
50
+ # Core sync-to-async streaming wrapper
51
+ # ------------------------------------------------------------ #
52
+ def stream_chunks( # noqa: PLR0915
43
53
  self,
44
54
  provider: str,
45
55
  model: str,
@@ -48,9 +58,15 @@ class SynchronousInferenceStream:
48
58
  timeout_per_chunk: float = 280.0,
49
59
  suppress_fc: bool = True,
50
60
  ) -> Generator[dict, None, None]:
61
+ """
62
+ Sync generator that mirrors async `inference_client.stream_inference_response`
63
+ but (optionally) removes raw <fc> … </fc> output *and* JSON
64
+ `{"type": "function_call" …}` objects from the stream.
65
+ """
51
66
 
52
67
  resolved_api_key = api_key or self.api_key
53
68
 
69
+ # ---------- async inner generator -------------------------------- #
54
70
  async def _stream_chunks_async():
55
71
  async for chk in self.inference_client.stream_inference_response(
56
72
  provider=provider,
@@ -65,6 +81,7 @@ class SynchronousInferenceStream:
65
81
 
66
82
  agen = _stream_chunks_async().__aiter__()
67
83
 
84
+ # ---------- FC-suppressor plumbing -------------------------------- #
68
85
  if suppress_fc:
69
86
  _suppressor = FunctionCallSuppressor()
70
87
  _peek_gate = PeekGate(_suppressor)
@@ -72,11 +89,15 @@ class SynchronousInferenceStream:
72
89
  def _filter_text(txt: str) -> str:
73
90
  return _peek_gate.feed(txt)
74
91
 
92
+ LOG.debug("[SyncStream] Function-call suppression ACTIVE")
75
93
  else:
76
94
 
77
95
  def _filter_text(txt: str) -> str:
78
96
  return txt
79
97
 
98
+ LOG.debug("[SyncStream] Function-call suppression DISABLED")
99
+
100
+ # ---------- helper to flush residual buffered text ---------------- #
80
101
  def _drain_filters() -> Optional[dict]:
81
102
  if not suppress_fc:
82
103
  return None
@@ -97,18 +118,17 @@ class SynchronousInferenceStream:
97
118
  }
98
119
  return None
99
120
 
121
+ # ---------- main sync loop ---------------------------------------- #
100
122
  while True:
101
123
  try:
102
124
  chunk = self._GLOBAL_LOOP.run_until_complete(
103
125
  asyncio.wait_for(agen.__anext__(), timeout=timeout_per_chunk)
104
126
  )
105
127
 
106
- # Always attach run_id
128
+ # Always attach run_id for front-end helpers
107
129
  chunk["run_id"] = self.run_id
108
130
 
109
- # ------------------------------------------------------
110
- # allow status chunks to bypass suppression suppression
111
- # -------------------------------------------------------
131
+ # ----- bypass filters for status / code-exec related -------- #
112
132
  if chunk.get("type") == "status":
113
133
  yield chunk
114
134
  continue
@@ -124,9 +144,19 @@ class SynchronousInferenceStream:
124
144
  yield chunk
125
145
  continue
126
146
 
147
+ # ----- NEW: swallow raw JSON function_call objects ---------- #
148
+ if suppress_fc and chunk.get("type") == "function_call":
149
+ LOG.debug(
150
+ "[SyncStream] Swallowing JSON function_call chunk: %s",
151
+ chunk.get("name") or "<unnamed>",
152
+ )
153
+ continue
154
+
155
+ # ----- text-level suppression ------------------------------- #
127
156
  if isinstance(chunk.get("content"), str):
128
157
  chunk["content"] = _filter_text(chunk["content"])
129
158
  if chunk["content"] == "":
159
+ # Entire segment was inside <fc> … </fc>
130
160
  continue
131
161
 
132
162
  yield chunk
@@ -134,21 +164,26 @@ class SynchronousInferenceStream:
134
164
  except StopAsyncIteration:
135
165
  if tail := _drain_filters():
136
166
  yield tail
137
- LOG.info("Stream completed normally.")
167
+ LOG.info("[SyncStream] Stream completed normally.")
138
168
  break
139
169
 
140
170
  except asyncio.TimeoutError:
141
171
  if tail := _drain_filters():
142
172
  yield tail
143
- LOG.error("[TimeoutError] Chunk wait expired aborting stream.")
173
+ LOG.error("[SyncStream] Timeout waiting for next chunk.")
144
174
  break
145
175
 
146
- except Exception as exc:
176
+ except Exception as exc: # noqa: BLE001
147
177
  if tail := _drain_filters():
148
178
  yield tail
149
- LOG.error("Unexpected streaming error: %s", exc, exc_info=True)
179
+ LOG.error(
180
+ "[SyncStream] Unexpected streaming error: %s", exc, exc_info=True
181
+ )
150
182
  break
151
183
 
184
+ # ------------------------------------------------------------ #
185
+ # House-keeping
186
+ # ------------------------------------------------------------ #
152
187
  @classmethod
153
188
  def shutdown_loop(cls) -> None:
154
189
  if cls._GLOBAL_LOOP and not cls._GLOBAL_LOOP.is_closed():
@@ -21,17 +21,19 @@ import torch
21
21
  from docx import Document
22
22
  from PIL import Image
23
23
  from pptx import Presentation
24
- from transformers import Blip2ForConditionalGeneration, Blip2Processor
25
- from ultralytics import YOLO
24
+ from projectdavid_common import UtilsInterface
25
+ from sentence_transformers import SentenceTransformer
26
+
27
+ # from transformers import Blip2ForConditionalGeneration, Blip2Processor
28
+
29
+ # from ultralytics import YOLO
26
30
 
27
31
  # OCR fallback – optional
28
- try:
29
- import pytesseract # noqa: F401 # pylint: disable=unused-import
30
- except ImportError:
31
- pytesseract = None
32
+ # try:
33
+ # import pytesseract # noqa: F401 # pylint: disable=unused-import
34
+ # except ImportError:
35
+ # pytesseract = None
32
36
 
33
- from projectdavid_common import UtilsInterface
34
- from sentence_transformers import SentenceTransformer
35
37
 
36
38
  log = UtilsInterface.LoggingUtility()
37
39
 
@@ -81,14 +83,6 @@ class FileProcessor:
81
83
  self.device = torch.device("cpu")
82
84
  self.torch_dtype = torch.float32
83
85
 
84
- # Feature flags
85
- self.use_ocr = use_ocr and pytesseract is not None
86
- self.use_detection = use_detection
87
- if use_ocr and pytesseract is None:
88
- log.warning("OCR requested but pytesseract not installed – skipping.")
89
- if self.use_detection:
90
- self.detector = YOLO("yolov8x.pt").to(self.device)
91
-
92
86
  # Text embedder
93
87
  self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
94
88
  self.embedding_model = SentenceTransformer(self.embedding_model_name)
@@ -100,35 +94,13 @@ class FileProcessor:
100
94
  self.effective_max_length = self.max_seq_length - self.special_tokens_count
101
95
  self.chunk_size = min(chunk_size, self.effective_max_length * 4)
102
96
 
103
- # Image embedder
104
- self.clip_model, _, self.clip_preprocess = (
105
- open_clip.create_model_and_transforms(
106
- image_model_name,
107
- pretrained="laion2b_s32b_b79k",
108
- precision="fp16" if self.device.type == "cuda" else "fp32",
109
- )
110
- )
111
- self.clip_model = self.clip_model.to(self.device).eval()
112
- self.clip_tokenizer = open_clip.get_tokenizer(image_model_name)
113
-
114
- # Caption generator
115
- self.blip_processor = Blip2Processor.from_pretrained(caption_model_name)
116
- self.blip_model = (
117
- Blip2ForConditionalGeneration.from_pretrained(
118
- caption_model_name,
119
- torch_dtype=self.torch_dtype,
120
- )
121
- .to(self.device)
122
- .eval()
123
- )
124
-
125
97
  # Executor & logging
126
98
  self._executor = ThreadPoolExecutor(max_workers=max_workers)
127
99
  log.info(
128
100
  "FileProcessor ready (device=%s, OCR=%s, detection=%s)",
129
101
  self.device,
130
- self.use_ocr,
131
- self.use_detection,
102
+ # self.use_ocr,
103
+ # self.use_detection,
132
104
  )
133
105
 
134
106
  # ------------------------------------------------------------------ #
@@ -190,90 +162,6 @@ class FileProcessor:
190
162
  ftype = self._detect_file_type(path)
191
163
  return await getattr(self, f"_process_{ftype}")(path)
192
164
 
193
- # ------------------------------------------------------------------ #
194
- # Image processing (OpenCLIP + BLIP-2 + OCR + YOLO)
195
- # ------------------------------------------------------------------ #
196
- async def _process_image(self, file_path: Path) -> Dict[str, Any]:
197
- loop = asyncio.get_event_loop()
198
- img = await loop.run_in_executor(self._executor, Image.open, file_path)
199
-
200
- # 1) Image vector
201
- def enc_img():
202
- with torch.no_grad():
203
- t = self.clip_preprocess(img).unsqueeze(0).to(self.device)
204
- v = self.clip_model.encode_image(t).squeeze()
205
- return (v / v.norm()).float().cpu().numpy()
206
-
207
- image_vec = await loop.run_in_executor(self._executor, enc_img)
208
-
209
- # 2) Caption
210
- def gen_cap():
211
- inp = self.blip_processor(images=img, return_tensors="pt").to(self.device)
212
- with torch.no_grad():
213
- ids = self.blip_model.generate(**inp, max_new_tokens=50)
214
- return self.blip_processor.decode(ids[0], skip_special_tokens=True)
215
-
216
- caption = await loop.run_in_executor(self._executor, gen_cap)
217
-
218
- # 3) OCR
219
- if self.use_ocr:
220
- text = await loop.run_in_executor(
221
- self._executor, pytesseract.image_to_string, img
222
- )
223
- if t := text.strip():
224
- caption += "\n" + t
225
-
226
- # 4) Caption vector
227
- def enc_txt():
228
- with torch.no_grad():
229
- tok = self.clip_tokenizer(caption).unsqueeze(0).to(self.device)
230
- v = self.clip_model.encode_text(tok).squeeze()
231
- return (v / v.norm()).float().cpu().numpy()
232
-
233
- caption_vec = await loop.run_in_executor(self._executor, enc_txt)
234
-
235
- # 5) YOLO regions
236
- region_vectors = []
237
- if self.use_detection:
238
- dets = self.detector(img)[0]
239
- for box in dets.boxes:
240
- x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().tolist())
241
- crop = img.crop((x1, y1, x2, y2))
242
- vec = self.encode_image(crop)
243
- region_vectors.append(
244
- {
245
- "vector": vec.tolist(),
246
- "bbox": [x1, y1, x2, y2],
247
- "label": dets.names[int(box.cls)],
248
- "conf": float(box.conf),
249
- }
250
- )
251
-
252
- # Metadata
253
- sha = hashlib.sha256(file_path.read_bytes()).hexdigest()
254
- w, h = img.size
255
- meta = {
256
- "source": str(file_path),
257
- "type": "image",
258
- "width": w,
259
- "height": h,
260
- "mime": f"image/{file_path.suffix.lstrip('.')}",
261
- "sha256": sha,
262
- "embedding_model": "openclip-vit-h-14",
263
- "caption": caption,
264
- }
265
-
266
- result = {
267
- "content": None,
268
- "metadata": meta,
269
- "chunks": [caption],
270
- "vectors": [image_vec.tolist()],
271
- "caption_vector": caption_vec.tolist(),
272
- }
273
- if region_vectors:
274
- result["region_vectors"] = region_vectors
275
- return result
276
-
277
165
  # ------------------------------------------------------------------ #
278
166
  # PDF
279
167
  # ------------------------------------------------------------------ #
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: projectdavid
3
- Version: 1.33.12
3
+ Version: 1.33.14
4
4
  Summary: Python SDK for interacting with the Entities Assistant API.
5
5
  Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
6
6
  License: PolyForm Noncommercial License 1.0.0
@@ -51,6 +51,7 @@ src/projectdavid/clients/tools_client.py
51
51
  src/projectdavid/clients/users_client.py
52
52
  src/projectdavid/clients/vector_store_manager.py
53
53
  src/projectdavid/clients/vectors.py
54
+ src/projectdavid/clients/vision-file_processor.py
54
55
  src/projectdavid/constants/platform.py
55
56
  src/projectdavid/services/logging_service.py
56
57
  src/projectdavid/synthesis/__init__.py
File without changes
File without changes
File without changes