projectdavid 1.33.13__tar.gz → 1.33.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of projectdavid might be problematic. Click here for more details.

Files changed (71) hide show
  1. {projectdavid-1.33.13 → projectdavid-1.33.15}/CHANGELOG.md +14 -0
  2. {projectdavid-1.33.13/src/projectdavid.egg-info → projectdavid-1.33.15}/PKG-INFO +1 -1
  3. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/vector_store.md +7 -6
  4. {projectdavid-1.33.13 → projectdavid-1.33.15}/pyproject.toml +1 -1
  5. projectdavid-1.33.15/src/projectdavid/clients/file_processor.py +364 -0
  6. projectdavid-1.33.15/src/projectdavid/clients/vectors.py +1025 -0
  7. projectdavid-1.33.13/src/projectdavid/clients/file_processor.py → projectdavid-1.33.15/src/projectdavid/clients/vision-file_processor.py +12 -124
  8. {projectdavid-1.33.13 → projectdavid-1.33.15/src/projectdavid.egg-info}/PKG-INFO +1 -1
  9. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid.egg-info/SOURCES.txt +2 -0
  10. {projectdavid-1.33.13 → projectdavid-1.33.15}/LICENSE +0 -0
  11. {projectdavid-1.33.13 → projectdavid-1.33.15}/MANIFEST.in +0 -0
  12. {projectdavid-1.33.13 → projectdavid-1.33.15}/README.md +0 -0
  13. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/assistants.md +0 -0
  14. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/code_interpretation.md +0 -0
  15. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/database.md +0 -0
  16. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/database_assistant_example.md +0 -0
  17. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/docker_comtainers.md +0 -0
  18. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/file_search.md +0 -0
  19. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/files.md +0 -0
  20. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/function_call_definition.md +0 -0
  21. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/function_calls.md +0 -0
  22. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/handling_function_calls.md +0 -0
  23. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/inference.md +0 -0
  24. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/messages.md +0 -0
  25. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/runs.md +0 -0
  26. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/streams.md +0 -0
  27. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/threads.md +0 -0
  28. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/tools.md +0 -0
  29. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/users.md +0 -0
  30. {projectdavid-1.33.13 → projectdavid-1.33.15}/docs/versioning.md +0 -0
  31. {projectdavid-1.33.13 → projectdavid-1.33.15}/setup.cfg +0 -0
  32. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/__init__.py +0 -0
  33. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/_version.py +0 -0
  34. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/actions_client.py +0 -0
  35. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/api_key_client.py +0 -0
  36. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/assistants_client.py +0 -0
  37. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/base_client.py +0 -0
  38. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/base_vector_store.py +0 -0
  39. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/event_handler.py +0 -0
  40. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/file_search.py +0 -0
  41. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/files_client.py +0 -0
  42. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/inference_client.py +0 -0
  43. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/messages_client.py +0 -0
  44. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/runs.py +0 -0
  45. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/synchronous_inference_wrapper.py +0 -0
  46. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/threads_client.py +0 -0
  47. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/tools_client.py +0 -0
  48. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/users_client.py +0 -0
  49. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/clients/vector_store_manager.py +0 -0
  50. /projectdavid-1.33.13/src/projectdavid/clients/vectors.py → /projectdavid-1.33.15/src/projectdavid/clients/vision_vectors.py +0 -0
  51. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/constants/platform.py +0 -0
  52. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/decorators.py +0 -0
  53. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/entity.py +0 -0
  54. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/events.py +0 -0
  55. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/serializers.py +0 -0
  56. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/services/logging_service.py +0 -0
  57. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/synthesis/__init__.py +0 -0
  58. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/synthesis/llm_synthesizer.py +0 -0
  59. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/synthesis/prompt.py +0 -0
  60. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/synthesis/reranker.py +0 -0
  61. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/synthesis/retriever.py +0 -0
  62. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/utils/__init__.py +0 -0
  63. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/utils/function_call_suppressor.py +0 -0
  64. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/utils/monitor_launcher.py +0 -0
  65. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/utils/peek_gate.py +0 -0
  66. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/utils/run_monitor.py +0 -0
  67. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid/utils/vector_search_formatter.py +0 -0
  68. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid.egg-info/dependency_links.txt +0 -0
  69. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid.egg-info/requires.txt +0 -0
  70. {projectdavid-1.33.13 → projectdavid-1.33.15}/src/projectdavid.egg-info/top_level.txt +0 -0
  71. {projectdavid-1.33.13 → projectdavid-1.33.15}/tests/test_clients.py +0 -0
@@ -1,3 +1,17 @@
1
+ ## [1.33.15](https://github.com/frankie336/projectdavid/compare/v1.33.14...v1.33.15) (2025-06-16)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * Back out from vision support - resource issue. Revisit in grand plan-2 ([a735034](https://github.com/frankie336/projectdavid/commit/a735034879ce50ce1dc2a508ce304796105f5830))
7
+
8
+ ## [1.33.14](https://github.com/frankie336/projectdavid/compare/v1.33.13...v1.33.14) (2025-06-16)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * Back out from vision support - resource issue. Revisit in grand plan ([3199ba7](https://github.com/frankie336/projectdavid/commit/3199ba7a18b3cfcc0f7306cd8748105f593a1836))
14
+
1
15
  ## [1.33.13](https://github.com/frankie336/projectdavid/compare/v1.33.12...v1.33.13) (2025-06-13)
2
16
 
3
17
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: projectdavid
3
- Version: 1.33.13
3
+ Version: 1.33.15
4
4
  Summary: Python SDK for interacting with the Entities Assistant API.
5
5
  Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
6
6
  License: PolyForm Noncommercial License 1.0.0
@@ -24,9 +24,8 @@ print(test_user)
24
24
 
25
25
  # create a vector store
26
26
  store = client.vectors.create_vector_store(
27
- name='Test Vector Store1',
28
- user_id=test_user.id,
29
- )
27
+ name='Test Vector Store1')
28
+
30
29
  print(store)
31
30
  ```
32
31
 
@@ -81,14 +80,16 @@ At this point, your file has been vectorized to your store.
81
80
 
82
81
  ---
83
82
 
83
+ ### Searches
84
84
 
85
- ## Supporting image vectors
86
85
 
87
- Entities now ingests and vectorizes a wide range of image formats for semantic search. You can leverage these image embeddings to extend text-only models into powerful multi-modal workflows; enriching chatbots, document search, recommendation engines, and more.
88
86
 
89
87
  ---
90
88
 
91
- ### Making Searches Against Files in a Store
89
+
90
+
91
+
92
+
92
93
 
93
94
  - The assistant will self-select appropriate vector store
94
95
  searches using its latent logic when responding to a prompt.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "projectdavid"
7
- version = "1.33.13"
7
+ version = "1.33.15"
8
8
  description = "Python SDK for interacting with the Entities Assistant API."
9
9
  readme = "README.md"
10
10
  authors = [
@@ -0,0 +1,364 @@
1
+ import asyncio
2
+ import csv
3
+ import json
4
+ import re
5
+ import textwrap
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Tuple, Union
9
+
10
+ try: # Python 3.11+
11
+ from typing import LiteralString
12
+ except ImportError: # 3.9–3.10
13
+ from typing_extensions import LiteralString
14
+
15
+ import numpy as np
16
+ import pdfplumber
17
+ from docx import Document
18
+ from pptx import Presentation
19
+ from projectdavid_common import UtilsInterface
20
+ from sentence_transformers import SentenceTransformer
21
+
22
+ log = UtilsInterface.LoggingUtility()
23
+
24
+
25
+ class FileProcessor:
26
+ # ------------------------------------------------------------------ #
27
+ # Construction
28
+ # ------------------------------------------------------------------ #
29
+ def __init__(self, max_workers: int = 4, chunk_size: int = 512):
30
+ self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
31
+ self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
32
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
33
+
34
+ # token limits
35
+ self.max_seq_length = self.embedding_model.get_max_seq_length()
36
+ self.special_tokens_count = 2
37
+ self.effective_max_length = self.max_seq_length - self.special_tokens_count
38
+ self.chunk_size = min(chunk_size, self.effective_max_length * 4)
39
+
40
+ log.info("Initialized optimized FileProcessor")
41
+
42
+ # ------------------------------------------------------------------ #
43
+ # Generic validators
44
+ # ------------------------------------------------------------------ #
45
+ def validate_file(self, file_path: Path):
46
+ """Ensure file exists and is under 100 MB."""
47
+ max_size = 100 * 1024 * 1024
48
+ if not file_path.exists():
49
+ raise FileNotFoundError(f"File not found: {file_path}")
50
+ if file_path.stat().st_size > max_size:
51
+ mb = max_size // (1024 * 1024)
52
+ raise ValueError(f"{file_path.name} > {mb} MB limit")
53
+
54
+ # ------------------------------------------------------------------ #
55
+ # File-type detection (simple extension map – NO libmagic)
56
+ # ------------------------------------------------------------------ #
57
+ def _detect_file_type(self, file_path: Path) -> str:
58
+ """
59
+ Return one of:
60
+
61
+ • 'pdf' • 'csv' • 'json'
62
+ • 'office' (.doc/.docx/.pptx)
63
+ • 'text' (code / markup / plain text)
64
+
65
+ Raises *ValueError* if the extension is not recognised.
66
+ """
67
+ suffix = file_path.suffix.lower()
68
+
69
+ if suffix == ".pdf":
70
+ return "pdf"
71
+ if suffix == ".csv":
72
+ return "csv"
73
+ if suffix == ".json":
74
+ return "json"
75
+ if suffix in {".doc", ".docx", ".pptx"}:
76
+ return "office"
77
+
78
+ text_exts = {
79
+ ".txt",
80
+ ".md",
81
+ ".rst",
82
+ ".c",
83
+ ".cpp",
84
+ ".cs",
85
+ ".go",
86
+ ".java",
87
+ ".js",
88
+ ".ts",
89
+ ".php",
90
+ ".py",
91
+ ".rb",
92
+ ".sh",
93
+ ".tex",
94
+ ".html",
95
+ ".css",
96
+ }
97
+ if suffix in text_exts:
98
+ return "text"
99
+
100
+ raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
101
+
102
+ # ------------------------------------------------------------------ #
103
+ # Public entry-point
104
+ # ------------------------------------------------------------------ #
105
+ async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
106
+ """Validate → detect → dispatch to the appropriate processor."""
107
+ file_path = Path(file_path)
108
+ self.validate_file(file_path)
109
+ ftype = self._detect_file_type(file_path)
110
+
111
+ dispatch_map = {
112
+ "pdf": self._process_pdf,
113
+ "text": self._process_text,
114
+ "csv": self._process_csv,
115
+ "office": self._process_office,
116
+ "json": self._process_json,
117
+ }
118
+ if ftype not in dispatch_map:
119
+ raise ValueError(f"Unsupported file type: {file_path.suffix}")
120
+
121
+ return await dispatch_map[ftype](file_path)
122
+
123
+ # ------------------------------------------------------------------ #
124
+ # PDF
125
+ # ------------------------------------------------------------------ #
126
+ async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
127
+ page_chunks, doc_meta = await self._extract_text(file_path)
128
+ all_chunks, line_data = [], []
129
+
130
+ for page_text, page_num, line_nums in page_chunks:
131
+ lines = page_text.split("\n")
132
+ buf, buf_lines, length = [], [], 0
133
+ for line, ln in zip(lines, line_nums):
134
+ l = len(line) + 1
135
+ if length + l <= self.chunk_size:
136
+ buf.append(line)
137
+ buf_lines.append(ln)
138
+ length += l
139
+ else:
140
+ if buf:
141
+ all_chunks.append("\n".join(buf))
142
+ line_data.append({"page": page_num, "lines": buf_lines})
143
+ buf, buf_lines, length = [], [], 0
144
+ for piece in self._split_oversized_chunk(line):
145
+ all_chunks.append(piece)
146
+ line_data.append({"page": page_num, "lines": [ln]})
147
+ if buf:
148
+ all_chunks.append("\n".join(buf))
149
+ line_data.append({"page": page_num, "lines": buf_lines})
150
+
151
+ vectors = await asyncio.gather(
152
+ *[self._encode_chunk_async(c) for c in all_chunks]
153
+ )
154
+ return {
155
+ "content": "\n\n".join(all_chunks),
156
+ "metadata": {
157
+ **doc_meta,
158
+ "source": str(file_path),
159
+ "chunks": len(all_chunks),
160
+ "type": "pdf",
161
+ },
162
+ "chunks": all_chunks,
163
+ "vectors": [v.tolist() for v in vectors],
164
+ "line_data": line_data,
165
+ }
166
+
167
+ # ------------------------------------------------------------------ #
168
+ # Plain-text / code / markup
169
+ # ------------------------------------------------------------------ #
170
+ async def _process_text(self, file_path: Path) -> Dict[str, Any]:
171
+ text, extra_meta, _ = await self._extract_text(file_path)
172
+ chunks = self._chunk_text(text)
173
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
174
+ return {
175
+ "content": text,
176
+ "metadata": {
177
+ **extra_meta,
178
+ "source": str(file_path),
179
+ "chunks": len(chunks),
180
+ "type": "text",
181
+ },
182
+ "chunks": chunks,
183
+ "vectors": [v.tolist() for v in vectors],
184
+ }
185
+
186
+ # ------------------------------------------------------------------ #
187
+ # CSV
188
+ # ------------------------------------------------------------------ #
189
+ async def _process_csv(
190
+ self, file_path: Path, text_field: str = "description"
191
+ ) -> Dict[str, Any]:
192
+ rows, texts, metas = [], [], []
193
+ with file_path.open(newline="", encoding="utf-8") as f:
194
+ reader = csv.DictReader(f)
195
+ for row in reader:
196
+ txt = row.get(text_field, "").strip()
197
+ if not txt:
198
+ continue
199
+ texts.append(txt)
200
+ metas.append({k: v for k, v in row.items() if k != text_field and v})
201
+
202
+ vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
203
+ return {
204
+ "content": None,
205
+ "metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
206
+ "chunks": texts,
207
+ "vectors": [v.tolist() for v in vectors],
208
+ "csv_row_metadata": metas,
209
+ }
210
+
211
+ # ------------------------------------------------------------------ #
212
+ # Office docs (.doc/.docx/.pptx)
213
+ # ------------------------------------------------------------------ #
214
+ async def _process_office(self, file_path: Path) -> Dict[str, Any]:
215
+ loop = asyncio.get_event_loop()
216
+ if file_path.suffix.lower() in {".doc", ".docx"}:
217
+ text = await loop.run_in_executor(
218
+ self._executor, self._read_docx, file_path
219
+ )
220
+ else: # .pptx
221
+ text = await loop.run_in_executor(
222
+ self._executor, self._read_pptx, file_path
223
+ )
224
+
225
+ chunks = self._chunk_text(text)
226
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
227
+ return {
228
+ "content": text,
229
+ "metadata": {
230
+ "source": str(file_path),
231
+ "chunks": len(chunks),
232
+ "type": "office",
233
+ },
234
+ "chunks": chunks,
235
+ "vectors": [v.tolist() for v in vectors],
236
+ }
237
+
238
+ # ------------------------------------------------------------------ #
239
+ # JSON
240
+ # ------------------------------------------------------------------ #
241
+ async def _process_json(self, file_path: Path) -> Dict[str, Any]:
242
+ text = await asyncio.get_event_loop().run_in_executor(
243
+ self._executor, self._read_json, file_path
244
+ )
245
+ chunks = self._chunk_text(text)
246
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
247
+ return {
248
+ "content": text,
249
+ "metadata": {
250
+ "source": str(file_path),
251
+ "chunks": len(chunks),
252
+ "type": "json",
253
+ },
254
+ "chunks": chunks,
255
+ "vectors": [v.tolist() for v in vectors],
256
+ }
257
+
258
+ # ------------------------------------------------------------------ #
259
+ # Shared helpers
260
+ # ------------------------------------------------------------------ #
261
+ async def _extract_text(self, file_path: Path) -> Union[
262
+ Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
263
+ Tuple[str, Dict[str, Any], List[int]],
264
+ ]:
265
+ loop = asyncio.get_event_loop()
266
+ if file_path.suffix.lower() == ".pdf":
267
+ return await loop.run_in_executor(
268
+ self._executor, self._extract_pdf_text, file_path
269
+ )
270
+ else:
271
+ text = await loop.run_in_executor(
272
+ self._executor, self._read_text_file, file_path
273
+ )
274
+ return text, {}, []
275
+
276
+ def _extract_pdf_text(self, file_path: Path):
277
+ page_chunks, meta = [], {}
278
+ with pdfplumber.open(file_path) as pdf:
279
+ meta.update(
280
+ {
281
+ "author": pdf.metadata.get("Author", ""),
282
+ "title": pdf.metadata.get("Title", file_path.stem),
283
+ "page_count": len(pdf.pages),
284
+ }
285
+ )
286
+ for i, page in enumerate(pdf.pages, start=1):
287
+ lines = page.extract_text_lines()
288
+ sorted_lines = sorted(lines, key=lambda x: x["top"])
289
+ txts, nums = [], []
290
+ for ln_idx, L in enumerate(sorted_lines, start=1):
291
+ t = L.get("text", "").strip()
292
+ if t:
293
+ txts.append(t)
294
+ nums.append(ln_idx)
295
+ if txts:
296
+ page_chunks.append(("\n".join(txts), i, nums))
297
+ return page_chunks, meta
298
+
299
+ def _read_text_file(self, file_path: Path) -> str:
300
+ try:
301
+ return file_path.read_text(encoding="utf-8")
302
+ except UnicodeDecodeError:
303
+ return file_path.read_text(encoding="latin-1")
304
+
305
+ def _read_docx(self, path: Path) -> str:
306
+ doc = Document(path)
307
+ return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
308
+
309
+ def _read_pptx(self, path: Path) -> str:
310
+ prs = Presentation(path)
311
+ slides = []
312
+ for slide in prs.slides:
313
+ chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
314
+ slides.append("\n".join(filter(None, chunks)))
315
+ return "\n\n".join(slides)
316
+
317
+ def _read_json(self, path: Path) -> str:
318
+ obj = json.loads(path.read_text(encoding="utf-8"))
319
+ pretty = json.dumps(obj, indent=2, ensure_ascii=False)
320
+ return "\n".join(textwrap.wrap(pretty, width=120))
321
+
322
+ async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
323
+ return await asyncio.get_event_loop().run_in_executor(
324
+ self._executor,
325
+ lambda: self.embedding_model.encode(
326
+ [chunk],
327
+ convert_to_numpy=True,
328
+ truncate="model_max_length",
329
+ normalize_embeddings=True,
330
+ show_progress_bar=False,
331
+ )[0],
332
+ )
333
+
334
+ # ------------------------------------------------------------------ #
335
+ # Text chunking helpers
336
+ # ------------------------------------------------------------------ #
337
+ def _chunk_text(self, text: str) -> List[str]:
338
+ sentences = re.split(r"(?<=[\.!?])\s+", text)
339
+ chunks, buf, length = [], [], 0
340
+ for sent in sentences:
341
+ slen = len(sent) + 1
342
+ if length + slen <= self.chunk_size:
343
+ buf.append(sent)
344
+ length += slen
345
+ else:
346
+ if buf:
347
+ chunks.append(" ".join(buf))
348
+ buf, length = [], 0
349
+ while len(sent) > self.chunk_size:
350
+ part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
351
+ chunks.append(part)
352
+ buf, length = [sent], len(sent)
353
+ if buf:
354
+ chunks.append(" ".join(buf))
355
+ return chunks
356
+
357
+ def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
358
+ if tokens is None:
359
+ tokens = self.embedding_model.tokenizer.tokenize(chunk)
360
+ out = []
361
+ for i in range(0, len(tokens), self.effective_max_length):
362
+ seg = tokens[i : i + self.effective_max_length]
363
+ out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
364
+ return out