projectdavid 1.31.0__py3-none-any.whl → 1.38.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- projectdavid/clients/assistants_client.py +7 -13
- projectdavid/clients/file_processor.py +102 -107
- projectdavid/clients/messages_client.py +24 -39
- projectdavid/clients/runs.py +156 -211
- projectdavid/clients/synchronous_inference_wrapper.py +52 -24
- projectdavid/clients/threads_client.py +32 -12
- projectdavid/clients/vector_store_manager.py +110 -21
- projectdavid/clients/vectors.py +47 -30
- projectdavid/clients/vision-file_processor.py +462 -0
- projectdavid/clients/vision_vectors.py +1058 -0
- projectdavid/decorators.py +64 -0
- projectdavid/entity.py +24 -5
- projectdavid/synthesis/reranker.py +4 -2
- projectdavid/utils/function_call_suppressor.py +40 -0
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/METADATA +6 -7
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/RECORD +19 -15
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/WHEEL +1 -1
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/licenses/LICENSE +0 -0
- {projectdavid-1.31.0.dist-info → projectdavid-1.38.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import csv
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
import textwrap
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
11
|
+
|
|
12
|
+
try: # Python 3.11+
|
|
13
|
+
from typing import LiteralString
|
|
14
|
+
except ImportError: # 3.9–3.10
|
|
15
|
+
from typing_extensions import LiteralString
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pdfplumber
|
|
19
|
+
from docx import Document
|
|
20
|
+
from PIL import Image
|
|
21
|
+
from pptx import Presentation
|
|
22
|
+
from projectdavid_common import UtilsInterface
|
|
23
|
+
|
|
24
|
+
log = UtilsInterface.LoggingUtility()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
|
|
28
|
+
"""Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
|
|
29
|
+
lat_r = math.radians(lat)
|
|
30
|
+
lon_r = math.radians(lon)
|
|
31
|
+
return [
|
|
32
|
+
math.cos(lat_r) * math.cos(lon_r),
|
|
33
|
+
math.cos(lat_r) * math.sin(lon_r),
|
|
34
|
+
math.sin(lat_r),
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class FileProcessor:
|
|
39
|
+
# ------------------------------------------------------------------ #
|
|
40
|
+
# Construction
|
|
41
|
+
# ------------------------------------------------------------------ #
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
*,
|
|
45
|
+
max_workers: int = 4,
|
|
46
|
+
chunk_size: int = 512,
|
|
47
|
+
use_gpu: bool = True,
|
|
48
|
+
use_ocr: bool = True,
|
|
49
|
+
use_detection: bool = False,
|
|
50
|
+
image_model_name: str = "ViT-H-14",
|
|
51
|
+
caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
|
|
52
|
+
):
|
|
53
|
+
# Configuration
|
|
54
|
+
self._use_gpu = use_gpu
|
|
55
|
+
self._max_workers = max_workers
|
|
56
|
+
self._requested_chunk_size = chunk_size
|
|
57
|
+
self._image_model_name = image_model_name
|
|
58
|
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
59
|
+
|
|
60
|
+
# Lazy ML Stack Attributes
|
|
61
|
+
self._device = None
|
|
62
|
+
self._torch_dtype = None
|
|
63
|
+
self._embedding_model = None
|
|
64
|
+
self._clip_model = None
|
|
65
|
+
self._clip_preprocess = None
|
|
66
|
+
self._clip_tokenizer = None
|
|
67
|
+
|
|
68
|
+
# Lazy Token Limits
|
|
69
|
+
self._effective_max_length = None
|
|
70
|
+
self._chunk_size = None
|
|
71
|
+
|
|
72
|
+
log.info("Initialized Multimodal Lazy-Loaded FileProcessor")
|
|
73
|
+
|
|
74
|
+
def _ensure_ml_stack(self):
|
|
75
|
+
"""Lazy-loads Torch, CLIP, and SentenceTransformers only when needed."""
|
|
76
|
+
if self._embedding_model is None:
|
|
77
|
+
try:
|
|
78
|
+
import open_clip
|
|
79
|
+
import torch
|
|
80
|
+
from sentence_transformers import SentenceTransformer
|
|
81
|
+
|
|
82
|
+
# 1. Setup Device
|
|
83
|
+
if self._use_gpu and torch.cuda.is_available():
|
|
84
|
+
self._device = torch.device("cuda")
|
|
85
|
+
self._torch_dtype = torch.float16
|
|
86
|
+
else:
|
|
87
|
+
self._device = torch.device("cpu")
|
|
88
|
+
self._torch_dtype = torch.float32
|
|
89
|
+
|
|
90
|
+
# 2. Setup Text Embedder
|
|
91
|
+
self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
|
|
92
|
+
self._embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
93
|
+
self._embedding_model.to(str(self._device))
|
|
94
|
+
|
|
95
|
+
# 3. Setup CLIP
|
|
96
|
+
# Note: We use the provided image_model_name (default ViT-H-14)
|
|
97
|
+
self._clip_model, _, self._clip_preprocess = (
|
|
98
|
+
open_clip.create_model_and_transforms(
|
|
99
|
+
self._image_model_name, pretrained="laion2b_s32b_b79k"
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
self._clip_model.to(self._device)
|
|
103
|
+
self._clip_tokenizer = open_clip.get_tokenizer(self._image_model_name)
|
|
104
|
+
|
|
105
|
+
# 4. Calculate limits
|
|
106
|
+
max_seq_length = self._embedding_model.get_max_seq_length()
|
|
107
|
+
special_tokens_count = 2
|
|
108
|
+
self._effective_max_length = max_seq_length - special_tokens_count
|
|
109
|
+
self._chunk_size = min(
|
|
110
|
+
self._requested_chunk_size, self._effective_max_length * 4
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
log.info("ML Stack loaded (device=%s)", self._device)
|
|
114
|
+
|
|
115
|
+
except ImportError as e:
|
|
116
|
+
log.error(f"ML Stack failed to load: {e}")
|
|
117
|
+
raise ImportError(
|
|
118
|
+
"This feature requires heavy ML binaries. "
|
|
119
|
+
"Please install the vector stack: pip install projectdavid[vector]"
|
|
120
|
+
)
|
|
121
|
+
return self._embedding_model
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def chunk_size(self):
|
|
125
|
+
if self._chunk_size is None:
|
|
126
|
+
self._ensure_ml_stack()
|
|
127
|
+
return self._chunk_size
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def effective_max_length(self):
|
|
131
|
+
if self._effective_max_length is None:
|
|
132
|
+
self._ensure_ml_stack()
|
|
133
|
+
return self._effective_max_length
|
|
134
|
+
|
|
135
|
+
# ------------------------------------------------------------------ #
|
|
136
|
+
# Public Embedders
|
|
137
|
+
# ------------------------------------------------------------------ #
|
|
138
|
+
def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
139
|
+
model = self._ensure_ml_stack()
|
|
140
|
+
single = isinstance(text, str)
|
|
141
|
+
out = model.encode(
|
|
142
|
+
text,
|
|
143
|
+
convert_to_numpy=True,
|
|
144
|
+
normalize_embeddings=True,
|
|
145
|
+
show_progress_bar=False,
|
|
146
|
+
)
|
|
147
|
+
return out if not single else out[0]
|
|
148
|
+
|
|
149
|
+
def encode_image(self, img: Image.Image) -> np.ndarray:
|
|
150
|
+
import torch
|
|
151
|
+
|
|
152
|
+
self._ensure_ml_stack()
|
|
153
|
+
with torch.no_grad():
|
|
154
|
+
tensor = self._clip_preprocess(img).unsqueeze(0).to(self._device)
|
|
155
|
+
feat = self._clip_model.encode_image(tensor).squeeze()
|
|
156
|
+
feat = feat / feat.norm()
|
|
157
|
+
return feat.float().cpu().numpy()
|
|
158
|
+
|
|
159
|
+
def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
160
|
+
import torch
|
|
161
|
+
|
|
162
|
+
self._ensure_ml_stack()
|
|
163
|
+
with torch.no_grad():
|
|
164
|
+
toks = (
|
|
165
|
+
self._clip_tokenizer(text)
|
|
166
|
+
if isinstance(text, str)
|
|
167
|
+
else self._clip_tokenizer(text) # Adjusted for clip tokenizer behavior
|
|
168
|
+
)
|
|
169
|
+
tensor = toks.to(self._device)
|
|
170
|
+
feat = self._clip_model.encode_text(tensor).squeeze()
|
|
171
|
+
if feat.dim() > 1: # Handle batch
|
|
172
|
+
feat = feat / feat.norm(dim=-1, keepdim=True)
|
|
173
|
+
else:
|
|
174
|
+
feat = feat / feat.norm()
|
|
175
|
+
return feat.float().cpu().numpy()
|
|
176
|
+
|
|
177
|
+
# ------------------------------------------------------------------ #
|
|
178
|
+
# Generic validators / Type Detection
|
|
179
|
+
# ------------------------------------------------------------------ #
|
|
180
|
+
def validate_file(self, file_path: Path):
|
|
181
|
+
max_size = 100 * 1024 * 1024
|
|
182
|
+
if not file_path.exists():
|
|
183
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
184
|
+
if file_path.stat().st_size > max_size:
|
|
185
|
+
raise ValueError(f"{file_path.name} exceeds 100MB limit")
|
|
186
|
+
|
|
187
|
+
def _detect_file_type(self, file_path: Path) -> str:
|
|
188
|
+
suffix = file_path.suffix.lower()
|
|
189
|
+
if suffix == ".pdf":
|
|
190
|
+
return "pdf"
|
|
191
|
+
if suffix == ".csv":
|
|
192
|
+
return "csv"
|
|
193
|
+
if suffix == ".json":
|
|
194
|
+
return "json"
|
|
195
|
+
if suffix in {".doc", ".docx", ".pptx"}:
|
|
196
|
+
return "office"
|
|
197
|
+
if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
|
|
198
|
+
return "image"
|
|
199
|
+
|
|
200
|
+
text_exts = {
|
|
201
|
+
".txt",
|
|
202
|
+
".md",
|
|
203
|
+
".rst",
|
|
204
|
+
".c",
|
|
205
|
+
".cpp",
|
|
206
|
+
".cs",
|
|
207
|
+
".go",
|
|
208
|
+
".java",
|
|
209
|
+
".js",
|
|
210
|
+
".ts",
|
|
211
|
+
".py",
|
|
212
|
+
".html",
|
|
213
|
+
".css",
|
|
214
|
+
}
|
|
215
|
+
if suffix in text_exts:
|
|
216
|
+
return "text"
|
|
217
|
+
raise ValueError(f"Unsupported file type: {file_path.name}")
|
|
218
|
+
|
|
219
|
+
async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
220
|
+
path = Path(file_path)
|
|
221
|
+
self.validate_file(path)
|
|
222
|
+
ftype = self._detect_file_type(path)
|
|
223
|
+
return await getattr(self, f"_process_{ftype}")(path)
|
|
224
|
+
|
|
225
|
+
# ------------------------------------------------------------------ #
|
|
226
|
+
# Processors (PDF, Text, CSV, Office, JSON, Image)
|
|
227
|
+
# ------------------------------------------------------------------ #
|
|
228
|
+
async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
|
|
229
|
+
page_chunks, doc_meta = await self._extract_text(file_path)
|
|
230
|
+
all_chunks, line_data = [], []
|
|
231
|
+
for page_text, page_num, line_nums in page_chunks:
|
|
232
|
+
lines = page_text.split("\n")
|
|
233
|
+
buf, buf_lines, length = [], [], 0
|
|
234
|
+
for line, ln in zip(lines, line_nums):
|
|
235
|
+
l = len(line) + 1
|
|
236
|
+
if length + l <= self.chunk_size:
|
|
237
|
+
buf.append(line)
|
|
238
|
+
buf_lines.append(ln)
|
|
239
|
+
length += l
|
|
240
|
+
else:
|
|
241
|
+
if buf:
|
|
242
|
+
all_chunks.append("\n".join(buf))
|
|
243
|
+
line_data.append({"page": page_num, "lines": buf_lines})
|
|
244
|
+
buf, buf_lines, length = [], [], 0
|
|
245
|
+
for piece in self._split_oversized_chunk(line):
|
|
246
|
+
all_chunks.append(piece)
|
|
247
|
+
line_data.append({"page": page_num, "lines": [ln]})
|
|
248
|
+
if buf:
|
|
249
|
+
all_chunks.append("\n".join(buf))
|
|
250
|
+
line_data.append({"page": page_num, "lines": buf_lines})
|
|
251
|
+
|
|
252
|
+
vectors = await asyncio.gather(
|
|
253
|
+
*[self._encode_chunk_async(c) for c in all_chunks]
|
|
254
|
+
)
|
|
255
|
+
return {
|
|
256
|
+
"content": "\n\n".join(all_chunks),
|
|
257
|
+
"metadata": {
|
|
258
|
+
**doc_meta,
|
|
259
|
+
"source": str(file_path),
|
|
260
|
+
"chunks": len(all_chunks),
|
|
261
|
+
"type": "pdf",
|
|
262
|
+
},
|
|
263
|
+
"chunks": all_chunks,
|
|
264
|
+
"vectors": [v.tolist() for v in vectors],
|
|
265
|
+
"line_data": line_data,
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
async def _process_text(self, file_path: Path) -> Dict[str, Any]:
|
|
269
|
+
text, extra_meta, _ = await self._extract_text(file_path)
|
|
270
|
+
chunks = self._chunk_text(text)
|
|
271
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
272
|
+
return {
|
|
273
|
+
"content": text,
|
|
274
|
+
"metadata": {
|
|
275
|
+
**extra_meta,
|
|
276
|
+
"source": str(file_path),
|
|
277
|
+
"chunks": len(chunks),
|
|
278
|
+
"type": "text",
|
|
279
|
+
},
|
|
280
|
+
"chunks": chunks,
|
|
281
|
+
"vectors": [v.tolist() for v in vectors],
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
async def _process_csv(
|
|
285
|
+
self, file_path: Path, text_field: str = "description"
|
|
286
|
+
) -> Dict[str, Any]:
|
|
287
|
+
texts, metas = [], []
|
|
288
|
+
with file_path.open(newline="", encoding="utf-8") as f:
|
|
289
|
+
reader = csv.DictReader(f)
|
|
290
|
+
for row in reader:
|
|
291
|
+
txt = row.get(text_field, "").strip()
|
|
292
|
+
if not txt:
|
|
293
|
+
continue
|
|
294
|
+
texts.append(txt)
|
|
295
|
+
metas.append({k: v for k, v in row.items() if k != text_field and v})
|
|
296
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
|
|
297
|
+
return {
|
|
298
|
+
"content": None,
|
|
299
|
+
"metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
|
|
300
|
+
"chunks": texts,
|
|
301
|
+
"vectors": [v.tolist() for v in vectors],
|
|
302
|
+
"csv_row_metadata": metas,
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
async def _process_office(self, file_path: Path) -> Dict[str, Any]:
|
|
306
|
+
loop = asyncio.get_event_loop()
|
|
307
|
+
method = (
|
|
308
|
+
self._read_docx
|
|
309
|
+
if file_path.suffix.lower() in {".doc", ".docx"}
|
|
310
|
+
else self._read_pptx
|
|
311
|
+
)
|
|
312
|
+
text = await loop.run_in_executor(self._executor, method, file_path)
|
|
313
|
+
chunks = self._chunk_text(text)
|
|
314
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
315
|
+
return {
|
|
316
|
+
"content": text,
|
|
317
|
+
"metadata": {
|
|
318
|
+
"source": str(file_path),
|
|
319
|
+
"chunks": len(chunks),
|
|
320
|
+
"type": "office",
|
|
321
|
+
},
|
|
322
|
+
"chunks": chunks,
|
|
323
|
+
"vectors": [v.tolist() for v in vectors],
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
async def _process_json(self, file_path: Path) -> Dict[str, Any]:
|
|
327
|
+
text = await asyncio.get_event_loop().run_in_executor(
|
|
328
|
+
self._executor, self._read_json, file_path
|
|
329
|
+
)
|
|
330
|
+
chunks = self._chunk_text(text)
|
|
331
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
332
|
+
return {
|
|
333
|
+
"content": text,
|
|
334
|
+
"metadata": {
|
|
335
|
+
"source": str(file_path),
|
|
336
|
+
"chunks": len(chunks),
|
|
337
|
+
"type": "json",
|
|
338
|
+
},
|
|
339
|
+
"chunks": chunks,
|
|
340
|
+
"vectors": [v.tolist() for v in vectors],
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
async def _process_image(self, file_path: Path) -> Dict[str, Any]:
|
|
344
|
+
"""Handles image embedding via CLIP."""
|
|
345
|
+
img = Image.open(file_path).convert("RGB")
|
|
346
|
+
vector = await asyncio.get_event_loop().run_in_executor(
|
|
347
|
+
self._executor, self.encode_image, img
|
|
348
|
+
)
|
|
349
|
+
return {
|
|
350
|
+
"content": None,
|
|
351
|
+
"metadata": {"source": str(file_path), "type": "image"},
|
|
352
|
+
"chunks": [],
|
|
353
|
+
"vectors": [vector.tolist()],
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
# ------------------------------------------------------------------ #
|
|
357
|
+
# Extraction/Read Helpers
|
|
358
|
+
# ------------------------------------------------------------------ #
|
|
359
|
+
async def _extract_text(self, file_path: Path):
|
|
360
|
+
loop = asyncio.get_event_loop()
|
|
361
|
+
if file_path.suffix.lower() == ".pdf":
|
|
362
|
+
return await loop.run_in_executor(
|
|
363
|
+
self._executor, self._extract_pdf_text, file_path
|
|
364
|
+
)
|
|
365
|
+
text = await loop.run_in_executor(
|
|
366
|
+
self._executor, self._read_text_file, file_path
|
|
367
|
+
)
|
|
368
|
+
return text, {}, []
|
|
369
|
+
|
|
370
|
+
def _extract_pdf_text(self, file_path: Path):
|
|
371
|
+
page_chunks, meta = [], {}
|
|
372
|
+
with pdfplumber.open(file_path) as pdf:
|
|
373
|
+
meta.update(
|
|
374
|
+
{
|
|
375
|
+
"author": pdf.metadata.get("Author", ""),
|
|
376
|
+
"title": pdf.metadata.get("Title", file_path.stem),
|
|
377
|
+
"page_count": len(pdf.pages),
|
|
378
|
+
}
|
|
379
|
+
)
|
|
380
|
+
for i, page in enumerate(pdf.pages, start=1):
|
|
381
|
+
lines = page.extract_text_lines()
|
|
382
|
+
sorted_lines = sorted(lines, key=lambda x: x["top"])
|
|
383
|
+
txts, nums = [], []
|
|
384
|
+
for ln_idx, line in enumerate(sorted_lines, start=1):
|
|
385
|
+
t = line.get("text", "").strip()
|
|
386
|
+
if t:
|
|
387
|
+
txts.append(t)
|
|
388
|
+
nums.append(ln_idx)
|
|
389
|
+
if txts:
|
|
390
|
+
page_chunks.append(("\n".join(txts), i, nums))
|
|
391
|
+
return page_chunks, meta
|
|
392
|
+
|
|
393
|
+
def _read_text_file(self, file_path: Path) -> str:
|
|
394
|
+
try:
|
|
395
|
+
return file_path.read_text(encoding="utf-8")
|
|
396
|
+
except UnicodeDecodeError:
|
|
397
|
+
return file_path.read_text(encoding="latin-1")
|
|
398
|
+
|
|
399
|
+
def _read_docx(self, path: Path) -> str:
|
|
400
|
+
return "\n".join(p.text for p in Document(path).paragraphs if p.text.strip())
|
|
401
|
+
|
|
402
|
+
def _read_pptx(self, path: Path) -> str:
|
|
403
|
+
slides = []
|
|
404
|
+
for slide in Presentation(path).slides:
|
|
405
|
+
slides.append(
|
|
406
|
+
"\n".join(
|
|
407
|
+
filter(
|
|
408
|
+
None, [sh.text for sh in slide.shapes if hasattr(sh, "text")]
|
|
409
|
+
)
|
|
410
|
+
)
|
|
411
|
+
)
|
|
412
|
+
return "\n\n".join(slides)
|
|
413
|
+
|
|
414
|
+
def _read_json(self, path: Path) -> str:
|
|
415
|
+
obj = json.loads(path.read_text(encoding="utf-8"))
|
|
416
|
+
return "\n".join(
|
|
417
|
+
textwrap.wrap(json.dumps(obj, indent=2, ensure_ascii=False), width=120)
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
|
|
421
|
+
model = self._ensure_ml_stack()
|
|
422
|
+
return await asyncio.get_event_loop().run_in_executor(
|
|
423
|
+
self._executor,
|
|
424
|
+
lambda: model.encode(
|
|
425
|
+
[chunk],
|
|
426
|
+
convert_to_numpy=True,
|
|
427
|
+
normalize_embeddings=True,
|
|
428
|
+
show_progress_bar=False,
|
|
429
|
+
)[0],
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# ------------------------------------------------------------------ #
|
|
433
|
+
# Chunking Logic
|
|
434
|
+
# ------------------------------------------------------------------ #
|
|
435
|
+
def _chunk_text(self, text: str) -> List[str]:
|
|
436
|
+
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
|
437
|
+
chunks, buf, length = [], [], 0
|
|
438
|
+
for sent in sentences:
|
|
439
|
+
slen = len(sent) + 1
|
|
440
|
+
if length + slen <= self.chunk_size:
|
|
441
|
+
buf.append(sent)
|
|
442
|
+
length += slen
|
|
443
|
+
else:
|
|
444
|
+
if buf:
|
|
445
|
+
chunks.append(" ".join(buf))
|
|
446
|
+
while len(sent) > self.chunk_size:
|
|
447
|
+
part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
|
|
448
|
+
chunks.append(part)
|
|
449
|
+
buf, length = [sent], len(sent)
|
|
450
|
+
if buf:
|
|
451
|
+
chunks.append(" ".join(buf))
|
|
452
|
+
return chunks
|
|
453
|
+
|
|
454
|
+
def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
|
|
455
|
+
model = self._ensure_ml_stack()
|
|
456
|
+
if tokens is None:
|
|
457
|
+
tokens = model.tokenizer.tokenize(chunk)
|
|
458
|
+
out = []
|
|
459
|
+
for i in range(0, len(tokens), self.effective_max_length):
|
|
460
|
+
seg = tokens[i : i + self.effective_max_length]
|
|
461
|
+
out.append(model.tokenizer.convert_tokens_to_string(seg))
|
|
462
|
+
return out
|