projectdavid 1.31.0__py3-none-any.whl → 1.38.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,462 @@
1
+ import asyncio
2
+ import csv
3
+ import hashlib
4
+ import json
5
+ import math
6
+ import re
7
+ import textwrap
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Tuple, Union
11
+
12
+ try: # Python 3.11+
13
+ from typing import LiteralString
14
+ except ImportError: # 3.9–3.10
15
+ from typing_extensions import LiteralString
16
+
17
+ import numpy as np
18
+ import pdfplumber
19
+ from docx import Document
20
+ from PIL import Image
21
+ from pptx import Presentation
22
+ from projectdavid_common import UtilsInterface
23
+
24
+ log = UtilsInterface.LoggingUtility()
25
+
26
+
27
+ def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
28
+ """Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
29
+ lat_r = math.radians(lat)
30
+ lon_r = math.radians(lon)
31
+ return [
32
+ math.cos(lat_r) * math.cos(lon_r),
33
+ math.cos(lat_r) * math.sin(lon_r),
34
+ math.sin(lat_r),
35
+ ]
36
+
37
+
38
+ class FileProcessor:
39
+ # ------------------------------------------------------------------ #
40
+ # Construction
41
+ # ------------------------------------------------------------------ #
42
+ def __init__(
43
+ self,
44
+ *,
45
+ max_workers: int = 4,
46
+ chunk_size: int = 512,
47
+ use_gpu: bool = True,
48
+ use_ocr: bool = True,
49
+ use_detection: bool = False,
50
+ image_model_name: str = "ViT-H-14",
51
+ caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
52
+ ):
53
+ # Configuration
54
+ self._use_gpu = use_gpu
55
+ self._max_workers = max_workers
56
+ self._requested_chunk_size = chunk_size
57
+ self._image_model_name = image_model_name
58
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
59
+
60
+ # Lazy ML Stack Attributes
61
+ self._device = None
62
+ self._torch_dtype = None
63
+ self._embedding_model = None
64
+ self._clip_model = None
65
+ self._clip_preprocess = None
66
+ self._clip_tokenizer = None
67
+
68
+ # Lazy Token Limits
69
+ self._effective_max_length = None
70
+ self._chunk_size = None
71
+
72
+ log.info("Initialized Multimodal Lazy-Loaded FileProcessor")
73
+
74
+ def _ensure_ml_stack(self):
75
+ """Lazy-loads Torch, CLIP, and SentenceTransformers only when needed."""
76
+ if self._embedding_model is None:
77
+ try:
78
+ import open_clip
79
+ import torch
80
+ from sentence_transformers import SentenceTransformer
81
+
82
+ # 1. Setup Device
83
+ if self._use_gpu and torch.cuda.is_available():
84
+ self._device = torch.device("cuda")
85
+ self._torch_dtype = torch.float16
86
+ else:
87
+ self._device = torch.device("cpu")
88
+ self._torch_dtype = torch.float32
89
+
90
+ # 2. Setup Text Embedder
91
+ self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
92
+ self._embedding_model = SentenceTransformer(self.embedding_model_name)
93
+ self._embedding_model.to(str(self._device))
94
+
95
+ # 3. Setup CLIP
96
+ # Note: We use the provided image_model_name (default ViT-H-14)
97
+ self._clip_model, _, self._clip_preprocess = (
98
+ open_clip.create_model_and_transforms(
99
+ self._image_model_name, pretrained="laion2b_s32b_b79k"
100
+ )
101
+ )
102
+ self._clip_model.to(self._device)
103
+ self._clip_tokenizer = open_clip.get_tokenizer(self._image_model_name)
104
+
105
+ # 4. Calculate limits
106
+ max_seq_length = self._embedding_model.get_max_seq_length()
107
+ special_tokens_count = 2
108
+ self._effective_max_length = max_seq_length - special_tokens_count
109
+ self._chunk_size = min(
110
+ self._requested_chunk_size, self._effective_max_length * 4
111
+ )
112
+
113
+ log.info("ML Stack loaded (device=%s)", self._device)
114
+
115
+ except ImportError as e:
116
+ log.error(f"ML Stack failed to load: {e}")
117
+ raise ImportError(
118
+ "This feature requires heavy ML binaries. "
119
+ "Please install the vector stack: pip install projectdavid[vector]"
120
+ )
121
+ return self._embedding_model
122
+
123
+ @property
124
+ def chunk_size(self):
125
+ if self._chunk_size is None:
126
+ self._ensure_ml_stack()
127
+ return self._chunk_size
128
+
129
+ @property
130
+ def effective_max_length(self):
131
+ if self._effective_max_length is None:
132
+ self._ensure_ml_stack()
133
+ return self._effective_max_length
134
+
135
+ # ------------------------------------------------------------------ #
136
+ # Public Embedders
137
+ # ------------------------------------------------------------------ #
138
+ def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
139
+ model = self._ensure_ml_stack()
140
+ single = isinstance(text, str)
141
+ out = model.encode(
142
+ text,
143
+ convert_to_numpy=True,
144
+ normalize_embeddings=True,
145
+ show_progress_bar=False,
146
+ )
147
+ return out if not single else out[0]
148
+
149
+ def encode_image(self, img: Image.Image) -> np.ndarray:
150
+ import torch
151
+
152
+ self._ensure_ml_stack()
153
+ with torch.no_grad():
154
+ tensor = self._clip_preprocess(img).unsqueeze(0).to(self._device)
155
+ feat = self._clip_model.encode_image(tensor).squeeze()
156
+ feat = feat / feat.norm()
157
+ return feat.float().cpu().numpy()
158
+
159
+ def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
160
+ import torch
161
+
162
+ self._ensure_ml_stack()
163
+ with torch.no_grad():
164
+ toks = (
165
+ self._clip_tokenizer(text)
166
+ if isinstance(text, str)
167
+ else self._clip_tokenizer(text) # Adjusted for clip tokenizer behavior
168
+ )
169
+ tensor = toks.to(self._device)
170
+ feat = self._clip_model.encode_text(tensor).squeeze()
171
+ if feat.dim() > 1: # Handle batch
172
+ feat = feat / feat.norm(dim=-1, keepdim=True)
173
+ else:
174
+ feat = feat / feat.norm()
175
+ return feat.float().cpu().numpy()
176
+
177
+ # ------------------------------------------------------------------ #
178
+ # Generic validators / Type Detection
179
+ # ------------------------------------------------------------------ #
180
+ def validate_file(self, file_path: Path):
181
+ max_size = 100 * 1024 * 1024
182
+ if not file_path.exists():
183
+ raise FileNotFoundError(f"File not found: {file_path}")
184
+ if file_path.stat().st_size > max_size:
185
+ raise ValueError(f"{file_path.name} exceeds 100MB limit")
186
+
187
+ def _detect_file_type(self, file_path: Path) -> str:
188
+ suffix = file_path.suffix.lower()
189
+ if suffix == ".pdf":
190
+ return "pdf"
191
+ if suffix == ".csv":
192
+ return "csv"
193
+ if suffix == ".json":
194
+ return "json"
195
+ if suffix in {".doc", ".docx", ".pptx"}:
196
+ return "office"
197
+ if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
198
+ return "image"
199
+
200
+ text_exts = {
201
+ ".txt",
202
+ ".md",
203
+ ".rst",
204
+ ".c",
205
+ ".cpp",
206
+ ".cs",
207
+ ".go",
208
+ ".java",
209
+ ".js",
210
+ ".ts",
211
+ ".py",
212
+ ".html",
213
+ ".css",
214
+ }
215
+ if suffix in text_exts:
216
+ return "text"
217
+ raise ValueError(f"Unsupported file type: {file_path.name}")
218
+
219
+ async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
220
+ path = Path(file_path)
221
+ self.validate_file(path)
222
+ ftype = self._detect_file_type(path)
223
+ return await getattr(self, f"_process_{ftype}")(path)
224
+
225
+ # ------------------------------------------------------------------ #
226
+ # Processors (PDF, Text, CSV, Office, JSON, Image)
227
+ # ------------------------------------------------------------------ #
228
+ async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
229
+ page_chunks, doc_meta = await self._extract_text(file_path)
230
+ all_chunks, line_data = [], []
231
+ for page_text, page_num, line_nums in page_chunks:
232
+ lines = page_text.split("\n")
233
+ buf, buf_lines, length = [], [], 0
234
+ for line, ln in zip(lines, line_nums):
235
+ l = len(line) + 1
236
+ if length + l <= self.chunk_size:
237
+ buf.append(line)
238
+ buf_lines.append(ln)
239
+ length += l
240
+ else:
241
+ if buf:
242
+ all_chunks.append("\n".join(buf))
243
+ line_data.append({"page": page_num, "lines": buf_lines})
244
+ buf, buf_lines, length = [], [], 0
245
+ for piece in self._split_oversized_chunk(line):
246
+ all_chunks.append(piece)
247
+ line_data.append({"page": page_num, "lines": [ln]})
248
+ if buf:
249
+ all_chunks.append("\n".join(buf))
250
+ line_data.append({"page": page_num, "lines": buf_lines})
251
+
252
+ vectors = await asyncio.gather(
253
+ *[self._encode_chunk_async(c) for c in all_chunks]
254
+ )
255
+ return {
256
+ "content": "\n\n".join(all_chunks),
257
+ "metadata": {
258
+ **doc_meta,
259
+ "source": str(file_path),
260
+ "chunks": len(all_chunks),
261
+ "type": "pdf",
262
+ },
263
+ "chunks": all_chunks,
264
+ "vectors": [v.tolist() for v in vectors],
265
+ "line_data": line_data,
266
+ }
267
+
268
+ async def _process_text(self, file_path: Path) -> Dict[str, Any]:
269
+ text, extra_meta, _ = await self._extract_text(file_path)
270
+ chunks = self._chunk_text(text)
271
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
272
+ return {
273
+ "content": text,
274
+ "metadata": {
275
+ **extra_meta,
276
+ "source": str(file_path),
277
+ "chunks": len(chunks),
278
+ "type": "text",
279
+ },
280
+ "chunks": chunks,
281
+ "vectors": [v.tolist() for v in vectors],
282
+ }
283
+
284
+ async def _process_csv(
285
+ self, file_path: Path, text_field: str = "description"
286
+ ) -> Dict[str, Any]:
287
+ texts, metas = [], []
288
+ with file_path.open(newline="", encoding="utf-8") as f:
289
+ reader = csv.DictReader(f)
290
+ for row in reader:
291
+ txt = row.get(text_field, "").strip()
292
+ if not txt:
293
+ continue
294
+ texts.append(txt)
295
+ metas.append({k: v for k, v in row.items() if k != text_field and v})
296
+ vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
297
+ return {
298
+ "content": None,
299
+ "metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
300
+ "chunks": texts,
301
+ "vectors": [v.tolist() for v in vectors],
302
+ "csv_row_metadata": metas,
303
+ }
304
+
305
+ async def _process_office(self, file_path: Path) -> Dict[str, Any]:
306
+ loop = asyncio.get_event_loop()
307
+ method = (
308
+ self._read_docx
309
+ if file_path.suffix.lower() in {".doc", ".docx"}
310
+ else self._read_pptx
311
+ )
312
+ text = await loop.run_in_executor(self._executor, method, file_path)
313
+ chunks = self._chunk_text(text)
314
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
315
+ return {
316
+ "content": text,
317
+ "metadata": {
318
+ "source": str(file_path),
319
+ "chunks": len(chunks),
320
+ "type": "office",
321
+ },
322
+ "chunks": chunks,
323
+ "vectors": [v.tolist() for v in vectors],
324
+ }
325
+
326
+ async def _process_json(self, file_path: Path) -> Dict[str, Any]:
327
+ text = await asyncio.get_event_loop().run_in_executor(
328
+ self._executor, self._read_json, file_path
329
+ )
330
+ chunks = self._chunk_text(text)
331
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
332
+ return {
333
+ "content": text,
334
+ "metadata": {
335
+ "source": str(file_path),
336
+ "chunks": len(chunks),
337
+ "type": "json",
338
+ },
339
+ "chunks": chunks,
340
+ "vectors": [v.tolist() for v in vectors],
341
+ }
342
+
343
+ async def _process_image(self, file_path: Path) -> Dict[str, Any]:
344
+ """Handles image embedding via CLIP."""
345
+ img = Image.open(file_path).convert("RGB")
346
+ vector = await asyncio.get_event_loop().run_in_executor(
347
+ self._executor, self.encode_image, img
348
+ )
349
+ return {
350
+ "content": None,
351
+ "metadata": {"source": str(file_path), "type": "image"},
352
+ "chunks": [],
353
+ "vectors": [vector.tolist()],
354
+ }
355
+
356
+ # ------------------------------------------------------------------ #
357
+ # Extraction/Read Helpers
358
+ # ------------------------------------------------------------------ #
359
+ async def _extract_text(self, file_path: Path):
360
+ loop = asyncio.get_event_loop()
361
+ if file_path.suffix.lower() == ".pdf":
362
+ return await loop.run_in_executor(
363
+ self._executor, self._extract_pdf_text, file_path
364
+ )
365
+ text = await loop.run_in_executor(
366
+ self._executor, self._read_text_file, file_path
367
+ )
368
+ return text, {}, []
369
+
370
+ def _extract_pdf_text(self, file_path: Path):
371
+ page_chunks, meta = [], {}
372
+ with pdfplumber.open(file_path) as pdf:
373
+ meta.update(
374
+ {
375
+ "author": pdf.metadata.get("Author", ""),
376
+ "title": pdf.metadata.get("Title", file_path.stem),
377
+ "page_count": len(pdf.pages),
378
+ }
379
+ )
380
+ for i, page in enumerate(pdf.pages, start=1):
381
+ lines = page.extract_text_lines()
382
+ sorted_lines = sorted(lines, key=lambda x: x["top"])
383
+ txts, nums = [], []
384
+ for ln_idx, line in enumerate(sorted_lines, start=1):
385
+ t = line.get("text", "").strip()
386
+ if t:
387
+ txts.append(t)
388
+ nums.append(ln_idx)
389
+ if txts:
390
+ page_chunks.append(("\n".join(txts), i, nums))
391
+ return page_chunks, meta
392
+
393
+ def _read_text_file(self, file_path: Path) -> str:
394
+ try:
395
+ return file_path.read_text(encoding="utf-8")
396
+ except UnicodeDecodeError:
397
+ return file_path.read_text(encoding="latin-1")
398
+
399
+ def _read_docx(self, path: Path) -> str:
400
+ return "\n".join(p.text for p in Document(path).paragraphs if p.text.strip())
401
+
402
+ def _read_pptx(self, path: Path) -> str:
403
+ slides = []
404
+ for slide in Presentation(path).slides:
405
+ slides.append(
406
+ "\n".join(
407
+ filter(
408
+ None, [sh.text for sh in slide.shapes if hasattr(sh, "text")]
409
+ )
410
+ )
411
+ )
412
+ return "\n\n".join(slides)
413
+
414
+ def _read_json(self, path: Path) -> str:
415
+ obj = json.loads(path.read_text(encoding="utf-8"))
416
+ return "\n".join(
417
+ textwrap.wrap(json.dumps(obj, indent=2, ensure_ascii=False), width=120)
418
+ )
419
+
420
+ async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
421
+ model = self._ensure_ml_stack()
422
+ return await asyncio.get_event_loop().run_in_executor(
423
+ self._executor,
424
+ lambda: model.encode(
425
+ [chunk],
426
+ convert_to_numpy=True,
427
+ normalize_embeddings=True,
428
+ show_progress_bar=False,
429
+ )[0],
430
+ )
431
+
432
+ # ------------------------------------------------------------------ #
433
+ # Chunking Logic
434
+ # ------------------------------------------------------------------ #
435
+ def _chunk_text(self, text: str) -> List[str]:
436
+ sentences = re.split(r"(?<=[\.!?])\s+", text)
437
+ chunks, buf, length = [], [], 0
438
+ for sent in sentences:
439
+ slen = len(sent) + 1
440
+ if length + slen <= self.chunk_size:
441
+ buf.append(sent)
442
+ length += slen
443
+ else:
444
+ if buf:
445
+ chunks.append(" ".join(buf))
446
+ while len(sent) > self.chunk_size:
447
+ part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
448
+ chunks.append(part)
449
+ buf, length = [sent], len(sent)
450
+ if buf:
451
+ chunks.append(" ".join(buf))
452
+ return chunks
453
+
454
+ def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
455
+ model = self._ensure_ml_stack()
456
+ if tokens is None:
457
+ tokens = model.tokenizer.tokenize(chunk)
458
+ out = []
459
+ for i in range(0, len(tokens), self.effective_max_length):
460
+ seg = tokens[i : i + self.effective_max_length]
461
+ out.append(model.tokenizer.convert_tokens_to_string(seg))
462
+ return out