projectdavid 1.33.12__py3-none-any.whl → 1.33.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of projectdavid might be problematic. Click here for more details.

@@ -1,8 +1,6 @@
1
1
  import asyncio
2
2
  import csv
3
- import hashlib
4
3
  import json
5
- import math
6
4
  import re
7
5
  import textwrap
8
6
  from concurrent.futures import ThreadPoolExecutor
@@ -15,124 +13,34 @@ except ImportError: # 3.9–3.10
15
13
  from typing_extensions import LiteralString
16
14
 
17
15
  import numpy as np
18
- import open_clip
19
16
  import pdfplumber
20
- import torch
21
17
  from docx import Document
22
- from PIL import Image
23
18
  from pptx import Presentation
24
- from transformers import Blip2ForConditionalGeneration, Blip2Processor
25
- from ultralytics import YOLO
26
-
27
- # OCR fallback – optional
28
- try:
29
- import pytesseract # noqa: F401 # pylint: disable=unused-import
30
- except ImportError:
31
- pytesseract = None
32
-
33
19
  from projectdavid_common import UtilsInterface
34
20
  from sentence_transformers import SentenceTransformer
35
21
 
36
22
  log = UtilsInterface.LoggingUtility()
37
23
 
38
24
 
39
- def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
40
- """Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
41
- lat_r = math.radians(lat)
42
- lon_r = math.radians(lon)
43
- return [
44
- math.cos(lat_r) * math.cos(lon_r),
45
- math.cos(lat_r) * math.sin(lon_r),
46
- math.sin(lat_r),
47
- ]
48
-
49
-
50
25
  class FileProcessor:
51
- """Unified processor for text, tabular, office, JSON, **and image** files.
52
-
53
- Each modality is embedded with its optimal model:
54
- • Text → paraphrase‑MiniLM‑L6‑v2 (384‑D)
55
- • Image → OpenCLIP ViT‑H/14 (1024‑D)
56
- • Caption→ OpenCLIP text head (1024‑D)
57
-
58
- Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
59
- GPU usage is optional; pass `use_gpu=False` to stay on CPU.
60
- """
61
-
62
26
  # ------------------------------------------------------------------ #
63
27
  # Construction
64
28
  # ------------------------------------------------------------------ #
65
- def __init__(
66
- self,
67
- *,
68
- max_workers: int = 4,
69
- chunk_size: int = 512,
70
- use_gpu: bool = True,
71
- use_ocr: bool = True,
72
- use_detection: bool = False,
73
- image_model_name: str = "ViT-H-14",
74
- caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
75
- ):
76
- # Device selection
77
- if use_gpu and torch.cuda.is_available():
78
- self.device = torch.device("cuda")
79
- self.torch_dtype = torch.float16
80
- else:
81
- self.device = torch.device("cpu")
82
- self.torch_dtype = torch.float32
83
-
84
- # Feature flags
85
- self.use_ocr = use_ocr and pytesseract is not None
86
- self.use_detection = use_detection
87
- if use_ocr and pytesseract is None:
88
- log.warning("OCR requested but pytesseract not installed – skipping.")
89
- if self.use_detection:
90
- self.detector = YOLO("yolov8x.pt").to(self.device)
91
-
92
- # Text embedder
29
+ def __init__(self, max_workers: int = 4, chunk_size: int = 512):
30
+ self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
93
31
  self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
94
- self.embedding_model = SentenceTransformer(self.embedding_model_name)
95
- self.embedding_model.to(str(self.device))
32
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
96
33
 
97
- # Chunking parameters
34
+ # token limits
98
35
  self.max_seq_length = self.embedding_model.get_max_seq_length()
99
36
  self.special_tokens_count = 2
100
37
  self.effective_max_length = self.max_seq_length - self.special_tokens_count
101
38
  self.chunk_size = min(chunk_size, self.effective_max_length * 4)
102
39
 
103
- # Image embedder
104
- self.clip_model, _, self.clip_preprocess = (
105
- open_clip.create_model_and_transforms(
106
- image_model_name,
107
- pretrained="laion2b_s32b_b79k",
108
- precision="fp16" if self.device.type == "cuda" else "fp32",
109
- )
110
- )
111
- self.clip_model = self.clip_model.to(self.device).eval()
112
- self.clip_tokenizer = open_clip.get_tokenizer(image_model_name)
113
-
114
- # Caption generator
115
- self.blip_processor = Blip2Processor.from_pretrained(caption_model_name)
116
- self.blip_model = (
117
- Blip2ForConditionalGeneration.from_pretrained(
118
- caption_model_name,
119
- torch_dtype=self.torch_dtype,
120
- )
121
- .to(self.device)
122
- .eval()
123
- )
124
-
125
- # Executor & logging
126
- self._executor = ThreadPoolExecutor(max_workers=max_workers)
127
- log.info(
128
- "FileProcessor ready (device=%s, OCR=%s, detection=%s)",
129
- self.device,
130
- self.use_ocr,
131
- self.use_detection,
132
- )
40
+ log.info("Initialized optimized FileProcessor")
133
41
 
134
42
  # ------------------------------------------------------------------ #
135
- # Generic validators *
43
+ # Generic validators
136
44
  # ------------------------------------------------------------------ #
137
45
  def validate_file(self, file_path: Path):
138
46
  """Ensure file exists and is under 100 MB."""
@@ -144,10 +52,20 @@ class FileProcessor:
144
52
  raise ValueError(f"{file_path.name} > {mb} MB limit")
145
53
 
146
54
  # ------------------------------------------------------------------ #
147
- # Filetype detection (extension‑basedno libmagic)
55
+ # File-type detection (simple extension map NO libmagic)
148
56
  # ------------------------------------------------------------------ #
149
57
  def _detect_file_type(self, file_path: Path) -> str:
58
+ """
59
+ Return one of:
60
+
61
+ • 'pdf' • 'csv' • 'json'
62
+ • 'office' (.doc/.docx/.pptx)
63
+ • 'text' (code / markup / plain text)
64
+
65
+ Raises *ValueError* if the extension is not recognised.
66
+ """
150
67
  suffix = file_path.suffix.lower()
68
+
151
69
  if suffix == ".pdf":
152
70
  return "pdf"
153
71
  if suffix == ".csv":
@@ -156,8 +74,7 @@ class FileProcessor:
156
74
  return "json"
157
75
  if suffix in {".doc", ".docx", ".pptx"}:
158
76
  return "office"
159
- if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
160
- return "image"
77
+
161
78
  text_exts = {
162
79
  ".txt",
163
80
  ".md",
@@ -179,100 +96,29 @@ class FileProcessor:
179
96
  }
180
97
  if suffix in text_exts:
181
98
  return "text"
99
+
182
100
  raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
183
101
 
184
102
  # ------------------------------------------------------------------ #
185
- # Dispatcher
103
+ # Public entry-point
186
104
  # ------------------------------------------------------------------ #
187
105
  async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
188
- path = Path(file_path)
189
- self.validate_file(path)
190
- ftype = self._detect_file_type(path)
191
- return await getattr(self, f"_process_{ftype}")(path)
192
-
193
- # ------------------------------------------------------------------ #
194
- # Image processing (OpenCLIP + BLIP-2 + OCR + YOLO)
195
- # ------------------------------------------------------------------ #
196
- async def _process_image(self, file_path: Path) -> Dict[str, Any]:
197
- loop = asyncio.get_event_loop()
198
- img = await loop.run_in_executor(self._executor, Image.open, file_path)
199
-
200
- # 1) Image vector
201
- def enc_img():
202
- with torch.no_grad():
203
- t = self.clip_preprocess(img).unsqueeze(0).to(self.device)
204
- v = self.clip_model.encode_image(t).squeeze()
205
- return (v / v.norm()).float().cpu().numpy()
206
-
207
- image_vec = await loop.run_in_executor(self._executor, enc_img)
208
-
209
- # 2) Caption
210
- def gen_cap():
211
- inp = self.blip_processor(images=img, return_tensors="pt").to(self.device)
212
- with torch.no_grad():
213
- ids = self.blip_model.generate(**inp, max_new_tokens=50)
214
- return self.blip_processor.decode(ids[0], skip_special_tokens=True)
215
-
216
- caption = await loop.run_in_executor(self._executor, gen_cap)
217
-
218
- # 3) OCR
219
- if self.use_ocr:
220
- text = await loop.run_in_executor(
221
- self._executor, pytesseract.image_to_string, img
222
- )
223
- if t := text.strip():
224
- caption += "\n" + t
225
-
226
- # 4) Caption vector
227
- def enc_txt():
228
- with torch.no_grad():
229
- tok = self.clip_tokenizer(caption).unsqueeze(0).to(self.device)
230
- v = self.clip_model.encode_text(tok).squeeze()
231
- return (v / v.norm()).float().cpu().numpy()
232
-
233
- caption_vec = await loop.run_in_executor(self._executor, enc_txt)
234
-
235
- # 5) YOLO regions
236
- region_vectors = []
237
- if self.use_detection:
238
- dets = self.detector(img)[0]
239
- for box in dets.boxes:
240
- x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().tolist())
241
- crop = img.crop((x1, y1, x2, y2))
242
- vec = self.encode_image(crop)
243
- region_vectors.append(
244
- {
245
- "vector": vec.tolist(),
246
- "bbox": [x1, y1, x2, y2],
247
- "label": dets.names[int(box.cls)],
248
- "conf": float(box.conf),
249
- }
250
- )
251
-
252
- # Metadata
253
- sha = hashlib.sha256(file_path.read_bytes()).hexdigest()
254
- w, h = img.size
255
- meta = {
256
- "source": str(file_path),
257
- "type": "image",
258
- "width": w,
259
- "height": h,
260
- "mime": f"image/{file_path.suffix.lstrip('.')}",
261
- "sha256": sha,
262
- "embedding_model": "openclip-vit-h-14",
263
- "caption": caption,
106
+ """Validate detect → dispatch to the appropriate processor."""
107
+ file_path = Path(file_path)
108
+ self.validate_file(file_path)
109
+ ftype = self._detect_file_type(file_path)
110
+
111
+ dispatch_map = {
112
+ "pdf": self._process_pdf,
113
+ "text": self._process_text,
114
+ "csv": self._process_csv,
115
+ "office": self._process_office,
116
+ "json": self._process_json,
264
117
  }
118
+ if ftype not in dispatch_map:
119
+ raise ValueError(f"Unsupported file type: {file_path.suffix}")
265
120
 
266
- result = {
267
- "content": None,
268
- "metadata": meta,
269
- "chunks": [caption],
270
- "vectors": [image_vec.tolist()],
271
- "caption_vector": caption_vec.tolist(),
272
- }
273
- if region_vectors:
274
- result["region_vectors"] = region_vectors
275
- return result
121
+ return await dispatch_map[ftype](file_path)
276
122
 
277
123
  # ------------------------------------------------------------------ #
278
124
  # PDF
@@ -280,6 +126,7 @@ class FileProcessor:
280
126
  async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
281
127
  page_chunks, doc_meta = await self._extract_text(file_path)
282
128
  all_chunks, line_data = [], []
129
+
283
130
  for page_text, page_num, line_nums in page_chunks:
284
131
  lines = page_text.split("\n")
285
132
  buf, buf_lines, length = [], [], 0
@@ -318,7 +165,7 @@ class FileProcessor:
318
165
  }
319
166
 
320
167
  # ------------------------------------------------------------------ #
321
- # Plaintext / code / markup
168
+ # Plain-text / code / markup
322
169
  # ------------------------------------------------------------------ #
323
170
  async def _process_text(self, file_path: Path) -> Dict[str, Any]:
324
171
  text, extra_meta, _ = await self._extract_text(file_path)
@@ -351,6 +198,7 @@ class FileProcessor:
351
198
  continue
352
199
  texts.append(txt)
353
200
  metas.append({k: v for k, v in row.items() if k != text_field and v})
201
+
354
202
  vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
355
203
  return {
356
204
  "content": None,
@@ -361,7 +209,7 @@ class FileProcessor:
361
209
  }
362
210
 
363
211
  # ------------------------------------------------------------------ #
364
- # Office docs
212
+ # Office docs (.doc/.docx/.pptx)
365
213
  # ------------------------------------------------------------------ #
366
214
  async def _process_office(self, file_path: Path) -> Dict[str, Any]:
367
215
  loop = asyncio.get_event_loop()
@@ -369,10 +217,11 @@ class FileProcessor:
369
217
  text = await loop.run_in_executor(
370
218
  self._executor, self._read_docx, file_path
371
219
  )
372
- else:
220
+ else: # .pptx
373
221
  text = await loop.run_in_executor(
374
222
  self._executor, self._read_pptx, file_path
375
223
  )
224
+
376
225
  chunks = self._chunk_text(text)
377
226
  vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
378
227
  return {
@@ -418,25 +267,11 @@ class FileProcessor:
418
267
  return await loop.run_in_executor(
419
268
  self._executor, self._extract_pdf_text, file_path
420
269
  )
421
- text = await loop.run_in_executor(
422
- self._executor, self._read_text_file, file_path
423
- )
424
- return text, {}, []
425
-
426
- # ------------------------------------------------------------------ #
427
- # util: clip‑text encoder (public)
428
- # ------------------------------------------------------------------ #
429
- def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
430
- with torch.no_grad():
431
- toks = (
432
- self.clip_tokenizer(text)
433
- if isinstance(text, str)
434
- else self.clip_tokenizer(text, truncate=True)
270
+ else:
271
+ text = await loop.run_in_executor(
272
+ self._executor, self._read_text_file, file_path
435
273
  )
436
- tensor = toks.unsqueeze(0).to(self.device)
437
- feat = self.clip_model.encode_text(tensor).squeeze()
438
- feat = feat / feat.norm()
439
- return feat.float().cpu().numpy()
274
+ return text, {}, []
440
275
 
441
276
  def _extract_pdf_text(self, file_path: Path):
442
277
  page_chunks, meta = [], {}
@@ -452,8 +287,8 @@ class FileProcessor:
452
287
  lines = page.extract_text_lines()
453
288
  sorted_lines = sorted(lines, key=lambda x: x["top"])
454
289
  txts, nums = [], []
455
- for ln_idx, line in enumerate(sorted_lines, start=1):
456
- t = line.get("text", "").strip()
290
+ for ln_idx, L in enumerate(sorted_lines, start=1):
291
+ t = L.get("text", "").strip()
457
292
  if t:
458
293
  txts.append(t)
459
294
  nums.append(ln_idx)
@@ -527,24 +362,3 @@ class FileProcessor:
527
362
  seg = tokens[i : i + self.effective_max_length]
528
363
  out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
529
364
  return out
530
-
531
- # ------------------------------------------------------------------ #
532
- # Retrieval helpers (optional use)
533
- # ------------------------------------------------------------------ #
534
- def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
535
- """Embed raw text with the SentenceTransformer model."""
536
- single = isinstance(text, str)
537
- out = self.embedding_model.encode(
538
- text,
539
- convert_to_numpy=True,
540
- normalize_embeddings=True,
541
- show_progress_bar=False,
542
- )
543
- return out if not single else out[0]
544
-
545
- def encode_image(self, img: Image.Image) -> np.ndarray:
546
- with torch.no_grad():
547
- tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
548
- feat = self.clip_model.encode_image(tensor).squeeze()
549
- feat = feat / feat.norm()
550
- return feat.float().cpu().numpy()
@@ -11,9 +11,15 @@ LOG = UtilsInterface.LoggingUtility()
11
11
 
12
12
 
13
13
  class SynchronousInferenceStream:
14
+ # ------------------------------------------------------------ #
15
+ # GLOBAL EVENT LOOP (single hidden thread for sync wrapper)
16
+ # ------------------------------------------------------------ #
14
17
  _GLOBAL_LOOP = asyncio.new_event_loop()
15
18
  asyncio.set_event_loop(_GLOBAL_LOOP)
16
19
 
20
+ # ------------------------------------------------------------ #
21
+ # Init / setup
22
+ # ------------------------------------------------------------ #
17
23
  def __init__(self, inference) -> None:
18
24
  self.inference_client = inference
19
25
  self.user_id: Optional[str] = None
@@ -32,6 +38,7 @@ class SynchronousInferenceStream:
32
38
  run_id: str,
33
39
  api_key: str,
34
40
  ) -> None:
41
+ """Populate IDs once, so callers only provide provider/model."""
35
42
  self.user_id = user_id
36
43
  self.thread_id = thread_id
37
44
  self.assistant_id = assistant_id
@@ -39,7 +46,10 @@ class SynchronousInferenceStream:
39
46
  self.run_id = run_id
40
47
  self.api_key = api_key
41
48
 
42
- def stream_chunks(
49
+ # ------------------------------------------------------------ #
50
+ # Core sync-to-async streaming wrapper
51
+ # ------------------------------------------------------------ #
52
+ def stream_chunks( # noqa: PLR0915
43
53
  self,
44
54
  provider: str,
45
55
  model: str,
@@ -48,9 +58,15 @@ class SynchronousInferenceStream:
48
58
  timeout_per_chunk: float = 280.0,
49
59
  suppress_fc: bool = True,
50
60
  ) -> Generator[dict, None, None]:
61
+ """
62
+ Sync generator that mirrors async `inference_client.stream_inference_response`
63
+ but (optionally) removes raw <fc> … </fc> output *and* JSON
64
+ `{"type": "function_call" …}` objects from the stream.
65
+ """
51
66
 
52
67
  resolved_api_key = api_key or self.api_key
53
68
 
69
+ # ---------- async inner generator -------------------------------- #
54
70
  async def _stream_chunks_async():
55
71
  async for chk in self.inference_client.stream_inference_response(
56
72
  provider=provider,
@@ -65,6 +81,7 @@ class SynchronousInferenceStream:
65
81
 
66
82
  agen = _stream_chunks_async().__aiter__()
67
83
 
84
+ # ---------- FC-suppressor plumbing -------------------------------- #
68
85
  if suppress_fc:
69
86
  _suppressor = FunctionCallSuppressor()
70
87
  _peek_gate = PeekGate(_suppressor)
@@ -72,11 +89,15 @@ class SynchronousInferenceStream:
72
89
  def _filter_text(txt: str) -> str:
73
90
  return _peek_gate.feed(txt)
74
91
 
92
+ LOG.debug("[SyncStream] Function-call suppression ACTIVE")
75
93
  else:
76
94
 
77
95
  def _filter_text(txt: str) -> str:
78
96
  return txt
79
97
 
98
+ LOG.debug("[SyncStream] Function-call suppression DISABLED")
99
+
100
+ # ---------- helper to flush residual buffered text ---------------- #
80
101
  def _drain_filters() -> Optional[dict]:
81
102
  if not suppress_fc:
82
103
  return None
@@ -97,18 +118,17 @@ class SynchronousInferenceStream:
97
118
  }
98
119
  return None
99
120
 
121
+ # ---------- main sync loop ---------------------------------------- #
100
122
  while True:
101
123
  try:
102
124
  chunk = self._GLOBAL_LOOP.run_until_complete(
103
125
  asyncio.wait_for(agen.__anext__(), timeout=timeout_per_chunk)
104
126
  )
105
127
 
106
- # Always attach run_id
128
+ # Always attach run_id for front-end helpers
107
129
  chunk["run_id"] = self.run_id
108
130
 
109
- # ------------------------------------------------------
110
- # allow status chunks to bypass suppression suppression
111
- # -------------------------------------------------------
131
+ # ----- bypass filters for status / code-exec related -------- #
112
132
  if chunk.get("type") == "status":
113
133
  yield chunk
114
134
  continue
@@ -124,9 +144,19 @@ class SynchronousInferenceStream:
124
144
  yield chunk
125
145
  continue
126
146
 
147
+ # ----- NEW: swallow raw JSON function_call objects ---------- #
148
+ if suppress_fc and chunk.get("type") == "function_call":
149
+ LOG.debug(
150
+ "[SyncStream] Swallowing JSON function_call chunk: %s",
151
+ chunk.get("name") or "<unnamed>",
152
+ )
153
+ continue
154
+
155
+ # ----- text-level suppression ------------------------------- #
127
156
  if isinstance(chunk.get("content"), str):
128
157
  chunk["content"] = _filter_text(chunk["content"])
129
158
  if chunk["content"] == "":
159
+ # Entire segment was inside <fc> … </fc>
130
160
  continue
131
161
 
132
162
  yield chunk
@@ -134,21 +164,26 @@ class SynchronousInferenceStream:
134
164
  except StopAsyncIteration:
135
165
  if tail := _drain_filters():
136
166
  yield tail
137
- LOG.info("Stream completed normally.")
167
+ LOG.info("[SyncStream] Stream completed normally.")
138
168
  break
139
169
 
140
170
  except asyncio.TimeoutError:
141
171
  if tail := _drain_filters():
142
172
  yield tail
143
- LOG.error("[TimeoutError] Chunk wait expired aborting stream.")
173
+ LOG.error("[SyncStream] Timeout waiting for next chunk.")
144
174
  break
145
175
 
146
- except Exception as exc:
176
+ except Exception as exc: # noqa: BLE001
147
177
  if tail := _drain_filters():
148
178
  yield tail
149
- LOG.error("Unexpected streaming error: %s", exc, exc_info=True)
179
+ LOG.error(
180
+ "[SyncStream] Unexpected streaming error: %s", exc, exc_info=True
181
+ )
150
182
  break
151
183
 
184
+ # ------------------------------------------------------------ #
185
+ # House-keeping
186
+ # ------------------------------------------------------------ #
152
187
  @classmethod
153
188
  def shutdown_loop(cls) -> None:
154
189
  if cls._GLOBAL_LOOP and not cls._GLOBAL_LOOP.is_closed():
@@ -0,0 +1,438 @@
1
+ import asyncio
2
+ import csv
3
+ import hashlib
4
+ import json
5
+ import math
6
+ import re
7
+ import textwrap
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Tuple, Union
11
+
12
+ try: # Python 3.11+
13
+ from typing import LiteralString
14
+ except ImportError: # 3.9–3.10
15
+ from typing_extensions import LiteralString
16
+
17
+ import numpy as np
18
+ import open_clip
19
+ import pdfplumber
20
+ import torch
21
+ from docx import Document
22
+ from PIL import Image
23
+ from pptx import Presentation
24
+ from projectdavid_common import UtilsInterface
25
+ from sentence_transformers import SentenceTransformer
26
+
27
+ # from transformers import Blip2ForConditionalGeneration, Blip2Processor
28
+
29
+ # from ultralytics import YOLO
30
+
31
+ # OCR fallback – optional
32
+ # try:
33
+ # import pytesseract # noqa: F401 # pylint: disable=unused-import
34
+ # except ImportError:
35
+ # pytesseract = None
36
+
37
+
38
+ log = UtilsInterface.LoggingUtility()
39
+
40
+
41
+ def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
42
+ """Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
43
+ lat_r = math.radians(lat)
44
+ lon_r = math.radians(lon)
45
+ return [
46
+ math.cos(lat_r) * math.cos(lon_r),
47
+ math.cos(lat_r) * math.sin(lon_r),
48
+ math.sin(lat_r),
49
+ ]
50
+
51
+
52
+ class FileProcessor:
53
+ """Unified processor for text, tabular, office, JSON, **and image** files.
54
+
55
+ Each modality is embedded with its optimal model:
56
+ • Text → paraphrase‑MiniLM‑L6‑v2 (384‑D)
57
+ • Image → OpenCLIP ViT‑H/14 (1024‑D)
58
+ • Caption→ OpenCLIP text head (1024‑D)
59
+
60
+ Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
61
+ GPU usage is optional; pass `use_gpu=False` to stay on CPU.
62
+ """
63
+
64
+ # ------------------------------------------------------------------ #
65
+ # Construction
66
+ # ------------------------------------------------------------------ #
67
+ def __init__(
68
+ self,
69
+ *,
70
+ max_workers: int = 4,
71
+ chunk_size: int = 512,
72
+ use_gpu: bool = True,
73
+ use_ocr: bool = True,
74
+ use_detection: bool = False,
75
+ image_model_name: str = "ViT-H-14",
76
+ caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
77
+ ):
78
+ # Device selection
79
+ if use_gpu and torch.cuda.is_available():
80
+ self.device = torch.device("cuda")
81
+ self.torch_dtype = torch.float16
82
+ else:
83
+ self.device = torch.device("cpu")
84
+ self.torch_dtype = torch.float32
85
+
86
+ # Text embedder
87
+ self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
88
+ self.embedding_model = SentenceTransformer(self.embedding_model_name)
89
+ self.embedding_model.to(str(self.device))
90
+
91
+ # Chunking parameters
92
+ self.max_seq_length = self.embedding_model.get_max_seq_length()
93
+ self.special_tokens_count = 2
94
+ self.effective_max_length = self.max_seq_length - self.special_tokens_count
95
+ self.chunk_size = min(chunk_size, self.effective_max_length * 4)
96
+
97
+ # Executor & logging
98
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
99
+ log.info(
100
+ "FileProcessor ready (device=%s, OCR=%s, detection=%s)",
101
+ self.device,
102
+ # self.use_ocr,
103
+ # self.use_detection,
104
+ )
105
+
106
+ # ------------------------------------------------------------------ #
107
+ # Generic validators *
108
+ # ------------------------------------------------------------------ #
109
+ def validate_file(self, file_path: Path):
110
+ """Ensure file exists and is under 100 MB."""
111
+ max_size = 100 * 1024 * 1024
112
+ if not file_path.exists():
113
+ raise FileNotFoundError(f"File not found: {file_path}")
114
+ if file_path.stat().st_size > max_size:
115
+ mb = max_size // (1024 * 1024)
116
+ raise ValueError(f"{file_path.name} > {mb} MB limit")
117
+
118
+ # ------------------------------------------------------------------ #
119
+ # File‑type detection (extension‑based – no libmagic)
120
+ # ------------------------------------------------------------------ #
121
+ def _detect_file_type(self, file_path: Path) -> str:
122
+ suffix = file_path.suffix.lower()
123
+ if suffix == ".pdf":
124
+ return "pdf"
125
+ if suffix == ".csv":
126
+ return "csv"
127
+ if suffix == ".json":
128
+ return "json"
129
+ if suffix in {".doc", ".docx", ".pptx"}:
130
+ return "office"
131
+ if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
132
+ return "image"
133
+ text_exts = {
134
+ ".txt",
135
+ ".md",
136
+ ".rst",
137
+ ".c",
138
+ ".cpp",
139
+ ".cs",
140
+ ".go",
141
+ ".java",
142
+ ".js",
143
+ ".ts",
144
+ ".php",
145
+ ".py",
146
+ ".rb",
147
+ ".sh",
148
+ ".tex",
149
+ ".html",
150
+ ".css",
151
+ }
152
+ if suffix in text_exts:
153
+ return "text"
154
+ raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
155
+
156
+ # ------------------------------------------------------------------ #
157
+ # Dispatcher
158
+ # ------------------------------------------------------------------ #
159
+ async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
160
+ path = Path(file_path)
161
+ self.validate_file(path)
162
+ ftype = self._detect_file_type(path)
163
+ return await getattr(self, f"_process_{ftype}")(path)
164
+
165
+ # ------------------------------------------------------------------ #
166
+ # PDF
167
+ # ------------------------------------------------------------------ #
168
+ async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
169
+ page_chunks, doc_meta = await self._extract_text(file_path)
170
+ all_chunks, line_data = [], []
171
+ for page_text, page_num, line_nums in page_chunks:
172
+ lines = page_text.split("\n")
173
+ buf, buf_lines, length = [], [], 0
174
+ for line, ln in zip(lines, line_nums):
175
+ l = len(line) + 1
176
+ if length + l <= self.chunk_size:
177
+ buf.append(line)
178
+ buf_lines.append(ln)
179
+ length += l
180
+ else:
181
+ if buf:
182
+ all_chunks.append("\n".join(buf))
183
+ line_data.append({"page": page_num, "lines": buf_lines})
184
+ buf, buf_lines, length = [], [], 0
185
+ for piece in self._split_oversized_chunk(line):
186
+ all_chunks.append(piece)
187
+ line_data.append({"page": page_num, "lines": [ln]})
188
+ if buf:
189
+ all_chunks.append("\n".join(buf))
190
+ line_data.append({"page": page_num, "lines": buf_lines})
191
+
192
+ vectors = await asyncio.gather(
193
+ *[self._encode_chunk_async(c) for c in all_chunks]
194
+ )
195
+ return {
196
+ "content": "\n\n".join(all_chunks),
197
+ "metadata": {
198
+ **doc_meta,
199
+ "source": str(file_path),
200
+ "chunks": len(all_chunks),
201
+ "type": "pdf",
202
+ },
203
+ "chunks": all_chunks,
204
+ "vectors": [v.tolist() for v in vectors],
205
+ "line_data": line_data,
206
+ }
207
+
208
+ # ------------------------------------------------------------------ #
209
+ # Plain‑text / code / markup
210
+ # ------------------------------------------------------------------ #
211
+ async def _process_text(self, file_path: Path) -> Dict[str, Any]:
212
+ text, extra_meta, _ = await self._extract_text(file_path)
213
+ chunks = self._chunk_text(text)
214
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
215
+ return {
216
+ "content": text,
217
+ "metadata": {
218
+ **extra_meta,
219
+ "source": str(file_path),
220
+ "chunks": len(chunks),
221
+ "type": "text",
222
+ },
223
+ "chunks": chunks,
224
+ "vectors": [v.tolist() for v in vectors],
225
+ }
226
+
227
+ # ------------------------------------------------------------------ #
228
+ # CSV
229
+ # ------------------------------------------------------------------ #
230
+ async def _process_csv(
231
+ self, file_path: Path, text_field: str = "description"
232
+ ) -> Dict[str, Any]:
233
+ rows, texts, metas = [], [], []
234
+ with file_path.open(newline="", encoding="utf-8") as f:
235
+ reader = csv.DictReader(f)
236
+ for row in reader:
237
+ txt = row.get(text_field, "").strip()
238
+ if not txt:
239
+ continue
240
+ texts.append(txt)
241
+ metas.append({k: v for k, v in row.items() if k != text_field and v})
242
+ vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
243
+ return {
244
+ "content": None,
245
+ "metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
246
+ "chunks": texts,
247
+ "vectors": [v.tolist() for v in vectors],
248
+ "csv_row_metadata": metas,
249
+ }
250
+
251
+ # ------------------------------------------------------------------ #
252
+ # Office docs
253
+ # ------------------------------------------------------------------ #
254
+ async def _process_office(self, file_path: Path) -> Dict[str, Any]:
255
+ loop = asyncio.get_event_loop()
256
+ if file_path.suffix.lower() in {".doc", ".docx"}:
257
+ text = await loop.run_in_executor(
258
+ self._executor, self._read_docx, file_path
259
+ )
260
+ else:
261
+ text = await loop.run_in_executor(
262
+ self._executor, self._read_pptx, file_path
263
+ )
264
+ chunks = self._chunk_text(text)
265
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
266
+ return {
267
+ "content": text,
268
+ "metadata": {
269
+ "source": str(file_path),
270
+ "chunks": len(chunks),
271
+ "type": "office",
272
+ },
273
+ "chunks": chunks,
274
+ "vectors": [v.tolist() for v in vectors],
275
+ }
276
+
277
+ # ------------------------------------------------------------------ #
278
+ # JSON
279
+ # ------------------------------------------------------------------ #
280
+ async def _process_json(self, file_path: Path) -> Dict[str, Any]:
281
+ text = await asyncio.get_event_loop().run_in_executor(
282
+ self._executor, self._read_json, file_path
283
+ )
284
+ chunks = self._chunk_text(text)
285
+ vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
286
+ return {
287
+ "content": text,
288
+ "metadata": {
289
+ "source": str(file_path),
290
+ "chunks": len(chunks),
291
+ "type": "json",
292
+ },
293
+ "chunks": chunks,
294
+ "vectors": [v.tolist() for v in vectors],
295
+ }
296
+
297
+ # ------------------------------------------------------------------ #
298
+ # Shared helpers
299
+ # ------------------------------------------------------------------ #
300
+ async def _extract_text(self, file_path: Path) -> Union[
301
+ Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
302
+ Tuple[str, Dict[str, Any], List[int]],
303
+ ]:
304
+ loop = asyncio.get_event_loop()
305
+ if file_path.suffix.lower() == ".pdf":
306
+ return await loop.run_in_executor(
307
+ self._executor, self._extract_pdf_text, file_path
308
+ )
309
+ text = await loop.run_in_executor(
310
+ self._executor, self._read_text_file, file_path
311
+ )
312
+ return text, {}, []
313
+
314
+ # ------------------------------------------------------------------ #
315
+ # util: clip‑text encoder (public)
316
+ # ------------------------------------------------------------------ #
317
+ def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
318
+ with torch.no_grad():
319
+ toks = (
320
+ self.clip_tokenizer(text)
321
+ if isinstance(text, str)
322
+ else self.clip_tokenizer(text, truncate=True)
323
+ )
324
+ tensor = toks.unsqueeze(0).to(self.device)
325
+ feat = self.clip_model.encode_text(tensor).squeeze()
326
+ feat = feat / feat.norm()
327
+ return feat.float().cpu().numpy()
328
+
329
+ def _extract_pdf_text(self, file_path: Path):
330
+ page_chunks, meta = [], {}
331
+ with pdfplumber.open(file_path) as pdf:
332
+ meta.update(
333
+ {
334
+ "author": pdf.metadata.get("Author", ""),
335
+ "title": pdf.metadata.get("Title", file_path.stem),
336
+ "page_count": len(pdf.pages),
337
+ }
338
+ )
339
+ for i, page in enumerate(pdf.pages, start=1):
340
+ lines = page.extract_text_lines()
341
+ sorted_lines = sorted(lines, key=lambda x: x["top"])
342
+ txts, nums = [], []
343
+ for ln_idx, line in enumerate(sorted_lines, start=1):
344
+ t = line.get("text", "").strip()
345
+ if t:
346
+ txts.append(t)
347
+ nums.append(ln_idx)
348
+ if txts:
349
+ page_chunks.append(("\n".join(txts), i, nums))
350
+ return page_chunks, meta
351
+
352
+ def _read_text_file(self, file_path: Path) -> str:
353
+ try:
354
+ return file_path.read_text(encoding="utf-8")
355
+ except UnicodeDecodeError:
356
+ return file_path.read_text(encoding="latin-1")
357
+
358
+ def _read_docx(self, path: Path) -> str:
359
+ doc = Document(path)
360
+ return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
361
+
362
+ def _read_pptx(self, path: Path) -> str:
363
+ prs = Presentation(path)
364
+ slides = []
365
+ for slide in prs.slides:
366
+ chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
367
+ slides.append("\n".join(filter(None, chunks)))
368
+ return "\n\n".join(slides)
369
+
370
+ def _read_json(self, path: Path) -> str:
371
+ obj = json.loads(path.read_text(encoding="utf-8"))
372
+ pretty = json.dumps(obj, indent=2, ensure_ascii=False)
373
+ return "\n".join(textwrap.wrap(pretty, width=120))
374
+
375
+ async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
376
+ return await asyncio.get_event_loop().run_in_executor(
377
+ self._executor,
378
+ lambda: self.embedding_model.encode(
379
+ [chunk],
380
+ convert_to_numpy=True,
381
+ truncate="model_max_length",
382
+ normalize_embeddings=True,
383
+ show_progress_bar=False,
384
+ )[0],
385
+ )
386
+
387
+ # ------------------------------------------------------------------ #
388
+ # Text chunking helpers
389
+ # ------------------------------------------------------------------ #
390
+ def _chunk_text(self, text: str) -> List[str]:
391
+ sentences = re.split(r"(?<=[\.!?])\s+", text)
392
+ chunks, buf, length = [], [], 0
393
+ for sent in sentences:
394
+ slen = len(sent) + 1
395
+ if length + slen <= self.chunk_size:
396
+ buf.append(sent)
397
+ length += slen
398
+ else:
399
+ if buf:
400
+ chunks.append(" ".join(buf))
401
+ buf, length = [], 0
402
+ while len(sent) > self.chunk_size:
403
+ part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
404
+ chunks.append(part)
405
+ buf, length = [sent], len(sent)
406
+ if buf:
407
+ chunks.append(" ".join(buf))
408
+ return chunks
409
+
410
+ def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
411
+ if tokens is None:
412
+ tokens = self.embedding_model.tokenizer.tokenize(chunk)
413
+ out = []
414
+ for i in range(0, len(tokens), self.effective_max_length):
415
+ seg = tokens[i : i + self.effective_max_length]
416
+ out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
417
+ return out
418
+
419
+ # ------------------------------------------------------------------ #
420
+ # Retrieval helpers (optional use)
421
+ # ------------------------------------------------------------------ #
422
+ def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
423
+ """Embed raw text with the SentenceTransformer model."""
424
+ single = isinstance(text, str)
425
+ out = self.embedding_model.encode(
426
+ text,
427
+ convert_to_numpy=True,
428
+ normalize_embeddings=True,
429
+ show_progress_bar=False,
430
+ )
431
+ return out if not single else out[0]
432
+
433
+ def encode_image(self, img: Image.Image) -> np.ndarray:
434
+ with torch.no_grad():
435
+ tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
436
+ feat = self.clip_model.encode_image(tensor).squeeze()
437
+ feat = feat / feat.norm()
438
+ return feat.float().cpu().numpy()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: projectdavid
3
- Version: 1.33.12
3
+ Version: 1.33.14
4
4
  Summary: Python SDK for interacting with the Entities Assistant API.
5
5
  Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
6
6
  License: PolyForm Noncommercial License 1.0.0
@@ -10,18 +10,19 @@ projectdavid/clients/assistants_client.py,sha256=SsIGa5wPr7ga9WX0ywam3djUF-uWFdk
10
10
  projectdavid/clients/base_client.py,sha256=UWl6nr6sxD1_xC6iyptQDR1tnNdFCOrEx5cEUPCRqJE,3417
11
11
  projectdavid/clients/base_vector_store.py,sha256=jXivmqAW1bgYcLgIeW-hPxOiWZbs2hCsLy4oWzSvpNI,2061
12
12
  projectdavid/clients/event_handler.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- projectdavid/clients/file_processor.py,sha256=nFccQmiow3lkjv1-Pdgv_2WQAtSy0FRN7oJlTKt4fs4,21114
13
+ projectdavid/clients/file_processor.py,sha256=t-Uw-kBP_VmlguMxO9PWY6ANuMAY0PstQDW37wLPF0Q,13980
14
14
  projectdavid/clients/file_search.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  projectdavid/clients/files_client.py,sha256=XkIDzbQFGDrd88taf0Kouc_4YJOPIYEHiIyWYLKDofI,15581
16
16
  projectdavid/clients/inference_client.py,sha256=xz4ACPv5Tkis604QxO5mJX1inH_TGDfQP-31geETYpE,6609
17
17
  projectdavid/clients/messages_client.py,sha256=467xeIt3VYs6cG8-bl-eDRi_auWOPmfd5tSJDmQSJUI,17232
18
18
  projectdavid/clients/runs.py,sha256=-fXOq5L9w2efDPmZkNxb0s2yjl6oN0XN4_aLXqaeceo,25270
19
- projectdavid/clients/synchronous_inference_wrapper.py,sha256=mN5WAHmv0aRoeMIb7XPgv3cuqrMPzu378UsZ02jEvRY,5090
19
+ projectdavid/clients/synchronous_inference_wrapper.py,sha256=qh94rtNlLqgIxiA_ZbQ1ncOwQTi9aBj5os3sMExLh4E,7070
20
20
  projectdavid/clients/threads_client.py,sha256=ekzU5w14zftmtmFkiec3NC90Of-_KVSUY1qH9cmfSFg,6771
21
21
  projectdavid/clients/tools_client.py,sha256=GkCVOmwpAoPqVt6aYmH0G1HIFha3iEwR9IIf9teR0j8,11487
22
22
  projectdavid/clients/users_client.py,sha256=eCuUb9qvyH1GUFhZu6TRL9zdoK-qzHSs8-Vmrk_0mmg,13729
23
23
  projectdavid/clients/vector_store_manager.py,sha256=q-ZgRQVX_S3nMrKYhmvkVrDjDRzM3ZFzUF55HBGRTe8,12861
24
24
  projectdavid/clients/vectors.py,sha256=cysPVbUzW3byB82MTqG2X1Iz5ZAe82WTS1JfQcoqVhE,40229
25
+ projectdavid/clients/vision-file_processor.py,sha256=19ft9IUeY5x9_22vC4JqndiFlpDYyUn6z1ygv-EV2NE,16852
25
26
  projectdavid/constants/platform.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
27
  projectdavid/services/logging_service.py,sha256=jdoRL46E42Ar8JFTDOV-xVD67CulcHSN-xhcEqA5CXQ,2643
27
28
  projectdavid/synthesis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -35,8 +36,8 @@ projectdavid/utils/monitor_launcher.py,sha256=3YAgJdeuaUvq3JGvpA4ymqFsAnk29nH5q9
35
36
  projectdavid/utils/peek_gate.py,sha256=5whMRnDOQjATRpThWDJkvY9ScXuJ7Sd_-9rvGgXeTAQ,2532
36
37
  projectdavid/utils/run_monitor.py,sha256=F_WkqIP-qnWH-4llIbileWWLfRj2Q1Cg-ni23SR1rec,3786
37
38
  projectdavid/utils/vector_search_formatter.py,sha256=YTe3HPGec26qGY7uxY8_GS8lc4QaN6aNXMzkl29nZpI,1735
38
- projectdavid-1.33.12.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
39
- projectdavid-1.33.12.dist-info/METADATA,sha256=NmKw2v_K20Uq-H0yEjxkDpAGdMi4wQdIjLrtP-Sthr8,11555
40
- projectdavid-1.33.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
41
- projectdavid-1.33.12.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
42
- projectdavid-1.33.12.dist-info/RECORD,,
39
+ projectdavid-1.33.14.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
40
+ projectdavid-1.33.14.dist-info/METADATA,sha256=jFWdJGL8LYBQNEoEqBZ6DhLJ-HnVgLsvQ06K7PAkpRA,11555
41
+ projectdavid-1.33.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
42
+ projectdavid-1.33.14.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
43
+ projectdavid-1.33.14.dist-info/RECORD,,