projectdavid 1.33.13__py3-none-any.whl → 1.33.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of projectdavid might be problematic. Click here for more details.
- projectdavid/clients/file_processor.py +46 -232
- projectdavid/clients/vision-file_processor.py +438 -0
- {projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/METADATA +1 -1
- {projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/RECORD +7 -6
- {projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/WHEEL +0 -0
- {projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/licenses/LICENSE +0 -0
- {projectdavid-1.33.13.dist-info → projectdavid-1.33.14.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
|
-
import hashlib
|
|
4
3
|
import json
|
|
5
|
-
import math
|
|
6
4
|
import re
|
|
7
5
|
import textwrap
|
|
8
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -15,124 +13,34 @@ except ImportError: # 3.9–3.10
|
|
|
15
13
|
from typing_extensions import LiteralString
|
|
16
14
|
|
|
17
15
|
import numpy as np
|
|
18
|
-
import open_clip
|
|
19
16
|
import pdfplumber
|
|
20
|
-
import torch
|
|
21
17
|
from docx import Document
|
|
22
|
-
from PIL import Image
|
|
23
18
|
from pptx import Presentation
|
|
24
|
-
from transformers import Blip2ForConditionalGeneration, Blip2Processor
|
|
25
|
-
from ultralytics import YOLO
|
|
26
|
-
|
|
27
|
-
# OCR fallback – optional
|
|
28
|
-
try:
|
|
29
|
-
import pytesseract # noqa: F401 # pylint: disable=unused-import
|
|
30
|
-
except ImportError:
|
|
31
|
-
pytesseract = None
|
|
32
|
-
|
|
33
19
|
from projectdavid_common import UtilsInterface
|
|
34
20
|
from sentence_transformers import SentenceTransformer
|
|
35
21
|
|
|
36
22
|
log = UtilsInterface.LoggingUtility()
|
|
37
23
|
|
|
38
24
|
|
|
39
|
-
def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
|
|
40
|
-
"""Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
|
|
41
|
-
lat_r = math.radians(lat)
|
|
42
|
-
lon_r = math.radians(lon)
|
|
43
|
-
return [
|
|
44
|
-
math.cos(lat_r) * math.cos(lon_r),
|
|
45
|
-
math.cos(lat_r) * math.sin(lon_r),
|
|
46
|
-
math.sin(lat_r),
|
|
47
|
-
]
|
|
48
|
-
|
|
49
|
-
|
|
50
25
|
class FileProcessor:
|
|
51
|
-
"""Unified processor for text, tabular, office, JSON, **and image** files.
|
|
52
|
-
|
|
53
|
-
Each modality is embedded with its optimal model:
|
|
54
|
-
• Text → paraphrase‑MiniLM‑L6‑v2 (384‑D)
|
|
55
|
-
• Image → OpenCLIP ViT‑H/14 (1024‑D)
|
|
56
|
-
• Caption→ OpenCLIP text head (1024‑D)
|
|
57
|
-
|
|
58
|
-
Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
|
|
59
|
-
GPU usage is optional; pass `use_gpu=False` to stay on CPU.
|
|
60
|
-
"""
|
|
61
|
-
|
|
62
26
|
# ------------------------------------------------------------------ #
|
|
63
27
|
# Construction
|
|
64
28
|
# ------------------------------------------------------------------ #
|
|
65
|
-
def __init__(
|
|
66
|
-
self
|
|
67
|
-
*,
|
|
68
|
-
max_workers: int = 4,
|
|
69
|
-
chunk_size: int = 512,
|
|
70
|
-
use_gpu: bool = True,
|
|
71
|
-
use_ocr: bool = True,
|
|
72
|
-
use_detection: bool = False,
|
|
73
|
-
image_model_name: str = "ViT-H-14",
|
|
74
|
-
caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
|
|
75
|
-
):
|
|
76
|
-
# Device selection
|
|
77
|
-
if use_gpu and torch.cuda.is_available():
|
|
78
|
-
self.device = torch.device("cuda")
|
|
79
|
-
self.torch_dtype = torch.float16
|
|
80
|
-
else:
|
|
81
|
-
self.device = torch.device("cpu")
|
|
82
|
-
self.torch_dtype = torch.float32
|
|
83
|
-
|
|
84
|
-
# Feature flags
|
|
85
|
-
self.use_ocr = use_ocr and pytesseract is not None
|
|
86
|
-
self.use_detection = use_detection
|
|
87
|
-
if use_ocr and pytesseract is None:
|
|
88
|
-
log.warning("OCR requested but pytesseract not installed – skipping.")
|
|
89
|
-
if self.use_detection:
|
|
90
|
-
self.detector = YOLO("yolov8x.pt").to(self.device)
|
|
91
|
-
|
|
92
|
-
# Text embedder
|
|
29
|
+
def __init__(self, max_workers: int = 4, chunk_size: int = 512):
|
|
30
|
+
self.embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
|
93
31
|
self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
|
|
94
|
-
self.
|
|
95
|
-
self.embedding_model.to(str(self.device))
|
|
32
|
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
96
33
|
|
|
97
|
-
#
|
|
34
|
+
# token limits
|
|
98
35
|
self.max_seq_length = self.embedding_model.get_max_seq_length()
|
|
99
36
|
self.special_tokens_count = 2
|
|
100
37
|
self.effective_max_length = self.max_seq_length - self.special_tokens_count
|
|
101
38
|
self.chunk_size = min(chunk_size, self.effective_max_length * 4)
|
|
102
39
|
|
|
103
|
-
|
|
104
|
-
self.clip_model, _, self.clip_preprocess = (
|
|
105
|
-
open_clip.create_model_and_transforms(
|
|
106
|
-
image_model_name,
|
|
107
|
-
pretrained="laion2b_s32b_b79k",
|
|
108
|
-
precision="fp16" if self.device.type == "cuda" else "fp32",
|
|
109
|
-
)
|
|
110
|
-
)
|
|
111
|
-
self.clip_model = self.clip_model.to(self.device).eval()
|
|
112
|
-
self.clip_tokenizer = open_clip.get_tokenizer(image_model_name)
|
|
113
|
-
|
|
114
|
-
# Caption generator
|
|
115
|
-
self.blip_processor = Blip2Processor.from_pretrained(caption_model_name)
|
|
116
|
-
self.blip_model = (
|
|
117
|
-
Blip2ForConditionalGeneration.from_pretrained(
|
|
118
|
-
caption_model_name,
|
|
119
|
-
torch_dtype=self.torch_dtype,
|
|
120
|
-
)
|
|
121
|
-
.to(self.device)
|
|
122
|
-
.eval()
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
# Executor & logging
|
|
126
|
-
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
127
|
-
log.info(
|
|
128
|
-
"FileProcessor ready (device=%s, OCR=%s, detection=%s)",
|
|
129
|
-
self.device,
|
|
130
|
-
self.use_ocr,
|
|
131
|
-
self.use_detection,
|
|
132
|
-
)
|
|
40
|
+
log.info("Initialized optimized FileProcessor")
|
|
133
41
|
|
|
134
42
|
# ------------------------------------------------------------------ #
|
|
135
|
-
# Generic validators
|
|
43
|
+
# Generic validators
|
|
136
44
|
# ------------------------------------------------------------------ #
|
|
137
45
|
def validate_file(self, file_path: Path):
|
|
138
46
|
"""Ensure file exists and is under 100 MB."""
|
|
@@ -144,10 +52,20 @@ class FileProcessor:
|
|
|
144
52
|
raise ValueError(f"{file_path.name} > {mb} MB limit")
|
|
145
53
|
|
|
146
54
|
# ------------------------------------------------------------------ #
|
|
147
|
-
# File
|
|
55
|
+
# File-type detection (simple extension map – NO libmagic)
|
|
148
56
|
# ------------------------------------------------------------------ #
|
|
149
57
|
def _detect_file_type(self, file_path: Path) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Return one of:
|
|
60
|
+
|
|
61
|
+
• 'pdf' • 'csv' • 'json'
|
|
62
|
+
• 'office' (.doc/.docx/.pptx)
|
|
63
|
+
• 'text' (code / markup / plain text)
|
|
64
|
+
|
|
65
|
+
Raises *ValueError* if the extension is not recognised.
|
|
66
|
+
"""
|
|
150
67
|
suffix = file_path.suffix.lower()
|
|
68
|
+
|
|
151
69
|
if suffix == ".pdf":
|
|
152
70
|
return "pdf"
|
|
153
71
|
if suffix == ".csv":
|
|
@@ -156,8 +74,7 @@ class FileProcessor:
|
|
|
156
74
|
return "json"
|
|
157
75
|
if suffix in {".doc", ".docx", ".pptx"}:
|
|
158
76
|
return "office"
|
|
159
|
-
|
|
160
|
-
return "image"
|
|
77
|
+
|
|
161
78
|
text_exts = {
|
|
162
79
|
".txt",
|
|
163
80
|
".md",
|
|
@@ -179,100 +96,29 @@ class FileProcessor:
|
|
|
179
96
|
}
|
|
180
97
|
if suffix in text_exts:
|
|
181
98
|
return "text"
|
|
99
|
+
|
|
182
100
|
raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
|
|
183
101
|
|
|
184
102
|
# ------------------------------------------------------------------ #
|
|
185
|
-
#
|
|
103
|
+
# Public entry-point
|
|
186
104
|
# ------------------------------------------------------------------ #
|
|
187
105
|
async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
# 1) Image vector
|
|
201
|
-
def enc_img():
|
|
202
|
-
with torch.no_grad():
|
|
203
|
-
t = self.clip_preprocess(img).unsqueeze(0).to(self.device)
|
|
204
|
-
v = self.clip_model.encode_image(t).squeeze()
|
|
205
|
-
return (v / v.norm()).float().cpu().numpy()
|
|
206
|
-
|
|
207
|
-
image_vec = await loop.run_in_executor(self._executor, enc_img)
|
|
208
|
-
|
|
209
|
-
# 2) Caption
|
|
210
|
-
def gen_cap():
|
|
211
|
-
inp = self.blip_processor(images=img, return_tensors="pt").to(self.device)
|
|
212
|
-
with torch.no_grad():
|
|
213
|
-
ids = self.blip_model.generate(**inp, max_new_tokens=50)
|
|
214
|
-
return self.blip_processor.decode(ids[0], skip_special_tokens=True)
|
|
215
|
-
|
|
216
|
-
caption = await loop.run_in_executor(self._executor, gen_cap)
|
|
217
|
-
|
|
218
|
-
# 3) OCR
|
|
219
|
-
if self.use_ocr:
|
|
220
|
-
text = await loop.run_in_executor(
|
|
221
|
-
self._executor, pytesseract.image_to_string, img
|
|
222
|
-
)
|
|
223
|
-
if t := text.strip():
|
|
224
|
-
caption += "\n" + t
|
|
225
|
-
|
|
226
|
-
# 4) Caption vector
|
|
227
|
-
def enc_txt():
|
|
228
|
-
with torch.no_grad():
|
|
229
|
-
tok = self.clip_tokenizer(caption).unsqueeze(0).to(self.device)
|
|
230
|
-
v = self.clip_model.encode_text(tok).squeeze()
|
|
231
|
-
return (v / v.norm()).float().cpu().numpy()
|
|
232
|
-
|
|
233
|
-
caption_vec = await loop.run_in_executor(self._executor, enc_txt)
|
|
234
|
-
|
|
235
|
-
# 5) YOLO regions
|
|
236
|
-
region_vectors = []
|
|
237
|
-
if self.use_detection:
|
|
238
|
-
dets = self.detector(img)[0]
|
|
239
|
-
for box in dets.boxes:
|
|
240
|
-
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().tolist())
|
|
241
|
-
crop = img.crop((x1, y1, x2, y2))
|
|
242
|
-
vec = self.encode_image(crop)
|
|
243
|
-
region_vectors.append(
|
|
244
|
-
{
|
|
245
|
-
"vector": vec.tolist(),
|
|
246
|
-
"bbox": [x1, y1, x2, y2],
|
|
247
|
-
"label": dets.names[int(box.cls)],
|
|
248
|
-
"conf": float(box.conf),
|
|
249
|
-
}
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
# Metadata
|
|
253
|
-
sha = hashlib.sha256(file_path.read_bytes()).hexdigest()
|
|
254
|
-
w, h = img.size
|
|
255
|
-
meta = {
|
|
256
|
-
"source": str(file_path),
|
|
257
|
-
"type": "image",
|
|
258
|
-
"width": w,
|
|
259
|
-
"height": h,
|
|
260
|
-
"mime": f"image/{file_path.suffix.lstrip('.')}",
|
|
261
|
-
"sha256": sha,
|
|
262
|
-
"embedding_model": "openclip-vit-h-14",
|
|
263
|
-
"caption": caption,
|
|
106
|
+
"""Validate → detect → dispatch to the appropriate processor."""
|
|
107
|
+
file_path = Path(file_path)
|
|
108
|
+
self.validate_file(file_path)
|
|
109
|
+
ftype = self._detect_file_type(file_path)
|
|
110
|
+
|
|
111
|
+
dispatch_map = {
|
|
112
|
+
"pdf": self._process_pdf,
|
|
113
|
+
"text": self._process_text,
|
|
114
|
+
"csv": self._process_csv,
|
|
115
|
+
"office": self._process_office,
|
|
116
|
+
"json": self._process_json,
|
|
264
117
|
}
|
|
118
|
+
if ftype not in dispatch_map:
|
|
119
|
+
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
265
120
|
|
|
266
|
-
|
|
267
|
-
"content": None,
|
|
268
|
-
"metadata": meta,
|
|
269
|
-
"chunks": [caption],
|
|
270
|
-
"vectors": [image_vec.tolist()],
|
|
271
|
-
"caption_vector": caption_vec.tolist(),
|
|
272
|
-
}
|
|
273
|
-
if region_vectors:
|
|
274
|
-
result["region_vectors"] = region_vectors
|
|
275
|
-
return result
|
|
121
|
+
return await dispatch_map[ftype](file_path)
|
|
276
122
|
|
|
277
123
|
# ------------------------------------------------------------------ #
|
|
278
124
|
# PDF
|
|
@@ -280,6 +126,7 @@ class FileProcessor:
|
|
|
280
126
|
async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
|
|
281
127
|
page_chunks, doc_meta = await self._extract_text(file_path)
|
|
282
128
|
all_chunks, line_data = [], []
|
|
129
|
+
|
|
283
130
|
for page_text, page_num, line_nums in page_chunks:
|
|
284
131
|
lines = page_text.split("\n")
|
|
285
132
|
buf, buf_lines, length = [], [], 0
|
|
@@ -318,7 +165,7 @@ class FileProcessor:
|
|
|
318
165
|
}
|
|
319
166
|
|
|
320
167
|
# ------------------------------------------------------------------ #
|
|
321
|
-
# Plain
|
|
168
|
+
# Plain-text / code / markup
|
|
322
169
|
# ------------------------------------------------------------------ #
|
|
323
170
|
async def _process_text(self, file_path: Path) -> Dict[str, Any]:
|
|
324
171
|
text, extra_meta, _ = await self._extract_text(file_path)
|
|
@@ -351,6 +198,7 @@ class FileProcessor:
|
|
|
351
198
|
continue
|
|
352
199
|
texts.append(txt)
|
|
353
200
|
metas.append({k: v for k, v in row.items() if k != text_field and v})
|
|
201
|
+
|
|
354
202
|
vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
|
|
355
203
|
return {
|
|
356
204
|
"content": None,
|
|
@@ -361,7 +209,7 @@ class FileProcessor:
|
|
|
361
209
|
}
|
|
362
210
|
|
|
363
211
|
# ------------------------------------------------------------------ #
|
|
364
|
-
# Office docs
|
|
212
|
+
# Office docs (.doc/.docx/.pptx)
|
|
365
213
|
# ------------------------------------------------------------------ #
|
|
366
214
|
async def _process_office(self, file_path: Path) -> Dict[str, Any]:
|
|
367
215
|
loop = asyncio.get_event_loop()
|
|
@@ -369,10 +217,11 @@ class FileProcessor:
|
|
|
369
217
|
text = await loop.run_in_executor(
|
|
370
218
|
self._executor, self._read_docx, file_path
|
|
371
219
|
)
|
|
372
|
-
else:
|
|
220
|
+
else: # .pptx
|
|
373
221
|
text = await loop.run_in_executor(
|
|
374
222
|
self._executor, self._read_pptx, file_path
|
|
375
223
|
)
|
|
224
|
+
|
|
376
225
|
chunks = self._chunk_text(text)
|
|
377
226
|
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
378
227
|
return {
|
|
@@ -418,25 +267,11 @@ class FileProcessor:
|
|
|
418
267
|
return await loop.run_in_executor(
|
|
419
268
|
self._executor, self._extract_pdf_text, file_path
|
|
420
269
|
)
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
return text, {}, []
|
|
425
|
-
|
|
426
|
-
# ------------------------------------------------------------------ #
|
|
427
|
-
# util: clip‑text encoder (public)
|
|
428
|
-
# ------------------------------------------------------------------ #
|
|
429
|
-
def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
430
|
-
with torch.no_grad():
|
|
431
|
-
toks = (
|
|
432
|
-
self.clip_tokenizer(text)
|
|
433
|
-
if isinstance(text, str)
|
|
434
|
-
else self.clip_tokenizer(text, truncate=True)
|
|
270
|
+
else:
|
|
271
|
+
text = await loop.run_in_executor(
|
|
272
|
+
self._executor, self._read_text_file, file_path
|
|
435
273
|
)
|
|
436
|
-
|
|
437
|
-
feat = self.clip_model.encode_text(tensor).squeeze()
|
|
438
|
-
feat = feat / feat.norm()
|
|
439
|
-
return feat.float().cpu().numpy()
|
|
274
|
+
return text, {}, []
|
|
440
275
|
|
|
441
276
|
def _extract_pdf_text(self, file_path: Path):
|
|
442
277
|
page_chunks, meta = [], {}
|
|
@@ -452,8 +287,8 @@ class FileProcessor:
|
|
|
452
287
|
lines = page.extract_text_lines()
|
|
453
288
|
sorted_lines = sorted(lines, key=lambda x: x["top"])
|
|
454
289
|
txts, nums = [], []
|
|
455
|
-
for ln_idx,
|
|
456
|
-
t =
|
|
290
|
+
for ln_idx, L in enumerate(sorted_lines, start=1):
|
|
291
|
+
t = L.get("text", "").strip()
|
|
457
292
|
if t:
|
|
458
293
|
txts.append(t)
|
|
459
294
|
nums.append(ln_idx)
|
|
@@ -527,24 +362,3 @@ class FileProcessor:
|
|
|
527
362
|
seg = tokens[i : i + self.effective_max_length]
|
|
528
363
|
out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
|
|
529
364
|
return out
|
|
530
|
-
|
|
531
|
-
# ------------------------------------------------------------------ #
|
|
532
|
-
# Retrieval helpers (optional use)
|
|
533
|
-
# ------------------------------------------------------------------ #
|
|
534
|
-
def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
535
|
-
"""Embed raw text with the SentenceTransformer model."""
|
|
536
|
-
single = isinstance(text, str)
|
|
537
|
-
out = self.embedding_model.encode(
|
|
538
|
-
text,
|
|
539
|
-
convert_to_numpy=True,
|
|
540
|
-
normalize_embeddings=True,
|
|
541
|
-
show_progress_bar=False,
|
|
542
|
-
)
|
|
543
|
-
return out if not single else out[0]
|
|
544
|
-
|
|
545
|
-
def encode_image(self, img: Image.Image) -> np.ndarray:
|
|
546
|
-
with torch.no_grad():
|
|
547
|
-
tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
|
|
548
|
-
feat = self.clip_model.encode_image(tensor).squeeze()
|
|
549
|
-
feat = feat / feat.norm()
|
|
550
|
-
return feat.float().cpu().numpy()
|
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import csv
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
import textwrap
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
11
|
+
|
|
12
|
+
try: # Python 3.11+
|
|
13
|
+
from typing import LiteralString
|
|
14
|
+
except ImportError: # 3.9–3.10
|
|
15
|
+
from typing_extensions import LiteralString
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import open_clip
|
|
19
|
+
import pdfplumber
|
|
20
|
+
import torch
|
|
21
|
+
from docx import Document
|
|
22
|
+
from PIL import Image
|
|
23
|
+
from pptx import Presentation
|
|
24
|
+
from projectdavid_common import UtilsInterface
|
|
25
|
+
from sentence_transformers import SentenceTransformer
|
|
26
|
+
|
|
27
|
+
# from transformers import Blip2ForConditionalGeneration, Blip2Processor
|
|
28
|
+
|
|
29
|
+
# from ultralytics import YOLO
|
|
30
|
+
|
|
31
|
+
# OCR fallback – optional
|
|
32
|
+
# try:
|
|
33
|
+
# import pytesseract # noqa: F401 # pylint: disable=unused-import
|
|
34
|
+
# except ImportError:
|
|
35
|
+
# pytesseract = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
log = UtilsInterface.LoggingUtility()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def latlon_to_unit_vec(lat: float, lon: float) -> List[float]:
|
|
42
|
+
"""Convert geographic lat/lon (deg) to a 3-D unit vector for Qdrant."""
|
|
43
|
+
lat_r = math.radians(lat)
|
|
44
|
+
lon_r = math.radians(lon)
|
|
45
|
+
return [
|
|
46
|
+
math.cos(lat_r) * math.cos(lon_r),
|
|
47
|
+
math.cos(lat_r) * math.sin(lon_r),
|
|
48
|
+
math.sin(lat_r),
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class FileProcessor:
|
|
53
|
+
"""Unified processor for text, tabular, office, JSON, **and image** files.
|
|
54
|
+
|
|
55
|
+
Each modality is embedded with its optimal model:
|
|
56
|
+
• Text → paraphrase‑MiniLM‑L6‑v2 (384‑D)
|
|
57
|
+
• Image → OpenCLIP ViT‑H/14 (1024‑D)
|
|
58
|
+
• Caption→ OpenCLIP text head (1024‑D)
|
|
59
|
+
|
|
60
|
+
Rich captions are generated via BLIP‑2 Flan‑T5‑XL.
|
|
61
|
+
GPU usage is optional; pass `use_gpu=False` to stay on CPU.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
# ------------------------------------------------------------------ #
|
|
65
|
+
# Construction
|
|
66
|
+
# ------------------------------------------------------------------ #
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
*,
|
|
70
|
+
max_workers: int = 4,
|
|
71
|
+
chunk_size: int = 512,
|
|
72
|
+
use_gpu: bool = True,
|
|
73
|
+
use_ocr: bool = True,
|
|
74
|
+
use_detection: bool = False,
|
|
75
|
+
image_model_name: str = "ViT-H-14",
|
|
76
|
+
caption_model_name: str = "Salesforce/blip2-flan-t5-xl",
|
|
77
|
+
):
|
|
78
|
+
# Device selection
|
|
79
|
+
if use_gpu and torch.cuda.is_available():
|
|
80
|
+
self.device = torch.device("cuda")
|
|
81
|
+
self.torch_dtype = torch.float16
|
|
82
|
+
else:
|
|
83
|
+
self.device = torch.device("cpu")
|
|
84
|
+
self.torch_dtype = torch.float32
|
|
85
|
+
|
|
86
|
+
# Text embedder
|
|
87
|
+
self.embedding_model_name = "paraphrase-MiniLM-L6-v2"
|
|
88
|
+
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
89
|
+
self.embedding_model.to(str(self.device))
|
|
90
|
+
|
|
91
|
+
# Chunking parameters
|
|
92
|
+
self.max_seq_length = self.embedding_model.get_max_seq_length()
|
|
93
|
+
self.special_tokens_count = 2
|
|
94
|
+
self.effective_max_length = self.max_seq_length - self.special_tokens_count
|
|
95
|
+
self.chunk_size = min(chunk_size, self.effective_max_length * 4)
|
|
96
|
+
|
|
97
|
+
# Executor & logging
|
|
98
|
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
99
|
+
log.info(
|
|
100
|
+
"FileProcessor ready (device=%s, OCR=%s, detection=%s)",
|
|
101
|
+
self.device,
|
|
102
|
+
# self.use_ocr,
|
|
103
|
+
# self.use_detection,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# ------------------------------------------------------------------ #
|
|
107
|
+
# Generic validators *
|
|
108
|
+
# ------------------------------------------------------------------ #
|
|
109
|
+
def validate_file(self, file_path: Path):
|
|
110
|
+
"""Ensure file exists and is under 100 MB."""
|
|
111
|
+
max_size = 100 * 1024 * 1024
|
|
112
|
+
if not file_path.exists():
|
|
113
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
114
|
+
if file_path.stat().st_size > max_size:
|
|
115
|
+
mb = max_size // (1024 * 1024)
|
|
116
|
+
raise ValueError(f"{file_path.name} > {mb} MB limit")
|
|
117
|
+
|
|
118
|
+
# ------------------------------------------------------------------ #
|
|
119
|
+
# File‑type detection (extension‑based – no libmagic)
|
|
120
|
+
# ------------------------------------------------------------------ #
|
|
121
|
+
def _detect_file_type(self, file_path: Path) -> str:
|
|
122
|
+
suffix = file_path.suffix.lower()
|
|
123
|
+
if suffix == ".pdf":
|
|
124
|
+
return "pdf"
|
|
125
|
+
if suffix == ".csv":
|
|
126
|
+
return "csv"
|
|
127
|
+
if suffix == ".json":
|
|
128
|
+
return "json"
|
|
129
|
+
if suffix in {".doc", ".docx", ".pptx"}:
|
|
130
|
+
return "office"
|
|
131
|
+
if suffix in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}:
|
|
132
|
+
return "image"
|
|
133
|
+
text_exts = {
|
|
134
|
+
".txt",
|
|
135
|
+
".md",
|
|
136
|
+
".rst",
|
|
137
|
+
".c",
|
|
138
|
+
".cpp",
|
|
139
|
+
".cs",
|
|
140
|
+
".go",
|
|
141
|
+
".java",
|
|
142
|
+
".js",
|
|
143
|
+
".ts",
|
|
144
|
+
".php",
|
|
145
|
+
".py",
|
|
146
|
+
".rb",
|
|
147
|
+
".sh",
|
|
148
|
+
".tex",
|
|
149
|
+
".html",
|
|
150
|
+
".css",
|
|
151
|
+
}
|
|
152
|
+
if suffix in text_exts:
|
|
153
|
+
return "text"
|
|
154
|
+
raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
|
|
155
|
+
|
|
156
|
+
# ------------------------------------------------------------------ #
|
|
157
|
+
# Dispatcher
|
|
158
|
+
# ------------------------------------------------------------------ #
|
|
159
|
+
async def process_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
160
|
+
path = Path(file_path)
|
|
161
|
+
self.validate_file(path)
|
|
162
|
+
ftype = self._detect_file_type(path)
|
|
163
|
+
return await getattr(self, f"_process_{ftype}")(path)
|
|
164
|
+
|
|
165
|
+
# ------------------------------------------------------------------ #
|
|
166
|
+
# PDF
|
|
167
|
+
# ------------------------------------------------------------------ #
|
|
168
|
+
async def _process_pdf(self, file_path: Path) -> Dict[str, Any]:
|
|
169
|
+
page_chunks, doc_meta = await self._extract_text(file_path)
|
|
170
|
+
all_chunks, line_data = [], []
|
|
171
|
+
for page_text, page_num, line_nums in page_chunks:
|
|
172
|
+
lines = page_text.split("\n")
|
|
173
|
+
buf, buf_lines, length = [], [], 0
|
|
174
|
+
for line, ln in zip(lines, line_nums):
|
|
175
|
+
l = len(line) + 1
|
|
176
|
+
if length + l <= self.chunk_size:
|
|
177
|
+
buf.append(line)
|
|
178
|
+
buf_lines.append(ln)
|
|
179
|
+
length += l
|
|
180
|
+
else:
|
|
181
|
+
if buf:
|
|
182
|
+
all_chunks.append("\n".join(buf))
|
|
183
|
+
line_data.append({"page": page_num, "lines": buf_lines})
|
|
184
|
+
buf, buf_lines, length = [], [], 0
|
|
185
|
+
for piece in self._split_oversized_chunk(line):
|
|
186
|
+
all_chunks.append(piece)
|
|
187
|
+
line_data.append({"page": page_num, "lines": [ln]})
|
|
188
|
+
if buf:
|
|
189
|
+
all_chunks.append("\n".join(buf))
|
|
190
|
+
line_data.append({"page": page_num, "lines": buf_lines})
|
|
191
|
+
|
|
192
|
+
vectors = await asyncio.gather(
|
|
193
|
+
*[self._encode_chunk_async(c) for c in all_chunks]
|
|
194
|
+
)
|
|
195
|
+
return {
|
|
196
|
+
"content": "\n\n".join(all_chunks),
|
|
197
|
+
"metadata": {
|
|
198
|
+
**doc_meta,
|
|
199
|
+
"source": str(file_path),
|
|
200
|
+
"chunks": len(all_chunks),
|
|
201
|
+
"type": "pdf",
|
|
202
|
+
},
|
|
203
|
+
"chunks": all_chunks,
|
|
204
|
+
"vectors": [v.tolist() for v in vectors],
|
|
205
|
+
"line_data": line_data,
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# ------------------------------------------------------------------ #
|
|
209
|
+
# Plain‑text / code / markup
|
|
210
|
+
# ------------------------------------------------------------------ #
|
|
211
|
+
async def _process_text(self, file_path: Path) -> Dict[str, Any]:
|
|
212
|
+
text, extra_meta, _ = await self._extract_text(file_path)
|
|
213
|
+
chunks = self._chunk_text(text)
|
|
214
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
215
|
+
return {
|
|
216
|
+
"content": text,
|
|
217
|
+
"metadata": {
|
|
218
|
+
**extra_meta,
|
|
219
|
+
"source": str(file_path),
|
|
220
|
+
"chunks": len(chunks),
|
|
221
|
+
"type": "text",
|
|
222
|
+
},
|
|
223
|
+
"chunks": chunks,
|
|
224
|
+
"vectors": [v.tolist() for v in vectors],
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
# ------------------------------------------------------------------ #
|
|
228
|
+
# CSV
|
|
229
|
+
# ------------------------------------------------------------------ #
|
|
230
|
+
async def _process_csv(
|
|
231
|
+
self, file_path: Path, text_field: str = "description"
|
|
232
|
+
) -> Dict[str, Any]:
|
|
233
|
+
rows, texts, metas = [], [], []
|
|
234
|
+
with file_path.open(newline="", encoding="utf-8") as f:
|
|
235
|
+
reader = csv.DictReader(f)
|
|
236
|
+
for row in reader:
|
|
237
|
+
txt = row.get(text_field, "").strip()
|
|
238
|
+
if not txt:
|
|
239
|
+
continue
|
|
240
|
+
texts.append(txt)
|
|
241
|
+
metas.append({k: v for k, v in row.items() if k != text_field and v})
|
|
242
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(t) for t in texts])
|
|
243
|
+
return {
|
|
244
|
+
"content": None,
|
|
245
|
+
"metadata": {"source": str(file_path), "rows": len(texts), "type": "csv"},
|
|
246
|
+
"chunks": texts,
|
|
247
|
+
"vectors": [v.tolist() for v in vectors],
|
|
248
|
+
"csv_row_metadata": metas,
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
# ------------------------------------------------------------------ #
|
|
252
|
+
# Office docs
|
|
253
|
+
# ------------------------------------------------------------------ #
|
|
254
|
+
async def _process_office(self, file_path: Path) -> Dict[str, Any]:
|
|
255
|
+
loop = asyncio.get_event_loop()
|
|
256
|
+
if file_path.suffix.lower() in {".doc", ".docx"}:
|
|
257
|
+
text = await loop.run_in_executor(
|
|
258
|
+
self._executor, self._read_docx, file_path
|
|
259
|
+
)
|
|
260
|
+
else:
|
|
261
|
+
text = await loop.run_in_executor(
|
|
262
|
+
self._executor, self._read_pptx, file_path
|
|
263
|
+
)
|
|
264
|
+
chunks = self._chunk_text(text)
|
|
265
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
266
|
+
return {
|
|
267
|
+
"content": text,
|
|
268
|
+
"metadata": {
|
|
269
|
+
"source": str(file_path),
|
|
270
|
+
"chunks": len(chunks),
|
|
271
|
+
"type": "office",
|
|
272
|
+
},
|
|
273
|
+
"chunks": chunks,
|
|
274
|
+
"vectors": [v.tolist() for v in vectors],
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
# ------------------------------------------------------------------ #
|
|
278
|
+
# JSON
|
|
279
|
+
# ------------------------------------------------------------------ #
|
|
280
|
+
async def _process_json(self, file_path: Path) -> Dict[str, Any]:
|
|
281
|
+
text = await asyncio.get_event_loop().run_in_executor(
|
|
282
|
+
self._executor, self._read_json, file_path
|
|
283
|
+
)
|
|
284
|
+
chunks = self._chunk_text(text)
|
|
285
|
+
vectors = await asyncio.gather(*[self._encode_chunk_async(c) for c in chunks])
|
|
286
|
+
return {
|
|
287
|
+
"content": text,
|
|
288
|
+
"metadata": {
|
|
289
|
+
"source": str(file_path),
|
|
290
|
+
"chunks": len(chunks),
|
|
291
|
+
"type": "json",
|
|
292
|
+
},
|
|
293
|
+
"chunks": chunks,
|
|
294
|
+
"vectors": [v.tolist() for v in vectors],
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
# ------------------------------------------------------------------ #
|
|
298
|
+
# Shared helpers
|
|
299
|
+
# ------------------------------------------------------------------ #
|
|
300
|
+
async def _extract_text(self, file_path: Path) -> Union[
|
|
301
|
+
Tuple[List[Tuple[str, int, List[int]]], Dict[str, Any]],
|
|
302
|
+
Tuple[str, Dict[str, Any], List[int]],
|
|
303
|
+
]:
|
|
304
|
+
loop = asyncio.get_event_loop()
|
|
305
|
+
if file_path.suffix.lower() == ".pdf":
|
|
306
|
+
return await loop.run_in_executor(
|
|
307
|
+
self._executor, self._extract_pdf_text, file_path
|
|
308
|
+
)
|
|
309
|
+
text = await loop.run_in_executor(
|
|
310
|
+
self._executor, self._read_text_file, file_path
|
|
311
|
+
)
|
|
312
|
+
return text, {}, []
|
|
313
|
+
|
|
314
|
+
# ------------------------------------------------------------------ #
|
|
315
|
+
# util: clip‑text encoder (public)
|
|
316
|
+
# ------------------------------------------------------------------ #
|
|
317
|
+
def encode_clip_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
318
|
+
with torch.no_grad():
|
|
319
|
+
toks = (
|
|
320
|
+
self.clip_tokenizer(text)
|
|
321
|
+
if isinstance(text, str)
|
|
322
|
+
else self.clip_tokenizer(text, truncate=True)
|
|
323
|
+
)
|
|
324
|
+
tensor = toks.unsqueeze(0).to(self.device)
|
|
325
|
+
feat = self.clip_model.encode_text(tensor).squeeze()
|
|
326
|
+
feat = feat / feat.norm()
|
|
327
|
+
return feat.float().cpu().numpy()
|
|
328
|
+
|
|
329
|
+
def _extract_pdf_text(self, file_path: Path):
|
|
330
|
+
page_chunks, meta = [], {}
|
|
331
|
+
with pdfplumber.open(file_path) as pdf:
|
|
332
|
+
meta.update(
|
|
333
|
+
{
|
|
334
|
+
"author": pdf.metadata.get("Author", ""),
|
|
335
|
+
"title": pdf.metadata.get("Title", file_path.stem),
|
|
336
|
+
"page_count": len(pdf.pages),
|
|
337
|
+
}
|
|
338
|
+
)
|
|
339
|
+
for i, page in enumerate(pdf.pages, start=1):
|
|
340
|
+
lines = page.extract_text_lines()
|
|
341
|
+
sorted_lines = sorted(lines, key=lambda x: x["top"])
|
|
342
|
+
txts, nums = [], []
|
|
343
|
+
for ln_idx, line in enumerate(sorted_lines, start=1):
|
|
344
|
+
t = line.get("text", "").strip()
|
|
345
|
+
if t:
|
|
346
|
+
txts.append(t)
|
|
347
|
+
nums.append(ln_idx)
|
|
348
|
+
if txts:
|
|
349
|
+
page_chunks.append(("\n".join(txts), i, nums))
|
|
350
|
+
return page_chunks, meta
|
|
351
|
+
|
|
352
|
+
def _read_text_file(self, file_path: Path) -> str:
|
|
353
|
+
try:
|
|
354
|
+
return file_path.read_text(encoding="utf-8")
|
|
355
|
+
except UnicodeDecodeError:
|
|
356
|
+
return file_path.read_text(encoding="latin-1")
|
|
357
|
+
|
|
358
|
+
def _read_docx(self, path: Path) -> str:
|
|
359
|
+
doc = Document(path)
|
|
360
|
+
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
|
361
|
+
|
|
362
|
+
def _read_pptx(self, path: Path) -> str:
|
|
363
|
+
prs = Presentation(path)
|
|
364
|
+
slides = []
|
|
365
|
+
for slide in prs.slides:
|
|
366
|
+
chunks = [sh.text for sh in slide.shapes if hasattr(sh, "text")]
|
|
367
|
+
slides.append("\n".join(filter(None, chunks)))
|
|
368
|
+
return "\n\n".join(slides)
|
|
369
|
+
|
|
370
|
+
def _read_json(self, path: Path) -> str:
|
|
371
|
+
obj = json.loads(path.read_text(encoding="utf-8"))
|
|
372
|
+
pretty = json.dumps(obj, indent=2, ensure_ascii=False)
|
|
373
|
+
return "\n".join(textwrap.wrap(pretty, width=120))
|
|
374
|
+
|
|
375
|
+
async def _encode_chunk_async(self, chunk: str) -> np.ndarray:
|
|
376
|
+
return await asyncio.get_event_loop().run_in_executor(
|
|
377
|
+
self._executor,
|
|
378
|
+
lambda: self.embedding_model.encode(
|
|
379
|
+
[chunk],
|
|
380
|
+
convert_to_numpy=True,
|
|
381
|
+
truncate="model_max_length",
|
|
382
|
+
normalize_embeddings=True,
|
|
383
|
+
show_progress_bar=False,
|
|
384
|
+
)[0],
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# ------------------------------------------------------------------ #
|
|
388
|
+
# Text chunking helpers
|
|
389
|
+
# ------------------------------------------------------------------ #
|
|
390
|
+
def _chunk_text(self, text: str) -> List[str]:
|
|
391
|
+
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
|
392
|
+
chunks, buf, length = [], [], 0
|
|
393
|
+
for sent in sentences:
|
|
394
|
+
slen = len(sent) + 1
|
|
395
|
+
if length + slen <= self.chunk_size:
|
|
396
|
+
buf.append(sent)
|
|
397
|
+
length += slen
|
|
398
|
+
else:
|
|
399
|
+
if buf:
|
|
400
|
+
chunks.append(" ".join(buf))
|
|
401
|
+
buf, length = [], 0
|
|
402
|
+
while len(sent) > self.chunk_size:
|
|
403
|
+
part, sent = sent[: self.chunk_size], sent[self.chunk_size :]
|
|
404
|
+
chunks.append(part)
|
|
405
|
+
buf, length = [sent], len(sent)
|
|
406
|
+
if buf:
|
|
407
|
+
chunks.append(" ".join(buf))
|
|
408
|
+
return chunks
|
|
409
|
+
|
|
410
|
+
def _split_oversized_chunk(self, chunk: str, tokens: List[str] = None) -> List[str]:
|
|
411
|
+
if tokens is None:
|
|
412
|
+
tokens = self.embedding_model.tokenizer.tokenize(chunk)
|
|
413
|
+
out = []
|
|
414
|
+
for i in range(0, len(tokens), self.effective_max_length):
|
|
415
|
+
seg = tokens[i : i + self.effective_max_length]
|
|
416
|
+
out.append(self.embedding_model.tokenizer.convert_tokens_to_string(seg))
|
|
417
|
+
return out
|
|
418
|
+
|
|
419
|
+
# ------------------------------------------------------------------ #
|
|
420
|
+
# Retrieval helpers (optional use)
|
|
421
|
+
# ------------------------------------------------------------------ #
|
|
422
|
+
def encode_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
|
423
|
+
"""Embed raw text with the SentenceTransformer model."""
|
|
424
|
+
single = isinstance(text, str)
|
|
425
|
+
out = self.embedding_model.encode(
|
|
426
|
+
text,
|
|
427
|
+
convert_to_numpy=True,
|
|
428
|
+
normalize_embeddings=True,
|
|
429
|
+
show_progress_bar=False,
|
|
430
|
+
)
|
|
431
|
+
return out if not single else out[0]
|
|
432
|
+
|
|
433
|
+
def encode_image(self, img: Image.Image) -> np.ndarray:
|
|
434
|
+
with torch.no_grad():
|
|
435
|
+
tensor = self.clip_preprocess(img).unsqueeze(0).to(self.device)
|
|
436
|
+
feat = self.clip_model.encode_image(tensor).squeeze()
|
|
437
|
+
feat = feat / feat.norm()
|
|
438
|
+
return feat.float().cpu().numpy()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: projectdavid
|
|
3
|
-
Version: 1.33.
|
|
3
|
+
Version: 1.33.14
|
|
4
4
|
Summary: Python SDK for interacting with the Entities Assistant API.
|
|
5
5
|
Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
|
|
6
6
|
License: PolyForm Noncommercial License 1.0.0
|
|
@@ -10,7 +10,7 @@ projectdavid/clients/assistants_client.py,sha256=SsIGa5wPr7ga9WX0ywam3djUF-uWFdk
|
|
|
10
10
|
projectdavid/clients/base_client.py,sha256=UWl6nr6sxD1_xC6iyptQDR1tnNdFCOrEx5cEUPCRqJE,3417
|
|
11
11
|
projectdavid/clients/base_vector_store.py,sha256=jXivmqAW1bgYcLgIeW-hPxOiWZbs2hCsLy4oWzSvpNI,2061
|
|
12
12
|
projectdavid/clients/event_handler.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
projectdavid/clients/file_processor.py,sha256=
|
|
13
|
+
projectdavid/clients/file_processor.py,sha256=t-Uw-kBP_VmlguMxO9PWY6ANuMAY0PstQDW37wLPF0Q,13980
|
|
14
14
|
projectdavid/clients/file_search.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
projectdavid/clients/files_client.py,sha256=XkIDzbQFGDrd88taf0Kouc_4YJOPIYEHiIyWYLKDofI,15581
|
|
16
16
|
projectdavid/clients/inference_client.py,sha256=xz4ACPv5Tkis604QxO5mJX1inH_TGDfQP-31geETYpE,6609
|
|
@@ -22,6 +22,7 @@ projectdavid/clients/tools_client.py,sha256=GkCVOmwpAoPqVt6aYmH0G1HIFha3iEwR9IIf
|
|
|
22
22
|
projectdavid/clients/users_client.py,sha256=eCuUb9qvyH1GUFhZu6TRL9zdoK-qzHSs8-Vmrk_0mmg,13729
|
|
23
23
|
projectdavid/clients/vector_store_manager.py,sha256=q-ZgRQVX_S3nMrKYhmvkVrDjDRzM3ZFzUF55HBGRTe8,12861
|
|
24
24
|
projectdavid/clients/vectors.py,sha256=cysPVbUzW3byB82MTqG2X1Iz5ZAe82WTS1JfQcoqVhE,40229
|
|
25
|
+
projectdavid/clients/vision-file_processor.py,sha256=19ft9IUeY5x9_22vC4JqndiFlpDYyUn6z1ygv-EV2NE,16852
|
|
25
26
|
projectdavid/constants/platform.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
27
|
projectdavid/services/logging_service.py,sha256=jdoRL46E42Ar8JFTDOV-xVD67CulcHSN-xhcEqA5CXQ,2643
|
|
27
28
|
projectdavid/synthesis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -35,8 +36,8 @@ projectdavid/utils/monitor_launcher.py,sha256=3YAgJdeuaUvq3JGvpA4ymqFsAnk29nH5q9
|
|
|
35
36
|
projectdavid/utils/peek_gate.py,sha256=5whMRnDOQjATRpThWDJkvY9ScXuJ7Sd_-9rvGgXeTAQ,2532
|
|
36
37
|
projectdavid/utils/run_monitor.py,sha256=F_WkqIP-qnWH-4llIbileWWLfRj2Q1Cg-ni23SR1rec,3786
|
|
37
38
|
projectdavid/utils/vector_search_formatter.py,sha256=YTe3HPGec26qGY7uxY8_GS8lc4QaN6aNXMzkl29nZpI,1735
|
|
38
|
-
projectdavid-1.33.
|
|
39
|
-
projectdavid-1.33.
|
|
40
|
-
projectdavid-1.33.
|
|
41
|
-
projectdavid-1.33.
|
|
42
|
-
projectdavid-1.33.
|
|
39
|
+
projectdavid-1.33.14.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
|
|
40
|
+
projectdavid-1.33.14.dist-info/METADATA,sha256=jFWdJGL8LYBQNEoEqBZ6DhLJ-HnVgLsvQ06K7PAkpRA,11555
|
|
41
|
+
projectdavid-1.33.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
42
|
+
projectdavid-1.33.14.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
|
|
43
|
+
projectdavid-1.33.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|