projectdavid 1.29.9__py3-none-any.whl → 1.38.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -61,13 +61,16 @@ class VectorStoreClient:
61
61
  • create_vector_store() no longer takes user_id; ownership from token.
62
62
  """
63
63
 
64
- # Construction / cleanup
64
+ # ------------------------------------------------------------------ #
65
+ # Construction / cleanup
66
+ # ------------------------------------------------------------------ #
65
67
  def __init__(
66
68
  self,
67
69
  base_url: Optional[str] = None,
68
70
  api_key: Optional[str] = None,
69
71
  *,
70
72
  vector_store_host: str = "localhost",
73
+ file_processor_kwargs: Optional[dict] = None, # 🔶 add arg
71
74
  ):
72
75
  self.base_url = (base_url or os.getenv("BASE_URL", "")).rstrip("/")
73
76
  self.api_key = api_key or os.getenv("API_KEY")
@@ -84,9 +87,13 @@ class VectorStoreClient:
84
87
  base_url=self.base_url, headers=self._base_headers, timeout=30.0
85
88
  )
86
89
 
87
- # Local helpers
90
+ # Local helpers ---------------------------------------------------
88
91
  self.vector_manager = VectorStoreManager(vector_store_host=vector_store_host)
89
92
  self.identifier_service = UtilsInterface.IdentifierService()
93
+
94
+ # 🔶 forward kwargs into the upgraded FileProcessor
95
+ # self.file_processor = FileProcessor(**(file_processor_kwargs or {}))
96
+ # Using Stripped down version for now until we move forward with multi-modal stores
90
97
  self.file_processor = FileProcessor()
91
98
 
92
99
  log.info("VectorStoreClient → %s", self.base_url)
@@ -240,32 +247,31 @@ class VectorStoreClient:
240
247
  ) -> ValidationInterface.VectorStoreFileRead:
241
248
  processed = await self.file_processor.process_file(p)
242
249
  texts, vectors = processed["chunks"], processed["vectors"]
243
- line_data = processed.get("line_data") or [] # ← NEW
244
-
245
- base_md = meta or {}
246
- base_md.update({"source": str(p), "file_name": p.name})
250
+ line_data = processed.get("line_data") or []
247
251
 
252
+ base_md = (meta or {}) | {"source": str(p), "file_name": p.name}
248
253
  file_record_id = f"vsf_{uuid.uuid4()}"
249
254
 
250
- # Build per‑chunk payload, now including page/lines if present
251
255
  chunk_md = []
252
- for i in range(len(texts)):
253
- payload = {
254
- **base_md,
255
- "chunk_index": i,
256
- "file_id": file_record_id,
257
- }
258
- if i < len(line_data): # ← NEW
259
- payload.update(line_data[i]) # {'page': …, 'lines': …}
256
+ for i, txt in enumerate(texts):
257
+ payload = {**base_md, "chunk_index": i, "file_id": file_record_id}
258
+ if i < len(line_data):
259
+ payload.update(line_data[i]) # {'page':…, 'lines':…}
260
260
  chunk_md.append(payload)
261
261
 
262
+ # 🔑 1. look up the backend store to get its *collection* name
263
+ store = self.retrieve_vector_store_sync(vector_store_id)
264
+ collection_name = store.collection_name
265
+
266
+ # 🔑 2. upsert via VectorStoreManager (auto-detects vector field)
262
267
  self.vector_manager.add_to_store(
263
- store_name=vector_store_id,
268
+ store_name=collection_name,
264
269
  texts=texts,
265
270
  vectors=vectors,
266
271
  metadata=chunk_md,
267
272
  )
268
273
 
274
+ # 3. register the file with the API
269
275
  resp = await self._request(
270
276
  "POST",
271
277
  f"/v1/vector-stores/{vector_store_id}/files",
@@ -287,26 +293,36 @@ class VectorStoreClient:
287
293
  filters: Optional[Dict] = None,
288
294
  vector_store_host: Optional[str] = None,
289
295
  ) -> List[Dict[str, Any]]:
290
- # Use the provided vector_store_host if specified, otherwise fall back to the default
291
- if vector_store_host:
292
- vector_manager = VectorStoreManager(vector_store_host=vector_store_host)
293
- else:
294
- vector_manager = self.vector_manager
296
+
297
+ # pick local vs. override host
298
+ vector_manager = (
299
+ VectorStoreManager(vector_store_host=vector_store_host)
300
+ if vector_store_host
301
+ else self.vector_manager
302
+ )
295
303
 
296
304
  store = self.retrieve_vector_store_sync(vector_store_id)
297
- vec = self.file_processor.embedding_model.encode(query_text).tolist()
305
+
306
+ # 🔶 choose encoder by vector_size
307
+ if store.vector_size == 1024: # images collection
308
+ vec = self.file_processor.encode_clip_text(query_text).tolist()
309
+ vector_field = "caption_vector" # field name in Qdrant
310
+ else: # 384-D text collection
311
+ vec = self.file_processor.encode_text(query_text).tolist()
312
+ vector_field = None # default field
298
313
 
299
314
  return vector_manager.query_store(
300
315
  store_name=store.collection_name,
301
316
  query_vector=vec,
302
317
  top_k=top_k,
303
318
  filters=filters,
319
+ vector_field=vector_field,
304
320
  )
305
321
 
306
- async def _delete_vs_async(
307
- self, vector_store_id: str, permanent: bool
308
- ) -> Dict[str, Any]:
309
- qres = self.vector_manager.delete_store(vector_store_id)
322
+ async def _delete_vs_async(self, vector_store_id: str, permanent: bool):
323
+ # collection deletion must use the *collection* name
324
+ store = self.retrieve_vector_store_sync(vector_store_id)
325
+ qres = self.vector_manager.delete_store(store.collection_name)
310
326
  await self._request(
311
327
  "DELETE",
312
328
  f"/v1/vector-stores/{vector_store_id}",
@@ -319,10 +335,11 @@ class VectorStoreClient:
319
335
  "qdrant_result": qres,
320
336
  }
321
337
 
322
- async def _delete_file_async(
323
- self, vector_store_id: str, file_path: str
324
- ) -> Dict[str, Any]:
325
- fres = self.vector_manager.delete_file_from_store(vector_store_id, file_path)
338
+ async def _delete_file_async(self, vector_store_id: str, file_path: str):
339
+ store = self.retrieve_vector_store_sync(vector_store_id)
340
+ fres = self.vector_manager.delete_file_from_store(
341
+ store.collection_name, file_path
342
+ )
326
343
  await self._request(
327
344
  "DELETE",
328
345
  f"/v1/vector-stores/{vector_store_id}/files",
@@ -454,19 +471,101 @@ class VectorStoreClient:
454
471
  )
455
472
  )
456
473
 
474
+ # ───────────────────────────────────────────────────────────────
475
+ # Convenience: ensure a per-user “file_search” store exists
476
+ # ───────────────────────────────────────────────────────────────
477
+ # unchanged … (get_or_create_file_search_store)
478
+
457
479
  def list_my_vector_stores(self) -> List[ValidationInterface.VectorStoreRead]:
458
- """List all non-deleted stores owned by the caller."""
480
+ """List all non-deleted stores owned by *this* API-key’s user."""
459
481
  return self._run_sync(self._list_my_vs_async())
460
482
 
483
+ # ───────────────────────────────────────────────────────────────
484
+ # NEW: real per-user listing (admin-only)
485
+ # ───────────────────────────────────────────────────────────────
486
+ async def _list_vs_by_user_async(self, user_id: str):
487
+ resp = await self._request(
488
+ "GET",
489
+ "/v1/vector-stores/admin/by-user",
490
+ params={"owner_id": user_id},
491
+ )
492
+ return [ValidationInterface.VectorStoreRead.model_validate(r) for r in resp]
493
+
461
494
  def get_stores_by_user(
462
- self, _user_id: str
495
+ self,
496
+ _user_id: str,
463
497
  ) -> List[ValidationInterface.VectorStoreRead]: # noqa: ARG002
498
+ """
499
+ ⚠️ **Deprecated** – prefer impersonating the user’s API-key or using
500
+ the newer RBAC endpoints, but keep working for legacy code.
501
+ """
464
502
  warnings.warn(
465
- "`get_stores_by_user()` is deprecated; use `list_my_vector_stores()`.",
503
+ "`get_stores_by_user()` is deprecated; use `list_my_vector_stores()` or "
504
+ "`VectorStoreClient(list_my_vector_stores)` with an impersonated key.",
466
505
  DeprecationWarning,
467
506
  stacklevel=2,
468
507
  )
469
- return self.list_my_vector_stores()
508
+ return self._run_sync(self._list_vs_by_user_async(_user_id))
509
+
510
+ # ───────────────────────────────────────────────────────────────
511
+ # Convenience: ensure a per-user “file_search” store exists
512
+ # ───────────────────────────────────────────────────────────────
513
+ def get_or_create_file_search_store(self, user_id: Optional[str] = None) -> str:
514
+ """
515
+ Return the *oldest* vector-store named **file_search** for ``user_id``;
516
+ create one if none exist.
517
+
518
+ Parameters
519
+ ----------
520
+ user_id : Optional[str]
521
+ • If **None** → operate on *this* API-key’s stores
522
+ • If not None → *admin-only* – look up / create on behalf of ``user_id``
523
+
524
+ Returns
525
+ -------
526
+ str
527
+ The vector-store **id**.
528
+ """
529
+
530
+ # 1️⃣ Fetch candidate stores
531
+ if user_id is None:
532
+ # Normal user context – only see caller-owned stores
533
+ stores = self.list_my_vector_stores()
534
+ else:
535
+ # Admin context – may inspect another user’s stores
536
+ stores = self.get_stores_by_user(_user_id=user_id)
537
+
538
+ file_search_stores = [s for s in stores if s.name == "file_search"]
539
+
540
+ if file_search_stores:
541
+ # 2️⃣ Pick the *earliest* (oldest created_at) to keep things stable
542
+ chosen = min(
543
+ file_search_stores,
544
+ key=lambda s: (s.created_at or 0),
545
+ )
546
+ log.info(
547
+ "Re-using existing 'file_search' store %s for user %s",
548
+ chosen.id,
549
+ user_id or "<self>",
550
+ )
551
+ return chosen.id
552
+
553
+ # 3️⃣ Nothing found → create a fresh store
554
+ if user_id is None:
555
+ new_store = self.create_vector_store(name="file_search")
556
+ else:
557
+ # Requires admin API-key
558
+ new_store = self.create_vector_store_for_user(
559
+ owner_id=user_id,
560
+ name="file_search",
561
+ )
562
+
563
+ log.info(
564
+ "Created new 'file_search' store %s for user %s",
565
+ new_store.id,
566
+ user_id or "<self>",
567
+ )
568
+ return new_store.id
470
569
 
471
570
  def add_file_to_vector_store(
472
571
  self,
@@ -479,6 +578,67 @@ class VectorStoreClient:
479
578
  raise FileNotFoundError(f"File not found: {p}")
480
579
  return self._run_sync(self._add_file_async(vector_store_id, p, user_metadata))
481
580
 
581
+ def delete_vector_store(
582
+ self,
583
+ vector_store_id: str,
584
+ permanent: bool = False,
585
+ ) -> Dict[str, Any]:
586
+ return self._run_sync(self._delete_vs_async(vector_store_id, permanent))
587
+
588
+ def delete_file_from_vector_store(
589
+ self,
590
+ vector_store_id: str,
591
+ file_path: str,
592
+ ) -> Dict[str, Any]:
593
+ return self._run_sync(self._delete_file_async(vector_store_id, file_path))
594
+
595
+ def list_store_files(
596
+ self,
597
+ vector_store_id: str,
598
+ ) -> List[ValidationInterface.VectorStoreFileRead]:
599
+ return self._run_sync(self._list_store_files_async(vector_store_id))
600
+
601
+ def update_vector_store_file_status(
602
+ self,
603
+ vector_store_id: str,
604
+ file_id: str,
605
+ status: ValidationInterface.StatusEnum,
606
+ error_message: Optional[str] = None,
607
+ ) -> ValidationInterface.VectorStoreFileRead:
608
+ return self._run_sync(
609
+ self._update_file_status_async(
610
+ vector_store_id, file_id, status, error_message
611
+ )
612
+ )
613
+
614
+ def get_vector_stores_for_assistant(
615
+ self,
616
+ assistant_id: str,
617
+ ) -> List[ValidationInterface.VectorStoreRead]:
618
+ return self._run_sync(self._get_assistant_vs_async(assistant_id))
619
+
620
+ def attach_vector_store_to_assistant(
621
+ self,
622
+ vector_store_id: str,
623
+ assistant_id: str,
624
+ ) -> bool:
625
+ return self._run_sync(self._attach_vs_async(vector_store_id, assistant_id))
626
+
627
+ def detach_vector_store_from_assistant(
628
+ self,
629
+ vector_store_id: str,
630
+ assistant_id: str,
631
+ ) -> bool:
632
+ return self._run_sync(self._detach_vs_async(vector_store_id, assistant_id))
633
+
634
+ def retrieve_vector_store_sync(
635
+ self,
636
+ vector_store_id: str,
637
+ ) -> ValidationInterface.VectorStoreRead:
638
+ resp = self._sync_api_client.get(f"/v1/vector-stores/{vector_store_id}")
639
+ resp.raise_for_status()
640
+ return ValidationInterface.VectorStoreRead.model_validate(resp.json())
641
+
482
642
  def vector_file_search_raw(
483
643
  self,
484
644
  vector_store_id: str,
@@ -545,71 +705,10 @@ class VectorStoreClient:
545
705
  # 4️⃣ Wrap everything into an OpenAI envelope
546
706
  return make_envelope(query_text, hits, answer_text)
547
707
 
548
- def delete_vector_store(
549
- self,
550
- vector_store_id: str,
551
- permanent: bool = False,
552
- ) -> Dict[str, Any]:
553
- return self._run_sync(self._delete_vs_async(vector_store_id, permanent))
554
-
555
- def delete_file_from_vector_store(
556
- self,
557
- vector_store_id: str,
558
- file_path: str,
559
- ) -> Dict[str, Any]:
560
- return self._run_sync(self._delete_file_async(vector_store_id, file_path))
561
-
562
- def list_store_files(
563
- self,
564
- vector_store_id: str,
565
- ) -> List[ValidationInterface.VectorStoreFileRead]:
566
- return self._run_sync(self._list_store_files_async(vector_store_id))
567
-
568
- def update_vector_store_file_status(
569
- self,
570
- vector_store_id: str,
571
- file_id: str,
572
- status: ValidationInterface.StatusEnum,
573
- error_message: Optional[str] = None,
574
- ) -> ValidationInterface.VectorStoreFileRead:
575
- return self._run_sync(
576
- self._update_file_status_async(
577
- vector_store_id, file_id, status, error_message
578
- )
579
- )
580
-
581
- def get_vector_stores_for_assistant(
582
- self,
583
- assistant_id: str,
584
- ) -> List[ValidationInterface.VectorStoreRead]:
585
- return self._run_sync(self._get_assistant_vs_async(assistant_id))
586
-
587
- def attach_vector_store_to_assistant(
588
- self,
589
- vector_store_id: str,
590
- assistant_id: str,
591
- ) -> bool:
592
- return self._run_sync(self._attach_vs_async(vector_store_id, assistant_id))
593
-
594
- def detach_vector_store_from_assistant(
595
- self,
596
- vector_store_id: str,
597
- assistant_id: str,
598
- ) -> bool:
599
- return self._run_sync(self._detach_vs_async(vector_store_id, assistant_id))
600
-
601
- def retrieve_vector_store_sync(
602
- self,
603
- vector_store_id: str,
604
- ) -> ValidationInterface.VectorStoreRead:
605
- resp = self._sync_api_client.get(f"/v1/vector-stores/{vector_store_id}")
606
- resp.raise_for_status()
607
- return ValidationInterface.VectorStoreRead.model_validate(resp.json())
608
-
609
708
  # ────────────────────────────────────────────────────────────────
610
709
  # End‑to‑end: retrieve → (rerank) → synthesize → envelope
611
710
  # ────────────────────────────────────────────────────────────────
612
- def file_search(
711
+ def attended_file_search(
613
712
  self,
614
713
  vector_store_id: str,
615
714
  query_text: str,
@@ -659,3 +758,58 @@ class VectorStoreClient:
659
758
  base_url=self.base_url, # Same backend
660
759
  provider_api_key=os.getenv("HYPERBOLIC_API_KEY"), # Hyperbolic key
661
760
  )
761
+
762
+ # ────────────────────────────────────────────────────────────────
763
+ # End‑to‑end: retrieve → (rerank) → synthesize → envelope
764
+ # ────────────────────────────────────────────────────────────────
765
+ def unattended_file_search(
766
+ self,
767
+ vector_store_id: str,
768
+ query_text: str,
769
+ k: int = 20,
770
+ vector_store_host: Optional[str] = None,
771
+ ) -> Dict[str, Any]:
772
+ """
773
+ Perform a search over the file vector store and return normalized retrieval hits.
774
+
775
+ This method executes a bare search pipeline: it retrieves vector-based candidates
776
+ using semantic similarity, optionally applies reranking (e.g., cross-encoder or LLM-based),
777
+ and normalizes the result schema. It does not perform synthesis or construct an OpenAI-style envelope.
778
+
779
+ Use this when you want direct access to retrieved content for custom downstream handling,
780
+ logging, inspection, or separate orchestration logic.
781
+
782
+ Parameters
783
+ ----------
784
+ vector_store_id : str
785
+ The ID of the vector store to search within.
786
+ query_text : str
787
+ The user query in natural language.
788
+ k : int, optional
789
+ The number of top hits to retrieve (default is 20).
790
+ vector_store_host : Optional[str], optional
791
+ Optional override for the vector store host (e.g., when calling remote Qdrant).
792
+
793
+ Returns
794
+ -------
795
+ Dict[str, Any]
796
+ A normalized list of retrieval results (each with metadata and score),
797
+ without abstraction, synthesis, or formatting.
798
+ """
799
+
800
+ # 1️⃣ Retrieve initial candidates (now with optional vector_store_host passthrough)
801
+ hits = retriever.retrieve(
802
+ self,
803
+ vector_store_id=vector_store_id,
804
+ query=query_text,
805
+ k=k,
806
+ vector_store_host=vector_store_host,
807
+ )
808
+
809
+ # 2️⃣ Optional cross-encoder / LLM rerank
810
+ hits = reranker.rerank(query_text, hits, top_k=min(len(hits), 10))
811
+
812
+ # 3️⃣ Normalize schema (guarantee 'meta_data')
813
+ hits = self._normalise_hits(hits)
814
+
815
+ return hits