sunholo 0.118.10__py3-none-any.whl → 0.119.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,8 +65,9 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
65
65
  return None
66
66
  for corp in corpuses:
67
67
  try:
68
- response = corp.import_documents(
69
- gcs_uri=message_data
68
+ response = corp.import_document_with_metadata(
69
+ gcs_uri=message_data,
70
+ metadata=metadata
70
71
  )
71
72
  log.info(f"Imported file to corpus: {response} with metadata: {metadata}")
72
73
  except Exception as err:
@@ -81,8 +82,9 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
81
82
  continue
82
83
  if new_corp:
83
84
  log.info(f"Found new DiscoveryEngine {config.vector_name=} - {new_corp=}")
84
- response = corp.import_documents(
85
- gcs_uri=message_data
85
+ response = corp.import_document_with_metadata(
86
+ gcs_uri=message_data,
87
+ metadata=metadata
86
88
  )
87
89
 
88
90
  continue
@@ -1,6 +1,7 @@
1
1
  from .discovery_engine_client import DiscoveryEngineClient
2
2
  from ..utils import ConfigManager
3
3
  from ..utils.gcp_project import get_gcp_project
4
+ from ..custom_logging import log
4
5
 
5
6
  def create_new_discovery_engine(config:ConfigManager):
6
7
 
@@ -12,9 +13,13 @@ def create_new_discovery_engine(config:ConfigManager):
12
13
  chunk_size = chunker_config["chunk_size"]
13
14
 
14
15
  gcp_config = config.vacConfig("gcp_config")
15
- project_id = gcp_config.get("project_id") or get_gcp_project()
16
+ if not gcp_config:
17
+ log.info("Found no gcp_config in configuration so using get_gcp_project()")
18
+ project_id = get_gcp_project()
19
+ else:
20
+ project_id = gcp_config.get("project_id") or get_gcp_project()
16
21
  if not project_id:
17
- raise ValueError("Could not find project_id in gcp_config")
22
+ raise ValueError("Could not find project_id in gcp_config or global")
18
23
 
19
24
  #location = gcp_config.get('location')
20
25
 
@@ -1,6 +1,6 @@
1
1
  try:
2
2
  from google.api_core.client_options import ClientOptions
3
- from google.cloud import discoveryengine_v1alpha as discoveryengine
3
+ from google.cloud import discoveryengine
4
4
  from google.api_core.retry import Retry, if_exception_type
5
5
  from google.api_core.exceptions import ResourceExhausted, AlreadyExists
6
6
  from google.cloud.discoveryengine_v1alpha import SearchResponse, Chunk
@@ -13,6 +13,9 @@ except ImportError:
13
13
  from ..custom_logging import log
14
14
  from typing import Optional, List
15
15
  import asyncio
16
+ import json
17
+ import uuid
18
+ from ..utils.mime import guess_mime_type
16
19
 
17
20
  class DiscoveryEngineClient:
18
21
  """
@@ -431,7 +434,37 @@ class DiscoveryEngineClient:
431
434
 
432
435
  return operation.operation.name
433
436
 
437
+ def _import_document_request(self,
438
+ request
439
+ ) -> str:
440
+ """
441
+ Handles the common logic for making an ImportDocumentsRequest, including retrying.
434
442
 
443
+ Args:
444
+ request (discoveryengine.ImportDocumentsRequest): The prepared request object.
445
+
446
+ Returns:
447
+ str: The operation name.
448
+ """
449
+ @self.my_retry()
450
+ def import_documents_with_retry(doc_client, request):
451
+ return doc_client.import_documents(request=request)
452
+
453
+ try:
454
+ operation = import_documents_with_retry(self.doc_client, request)
455
+ except ResourceExhausted as e:
456
+ log.error(f"DiscoveryEngine Operation failed after retries due to quota exceeded: {e}")
457
+ raise e
458
+ except AlreadyExists as e:
459
+ # Extract relevant info from the request to log
460
+ gcs_uri = request.gcs_source.input_uris if request.gcs_source else None
461
+ bigquery_table = request.bigquery_source.table_id if request.bigquery_source else None
462
+ log.warning(f"DiscoveryEngine - Already exists: {e} - {gcs_uri=} {bigquery_table=}")
463
+ except Exception as e:
464
+ log.error(f"An unexpected DiscoveryEngine error occurred: {e}")
465
+ raise e
466
+
467
+ return operation.operation.name
435
468
 
436
469
  def import_documents(self,
437
470
  gcs_uri: Optional[str] = None,
@@ -479,23 +512,115 @@ class DiscoveryEngineClient:
479
512
  )
480
513
 
481
514
  # Make the request
482
- @self.my_retry()
483
- def import_documents_with_retry(doc_client, request):
484
- return doc_client.import_documents(request=request)
485
-
486
- try:
487
- operation = import_documents_with_retry(self.doc_client, request)
488
- except ResourceExhausted as e:
489
- log.error(f"DiscoveryEngine Operation failed after retries due to quota exceeded: {e}")
515
+ return self._import_document_request(request)
490
516
 
491
- raise e
492
- except AlreadyExists as e:
493
- log.warning(f"DiscoveryEngine - Already exists: {e} - {gcs_uri=} {bigquery_table=}")
494
517
 
495
- except Exception as e:
496
- log.error(f"An unexpected DiscoveryEngine error occurred: {e}")
518
+ def import_documents_with_metadata(self, gcs_uri: str, data_schema="content", branch="default_branch"):
519
+ """
520
+ Supply a JSONLD GCS location to import all the GS URIs within and their metadata
521
+ """
522
+ parent = self.doc_client.branch_path(
523
+ self.project_id,
524
+ self.location,
525
+ self.data_store_id,
526
+ branch
527
+ )
497
528
 
498
- raise e
529
+ # 1. Prepare your documents with metadata:
530
+ documents_with_metadata = []
531
+ with open(gcs_uri, 'r') as f: # Assuming one JSON object per line in your GCS file
532
+ for line in f:
533
+ try:
534
+ document_data = json.loads(line) # Load the JSON from the line
535
+ # Check if it has the required fields, if not create them
536
+ if "id" not in document_data:
537
+ document_data["id"] = str(uuid.uuid4())
538
+ if "structData" not in document_data:
539
+ document_data["structData"] = {}
540
+ if "content" not in document_data:
541
+ document_data["content"] = {}
542
+ # Create the Document object with your metadata
543
+ document = discoveryengine.Document(
544
+ name = f"{parent}/documents/{document_data['id']}", # important!
545
+ id=document_data["id"],
546
+ struct_data=document_data.get("structData", {}), # Your metadata here
547
+ content = discoveryengine.Content(
548
+ mime_type = document_data.get("content", {}).get("mimeType", "text/plain"),
549
+ uri = document_data.get("content", {}).get("uri", ""),
550
+ )
551
+ )
552
+
553
+ if "jsonData" in document_data:
554
+ document.json_data = document_data["jsonData"]
555
+
556
+ documents_with_metadata.append(document)
557
+
558
+ except json.JSONDecodeError as e:
559
+ log.error(f"Error decoding JSON in line: {line.strip()}. Error: {e}")
560
+ continue # Skip to the next line if there's an error
561
+
562
+ # 2. Use InlineSource to import:
563
+ request = discoveryengine.ImportDocumentsRequest(
564
+ parent=parent,
565
+ inline_source=discoveryengine.ImportDocumentsRequest.InlineSource(
566
+ documents=documents_with_metadata, # Pass the list of Document objects
567
+ ),
568
+ reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
569
+ )
499
570
 
500
- return operation.operation.name
571
+ return self._import_document_request(request)
572
+
573
+
574
+ def import_document_with_metadata(self, gcs_uri: str, metadata: dict, branch="default_branch"):
575
+ """
576
+ Imports a single document with metadata.
577
+
578
+ Args:
579
+ gcs_uri: The GCS URI of the document to import.
580
+ metadata: A dictionary containing the metadata for the document.
581
+ branch: The branch to import the document into.
501
582
 
583
+ Returns:
584
+ str: The operation name.
585
+ """
586
+ try:
587
+ # 1. Generate a unique document ID
588
+ document_id = str(uuid.uuid4())
589
+
590
+ # 2. Create a Document object
591
+ parent = self.doc_client.branch_path(
592
+ self.project_id, self.location, self.data_store_id, branch
593
+ )
594
+ document = discoveryengine.Document(
595
+ name=f"{parent}/documents/{document_id}",
596
+ id=document_id,
597
+ struct_data=metadata,
598
+ content=discoveryengine.Document.Content(
599
+ uri=gcs_uri,
600
+ mime_type=self.get_mime_type(gcs_uri)
601
+ )
602
+ )
603
+
604
+ # 3. Use InlineSource for import
605
+ request = discoveryengine.ImportDocumentsRequest(
606
+ parent=parent,
607
+ inline_source=discoveryengine.ImportDocumentsRequest.InlineSource(
608
+ documents=[document],
609
+ ),
610
+ reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL
611
+ )
612
+
613
+ # 4. Make the import request (using the common method)
614
+ return self._import_document_request(request)
615
+
616
+ except Exception as e:
617
+ log.error(f"Error importing document with metadata: {e}")
618
+
619
+ def get_mime_type(self, uri:str):
620
+ return guess_mime_type(uri)
621
+
622
+ def search_with_filters(self, query, folder=None, date=None,
623
+ num_previous_chunks=3, num_next_chunks=3,
624
+ page_size=10, parse_chunks_to_string=True,
625
+ serving_config="default_serving_config"):
626
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sunholo
3
- Version: 0.118.10
3
+ Version: 0.119.3
4
4
  Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
5
5
  Author-email: Holosun ApS <multivac@sunholo.com>
6
6
  License: Apache License, Version 2.0
@@ -47,7 +47,7 @@ Requires-Dist: google-cloud-service-control; extra == "all"
47
47
  Requires-Dist: google-cloud-logging; extra == "all"
48
48
  Requires-Dist: google-cloud-storage; extra == "all"
49
49
  Requires-Dist: google-cloud-pubsub; extra == "all"
50
- Requires-Dist: google-cloud-discoveryengine; extra == "all"
50
+ Requires-Dist: google-cloud-discoveryengine>=0.13.4; extra == "all"
51
51
  Requires-Dist: google-cloud-texttospeech; extra == "all"
52
52
  Requires-Dist: google-generativeai>=0.7.1; extra == "all"
53
53
  Requires-Dist: google-genai; extra == "all"
@@ -133,7 +133,7 @@ Requires-Dist: google-cloud-service-control; extra == "gcp"
133
133
  Requires-Dist: google-cloud-storage; extra == "gcp"
134
134
  Requires-Dist: google-cloud-logging; extra == "gcp"
135
135
  Requires-Dist: google-cloud-pubsub; extra == "gcp"
136
- Requires-Dist: google-cloud-discoveryengine; extra == "gcp"
136
+ Requires-Dist: google-cloud-discoveryengine>=0.13.4; extra == "gcp"
137
137
  Requires-Dist: google-cloud-texttospeech; extra == "gcp"
138
138
  Requires-Dist: google-genai; extra == "gcp"
139
139
  Requires-Dist: google-generativeai>=0.8.3; extra == "gcp"
@@ -72,9 +72,9 @@ sunholo/database/sql/sb/delete_source_row.sql,sha256=r6fEuUKdbiLHCDGKSbKINDCpJjs
72
72
  sunholo/database/sql/sb/return_sources.sql,sha256=89KAnxfK8n_qGK9jy1OQT8f9n4uYUtYL5cCxbC2mj_c,255
73
73
  sunholo/database/sql/sb/setup.sql,sha256=CvoFvZQev2uWjmFa3aj3m3iuPFzAAJZ0S7Qi3L3-zZI,89
74
74
  sunholo/discovery_engine/__init__.py,sha256=hLgqRDJ22Aov9o2QjAEfsVgnL3kMdM-g5p8RJ9OyKdQ,130
75
- sunholo/discovery_engine/chunker_handler.py,sha256=5tw5_jalNQosf7uFyCmsYA__VdNpWC1PPVVa420CzWU,5479
76
- sunholo/discovery_engine/create_new.py,sha256=jWg5LW-QpFE8zq50ShaQJB3Wu8loiWB0P4lRWaCHpss,1023
77
- sunholo/discovery_engine/discovery_engine_client.py,sha256=jfIayVUOPM4svGF1S5Kk60rIG-xSo_e3zOHtBRg0nZA,22002
75
+ sunholo/discovery_engine/chunker_handler.py,sha256=E3z-rVUuhjDebJY6nderr9QBYe8CrjwKskwIkOa_e68,5591
76
+ sunholo/discovery_engine/create_new.py,sha256=WUi4_xh_dFaGX3xA9jkNKZhaR6LCELjMPeRb0hyj4FU,1226
77
+ sunholo/discovery_engine/discovery_engine_client.py,sha256=C9fz341ZFMPtVSvqw2DbAgosJ5r5-YjfigRK-uFsldY,27407
78
78
  sunholo/discovery_engine/get_ai_search_chunks.py,sha256=hsFGOQugSeTMPEaQ16XTs_D45F8NABBm2IsAEdTk7kQ,4316
79
79
  sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
80
80
  sunholo/embedder/embed_chunk.py,sha256=Vvvj3-H4pSb1a2sLik3-X3X459j2jrUq1dBNAsOcQLo,7156
@@ -164,9 +164,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
164
164
  sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
165
165
  sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
166
166
  sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
167
- sunholo-0.118.10.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
168
- sunholo-0.118.10.dist-info/METADATA,sha256=prYJcbvo8lNaYqKHCeNC-I2_9ObUb8hr_AZHt6YMRRY,9639
169
- sunholo-0.118.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
170
- sunholo-0.118.10.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
171
- sunholo-0.118.10.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
172
- sunholo-0.118.10.dist-info/RECORD,,
167
+ sunholo-0.119.3.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
168
+ sunholo-0.119.3.dist-info/METADATA,sha256=OmzF8MtMCAYTxGe_VmAh8Zv9eMFwfwQ5ZC85d0FtZHg,9654
169
+ sunholo-0.119.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
170
+ sunholo-0.119.3.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
171
+ sunholo-0.119.3.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
172
+ sunholo-0.119.3.dist-info/RECORD,,