sunholo 0.118.9__py3-none-any.whl → 0.119.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,8 +65,9 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
65
65
  return None
66
66
  for corp in corpuses:
67
67
  try:
68
- response = corp.import_documents(
69
- gcs_uri=message_data
68
+ response = corp.import_document_with_metadata(
69
+ gcs_uri=message_data,
70
+ metadata=metadata
70
71
  )
71
72
  log.info(f"Imported file to corpus: {response} with metadata: {metadata}")
72
73
  except Exception as err:
@@ -81,8 +82,9 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
81
82
  continue
82
83
  if new_corp:
83
84
  log.info(f"Found new DiscoveryEngine {config.vector_name=} - {new_corp=}")
84
- response = corp.import_documents(
85
- gcs_uri=message_data
85
+ response = corp.import_document_with_metadata(
86
+ gcs_uri=message_data,
87
+ metadata=metadata
86
88
  )
87
89
 
88
90
  continue
@@ -1,6 +1,6 @@
1
1
  try:
2
2
  from google.api_core.client_options import ClientOptions
3
- from google.cloud import discoveryengine_v1alpha as discoveryengine
3
+ from google.cloud import discoveryengine
4
4
  from google.api_core.retry import Retry, if_exception_type
5
5
  from google.api_core.exceptions import ResourceExhausted, AlreadyExists
6
6
  from google.cloud.discoveryengine_v1alpha import SearchResponse, Chunk
@@ -13,6 +13,9 @@ except ImportError:
13
13
  from ..custom_logging import log
14
14
  from typing import Optional, List
15
15
  import asyncio
16
+ import json
17
+ import uuid
18
+ from ..utils.mime import guess_mime_type
16
19
 
17
20
  class DiscoveryEngineClient:
18
21
  """
@@ -431,7 +434,37 @@ class DiscoveryEngineClient:
431
434
 
432
435
  return operation.operation.name
433
436
 
437
+ def _import_document_request(self,
438
+ request: discoveryengine.ImportDocumentsRequest # type: ignore
439
+ ) -> str:
440
+ """
441
+ Handles the common logic for making an ImportDocumentsRequest, including retrying.
434
442
 
443
+ Args:
444
+ request (discoveryengine.ImportDocumentsRequest): The prepared request object.
445
+
446
+ Returns:
447
+ str: The operation name.
448
+ """
449
+ @self.my_retry()
450
+ def import_documents_with_retry(doc_client, request):
451
+ return doc_client.import_documents(request=request)
452
+
453
+ try:
454
+ operation = import_documents_with_retry(self.doc_client, request)
455
+ except ResourceExhausted as e:
456
+ log.error(f"DiscoveryEngine Operation failed after retries due to quota exceeded: {e}")
457
+ raise e
458
+ except AlreadyExists as e:
459
+ # Extract relevant info from the request to log
460
+ gcs_uri = request.gcs_source.input_uris if request.gcs_source else None
461
+ bigquery_table = request.bigquery_source.table_id if request.bigquery_source else None
462
+ log.warning(f"DiscoveryEngine - Already exists: {e} - {gcs_uri=} {bigquery_table=}")
463
+ except Exception as e:
464
+ log.error(f"An unexpected DiscoveryEngine error occurred: {e}")
465
+ raise e
466
+
467
+ return operation.operation.name
435
468
 
436
469
  def import_documents(self,
437
470
  gcs_uri: Optional[str] = None,
@@ -479,23 +512,117 @@ class DiscoveryEngineClient:
479
512
  )
480
513
 
481
514
  # Make the request
482
- @self.my_retry()
483
- def import_documents_with_retry(doc_client, request):
484
- return doc_client.import_documents(request=request)
485
-
486
- try:
487
- operation = import_documents_with_retry(self.doc_client, request)
488
- except ResourceExhausted as e:
489
- log.error(f"DiscoveryEngine Operation failed after retries due to quota exceeded: {e}")
515
+ return self._import_document_request(request)
490
516
 
491
- raise e
492
- except AlreadyExists as e:
493
- log.warning(f"DiscoveryEngine - Already exists: {e} - {gcs_uri=} {bigquery_table=}")
494
517
 
495
- except Exception as e:
496
- log.error(f"An unexpected DiscoveryEngine error occurred: {e}")
518
+ def import_documents_with_metadata(self, gcs_uri: str, data_schema="content", branch="default_branch"):
519
+ """
520
+ Supply a JSONLD GCS location to import all the GS URIs within and their metadata
521
+ """
522
+ parent = self.doc_client.branch_path(
523
+ self.project_id,
524
+ self.location,
525
+ self.data_store_id,
526
+ branch
527
+ )
497
528
 
498
- raise e
529
+ # 1. Prepare your documents with metadata:
530
+ documents_with_metadata = []
531
+ with open(gcs_uri, 'r') as f: # Assuming one JSON object per line in your GCS file
532
+ for line in f:
533
+ try:
534
+ document_data = json.loads(line) # Load the JSON from the line
535
+ # Check if it has the required fields, if not create them
536
+ if "id" not in document_data:
537
+ document_data["id"] = str(uuid.uuid4())
538
+ if "structData" not in document_data:
539
+ document_data["structData"] = {}
540
+ if "content" not in document_data:
541
+ document_data["content"] = {}
542
+ # Create the Document object with your metadata
543
+ document = discoveryengine.Document(
544
+ name = f"{parent}/documents/{document_data['id']}", # important!
545
+ id=document_data["id"],
546
+ struct_data=document_data.get("structData", {}), # Your metadata here
547
+ content = discoveryengine.Content(
548
+ mime_type = document_data.get("content", {}).get("mimeType", "text/plain"),
549
+ uri = document_data.get("content", {}).get("uri", ""),
550
+ )
551
+ )
552
+
553
+ if "jsonData" in document_data:
554
+ document.json_data = document_data["jsonData"]
555
+
556
+ documents_with_metadata.append(document)
557
+
558
+ except json.JSONDecodeError as e:
559
+ log.error(f"Error decoding JSON in line: {line.strip()}. Error: {e}")
560
+ continue # Skip to the next line if there's an error
561
+
562
+ # 2. Use InlineSource to import:
563
+ request = discoveryengine.ImportDocumentsRequest(
564
+ parent=parent,
565
+ inline_source=discoveryengine.ImportDocumentsRequest.InlineSource(
566
+ documents=documents_with_metadata, # Pass the list of Document objects
567
+ data_schema="document" # Important: Set to "document" when providing full Documents
568
+ ),
569
+ reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
570
+ )
499
571
 
500
- return operation.operation.name
572
+ return self._import_document_request(request)
573
+
574
+
575
+ def import_document_with_metadata(self, gcs_uri: str, metadata: dict, branch="default_branch"):
576
+ """
577
+ Imports a single document with metadata.
578
+
579
+ Args:
580
+ gcs_uri: The GCS URI of the document to import.
581
+ metadata: A dictionary containing the metadata for the document.
582
+ branch: The branch to import the document into.
501
583
 
584
+ Returns:
585
+ str: The operation name.
586
+ """
587
+ try:
588
+ # 1. Generate a unique document ID
589
+ document_id = str(uuid.uuid4())
590
+
591
+ # 2. Create a Document object
592
+ parent = self.doc_client.branch_path(
593
+ self.project_id, self.location, self.data_store_id, branch
594
+ )
595
+ document = discoveryengine.Document(
596
+ name=f"{parent}/documents/{document_id}",
597
+ id=document_id,
598
+ struct_data=metadata,
599
+ content=discoveryengine.Document.Content(
600
+ uri=gcs_uri,
601
+ mime_type=self.get_mime_type(gcs_uri)
602
+ )
603
+ )
604
+
605
+ # 3. Use InlineSource for import
606
+ request = discoveryengine.ImportDocumentsRequest(
607
+ parent=parent,
608
+ inline_source=discoveryengine.ImportDocumentsRequest.InlineSource(
609
+ documents=[document],
610
+ data_schema="document"
611
+ ),
612
+ reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL
613
+ )
614
+
615
+ # 4. Make the import request (using the common method)
616
+ return self._import_document_request(request)
617
+
618
+ except Exception as e:
619
+ log.error(f"Error importing document with metadata: {e}")
620
+
621
+ def get_mime_type(self, uri:str):
622
+ return guess_mime_type(uri)
623
+
624
+ def search_with_filters(self, query, folder=None, date=None,
625
+ num_previous_chunks=3, num_next_chunks=3,
626
+ page_size=10, parse_chunks_to_string=True,
627
+ serving_config="default_serving_config"):
628
+ pass
@@ -132,7 +132,7 @@ async def construct_file_content(gs_list, bucket:str, genai_lib=False):
132
132
  myfile = genai.get_file(name)
133
133
  else:
134
134
  client = genaiv2.Client()
135
- myfile = client.files.get(file=name)
135
+ myfile = client.files.get(name=name)
136
136
  content.append(myfile)
137
137
  content.append(f"You have been given the ability to work with file {display_name=} with {mime_type=} {display_url=}")
138
138
  log.info(f"Found existing genai.get_file {name=}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sunholo
3
- Version: 0.118.9
3
+ Version: 0.119.1
4
4
  Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
5
5
  Author-email: Holosun ApS <multivac@sunholo.com>
6
6
  License: Apache License, Version 2.0
@@ -20,6 +20,7 @@ Description-Content-Type: text/markdown
20
20
  License-File: LICENSE.txt
21
21
  Requires-Dist: aiohttp
22
22
  Requires-Dist: google-auth
23
+ Requires-Dist: google-cloud-discoveryengine>=0.13.4
23
24
  Requires-Dist: pydantic
24
25
  Requires-Dist: requests
25
26
  Requires-Dist: ruamel.yaml
@@ -47,7 +48,7 @@ Requires-Dist: google-cloud-service-control; extra == "all"
47
48
  Requires-Dist: google-cloud-logging; extra == "all"
48
49
  Requires-Dist: google-cloud-storage; extra == "all"
49
50
  Requires-Dist: google-cloud-pubsub; extra == "all"
50
- Requires-Dist: google-cloud-discoveryengine; extra == "all"
51
+ Requires-Dist: google-cloud-discoveryengine>=0.13.4; extra == "all"
51
52
  Requires-Dist: google-cloud-texttospeech; extra == "all"
52
53
  Requires-Dist: google-generativeai>=0.7.1; extra == "all"
53
54
  Requires-Dist: google-genai; extra == "all"
@@ -133,7 +134,7 @@ Requires-Dist: google-cloud-service-control; extra == "gcp"
133
134
  Requires-Dist: google-cloud-storage; extra == "gcp"
134
135
  Requires-Dist: google-cloud-logging; extra == "gcp"
135
136
  Requires-Dist: google-cloud-pubsub; extra == "gcp"
136
- Requires-Dist: google-cloud-discoveryengine; extra == "gcp"
137
+ Requires-Dist: google-cloud-discoveryengine>=0.13.4; extra == "gcp"
137
138
  Requires-Dist: google-cloud-texttospeech; extra == "gcp"
138
139
  Requires-Dist: google-genai; extra == "gcp"
139
140
  Requires-Dist: google-generativeai>=0.8.3; extra == "gcp"
@@ -72,9 +72,9 @@ sunholo/database/sql/sb/delete_source_row.sql,sha256=r6fEuUKdbiLHCDGKSbKINDCpJjs
72
72
  sunholo/database/sql/sb/return_sources.sql,sha256=89KAnxfK8n_qGK9jy1OQT8f9n4uYUtYL5cCxbC2mj_c,255
73
73
  sunholo/database/sql/sb/setup.sql,sha256=CvoFvZQev2uWjmFa3aj3m3iuPFzAAJZ0S7Qi3L3-zZI,89
74
74
  sunholo/discovery_engine/__init__.py,sha256=hLgqRDJ22Aov9o2QjAEfsVgnL3kMdM-g5p8RJ9OyKdQ,130
75
- sunholo/discovery_engine/chunker_handler.py,sha256=5tw5_jalNQosf7uFyCmsYA__VdNpWC1PPVVa420CzWU,5479
75
+ sunholo/discovery_engine/chunker_handler.py,sha256=E3z-rVUuhjDebJY6nderr9QBYe8CrjwKskwIkOa_e68,5591
76
76
  sunholo/discovery_engine/create_new.py,sha256=jWg5LW-QpFE8zq50ShaQJB3Wu8loiWB0P4lRWaCHpss,1023
77
- sunholo/discovery_engine/discovery_engine_client.py,sha256=jfIayVUOPM4svGF1S5Kk60rIG-xSo_e3zOHtBRg0nZA,22002
77
+ sunholo/discovery_engine/discovery_engine_client.py,sha256=uZFNRtFtG9AfWM1Go3aPWmciyQ_4bQwhgiR2A9XjPPE,27606
78
78
  sunholo/discovery_engine/get_ai_search_chunks.py,sha256=hsFGOQugSeTMPEaQ16XTs_D45F8NABBm2IsAEdTk7kQ,4316
79
79
  sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
80
80
  sunholo/embedder/embed_chunk.py,sha256=Vvvj3-H4pSb1a2sLik3-X3X459j2jrUq1dBNAsOcQLo,7156
@@ -87,7 +87,7 @@ sunholo/gcs/download_url.py,sha256=Ul81n1rklr8WogPsuxWWD1Nr8RHU451LzHPMJNhAKzw,6
87
87
  sunholo/gcs/extract_and_sign.py,sha256=paRrTCvCN5vkQwCB7OSkxWi-pfOgOtZ0bwdXE08c3Ps,1546
88
88
  sunholo/gcs/metadata.py,sha256=oQLcXi4brsZ74aegWyC1JZmhlaEV270HS5_UWtAYYWE,898
89
89
  sunholo/genai/__init__.py,sha256=TV3PYHWoR4cChdmCOaYB0PtAEQ86qol9XYYEtb1JmSA,239
90
- sunholo/genai/file_handling.py,sha256=o4MkWpf0FiEOlqDllORCnKwgrOnXzORYTk0JofoWNDo,9564
90
+ sunholo/genai/file_handling.py,sha256=5wN8ynrrLLiY4JRRCCWLcqnxXVeYjgw26IH4itTrFCc,9564
91
91
  sunholo/genai/genaiv2.py,sha256=uqWcfvlsPVPyQo-W_cP9h2TTzyYfzj4lyJlyqPyKTkI,20269
92
92
  sunholo/genai/images.py,sha256=EyjsDqt6XQw99pZUQamomCpMOoIah9bp3XY94WPU7Ms,1678
93
93
  sunholo/genai/init.py,sha256=yG8E67TduFCTQPELo83OJuWfjwTnGZsyACospahyEaY,687
@@ -164,9 +164,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
164
164
  sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
165
165
  sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
166
166
  sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
167
- sunholo-0.118.9.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
168
- sunholo-0.118.9.dist-info/METADATA,sha256=8ssTrQCY69NbsQeUhwowK5R04aqcw65cw231BGNDmk0,9638
169
- sunholo-0.118.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
170
- sunholo-0.118.9.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
171
- sunholo-0.118.9.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
172
- sunholo-0.118.9.dist-info/RECORD,,
167
+ sunholo-0.119.1.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
168
+ sunholo-0.119.1.dist-info/METADATA,sha256=0UGUFxVnrxTEFwLKYjbSWEu_8S74mPHVQgypdr1E-S4,9706
169
+ sunholo-0.119.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
170
+ sunholo-0.119.1.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
171
+ sunholo-0.119.1.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
172
+ sunholo-0.119.1.dist-info/RECORD,,