sunholo 0.118.9__py3-none-any.whl → 0.119.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunholo/discovery_engine/chunker_handler.py +6 -4
- sunholo/discovery_engine/discovery_engine_client.py +143 -16
- sunholo/genai/file_handling.py +1 -1
- {sunholo-0.118.9.dist-info → sunholo-0.119.1.dist-info}/METADATA +4 -3
- {sunholo-0.118.9.dist-info → sunholo-0.119.1.dist-info}/RECORD +9 -9
- {sunholo-0.118.9.dist-info → sunholo-0.119.1.dist-info}/LICENSE.txt +0 -0
- {sunholo-0.118.9.dist-info → sunholo-0.119.1.dist-info}/WHEEL +0 -0
- {sunholo-0.118.9.dist-info → sunholo-0.119.1.dist-info}/entry_points.txt +0 -0
- {sunholo-0.118.9.dist-info → sunholo-0.119.1.dist-info}/top_level.txt +0 -0
|
@@ -65,8 +65,9 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
|
|
|
65
65
|
return None
|
|
66
66
|
for corp in corpuses:
|
|
67
67
|
try:
|
|
68
|
-
response = corp.
|
|
69
|
-
gcs_uri=message_data
|
|
68
|
+
response = corp.import_document_with_metadata(
|
|
69
|
+
gcs_uri=message_data,
|
|
70
|
+
metadata=metadata
|
|
70
71
|
)
|
|
71
72
|
log.info(f"Imported file to corpus: {response} with metadata: {metadata}")
|
|
72
73
|
except Exception as err:
|
|
@@ -81,8 +82,9 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
|
|
|
81
82
|
continue
|
|
82
83
|
if new_corp:
|
|
83
84
|
log.info(f"Found new DiscoveryEngine {config.vector_name=} - {new_corp=}")
|
|
84
|
-
response = corp.
|
|
85
|
-
gcs_uri=message_data
|
|
85
|
+
response = corp.import_document_with_metadata(
|
|
86
|
+
gcs_uri=message_data,
|
|
87
|
+
metadata=metadata
|
|
86
88
|
)
|
|
87
89
|
|
|
88
90
|
continue
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
try:
|
|
2
2
|
from google.api_core.client_options import ClientOptions
|
|
3
|
-
from google.cloud import
|
|
3
|
+
from google.cloud import discoveryengine
|
|
4
4
|
from google.api_core.retry import Retry, if_exception_type
|
|
5
5
|
from google.api_core.exceptions import ResourceExhausted, AlreadyExists
|
|
6
6
|
from google.cloud.discoveryengine_v1alpha import SearchResponse, Chunk
|
|
@@ -13,6 +13,9 @@ except ImportError:
|
|
|
13
13
|
from ..custom_logging import log
|
|
14
14
|
from typing import Optional, List
|
|
15
15
|
import asyncio
|
|
16
|
+
import json
|
|
17
|
+
import uuid
|
|
18
|
+
from ..utils.mime import guess_mime_type
|
|
16
19
|
|
|
17
20
|
class DiscoveryEngineClient:
|
|
18
21
|
"""
|
|
@@ -431,7 +434,37 @@ class DiscoveryEngineClient:
|
|
|
431
434
|
|
|
432
435
|
return operation.operation.name
|
|
433
436
|
|
|
437
|
+
def _import_document_request(self,
|
|
438
|
+
request: discoveryengine.ImportDocumentsRequest # type: ignore
|
|
439
|
+
) -> str:
|
|
440
|
+
"""
|
|
441
|
+
Handles the common logic for making an ImportDocumentsRequest, including retrying.
|
|
434
442
|
|
|
443
|
+
Args:
|
|
444
|
+
request (discoveryengine.ImportDocumentsRequest): The prepared request object.
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
str: The operation name.
|
|
448
|
+
"""
|
|
449
|
+
@self.my_retry()
|
|
450
|
+
def import_documents_with_retry(doc_client, request):
|
|
451
|
+
return doc_client.import_documents(request=request)
|
|
452
|
+
|
|
453
|
+
try:
|
|
454
|
+
operation = import_documents_with_retry(self.doc_client, request)
|
|
455
|
+
except ResourceExhausted as e:
|
|
456
|
+
log.error(f"DiscoveryEngine Operation failed after retries due to quota exceeded: {e}")
|
|
457
|
+
raise e
|
|
458
|
+
except AlreadyExists as e:
|
|
459
|
+
# Extract relevant info from the request to log
|
|
460
|
+
gcs_uri = request.gcs_source.input_uris if request.gcs_source else None
|
|
461
|
+
bigquery_table = request.bigquery_source.table_id if request.bigquery_source else None
|
|
462
|
+
log.warning(f"DiscoveryEngine - Already exists: {e} - {gcs_uri=} {bigquery_table=}")
|
|
463
|
+
except Exception as e:
|
|
464
|
+
log.error(f"An unexpected DiscoveryEngine error occurred: {e}")
|
|
465
|
+
raise e
|
|
466
|
+
|
|
467
|
+
return operation.operation.name
|
|
435
468
|
|
|
436
469
|
def import_documents(self,
|
|
437
470
|
gcs_uri: Optional[str] = None,
|
|
@@ -479,23 +512,117 @@ class DiscoveryEngineClient:
|
|
|
479
512
|
)
|
|
480
513
|
|
|
481
514
|
# Make the request
|
|
482
|
-
|
|
483
|
-
def import_documents_with_retry(doc_client, request):
|
|
484
|
-
return doc_client.import_documents(request=request)
|
|
485
|
-
|
|
486
|
-
try:
|
|
487
|
-
operation = import_documents_with_retry(self.doc_client, request)
|
|
488
|
-
except ResourceExhausted as e:
|
|
489
|
-
log.error(f"DiscoveryEngine Operation failed after retries due to quota exceeded: {e}")
|
|
515
|
+
return self._import_document_request(request)
|
|
490
516
|
|
|
491
|
-
raise e
|
|
492
|
-
except AlreadyExists as e:
|
|
493
|
-
log.warning(f"DiscoveryEngine - Already exists: {e} - {gcs_uri=} {bigquery_table=}")
|
|
494
517
|
|
|
495
|
-
|
|
496
|
-
|
|
518
|
+
def import_documents_with_metadata(self, gcs_uri: str, data_schema="content", branch="default_branch"):
|
|
519
|
+
"""
|
|
520
|
+
Supply a JSONLD GCS location to import all the GS URIs within and their metadata
|
|
521
|
+
"""
|
|
522
|
+
parent = self.doc_client.branch_path(
|
|
523
|
+
self.project_id,
|
|
524
|
+
self.location,
|
|
525
|
+
self.data_store_id,
|
|
526
|
+
branch
|
|
527
|
+
)
|
|
497
528
|
|
|
498
|
-
|
|
529
|
+
# 1. Prepare your documents with metadata:
|
|
530
|
+
documents_with_metadata = []
|
|
531
|
+
with open(gcs_uri, 'r') as f: # Assuming one JSON object per line in your GCS file
|
|
532
|
+
for line in f:
|
|
533
|
+
try:
|
|
534
|
+
document_data = json.loads(line) # Load the JSON from the line
|
|
535
|
+
# Check if it has the required fields, if not create them
|
|
536
|
+
if "id" not in document_data:
|
|
537
|
+
document_data["id"] = str(uuid.uuid4())
|
|
538
|
+
if "structData" not in document_data:
|
|
539
|
+
document_data["structData"] = {}
|
|
540
|
+
if "content" not in document_data:
|
|
541
|
+
document_data["content"] = {}
|
|
542
|
+
# Create the Document object with your metadata
|
|
543
|
+
document = discoveryengine.Document(
|
|
544
|
+
name = f"{parent}/documents/{document_data['id']}", # important!
|
|
545
|
+
id=document_data["id"],
|
|
546
|
+
struct_data=document_data.get("structData", {}), # Your metadata here
|
|
547
|
+
content = discoveryengine.Content(
|
|
548
|
+
mime_type = document_data.get("content", {}).get("mimeType", "text/plain"),
|
|
549
|
+
uri = document_data.get("content", {}).get("uri", ""),
|
|
550
|
+
)
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
if "jsonData" in document_data:
|
|
554
|
+
document.json_data = document_data["jsonData"]
|
|
555
|
+
|
|
556
|
+
documents_with_metadata.append(document)
|
|
557
|
+
|
|
558
|
+
except json.JSONDecodeError as e:
|
|
559
|
+
log.error(f"Error decoding JSON in line: {line.strip()}. Error: {e}")
|
|
560
|
+
continue # Skip to the next line if there's an error
|
|
561
|
+
|
|
562
|
+
# 2. Use InlineSource to import:
|
|
563
|
+
request = discoveryengine.ImportDocumentsRequest(
|
|
564
|
+
parent=parent,
|
|
565
|
+
inline_source=discoveryengine.ImportDocumentsRequest.InlineSource(
|
|
566
|
+
documents=documents_with_metadata, # Pass the list of Document objects
|
|
567
|
+
data_schema="document" # Important: Set to "document" when providing full Documents
|
|
568
|
+
),
|
|
569
|
+
reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
|
|
570
|
+
)
|
|
499
571
|
|
|
500
|
-
return
|
|
572
|
+
return self._import_document_request(request)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def import_document_with_metadata(self, gcs_uri: str, metadata: dict, branch="default_branch"):
|
|
576
|
+
"""
|
|
577
|
+
Imports a single document with metadata.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
gcs_uri: The GCS URI of the document to import.
|
|
581
|
+
metadata: A dictionary containing the metadata for the document.
|
|
582
|
+
branch: The branch to import the document into.
|
|
501
583
|
|
|
584
|
+
Returns:
|
|
585
|
+
str: The operation name.
|
|
586
|
+
"""
|
|
587
|
+
try:
|
|
588
|
+
# 1. Generate a unique document ID
|
|
589
|
+
document_id = str(uuid.uuid4())
|
|
590
|
+
|
|
591
|
+
# 2. Create a Document object
|
|
592
|
+
parent = self.doc_client.branch_path(
|
|
593
|
+
self.project_id, self.location, self.data_store_id, branch
|
|
594
|
+
)
|
|
595
|
+
document = discoveryengine.Document(
|
|
596
|
+
name=f"{parent}/documents/{document_id}",
|
|
597
|
+
id=document_id,
|
|
598
|
+
struct_data=metadata,
|
|
599
|
+
content=discoveryengine.Document.Content(
|
|
600
|
+
uri=gcs_uri,
|
|
601
|
+
mime_type=self.get_mime_type(gcs_uri)
|
|
602
|
+
)
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
# 3. Use InlineSource for import
|
|
606
|
+
request = discoveryengine.ImportDocumentsRequest(
|
|
607
|
+
parent=parent,
|
|
608
|
+
inline_source=discoveryengine.ImportDocumentsRequest.InlineSource(
|
|
609
|
+
documents=[document],
|
|
610
|
+
data_schema="document"
|
|
611
|
+
),
|
|
612
|
+
reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
# 4. Make the import request (using the common method)
|
|
616
|
+
return self._import_document_request(request)
|
|
617
|
+
|
|
618
|
+
except Exception as e:
|
|
619
|
+
log.error(f"Error importing document with metadata: {e}")
|
|
620
|
+
|
|
621
|
+
def get_mime_type(self, uri:str):
|
|
622
|
+
return guess_mime_type(uri)
|
|
623
|
+
|
|
624
|
+
def search_with_filters(self, query, folder=None, date=None,
|
|
625
|
+
num_previous_chunks=3, num_next_chunks=3,
|
|
626
|
+
page_size=10, parse_chunks_to_string=True,
|
|
627
|
+
serving_config="default_serving_config"):
|
|
628
|
+
pass
|
sunholo/genai/file_handling.py
CHANGED
|
@@ -132,7 +132,7 @@ async def construct_file_content(gs_list, bucket:str, genai_lib=False):
|
|
|
132
132
|
myfile = genai.get_file(name)
|
|
133
133
|
else:
|
|
134
134
|
client = genaiv2.Client()
|
|
135
|
-
myfile = client.files.get(
|
|
135
|
+
myfile = client.files.get(name=name)
|
|
136
136
|
content.append(myfile)
|
|
137
137
|
content.append(f"You have been given the ability to work with file {display_name=} with {mime_type=} {display_url=}")
|
|
138
138
|
log.info(f"Found existing genai.get_file {name=}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: sunholo
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.119.1
|
|
4
4
|
Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
|
|
5
5
|
Author-email: Holosun ApS <multivac@sunholo.com>
|
|
6
6
|
License: Apache License, Version 2.0
|
|
@@ -20,6 +20,7 @@ Description-Content-Type: text/markdown
|
|
|
20
20
|
License-File: LICENSE.txt
|
|
21
21
|
Requires-Dist: aiohttp
|
|
22
22
|
Requires-Dist: google-auth
|
|
23
|
+
Requires-Dist: google-cloud-discoveryengine>=0.13.4
|
|
23
24
|
Requires-Dist: pydantic
|
|
24
25
|
Requires-Dist: requests
|
|
25
26
|
Requires-Dist: ruamel.yaml
|
|
@@ -47,7 +48,7 @@ Requires-Dist: google-cloud-service-control; extra == "all"
|
|
|
47
48
|
Requires-Dist: google-cloud-logging; extra == "all"
|
|
48
49
|
Requires-Dist: google-cloud-storage; extra == "all"
|
|
49
50
|
Requires-Dist: google-cloud-pubsub; extra == "all"
|
|
50
|
-
Requires-Dist: google-cloud-discoveryengine; extra == "all"
|
|
51
|
+
Requires-Dist: google-cloud-discoveryengine>=0.13.4; extra == "all"
|
|
51
52
|
Requires-Dist: google-cloud-texttospeech; extra == "all"
|
|
52
53
|
Requires-Dist: google-generativeai>=0.7.1; extra == "all"
|
|
53
54
|
Requires-Dist: google-genai; extra == "all"
|
|
@@ -133,7 +134,7 @@ Requires-Dist: google-cloud-service-control; extra == "gcp"
|
|
|
133
134
|
Requires-Dist: google-cloud-storage; extra == "gcp"
|
|
134
135
|
Requires-Dist: google-cloud-logging; extra == "gcp"
|
|
135
136
|
Requires-Dist: google-cloud-pubsub; extra == "gcp"
|
|
136
|
-
Requires-Dist: google-cloud-discoveryengine; extra == "gcp"
|
|
137
|
+
Requires-Dist: google-cloud-discoveryengine>=0.13.4; extra == "gcp"
|
|
137
138
|
Requires-Dist: google-cloud-texttospeech; extra == "gcp"
|
|
138
139
|
Requires-Dist: google-genai; extra == "gcp"
|
|
139
140
|
Requires-Dist: google-generativeai>=0.8.3; extra == "gcp"
|
|
@@ -72,9 +72,9 @@ sunholo/database/sql/sb/delete_source_row.sql,sha256=r6fEuUKdbiLHCDGKSbKINDCpJjs
|
|
|
72
72
|
sunholo/database/sql/sb/return_sources.sql,sha256=89KAnxfK8n_qGK9jy1OQT8f9n4uYUtYL5cCxbC2mj_c,255
|
|
73
73
|
sunholo/database/sql/sb/setup.sql,sha256=CvoFvZQev2uWjmFa3aj3m3iuPFzAAJZ0S7Qi3L3-zZI,89
|
|
74
74
|
sunholo/discovery_engine/__init__.py,sha256=hLgqRDJ22Aov9o2QjAEfsVgnL3kMdM-g5p8RJ9OyKdQ,130
|
|
75
|
-
sunholo/discovery_engine/chunker_handler.py,sha256=
|
|
75
|
+
sunholo/discovery_engine/chunker_handler.py,sha256=E3z-rVUuhjDebJY6nderr9QBYe8CrjwKskwIkOa_e68,5591
|
|
76
76
|
sunholo/discovery_engine/create_new.py,sha256=jWg5LW-QpFE8zq50ShaQJB3Wu8loiWB0P4lRWaCHpss,1023
|
|
77
|
-
sunholo/discovery_engine/discovery_engine_client.py,sha256=
|
|
77
|
+
sunholo/discovery_engine/discovery_engine_client.py,sha256=uZFNRtFtG9AfWM1Go3aPWmciyQ_4bQwhgiR2A9XjPPE,27606
|
|
78
78
|
sunholo/discovery_engine/get_ai_search_chunks.py,sha256=hsFGOQugSeTMPEaQ16XTs_D45F8NABBm2IsAEdTk7kQ,4316
|
|
79
79
|
sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
|
|
80
80
|
sunholo/embedder/embed_chunk.py,sha256=Vvvj3-H4pSb1a2sLik3-X3X459j2jrUq1dBNAsOcQLo,7156
|
|
@@ -87,7 +87,7 @@ sunholo/gcs/download_url.py,sha256=Ul81n1rklr8WogPsuxWWD1Nr8RHU451LzHPMJNhAKzw,6
|
|
|
87
87
|
sunholo/gcs/extract_and_sign.py,sha256=paRrTCvCN5vkQwCB7OSkxWi-pfOgOtZ0bwdXE08c3Ps,1546
|
|
88
88
|
sunholo/gcs/metadata.py,sha256=oQLcXi4brsZ74aegWyC1JZmhlaEV270HS5_UWtAYYWE,898
|
|
89
89
|
sunholo/genai/__init__.py,sha256=TV3PYHWoR4cChdmCOaYB0PtAEQ86qol9XYYEtb1JmSA,239
|
|
90
|
-
sunholo/genai/file_handling.py,sha256=
|
|
90
|
+
sunholo/genai/file_handling.py,sha256=5wN8ynrrLLiY4JRRCCWLcqnxXVeYjgw26IH4itTrFCc,9564
|
|
91
91
|
sunholo/genai/genaiv2.py,sha256=uqWcfvlsPVPyQo-W_cP9h2TTzyYfzj4lyJlyqPyKTkI,20269
|
|
92
92
|
sunholo/genai/images.py,sha256=EyjsDqt6XQw99pZUQamomCpMOoIah9bp3XY94WPU7Ms,1678
|
|
93
93
|
sunholo/genai/init.py,sha256=yG8E67TduFCTQPELo83OJuWfjwTnGZsyACospahyEaY,687
|
|
@@ -164,9 +164,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
|
|
|
164
164
|
sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
|
|
165
165
|
sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
|
|
166
166
|
sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
|
|
167
|
-
sunholo-0.
|
|
168
|
-
sunholo-0.
|
|
169
|
-
sunholo-0.
|
|
170
|
-
sunholo-0.
|
|
171
|
-
sunholo-0.
|
|
172
|
-
sunholo-0.
|
|
167
|
+
sunholo-0.119.1.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
|
|
168
|
+
sunholo-0.119.1.dist-info/METADATA,sha256=0UGUFxVnrxTEFwLKYjbSWEu_8S74mPHVQgypdr1E-S4,9706
|
|
169
|
+
sunholo-0.119.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
170
|
+
sunholo-0.119.1.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
|
|
171
|
+
sunholo-0.119.1.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
|
|
172
|
+
sunholo-0.119.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|