sunholo 0.69.0__tar.gz → 0.69.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sunholo-0.69.0 → sunholo-0.69.3}/PKG-INFO +2 -2
- {sunholo-0.69.0 → sunholo-0.69.3}/setup.py +1 -1
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/chunker/data_to_embed_pubsub.py +8 -3
- sunholo-0.69.3/sunholo/discovery_engine/chunker_handler.py +109 -0
- sunholo-0.69.3/sunholo/discovery_engine/create_new.py +28 -0
- sunholo-0.69.0/sunholo/database/discovery_engine.py → sunholo-0.69.3/sunholo/discovery_engine/discovery_engine_client.py +60 -38
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/llamaindex/import_files.py +2 -22
- sunholo-0.69.3/sunholo/patches/langchain/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo.egg-info/PKG-INFO +2 -2
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo.egg-info/SOURCES.txt +4 -1
- {sunholo-0.69.0 → sunholo-0.69.3}/LICENSE.txt +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/MANIFEST.in +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/README.md +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/setup.cfg +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/chat_history.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/dispatch_to_qa.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/fastapi/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/fastapi/base.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/fastapi/qna_routes.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/flask/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/flask/base.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/flask/qna_routes.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/flask/vac_routes.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/langserve.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/pubsub.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/route.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/special_commands.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/agents/swagger.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/archive/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/archive/archive.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/auth/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/auth/run.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/bots/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/bots/discord.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/bots/github_webhook.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/bots/webapp.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/chunker/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/chunker/doc_handling.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/chunker/images.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/chunker/loaders.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/chunker/message_data.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/chunker/pdfs.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/chunker/publish.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/chunker/splitter.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/chat_vac.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/cli.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/cli_init.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/configs.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/deploy.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/embedder.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/merge_texts.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/run_proxy.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/sun_rich.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/cli/swagger.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/components/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/components/llm.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/components/retriever.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/components/vectorstore.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/alloydb.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/alloydb_client.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/database.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/lancedb.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/sql/sb/create_function.sql +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/sql/sb/create_function_time.sql +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/sql/sb/create_table.sql +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/sql/sb/delete_source_row.sql +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/sql/sb/return_sources.sql +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/sql/sb/setup.sql +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/static_dbs.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/database/uuid.py +0 -0
- {sunholo-0.69.0/sunholo/langfuse → sunholo-0.69.3/sunholo/discovery_engine}/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/embedder/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/embedder/embed_chunk.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/gcs/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/gcs/add_file.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/gcs/download_url.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/gcs/metadata.py +0 -0
- {sunholo-0.69.0/sunholo/llamaindex → sunholo-0.69.3/sunholo/langfuse}/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/langfuse/callback.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/langfuse/prompts.py +0 -0
- {sunholo-0.69.0/sunholo/lookup → sunholo-0.69.3/sunholo/llamaindex}/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/llamaindex/generate.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/llamaindex/get_files.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/logging.py +0 -0
- {sunholo-0.69.0/sunholo/patches → sunholo-0.69.3/sunholo/lookup}/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/lookup/model_lookup.yaml +0 -0
- {sunholo-0.69.0/sunholo/patches/langchain → sunholo-0.69.3/sunholo/patches}/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/patches/langchain/lancedb.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/patches/langchain/vertexai.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/pubsub/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/pubsub/process_pubsub.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/pubsub/pubsub_manager.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/qna/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/qna/parsers.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/qna/retry.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/streaming/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/streaming/content_buffer.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/streaming/langserve.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/streaming/stream_lookup.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/streaming/streaming.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/summarise/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/summarise/summarise.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/api_key.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/big_context.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/config.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/config_schema.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/gcp.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/gcp_project.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/parsers.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/timedelta.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/user_ids.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/utils/version.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/vertex/__init__.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/vertex/init.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/vertex/memory_tools.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo/vertex/safety.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo.egg-info/dependency_links.txt +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo.egg-info/entry_points.txt +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo.egg-info/requires.txt +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/sunholo.egg-info/top_level.txt +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/tests/test_chat_history.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/tests/test_chunker.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/tests/test_config.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/tests/test_dispatch_to_qa.py +0 -0
- {sunholo-0.69.0 → sunholo-0.69.3}/tests/test_swagger.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sunholo
|
|
3
|
-
Version: 0.69.
|
|
3
|
+
Version: 0.69.3
|
|
4
4
|
Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
|
|
5
5
|
Home-page: https://github.com/sunholo-data/sunholo-py
|
|
6
|
-
Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.69.
|
|
6
|
+
Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.69.3.tar.gz
|
|
7
7
|
Author: Holosun ApS
|
|
8
8
|
Author-email: multivac@sunholo.com
|
|
9
9
|
License: Apache License, Version 2.0
|
|
@@ -20,6 +20,8 @@ from .publish import process_docs_chunks_vector_name
|
|
|
20
20
|
from .splitter import chunk_doc_to_docs
|
|
21
21
|
|
|
22
22
|
from ..llamaindex.import_files import llamaindex_chunker_check
|
|
23
|
+
from ..discovery_engine.chunker_handler import discovery_engine_chunker_check
|
|
24
|
+
|
|
23
25
|
from . import loaders
|
|
24
26
|
|
|
25
27
|
def direct_file_to_embed(file_name: pathlib.Path, metadata: dict, vector_name: str):
|
|
@@ -58,17 +60,20 @@ def process_chunker_data(message_data, metadata, vector_name):
|
|
|
58
60
|
metadata["vector_name"] = vector_name
|
|
59
61
|
|
|
60
62
|
if message_data is None:
|
|
61
|
-
log.error("No message_data was found in data: {
|
|
63
|
+
log.error(f"No message_data was found in data: {message_data}")
|
|
62
64
|
return
|
|
63
65
|
|
|
64
66
|
log.debug(f"Found metadata in pubsub: {metadata}")
|
|
65
67
|
|
|
66
68
|
# checks if only a llamaindex chunking/embedder, return early as no other processing needed
|
|
67
69
|
llamacheck = llamaindex_chunker_check(message_data, metadata, vector_name)
|
|
68
|
-
|
|
69
70
|
if llamacheck:
|
|
70
|
-
|
|
71
71
|
return llamacheck
|
|
72
|
+
|
|
73
|
+
# if only a discovery engine memory, return early as no other processing needed
|
|
74
|
+
discovery_check = discovery_engine_chunker_check(message_data, metadata, vector_name)
|
|
75
|
+
if discovery_check:
|
|
76
|
+
return discovery_check
|
|
72
77
|
|
|
73
78
|
chunks = []
|
|
74
79
|
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from ..logging import log
|
|
2
|
+
from ..utils.config import load_config_key
|
|
3
|
+
from ..components import load_memories
|
|
4
|
+
|
|
5
|
+
from .discovery_engine_client import DiscoveryEngineClient
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def do_discovery_engine(message_data, metadata, vector_name):
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
```python
|
|
13
|
+
message_data = "gs://bucket_name/path_to_file.txt"
|
|
14
|
+
metadata = {"user": "admin"}
|
|
15
|
+
vector_name = "example_vector"
|
|
16
|
+
response = do_discovery_engine(message_data, metadata, vector_name)
|
|
17
|
+
print(response)
|
|
18
|
+
# Imported file to corpus: {'status': 'success'}
|
|
19
|
+
```
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
gcp_config = load_config_key("gcp_config", vector_name=vector_name, kind="vacConfig")
|
|
23
|
+
if not gcp_config:
|
|
24
|
+
raise ValueError(f"Need config.{vector_name}.gcp_config to configure discovery engine")
|
|
25
|
+
|
|
26
|
+
global_project_id = gcp_config.get('project_id')
|
|
27
|
+
#global_location = gcp_config.get('location')
|
|
28
|
+
global_data_store_id = gcp_config.get('data_store_id')
|
|
29
|
+
|
|
30
|
+
memories = load_memories(vector_name)
|
|
31
|
+
tools = []
|
|
32
|
+
|
|
33
|
+
if not memories:
|
|
34
|
+
return tools
|
|
35
|
+
|
|
36
|
+
corpuses = []
|
|
37
|
+
for memory in memories:
|
|
38
|
+
for key, value in memory.items(): # Now iterate over the dictionary
|
|
39
|
+
log.info(f"Found memory {key}")
|
|
40
|
+
vectorstore = value.get('vectorstore')
|
|
41
|
+
if vectorstore == "discovery_engine" or vectorstore == "vertex_ai_search":
|
|
42
|
+
log.info(f"Found vectorstore {vectorstore}")
|
|
43
|
+
data_store_id = value.get('data_store_id')
|
|
44
|
+
project_id = gcp_config.get('project_id')
|
|
45
|
+
#location = gcp_config.get('location')
|
|
46
|
+
corpus = DiscoveryEngineClient(
|
|
47
|
+
data_store_id=data_store_id or global_data_store_id,
|
|
48
|
+
project_id=project_id or global_project_id,
|
|
49
|
+
# location needs to be 'eu' or 'us' which doesn't work with other configurations
|
|
50
|
+
#location=location or global_location
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
corpuses.append(corpus)
|
|
54
|
+
if not corpuses:
|
|
55
|
+
log.error("Could not find any Discovery Engine corpus to import data to")
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
log.info(f"Found Discovery Engine / Vertex AI Search {corpuses=}")
|
|
59
|
+
|
|
60
|
+
if message_data.startswith("gs://"):
|
|
61
|
+
log.info(f"DiscoveryEngineClient.import_files for {message_data}")
|
|
62
|
+
for corp in corpuses:
|
|
63
|
+
try:
|
|
64
|
+
response = corp.import_documents(
|
|
65
|
+
gcs_uri=message_data
|
|
66
|
+
)
|
|
67
|
+
log.info(f"Imported file to corpus: {response} with metadata: {metadata}")
|
|
68
|
+
except Exception as err:
|
|
69
|
+
log.error(f"Error importing {message_data} - {corp=} - {str(err)}")
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
metadata["source"] = message_data
|
|
73
|
+
return metadata
|
|
74
|
+
|
|
75
|
+
else:
|
|
76
|
+
log.warning("Only gs:// data is supported for Discovery Engine")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def check_discovery_engine_in_memory(vector_name):
|
|
80
|
+
memories = load_config_key("memory", vector_name=vector_name, kind="vacConfig")
|
|
81
|
+
for memory in memories: # Iterate over the list
|
|
82
|
+
for key, value in memory.items(): # Now iterate over the dictionary
|
|
83
|
+
log.info(f"Found memory {key}")
|
|
84
|
+
vectorstore = value.get('vectorstore')
|
|
85
|
+
if vectorstore:
|
|
86
|
+
if vectorstore == "discovery_engine" or vectorstore == "vertex_ai_search":
|
|
87
|
+
log.info(f"Found vectorstore {vectorstore}")
|
|
88
|
+
return True
|
|
89
|
+
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
def discovery_engine_chunker_check(message_data, metadata, vector_name):
|
|
93
|
+
# discovery engine handles its own chunking/embedding
|
|
94
|
+
memories = load_config_key("memory", vector_name=vector_name, kind="vacConfig")
|
|
95
|
+
total_memories = len(memories)
|
|
96
|
+
llama = None
|
|
97
|
+
if check_discovery_engine_in_memory(vector_name):
|
|
98
|
+
llama = do_discovery_engine(message_data, metadata, vector_name)
|
|
99
|
+
log.info(f"Processed discovery engine: {llama}")
|
|
100
|
+
|
|
101
|
+
# If discovery engine is the only entry, return
|
|
102
|
+
if llama and total_memories == 1:
|
|
103
|
+
|
|
104
|
+
return llama
|
|
105
|
+
|
|
106
|
+
elif llama:
|
|
107
|
+
log.info("Discovery Engine found but not the only memory, continuing with other processes.")
|
|
108
|
+
|
|
109
|
+
return None
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from .discovery_engine_client import DiscoveryEngineClient
|
|
2
|
+
from ..utils.config import load_config_key
|
|
3
|
+
|
|
4
|
+
def create_new_discovery_engine(vector_name):
|
|
5
|
+
gcp_config = load_config_key("gcp_config", vector_name=vector_name, kind="vacConfig")
|
|
6
|
+
|
|
7
|
+
chunker_config = load_config_key("chunker", vector_name=vector_name, kind="vacConfig")
|
|
8
|
+
|
|
9
|
+
if chunker_config:
|
|
10
|
+
chunk_size = chunker_config.get("chunk_size")
|
|
11
|
+
|
|
12
|
+
if not chunk_size:
|
|
13
|
+
chunk_size = 500
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
project_id = gcp_config.get('project_id')
|
|
17
|
+
#location = gcp_config.get('location')
|
|
18
|
+
|
|
19
|
+
de = DiscoveryEngineClient(
|
|
20
|
+
data_store_id=vector_name,
|
|
21
|
+
project_id=project_id,
|
|
22
|
+
# location needs to be 'eu' or 'us' which doesn't work with other configurations
|
|
23
|
+
#location=location
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
new_store = de.create_data_store(chunk_size=chunk_size)
|
|
27
|
+
|
|
28
|
+
return new_store
|
|
@@ -6,6 +6,7 @@ except ImportError:
|
|
|
6
6
|
discoveryengine = None
|
|
7
7
|
|
|
8
8
|
from ..logging import log
|
|
9
|
+
from typing import Optional
|
|
9
10
|
|
|
10
11
|
class DiscoveryEngineClient:
|
|
11
12
|
"""
|
|
@@ -19,7 +20,7 @@ class DiscoveryEngineClient:
|
|
|
19
20
|
Example:
|
|
20
21
|
```python
|
|
21
22
|
client = DiscoveryEngineClient(project_id='your-project-id', data_store_id='your-data-store-id')
|
|
22
|
-
|
|
23
|
+
|
|
23
24
|
# Create a collection
|
|
24
25
|
collection_name = client.create_collection("my_new_collection")
|
|
25
26
|
|
|
@@ -61,39 +62,13 @@ class DiscoveryEngineClient:
|
|
|
61
62
|
else None
|
|
62
63
|
)
|
|
63
64
|
self.client = discoveryengine.DataStoreServiceClient(client_options=client_options)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
collection_id (str): The ID of the collection to create.
|
|
72
|
-
|
|
73
|
-
Returns:
|
|
74
|
-
str: The resource name of the created collection.
|
|
75
|
-
|
|
76
|
-
Example:
|
|
77
|
-
```python
|
|
78
|
-
collection_name = client.create_collection('my_new_collection')
|
|
79
|
-
`
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
parent = self.client.data_store_path(
|
|
83
|
-
project=self.project_id, location=self.location, data_store=self.data_store_id
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
collection = discoveryengine.Collection(display_name=collection_id)
|
|
87
|
-
request = discoveryengine.CreateCollectionRequest(
|
|
88
|
-
parent=parent, collection_id=collection_id, collection=collection
|
|
65
|
+
self.parent = self.client.branch_path(
|
|
66
|
+
project=project_id,
|
|
67
|
+
location=location,
|
|
68
|
+
data_store=data_store_id,
|
|
69
|
+
branch="default_branch",
|
|
89
70
|
)
|
|
90
71
|
|
|
91
|
-
operation = self.client.create_collection(request=request)
|
|
92
|
-
log.info(f"Waiting for operation to complete: {operation.operation.name}")
|
|
93
|
-
response = operation.result()
|
|
94
|
-
|
|
95
|
-
return response.name
|
|
96
|
-
|
|
97
72
|
def create_data_store(
|
|
98
73
|
self, chunk_size: int = 500
|
|
99
74
|
) -> str:
|
|
@@ -106,7 +81,6 @@ class DiscoveryEngineClient:
|
|
|
106
81
|
Returns:
|
|
107
82
|
str: The name of the long-running operation for data store creation.
|
|
108
83
|
"""
|
|
109
|
-
parent = self.client.common_location_path(project=self.project_id, location=self.location)
|
|
110
84
|
|
|
111
85
|
# https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1alpha.types.DocumentProcessingConfig
|
|
112
86
|
doc_config = discoveryengine.DocumentProcessingConfig(
|
|
@@ -137,7 +111,7 @@ class DiscoveryEngineClient:
|
|
|
137
111
|
|
|
138
112
|
# https://cloud.google.com/python/docs/reference/discoveryengine/0.11.4/google.cloud.discoveryengine_v1alpha.types.CreateDataStoreRequest
|
|
139
113
|
request = discoveryengine.CreateDataStoreRequest(
|
|
140
|
-
parent=parent,
|
|
114
|
+
parent=self.parent,
|
|
141
115
|
data_store_id=self.data_store_id,
|
|
142
116
|
data_store=data_store,
|
|
143
117
|
# Optional: For Advanced Site Search Only
|
|
@@ -162,7 +136,6 @@ class DiscoveryEngineClient:
|
|
|
162
136
|
def get_chunks(
|
|
163
137
|
self,
|
|
164
138
|
query: str,
|
|
165
|
-
collection_id: str,
|
|
166
139
|
num_previous_chunks: int = 3,
|
|
167
140
|
num_next_chunks: int = 3,
|
|
168
141
|
page_size: int = 10,
|
|
@@ -196,13 +169,10 @@ class DiscoveryEngineClient:
|
|
|
196
169
|
serving_config="default_serving_config")
|
|
197
170
|
).name
|
|
198
171
|
|
|
199
|
-
filter = f'content_search=true AND collection_id="{collection_id}"'
|
|
200
|
-
|
|
201
172
|
search_request = discoveryengine.SearchRequest(
|
|
202
173
|
serving_config=serving_config,
|
|
203
174
|
query=query,
|
|
204
175
|
page_size=page_size,
|
|
205
|
-
filter=filter,
|
|
206
176
|
content_search_spec=discoveryengine.SearchRequest.ContentSearchSpec(
|
|
207
177
|
#snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(
|
|
208
178
|
# return_snippet=True
|
|
@@ -219,3 +189,55 @@ class DiscoveryEngineClient:
|
|
|
219
189
|
|
|
220
190
|
return search_response
|
|
221
191
|
|
|
192
|
+
def import_documents(self,
|
|
193
|
+
gcs_uri: Optional[str] = None,
|
|
194
|
+
data_schema="content",
|
|
195
|
+
bigquery_dataset: Optional[str] = None,
|
|
196
|
+
bigquery_table: Optional[str] = None,
|
|
197
|
+
bigquery_project_id: Optional[str] = None,
|
|
198
|
+
) -> str:
|
|
199
|
+
"""
|
|
200
|
+
Args:
|
|
201
|
+
- gcs_uri: Required. List of Cloud Storage URIs to input files. Each URI can be up to 2000 characters long. URIs can match the full object path (for example, gs://bucket/directory/object.json) or a pattern matching one or more files, such as gs://bucket/directory/*.json. A request can contain at most 100 files (or 100,000 files if data_schema is content). Each file can be up to 2 GB (or 100 MB if data_schema is content).
|
|
202
|
+
- data_schema: Must be one of 'user_event', 'custom' or 'document' if using BigQuery. Default 'content' only for GCS. The schema to use when parsing the data from the source. Supported values for document imports: - document (default): One JSON Document per line. Each document must have a valid Document.id. - content: Unstructured data (e.g. PDF, HTML). Each file matched by input_uris becomes a document, with the ID set to the first 128 bits of SHA256(URI) encoded as a hex string. - custom: One custom data JSON per row in arbitrary format that conforms to the defined Schema of the data store. This can only be used by the GENERIC Data Store vertical. - csv: A CSV file with header conforming to the defined Schema of the data store. Each entry after the header is imported as a Document. This can only be used by the GENERIC Data Store vertical. Supported values for user event imports: - user_event (default): One JSON UserEvent per line.
|
|
203
|
+
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
if gcs_uri:
|
|
207
|
+
request = discoveryengine.ImportDocumentsRequest(
|
|
208
|
+
parent=self.parent,
|
|
209
|
+
# https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1alpha.types.GcsSource
|
|
210
|
+
gcs_source=discoveryengine.GcsSource(
|
|
211
|
+
input_uris=[gcs_uri], data_schema=data_schema,
|
|
212
|
+
),
|
|
213
|
+
# Options: `FULL`, `INCREMENTAL`
|
|
214
|
+
reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
|
|
215
|
+
)
|
|
216
|
+
else:
|
|
217
|
+
request = discoveryengine.ImportDocumentsRequest(
|
|
218
|
+
parent=self.parent,
|
|
219
|
+
bigquery_source=discoveryengine.BigQuerySource(
|
|
220
|
+
project_id=bigquery_project_id or self.project_id,
|
|
221
|
+
dataset_id=bigquery_dataset,
|
|
222
|
+
table_id=bigquery_table,
|
|
223
|
+
data_schema=data_schema,
|
|
224
|
+
),
|
|
225
|
+
# Options: `FULL`, `INCREMENTAL`
|
|
226
|
+
reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Make the request
|
|
230
|
+
operation = self.client.import_documents(request=request)
|
|
231
|
+
|
|
232
|
+
log.info(f"Waiting for operation to complete: {operation.operation.name}")
|
|
233
|
+
response = operation.result()
|
|
234
|
+
|
|
235
|
+
# Once the operation is complete,
|
|
236
|
+
# get information from operation metadata
|
|
237
|
+
metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)
|
|
238
|
+
|
|
239
|
+
# Handle the response
|
|
240
|
+
log.info(f"{response=} {metadata=}")
|
|
241
|
+
|
|
242
|
+
return operation.operation.name
|
|
243
|
+
|
|
@@ -105,28 +105,8 @@ def do_llamaindex(message_data, metadata, vector_name):
|
|
|
105
105
|
return metadata
|
|
106
106
|
|
|
107
107
|
else:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
# TODO(developer): Update and un-comment below lines
|
|
111
|
-
# project_id = "PROJECT_ID"
|
|
112
|
-
# corpus_name = "projects/{project_id}/locations/us-central1/ragCorpora/{rag_corpus_id}"
|
|
113
|
-
# path = "path/to/local/file.txt"
|
|
114
|
-
# display_name = "file_display_name"
|
|
115
|
-
# description = "file description"
|
|
116
|
-
|
|
117
|
-
# Initialize Vertex AI API once per session
|
|
118
|
-
#path = 'path/to/local/file.txt'
|
|
119
|
-
|
|
120
|
-
# Write the message_data to a file
|
|
121
|
-
#with open(path, 'w') as file:
|
|
122
|
-
# file.write(message_data)
|
|
123
|
-
|
|
124
|
-
#rag_file = rag.upload_file(
|
|
125
|
-
# corpus_name=corpus_name,
|
|
126
|
-
# path=path,
|
|
127
|
-
# display_name=display_name,
|
|
128
|
-
# description=description,
|
|
129
|
-
#)
|
|
108
|
+
log.warning("Only gs:// and https://drive data is supported for llamaindex")
|
|
109
|
+
|
|
130
110
|
|
|
131
111
|
def check_llamaindex_in_memory(vector_name):
|
|
132
112
|
memories = load_config_key("memory", vector_name=vector_name, kind="vacConfig")
|
|
File without changes
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sunholo
|
|
3
|
-
Version: 0.69.
|
|
3
|
+
Version: 0.69.3
|
|
4
4
|
Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
|
|
5
5
|
Home-page: https://github.com/sunholo-data/sunholo-py
|
|
6
|
-
Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.69.
|
|
6
|
+
Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.69.3.tar.gz
|
|
7
7
|
Author: Holosun ApS
|
|
8
8
|
Author-email: multivac@sunholo.com
|
|
9
9
|
License: Apache License, Version 2.0
|
|
@@ -62,7 +62,6 @@ sunholo/database/__init__.py
|
|
|
62
62
|
sunholo/database/alloydb.py
|
|
63
63
|
sunholo/database/alloydb_client.py
|
|
64
64
|
sunholo/database/database.py
|
|
65
|
-
sunholo/database/discovery_engine.py
|
|
66
65
|
sunholo/database/lancedb.py
|
|
67
66
|
sunholo/database/static_dbs.py
|
|
68
67
|
sunholo/database/uuid.py
|
|
@@ -72,6 +71,10 @@ sunholo/database/sql/sb/create_table.sql
|
|
|
72
71
|
sunholo/database/sql/sb/delete_source_row.sql
|
|
73
72
|
sunholo/database/sql/sb/return_sources.sql
|
|
74
73
|
sunholo/database/sql/sb/setup.sql
|
|
74
|
+
sunholo/discovery_engine/__init__.py
|
|
75
|
+
sunholo/discovery_engine/chunker_handler.py
|
|
76
|
+
sunholo/discovery_engine/create_new.py
|
|
77
|
+
sunholo/discovery_engine/discovery_engine_client.py
|
|
75
78
|
sunholo/embedder/__init__.py
|
|
76
79
|
sunholo/embedder/embed_chunk.py
|
|
77
80
|
sunholo/gcs/__init__.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|