sunholo 0.119.4__tar.gz → 0.119.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sunholo-0.119.4/src/sunholo.egg-info → sunholo-0.119.7}/PKG-INFO +1 -1
- {sunholo-0.119.4 → sunholo-0.119.7}/pyproject.toml +1 -1
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/cli.py +3 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/discovery_engine/chunker_handler.py +2 -1
- sunholo-0.119.7/src/sunholo/discovery_engine/cli.py +245 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/discovery_engine/discovery_engine_client.py +127 -10
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/embedder/embed_chunk.py +2 -19
- sunholo-0.119.7/src/sunholo/embedder/embed_metadata.py +27 -0
- {sunholo-0.119.4 → sunholo-0.119.7/src/sunholo.egg-info}/PKG-INFO +1 -1
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo.egg-info/SOURCES.txt +2 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/LICENSE.txt +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/MANIFEST.in +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/README.md +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/setup.cfg +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/chat_history.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/dispatch_to_qa.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/fastapi/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/fastapi/base.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/fastapi/qna_routes.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/flask/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/flask/base.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/flask/qna_routes.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/flask/vac_routes.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/langserve.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/pubsub.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/route.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/special_commands.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/agents/swagger.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/archive/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/archive/archive.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/auth/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/auth/gcloud.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/auth/refresh.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/auth/run.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/azure/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/azure/auth.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/azure/blobs.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/azure/event_grid.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/bots/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/bots/discord.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/bots/github_webhook.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/bots/webapp.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/azure.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/doc_handling.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/encode_metadata.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/images.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/loaders.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/message_data.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/pdfs.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/process_chunker_data.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/publish.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/pubsub.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/chunker/splitter.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/chat_vac.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/cli_init.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/configs.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/deploy.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/embedder.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/merge_texts.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/run_proxy.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/sun_rich.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/swagger.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/cli/vertex.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/components/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/components/llm.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/components/retriever.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/components/vectorstore.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/custom_logging.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/alloydb.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/alloydb_client.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/database.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/lancedb.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/sql/sb/create_function.sql +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/sql/sb/create_function_time.sql +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/sql/sb/create_table.sql +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/sql/sb/delete_source_row.sql +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/sql/sb/return_sources.sql +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/sql/sb/setup.sql +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/static_dbs.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/database/uuid.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/discovery_engine/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/discovery_engine/create_new.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/discovery_engine/get_ai_search_chunks.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/embedder/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/excel/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/excel/plugin.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/gcs/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/gcs/add_file.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/gcs/download_folder.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/gcs/download_url.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/gcs/extract_and_sign.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/gcs/metadata.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/genai/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/genai/file_handling.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/genai/genaiv2.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/genai/images.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/genai/init.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/genai/process_funcs_cls.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/genai/safety.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/invoke/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/invoke/async_class.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/invoke/direct_vac_func.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/invoke/invoke_vac_utils.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/langchain_types.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/langfuse/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/langfuse/callback.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/langfuse/evals.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/langfuse/prompts.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/llamaindex/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/llamaindex/get_files.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/llamaindex/import_files.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/llamaindex/llamaindex_class.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/llamaindex/user_history.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/lookup/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/lookup/model_lookup.yaml +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/mcp/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/mcp/cli.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/pubsub/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/pubsub/process_pubsub.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/pubsub/pubsub_manager.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/qna/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/qna/parsers.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/qna/retry.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/senses/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/senses/stream_voice.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/streaming/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/streaming/content_buffer.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/streaming/langserve.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/streaming/stream_lookup.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/streaming/streaming.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/summarise/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/summarise/summarise.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/agent/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/agent/agent_service.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/agent/app.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/agent/my_log.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/agent/tools/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/agent/tools/your_agent.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/agent/vac_service.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/project/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/project/app.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/project/my_log.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/project/vac_service.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/system_services/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/system_services/app.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/templates/system_services/my_log.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/terraform/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/terraform/tfvars_editor.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/tools/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/tools/web_browser.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/api_key.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/big_context.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/config.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/config_class.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/config_schema.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/gcp.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/gcp_project.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/mime.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/parsers.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/timedelta.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/user_ids.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/utils/version.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/vertex/__init__.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/vertex/extensions_call.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/vertex/extensions_class.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/vertex/genai_functions.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/vertex/init.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/vertex/memory_tools.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/vertex/safety.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo/vertex/type_dict_to_json.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo.egg-info/dependency_links.txt +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo.egg-info/entry_points.txt +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo.egg-info/requires.txt +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/src/sunholo.egg-info/top_level.txt +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/tests/test_async.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/tests/test_async_genai2.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/tests/test_chat_history.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/tests/test_config.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/tests/test_genai2.py +0 -0
- {sunholo-0.119.4 → sunholo-0.119.7}/tests/test_unstructured.py +0 -0
|
@@ -15,6 +15,7 @@ from ..excel import setup_excel_subparser
|
|
|
15
15
|
from ..terraform import setup_tfvarseditor_subparser
|
|
16
16
|
from ..senses.stream_voice import setup_tts_subparser
|
|
17
17
|
from ..mcp.cli import setup_mcp_subparser
|
|
18
|
+
from ..discovery_engine.cli import setup_discovery_engine_subparser
|
|
18
19
|
|
|
19
20
|
from ..utils import ConfigManager
|
|
20
21
|
from ..utils.version import sunholo_version
|
|
@@ -105,6 +106,8 @@ def main(args=None):
|
|
|
105
106
|
setup_tts_subparser(subparsers)
|
|
106
107
|
# anthropic MCP
|
|
107
108
|
setup_mcp_subparser(subparsers)
|
|
109
|
+
# discovery engine
|
|
110
|
+
setup_discovery_engine_subparser(subparsers)
|
|
108
111
|
|
|
109
112
|
#TODO: add database setup commands: alloydb and supabase
|
|
110
113
|
|
|
@@ -5,7 +5,7 @@ from ..components import load_memories
|
|
|
5
5
|
|
|
6
6
|
from .discovery_engine_client import DiscoveryEngineClient
|
|
7
7
|
from .create_new import create_new_discovery_engine
|
|
8
|
-
|
|
8
|
+
from ..embedder.embed_metadata import audit_metadata
|
|
9
9
|
|
|
10
10
|
def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=None):
|
|
11
11
|
"""
|
|
@@ -65,6 +65,7 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
|
|
|
65
65
|
return None
|
|
66
66
|
for corp in corpuses:
|
|
67
67
|
try:
|
|
68
|
+
metadata = audit_metadata(metadata, chunk_length=500)
|
|
68
69
|
response = corp.import_document_with_metadata(
|
|
69
70
|
gcs_uri=message_data,
|
|
70
71
|
metadata=metadata
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from ..cli.sun_rich import console
|
|
5
|
+
except ImportError:
|
|
6
|
+
console = None
|
|
7
|
+
|
|
8
|
+
from ..custom_logging import log
|
|
9
|
+
|
|
10
|
+
# Make sure to adjust the relative import path if needed
|
|
11
|
+
from .discovery_engine_client import DiscoveryEngineClient
|
|
12
|
+
|
|
13
|
+
def discovery_engine_command(args):
|
|
14
|
+
"""
|
|
15
|
+
Handles the `discovery-engine` command and its subcommands.
|
|
16
|
+
"""
|
|
17
|
+
if args.subcommand == 'create-datastore':
|
|
18
|
+
create_datastore_command(args)
|
|
19
|
+
elif args.subcommand == 'import-documents':
|
|
20
|
+
import_documents_command(args)
|
|
21
|
+
elif args.subcommand == 'import-documents-with-metadata':
|
|
22
|
+
import_documents_with_metadata_command(args)
|
|
23
|
+
elif args.subcommand == 'import-document-with-metadata':
|
|
24
|
+
import_document_with_metadata_command(args)
|
|
25
|
+
elif args.subcommand == 'search':
|
|
26
|
+
search_command(args)
|
|
27
|
+
elif args.subcommand == 'search-by-id-and-or-date':
|
|
28
|
+
search_by_id_and_or_date_command(args)
|
|
29
|
+
else:
|
|
30
|
+
console.print(f"[bold red]Unknown Discovery Engine subcommand: {args.subcommand}[/bold red]")
|
|
31
|
+
|
|
32
|
+
def create_datastore_command(args):
|
|
33
|
+
"""
|
|
34
|
+
Handles the `discovery-engine create-datastore` subcommand.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
client = DiscoveryEngineClient(
|
|
38
|
+
project_id=args.project,
|
|
39
|
+
data_store_id=args.data_store_id,
|
|
40
|
+
location=args.location
|
|
41
|
+
)
|
|
42
|
+
operation_name = client.create_data_store(
|
|
43
|
+
type=args.type,
|
|
44
|
+
chunk_size=args.chunk_size,
|
|
45
|
+
collection=args.collection
|
|
46
|
+
)
|
|
47
|
+
console.print(f"[bold green]Datastore creation initiated. Operation name: {operation_name}[/bold green]")
|
|
48
|
+
except Exception as e:
|
|
49
|
+
console.print(f"[bold red]Error creating datastore: {e}[/bold red]")
|
|
50
|
+
|
|
51
|
+
def import_documents_command(args):
|
|
52
|
+
"""
|
|
53
|
+
Handles the `discovery-engine import-documents` subcommand.
|
|
54
|
+
"""
|
|
55
|
+
try:
|
|
56
|
+
client = DiscoveryEngineClient(
|
|
57
|
+
project_id=args.project,
|
|
58
|
+
data_store_id=args.data_store_id,
|
|
59
|
+
location=args.location
|
|
60
|
+
)
|
|
61
|
+
operation_name = client.import_documents(
|
|
62
|
+
gcs_uri=args.gcs_uri,
|
|
63
|
+
data_schema=args.data_schema,
|
|
64
|
+
branch=args.branch,
|
|
65
|
+
bigquery_dataset=args.bigquery_dataset,
|
|
66
|
+
bigquery_table=args.bigquery_table,
|
|
67
|
+
bigquery_project_id=args.bigquery_project_id
|
|
68
|
+
)
|
|
69
|
+
console.print(f"[bold green]Document import initiated. Operation name: {operation_name}[/bold green]")
|
|
70
|
+
except Exception as e:
|
|
71
|
+
console.print(f"[bold red]Error importing documents: {e}[/bold red]")
|
|
72
|
+
|
|
73
|
+
def import_documents_with_metadata_command(args):
|
|
74
|
+
"""
|
|
75
|
+
Handles the `discovery-engine import-documents-with-metadata` subcommand.
|
|
76
|
+
"""
|
|
77
|
+
try:
|
|
78
|
+
client = DiscoveryEngineClient(
|
|
79
|
+
project_id=args.project,
|
|
80
|
+
data_store_id=args.data_store_id,
|
|
81
|
+
location=args.location
|
|
82
|
+
)
|
|
83
|
+
operation_name = client.import_documents_with_metadata(
|
|
84
|
+
gcs_uri=args.gcs_uri,
|
|
85
|
+
data_schema=args.data_schema,
|
|
86
|
+
branch=args.branch
|
|
87
|
+
)
|
|
88
|
+
console.print(f"[bold green]Document import with metadata initiated. Operation name: {operation_name}[/bold green]")
|
|
89
|
+
except Exception as e:
|
|
90
|
+
console.print(f"[bold red]Error importing documents with metadata: {e}[/bold red]")
|
|
91
|
+
|
|
92
|
+
def import_document_with_metadata_command(args):
|
|
93
|
+
"""
|
|
94
|
+
Handles the `discovery-engine import-document-with-metadata` subcommand.
|
|
95
|
+
"""
|
|
96
|
+
try:
|
|
97
|
+
# Load metadata from JSON file or string
|
|
98
|
+
if args.metadata_file:
|
|
99
|
+
with open(args.metadata_file, 'r') as f:
|
|
100
|
+
metadata = json.load(f)
|
|
101
|
+
elif args.metadata_string:
|
|
102
|
+
metadata = json.loads(args.metadata_string)
|
|
103
|
+
else:
|
|
104
|
+
console.print("[bold red]Error: Must provide either --metadata-file or --metadata-string[/bold red]")
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
client = DiscoveryEngineClient(
|
|
108
|
+
project_id=args.project,
|
|
109
|
+
data_store_id=args.data_store_id,
|
|
110
|
+
location=args.location
|
|
111
|
+
)
|
|
112
|
+
operation_name = client.import_document_with_metadata(
|
|
113
|
+
gcs_uri=args.gcs_uri,
|
|
114
|
+
metadata=metadata,
|
|
115
|
+
branch=args.branch
|
|
116
|
+
)
|
|
117
|
+
console.print(f"[bold green]Document import with metadata initiated. Operation name: {operation_name}[/bold green]")
|
|
118
|
+
except Exception as e:
|
|
119
|
+
console.print(f"[bold red]Error importing document with metadata: {e}[/bold red]")
|
|
120
|
+
|
|
121
|
+
def search_command(args):
|
|
122
|
+
"""
|
|
123
|
+
Handles the `discovery-engine search` subcommand.
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
client = DiscoveryEngineClient(
|
|
127
|
+
project_id=args.project,
|
|
128
|
+
data_store_id=args.data_store_id,
|
|
129
|
+
location=args.location
|
|
130
|
+
)
|
|
131
|
+
results = client.get_chunks(
|
|
132
|
+
query=args.query,
|
|
133
|
+
num_previous_chunks=args.num_previous_chunks,
|
|
134
|
+
num_next_chunks=args.num_next_chunks,
|
|
135
|
+
page_size=args.page_size,
|
|
136
|
+
parse_chunks_to_string=args.parse_chunks_to_string,
|
|
137
|
+
serving_config=args.serving_config,
|
|
138
|
+
data_store_ids=args.data_store_ids
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if args.parse_chunks_to_string:
|
|
142
|
+
console.print(results) # Print the combined string
|
|
143
|
+
else:
|
|
144
|
+
# Process and print the results (assuming it's a SearchResponse object)
|
|
145
|
+
for result in results.results:
|
|
146
|
+
for chunk in result.document.chunks:
|
|
147
|
+
console.print(f"Chunk: {chunk.snippet}, document name: {chunk.document_name}")
|
|
148
|
+
except Exception as e:
|
|
149
|
+
console.print(f"[bold red]Error searching: {e}[/bold red]")
|
|
150
|
+
|
|
151
|
+
def search_by_id_and_or_date_command(args):
|
|
152
|
+
"""
|
|
153
|
+
Handles the `discovery-engine search-by-id-and-or-date` subcommand.
|
|
154
|
+
"""
|
|
155
|
+
try:
|
|
156
|
+
client = DiscoveryEngineClient(
|
|
157
|
+
project_id=args.project,
|
|
158
|
+
data_store_id=args.data_store_id,
|
|
159
|
+
location=args.location
|
|
160
|
+
)
|
|
161
|
+
results = client.search_by_objectId_and_or_date(
|
|
162
|
+
query=args.query,
|
|
163
|
+
objectId=args.object_id,
|
|
164
|
+
date=args.date,
|
|
165
|
+
num_previous_chunks=args.num_previous_chunks,
|
|
166
|
+
num_next_chunks=args.num_next_chunks,
|
|
167
|
+
page_size=args.page_size,
|
|
168
|
+
parse_chunks_to_string=args.parse_chunks_to_string,
|
|
169
|
+
serving_config=args.serving_config,
|
|
170
|
+
data_store_ids=args.data_store_ids
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if args.parse_chunks_to_string:
|
|
174
|
+
console.print(results) # Print the combined string
|
|
175
|
+
else:
|
|
176
|
+
# Process and print the results (assuming it's a SearchResponse object)
|
|
177
|
+
for result in results.results:
|
|
178
|
+
for chunk in result.document.chunks:
|
|
179
|
+
console.print(f"Chunk: {chunk.snippet}, document name: {chunk.document_name}")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
console.print(f"[bold red]Error searching by ID and/or date: {e}[/bold red]")
|
|
182
|
+
|
|
183
|
+
def setup_discovery_engine_subparser(subparsers):
|
|
184
|
+
"""
|
|
185
|
+
Sets up the `discovery-engine` subparser and its subcommands.
|
|
186
|
+
"""
|
|
187
|
+
discovery_engine_parser = subparsers.add_parser('discovery-engine', help='Interact with Google Cloud Discovery Engine')
|
|
188
|
+
discovery_engine_subparsers = discovery_engine_parser.add_subparsers(dest='subcommand', required=True)
|
|
189
|
+
|
|
190
|
+
# Create Datastore subcommand
|
|
191
|
+
create_datastore_parser = discovery_engine_subparsers.add_parser('create-datastore', help='Create a new Discovery Engine datastore')
|
|
192
|
+
create_datastore_parser.add_argument('--data-store-id', required=True, help='The ID of the datastore')
|
|
193
|
+
create_datastore_parser.add_argument('--type', choices=['chunk'], default='chunk', help='The type of datastore to create')
|
|
194
|
+
create_datastore_parser.add_argument('--chunk-size', type=int, default=500, help='The size of the chunks for documents (if applicable)')
|
|
195
|
+
create_datastore_parser.add_argument('--collection', default='default_collection', help='The collection to create the datastore in')
|
|
196
|
+
create_datastore_parser.set_defaults(func=discovery_engine_command)
|
|
197
|
+
|
|
198
|
+
# Import Documents subcommand
|
|
199
|
+
import_documents_parser = discovery_engine_subparsers.add_parser('import-documents', help='Import documents into a Discovery Engine datastore')
|
|
200
|
+
import_documents_parser.add_argument('--gcs-uri', required=True, help='The GCS URI of the documents to import')
|
|
201
|
+
import_documents_parser.add_argument('--data-schema', default='content', help='The schema of the data to import')
|
|
202
|
+
import_documents_parser.add_argument('--branch', default='default_branch', help='The branch to import the documents into')
|
|
203
|
+
import_documents_parser.add_argument('--bigquery-dataset', help='The BigQuery dataset ID (if applicable)')
|
|
204
|
+
import_documents_parser.add_argument('--bigquery-table', help='The BigQuery table ID (if applicable)')
|
|
205
|
+
import_documents_parser.add_argument('--bigquery-project-id', help='The project ID of the BigQuery dataset (if applicable)')
|
|
206
|
+
import_documents_parser.set_defaults(func=discovery_engine_command)
|
|
207
|
+
|
|
208
|
+
# Import Documents with Metadata subcommand
|
|
209
|
+
import_documents_with_metadata_parser = discovery_engine_subparsers.add_parser('import-documents-with-metadata', help='Import documents with metadata into a Discovery Engine datastore')
|
|
210
|
+
import_documents_with_metadata_parser.add_argument('--gcs-uri', required=True, help='The GCS URI of the documents to import (JSONL format with metadata)')
|
|
211
|
+
import_documents_with_metadata_parser.add_argument('--data-schema', default='content', help='The schema of the data to import')
|
|
212
|
+
import_documents_with_metadata_parser.add_argument('--branch', default='default_branch', help='The branch to import the documents into')
|
|
213
|
+
import_documents_with_metadata_parser.set_defaults(func=discovery_engine_command)
|
|
214
|
+
|
|
215
|
+
# Import Document with Metadata subcommand
|
|
216
|
+
import_document_with_metadata_parser = discovery_engine_subparsers.add_parser('import-document-with-metadata', help='Import a single document with metadata into a Discovery Engine datastore')
|
|
217
|
+
import_document_with_metadata_parser.add_argument('--gcs-uri', required=True, help='The GCS URI of the document to import')
|
|
218
|
+
import_document_with_metadata_parser.add_argument('--metadata-file', help='The path to a JSON file containing the metadata')
|
|
219
|
+
import_document_with_metadata_parser.add_argument('--metadata-string', help='A JSON string containing the metadata')
|
|
220
|
+
import_document_with_metadata_parser.add_argument('--branch', default='default_branch', help='The branch to import the document into')
|
|
221
|
+
import_document_with_metadata_parser.set_defaults(func=discovery_engine_command)
|
|
222
|
+
|
|
223
|
+
# Search subcommand
|
|
224
|
+
search_parser = discovery_engine_subparsers.add_parser('search', help='Search a Discovery Engine datastore')
|
|
225
|
+
search_parser.add_argument('--query', required=True, help='The search query')
|
|
226
|
+
search_parser.add_argument('--num-previous-chunks', type=int, default=3, help='Number of previous chunks to return for context')
|
|
227
|
+
search_parser.add_argument('--num-next-chunks', type=int, default=3, help='Number of next chunks to return for context')
|
|
228
|
+
search_parser.add_argument('--page-size', type=int, default=10, help='The maximum number of results to return per page')
|
|
229
|
+
search_parser.add_argument('--parse-chunks-to-string', action='store_true', help='Combine chunks into a single string')
|
|
230
|
+
search_parser.add_argument('--serving-config', default='default_serving_config', help='The serving configuration to use')
|
|
231
|
+
search_parser.add_argument('--data-store-ids', nargs='+', help='List of data store IDs to search (optional)')
|
|
232
|
+
search_parser.set_defaults(func=discovery_engine_command)
|
|
233
|
+
|
|
234
|
+
# Search by ID and/or Date subcommand
|
|
235
|
+
search_by_id_and_or_date_parser = discovery_engine_subparsers.add_parser('search-by-id-and-or-date', help='Search a Discovery Engine datastore by object ID and/or date')
|
|
236
|
+
search_by_id_and_or_date_parser.add_argument('--query', required=True, help='The search query')
|
|
237
|
+
search_by_id_and_or_date_parser.add_argument('--object-id', help='The exact object ID to filter by')
|
|
238
|
+
search_by_id_and_or_date_parser.add_argument('--date', help='The date to filter by (YYYY-MM-DD)')
|
|
239
|
+
search_by_id_and_or_date_parser.add_argument('--num-previous-chunks', type=int, default=3, help='Number of previous chunks to return for context')
|
|
240
|
+
search_by_id_and_or_date_parser.add_argument('--num-next-chunks', type=int, default=3, help='Number of next chunks to return for context')
|
|
241
|
+
search_by_id_and_or_date_parser.add_argument('--page-size', type=int, default=10, help='The maximum number of results to return per page')
|
|
242
|
+
search_by_id_and_or_date_parser.add_argument('--parse-chunks-to-string', action='store_true', help='Combine chunks into a single string')
|
|
243
|
+
search_by_id_and_or_date_parser.add_argument('--serving-config', default='default_serving_config', help='The serving configuration to use')
|
|
244
|
+
search_by_id_and_or_date_parser.add_argument('--data-store-ids', nargs='+', help='List of data store IDs to search (optional)')
|
|
245
|
+
search_by_id_and_or_date_parser.set_defaults(func=discovery_engine_command)
|
|
@@ -177,6 +177,15 @@ class DiscoveryEngineClient:
|
|
|
177
177
|
|
|
178
178
|
return operation.operation.name
|
|
179
179
|
|
|
180
|
+
def _search_data_store_path(self,
|
|
181
|
+
data_store_id: str,
|
|
182
|
+
collection_id: str = "default_collection",
|
|
183
|
+
serving_config: str = "default_serving_config"):
|
|
184
|
+
if data_store_id.startswith("projects/"):
|
|
185
|
+
return data_store_id # Already a full path
|
|
186
|
+
|
|
187
|
+
return f"projects/{self.project_id}/locations/{self.location}/collections/{collection_id}/dataStores/{data_store_id}"
|
|
188
|
+
|
|
180
189
|
def get_chunks(
|
|
181
190
|
self,
|
|
182
191
|
query: str,
|
|
@@ -185,6 +194,7 @@ class DiscoveryEngineClient:
|
|
|
185
194
|
page_size: int = 10,
|
|
186
195
|
parse_chunks_to_string: bool = True,
|
|
187
196
|
serving_config: str = "default_serving_config",
|
|
197
|
+
data_store_ids: Optional[List[str]] = None,
|
|
188
198
|
):
|
|
189
199
|
"""Retrieves chunks or documents based on a query.
|
|
190
200
|
|
|
@@ -196,6 +206,7 @@ class DiscoveryEngineClient:
|
|
|
196
206
|
page_size (int, optional): The maximum number of results to return per page (default is 10).
|
|
197
207
|
parse_chunks_to_string: If True will put chunks in one big string, False will return object
|
|
198
208
|
serving_config: The resource name of the Search serving config
|
|
209
|
+
data_store_ids: If you want to search over many data stores, not just the one that was used to init the class. They should be of the format projects/{project}/locations/{location}/collections/{collection_id}/dataStores/{data_store_id}
|
|
199
210
|
|
|
200
211
|
Returns:
|
|
201
212
|
discoveryengine.SearchResponse: The search response object containing the search results.
|
|
@@ -216,7 +227,6 @@ class DiscoveryEngineClient:
|
|
|
216
227
|
serving_config
|
|
217
228
|
)
|
|
218
229
|
|
|
219
|
-
|
|
220
230
|
search_request = discoveryengine.SearchRequest(
|
|
221
231
|
serving_config=serving_config_path,
|
|
222
232
|
query=query,
|
|
@@ -230,9 +240,21 @@ class DiscoveryEngineClient:
|
|
|
230
240
|
),
|
|
231
241
|
)
|
|
232
242
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
243
|
+
if data_store_ids:
|
|
244
|
+
search_request.data_store_specs = [
|
|
245
|
+
discoveryengine.SearchRequest.DataStoreSpec(
|
|
246
|
+
data_store=self._search_data_store_path(data_store_id, serving_config=serving_config)
|
|
247
|
+
)
|
|
248
|
+
for data_store_id in data_store_ids
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
log.info(f"Discovery engine request: {search_request=}")
|
|
253
|
+
search_response = self.search_client.search(search_request)
|
|
254
|
+
except Exception as err:
|
|
255
|
+
log.warning(f"Error searching {search_request=} - no results found? {str(err)}")
|
|
256
|
+
search_response = []
|
|
257
|
+
|
|
236
258
|
if parse_chunks_to_string:
|
|
237
259
|
|
|
238
260
|
big_string = self.process_chunks(search_response)
|
|
@@ -251,6 +273,7 @@ class DiscoveryEngineClient:
|
|
|
251
273
|
page_size: int = 10,
|
|
252
274
|
parse_chunks_to_string: bool = True,
|
|
253
275
|
serving_config: str = "default_serving_config",
|
|
276
|
+
data_store_ids: Optional[List[str]] = None,
|
|
254
277
|
):
|
|
255
278
|
"""Retrieves chunks or documents based on a query.
|
|
256
279
|
|
|
@@ -262,6 +285,7 @@ class DiscoveryEngineClient:
|
|
|
262
285
|
page_size (int, optional): The maximum number of results to return per page (default is 10).
|
|
263
286
|
parse_chunks_to_string: If True will put chunks in one big string, False will return object
|
|
264
287
|
serving_config: The resource name of the Search serving config
|
|
288
|
+
data_store_ids: If you want to search over many data stores, not just the one that was used to init the class. They should be of the format projects/{project}/locations/{location}/collections/{collection_id}/dataStores/{data_store_id}
|
|
265
289
|
|
|
266
290
|
Returns:
|
|
267
291
|
discoveryengine.SearchResponse: The search response object containing the search results.
|
|
@@ -296,9 +320,19 @@ class DiscoveryEngineClient:
|
|
|
296
320
|
),
|
|
297
321
|
)
|
|
298
322
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
323
|
+
if data_store_ids:
|
|
324
|
+
search_request.data_store_specs = [
|
|
325
|
+
discoveryengine.SearchRequest.DataStoreSpec(data_store=data_store_id)
|
|
326
|
+
for data_store_id in data_store_ids
|
|
327
|
+
]
|
|
328
|
+
|
|
329
|
+
try:
|
|
330
|
+
log.info(f"Discovery engine request: {search_request=}")
|
|
331
|
+
search_response = self.async_search_client.search(search_request)
|
|
332
|
+
except Exception as err:
|
|
333
|
+
log.warning(f"Error searching {search_request=} - no results found? {str(err)}")
|
|
334
|
+
search_response = []
|
|
335
|
+
|
|
302
336
|
if parse_chunks_to_string:
|
|
303
337
|
|
|
304
338
|
big_string = await self.async_process_chunks(search_response)
|
|
@@ -623,8 +657,91 @@ class DiscoveryEngineClient:
|
|
|
623
657
|
def get_mime_type(self, uri:str):
|
|
624
658
|
return guess_mime_type(uri)
|
|
625
659
|
|
|
626
|
-
def search_with_filters(self, query,
|
|
660
|
+
def search_with_filters(self, query, filter_str=None,
|
|
627
661
|
num_previous_chunks=3, num_next_chunks=3,
|
|
628
662
|
page_size=10, parse_chunks_to_string=True,
|
|
629
|
-
serving_config="default_serving_config"
|
|
630
|
-
|
|
663
|
+
serving_config="default_serving_config",
|
|
664
|
+
data_store_ids: Optional[List[str]] = None):
|
|
665
|
+
"""
|
|
666
|
+
Searches with a generic filter string.
|
|
667
|
+
|
|
668
|
+
Args:
|
|
669
|
+
query (str): The search query.
|
|
670
|
+
filter_str (str, optional): The filter string to apply (e.g., "source LIKE 'my_source' AND eventTime > TIMESTAMP('2024-01-01')").
|
|
671
|
+
#... other parameters from get_chunks
|
|
672
|
+
|
|
673
|
+
Returns:
|
|
674
|
+
discoveryengine.SearchResponse or str: The search response object or string of chunks.
|
|
675
|
+
"""
|
|
676
|
+
|
|
677
|
+
serving_config_path = self.search_client.serving_config_path(
|
|
678
|
+
self.project_id,
|
|
679
|
+
self.location,
|
|
680
|
+
self.data_store_id,
|
|
681
|
+
serving_config
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
search_request = discoveryengine.SearchRequest(
|
|
685
|
+
serving_config=serving_config_path,
|
|
686
|
+
query=query,
|
|
687
|
+
page_size=page_size,
|
|
688
|
+
content_search_spec=discoveryengine.SearchRequest.ContentSearchSpec(
|
|
689
|
+
search_result_mode="CHUNKS",
|
|
690
|
+
chunk_spec=discoveryengine.SearchRequest.ContentSearchSpec.ChunkSpec(
|
|
691
|
+
num_previous_chunks=num_previous_chunks,
|
|
692
|
+
num_next_chunks=num_next_chunks,
|
|
693
|
+
),
|
|
694
|
+
),
|
|
695
|
+
filter=filter_str # name:'ANY("king kong")'
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
if data_store_ids:
|
|
699
|
+
search_request.data_store_specs = [
|
|
700
|
+
discoveryengine.SearchRequest.DataStoreSpec(
|
|
701
|
+
data_store=self._search_data_store_path(data_store_id, serving_config=serving_config)
|
|
702
|
+
)
|
|
703
|
+
for data_store_id in data_store_ids
|
|
704
|
+
]
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
log.info(f"Discovery engine request with filter: {search_request=}")
|
|
709
|
+
try:
|
|
710
|
+
search_response = self.search_client.search(search_request)
|
|
711
|
+
except Exception as e:
|
|
712
|
+
log.info(f"No results {search_request.data_store_specs=}: {str(e)}")
|
|
713
|
+
return None
|
|
714
|
+
|
|
715
|
+
if parse_chunks_to_string:
|
|
716
|
+
big_string = self.process_chunks(search_response)
|
|
717
|
+
log.info(f"Discovery engine chunks string sample: {big_string[:100]}")
|
|
718
|
+
return big_string
|
|
719
|
+
|
|
720
|
+
log.info("Discovery engine response object")
|
|
721
|
+
return search_response
|
|
722
|
+
|
|
723
|
+
def search_by_objectId_and_or_date(self, query, objectId=None, date=None, **kwargs):
|
|
724
|
+
"""
|
|
725
|
+
Searches and filters by objectId (exact match) and/or date.
|
|
726
|
+
|
|
727
|
+
Args:
|
|
728
|
+
query (str): The search query.
|
|
729
|
+
objectId (str, optional): The exact objectId to filter by.
|
|
730
|
+
date (str, optional): The literal_iso_8601_datetime_format date to filter by e.g. 2025-02-24T12:25:30.123Z
|
|
731
|
+
**kwargs: Additional keyword arguments to pass to `search_with_filters`.
|
|
732
|
+
|
|
733
|
+
Returns:
|
|
734
|
+
list: A list of search results.
|
|
735
|
+
"""
|
|
736
|
+
filter_clauses = []
|
|
737
|
+
if objectId:
|
|
738
|
+
filter_clauses.append(f'objectId: ANY("{objectId}")')
|
|
739
|
+
if date:
|
|
740
|
+
filter_clauses.append(f'eventTime >= "{date}"')
|
|
741
|
+
|
|
742
|
+
if filter_clauses:
|
|
743
|
+
filter_str = " AND ".join(filter_clauses) # Combine with AND
|
|
744
|
+
return self.search_with_filters(query, filter_str, **kwargs)
|
|
745
|
+
else:
|
|
746
|
+
# No filters, perform regular search
|
|
747
|
+
return self.search_with_filters(query, **kwargs)
|
|
@@ -26,6 +26,7 @@ from ..components import get_embeddings, pick_vectorstore, load_memories, pick_e
|
|
|
26
26
|
from ..custom_logging import log
|
|
27
27
|
from ..database.uuid import generate_uuid_from_object_id
|
|
28
28
|
from ..utils import ConfigManager
|
|
29
|
+
from .embed_metadata import audit_metadata
|
|
29
30
|
|
|
30
31
|
def embed_pubsub_chunk(data: dict):
|
|
31
32
|
"""Triggered from a message on a Cloud Pub/Sub topic "embed_chunk" topic
|
|
@@ -75,25 +76,7 @@ def embed_pubsub_chunk(data: dict):
|
|
|
75
76
|
|
|
76
77
|
log.info(f"Embedding: {vector_name} page_content: {page_content[:30]}...[{len(page_content)}] - {metadata}")
|
|
77
78
|
|
|
78
|
-
|
|
79
|
-
metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
|
|
80
|
-
metadata['eventtime'] = metadata['eventTime']
|
|
81
|
-
|
|
82
|
-
if 'source' not in metadata:
|
|
83
|
-
if 'objectId' in metadata:
|
|
84
|
-
metadata['source'] = metadata['objectId']
|
|
85
|
-
elif 'url' in metadata:
|
|
86
|
-
metadata['source'] = metadata['url']
|
|
87
|
-
else:
|
|
88
|
-
log.warning(f"No source found in metadata: {metadata}")
|
|
89
|
-
|
|
90
|
-
if 'original_source' not in metadata:
|
|
91
|
-
metadata['original_source'] = metadata.get('source')
|
|
92
|
-
else:
|
|
93
|
-
metadata['source'] = metadata['original_source']
|
|
94
|
-
|
|
95
|
-
if 'chunk_length' not in metadata:
|
|
96
|
-
metadata['chunk_length'] = len(page_content)
|
|
79
|
+
metadata = audit_metadata(metadata, chunk_length=len(page_content))
|
|
97
80
|
|
|
98
81
|
if 'doc_id' not in metadata:
|
|
99
82
|
log.warning(f"No doc_id found in metadata for {metadata['source']}- creating one")
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
import datetime
|
|
3
|
+
from ..custom_logging import log
|
|
4
|
+
|
|
5
|
+
def audit_metadata(metadata, chunk_length=None):
|
|
6
|
+
|
|
7
|
+
if 'eventTime' not in metadata:
|
|
8
|
+
metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
|
|
9
|
+
metadata['eventtime'] = metadata['eventTime']
|
|
10
|
+
|
|
11
|
+
if 'source' not in metadata:
|
|
12
|
+
if 'objectId' in metadata:
|
|
13
|
+
metadata['source'] = metadata['objectId']
|
|
14
|
+
elif 'url' in metadata:
|
|
15
|
+
metadata['source'] = metadata['url']
|
|
16
|
+
else:
|
|
17
|
+
log.warning(f"No source found in metadata: {metadata}")
|
|
18
|
+
|
|
19
|
+
if 'original_source' not in metadata:
|
|
20
|
+
metadata['original_source'] = metadata.get('source')
|
|
21
|
+
else:
|
|
22
|
+
metadata['source'] = metadata['original_source']
|
|
23
|
+
|
|
24
|
+
if 'chunk_length' not in metadata:
|
|
25
|
+
metadata['chunk_length'] = chunk_length
|
|
26
|
+
|
|
27
|
+
return metadata
|
|
@@ -83,11 +83,13 @@ src/sunholo/database/sql/sb/return_sources.sql
|
|
|
83
83
|
src/sunholo/database/sql/sb/setup.sql
|
|
84
84
|
src/sunholo/discovery_engine/__init__.py
|
|
85
85
|
src/sunholo/discovery_engine/chunker_handler.py
|
|
86
|
+
src/sunholo/discovery_engine/cli.py
|
|
86
87
|
src/sunholo/discovery_engine/create_new.py
|
|
87
88
|
src/sunholo/discovery_engine/discovery_engine_client.py
|
|
88
89
|
src/sunholo/discovery_engine/get_ai_search_chunks.py
|
|
89
90
|
src/sunholo/embedder/__init__.py
|
|
90
91
|
src/sunholo/embedder/embed_chunk.py
|
|
92
|
+
src/sunholo/embedder/embed_metadata.py
|
|
91
93
|
src/sunholo/excel/__init__.py
|
|
92
94
|
src/sunholo/excel/plugin.py
|
|
93
95
|
src/sunholo/gcs/__init__.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|