unique_internal_search 1.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [1.2.2] - 2025-11-10
9
+ - Temporarily reverting the removal of the `get_tool_call_result_for_loop_history` function, as it is still required for the Investment Research Agent.
10
+
11
+ ## [1.2.1] - 2025-11-06
12
+ - Upload and chat system reminder cleanup
13
+
14
+ ## [1.2.0] - 2025-11-04
15
+ - Include system reminder for upload and chat tool about it being a forced tool in UniqueAI
16
+
17
+ ## [1.1.0] - 2025-10-30
18
+ - Add support for multiple search strings in a single tool call
19
+ - Search results from multiple queries are interleaved for better diversity
20
+ - Add automatic deduplication of chunks by `chunk_id` when using multiple search queries
21
+ - Prevents duplicate content from appearing in results when multiple related queries return the same chunks
22
+ - Preserves first occurrence and logs number of duplicates removed
23
+ - Add automatic parsing and cleaning of search query operators1
24
+ - Removes QDF (QueryDeservedFreshness) operators: `--QDF=0` to `--QDF=5` (freshness ratings)
25
+ - Removes boost operators: `+(term)` and `+(multi word phrase)` for query term boosting
26
+
27
+ ## [1.0.4] - 2025-10-28
28
+ - Removing unused tool specific `get_tool_call_result_for_loop_history` function
29
+ - Removing unused config `source_format_config`
30
+
31
+ ## [1.0.3] - 2025-10-25
32
+ - Fix appending of metadata to chunks
33
+
34
+ ## [1.0.2] - 2025-10-17
35
+ - Remove print statements originating from tool refactor
36
+
37
+ ## [1.0.1] - 2025-09-30
38
+ - Fix bug in metadata filter in the search method.
39
+
40
+ ## [1.0.0] - 2025-09-18
41
+ - Bump toolkit version to allow for both patch and minor updates
42
+
43
+ ## [0.0.7] - 2025-09-17
44
+ - Updated to latest toolkit
45
+
46
+ ## [0.0.6] - 2025-09-15
47
+ - Fix Minor bug in transforming toolResponse to toolCallResult
48
+
49
+ ## [0.0.5] - 2025-09-05
50
+ - Fixed a bug around metadata-filter assignment
51
+
52
+ ## [0.0.4] - 2025-09-05
53
+ - Fixed a bug around metadata-filter deep-copy
54
+
55
+ ## [0.0.3] - 2025-09-01
56
+ - Migrated the `uploaded_search` into this package.
57
+
58
+ ## [0.0.2] - 2025-09-01
59
+ - Migrated the `internal_search`.
60
+
61
+ ## [0.0.1] - 2025-08-18
62
+ - Initial release of `internal_search`.
@@ -0,0 +1 @@
1
+ `unique_toolkit` is covered by the [`Unique License v1`](https://github.com/Unique-AG/license/releases/tag/unique-license.v1), unless the/a header or a nested LICENSE specifies another license.
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.1
2
+ Name: unique_internal_search
3
+ Version: 1.2.2
4
+ Summary:
5
+ License: Proprietary
6
+ Author: Martin Fadler
7
+ Author-email: martin.fadler@unique.ch
8
+ Requires-Python: >=3.12,<4.0
9
+ Classifier: License :: Other/Proprietary License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Dist: pillow (>=10.4.0,<11.0.0)
13
+ Requires-Dist: pydantic (>=2.8.2,<3.0.0)
14
+ Requires-Dist: pydantic-settings (>=2.10.1,<3.0.0)
15
+ Requires-Dist: pytest (>=8.4.1,<9.0.0)
16
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
17
+ Requires-Dist: typing-extensions (>=4.9.0,<5.0.0)
18
+ Requires-Dist: unique-sdk (>=0.10.0,<0.11.0)
19
+ Requires-Dist: unique-toolkit (>=1.18.1,<2.0.0)
20
+ Description-Content-Type: text/markdown
21
+
22
+ # Internal Search Tool
23
+
24
+ Internal Search Tool to find documents in the Knowledge Base
25
+ # Changelog
26
+
27
+ All notable changes to this project will be documented in this file.
28
+
29
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
30
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
31
+
32
+ ## [1.2.2] - 2025-11-10
33
+ - Temporarily reverting the removal of the `get_tool_call_result_for_loop_history` function, as it is still required for the Investment Research Agent.
34
+
35
+ ## [1.2.1] - 2025-11-06
36
+ - Upload and chat system reminder cleanup
37
+
38
+ ## [1.2.0] - 2025-11-04
39
+ - Include system reminder for upload and chat tool about it being a forced tool in UniqueAI
40
+
41
+ ## [1.1.0] - 2025-10-30
42
+ - Add support for multiple search strings in a single tool call
43
+ - Search results from multiple queries are interleaved for better diversity
44
+ - Add automatic deduplication of chunks by `chunk_id` when using multiple search queries
45
+ - Prevents duplicate content from appearing in results when multiple related queries return the same chunks
46
+ - Preserves first occurrence and logs number of duplicates removed
47
+ - Add automatic parsing and cleaning of search query operators1
48
+ - Removes QDF (QueryDeservedFreshness) operators: `--QDF=0` to `--QDF=5` (freshness ratings)
49
+ - Removes boost operators: `+(term)` and `+(multi word phrase)` for query term boosting
50
+
51
+ ## [1.0.4] - 2025-10-28
52
+ - Removing unused tool specific `get_tool_call_result_for_loop_history` function
53
+ - Removing unused config `source_format_config`
54
+
55
+ ## [1.0.3] - 2025-10-25
56
+ - Fix appending of metadata to chunks
57
+
58
+ ## [1.0.2] - 2025-10-17
59
+ - Remove print statements originating from tool refactor
60
+
61
+ ## [1.0.1] - 2025-09-30
62
+ - Fix bug in metadata filter in the search method.
63
+
64
+ ## [1.0.0] - 2025-09-18
65
+ - Bump toolkit version to allow for both patch and minor updates
66
+
67
+ ## [0.0.7] - 2025-09-17
68
+ - Updated to latest toolkit
69
+
70
+ ## [0.0.6] - 2025-09-15
71
+ - Fix Minor bug in transforming toolResponse to toolCallResult
72
+
73
+ ## [0.0.5] - 2025-09-05
74
+ - Fixed a bug around metadata-filter assignment
75
+
76
+ ## [0.0.4] - 2025-09-05
77
+ - Fixed a bug around metadata-filter deep-copy
78
+
79
+ ## [0.0.3] - 2025-09-01
80
+ - Migrated the `uploaded_search` into this package.
81
+
82
+ ## [0.0.2] - 2025-09-01
83
+ - Migrated the `internal_search`.
84
+
85
+ ## [0.0.1] - 2025-08-18
86
+ - Initial release of `internal_search`.
@@ -0,0 +1,3 @@
1
+ # Internal Search Tool
2
+
3
+ Internal Search Tool to find documents in the Knowledge Base
@@ -0,0 +1,46 @@
1
+ [tool.poetry]
2
+ name = "unique_internal_search"
3
+ version = "1.2.2"
4
+ description = ""
5
+ authors = [
6
+ "Martin Fadler <martin.fadler@unique.ch>",
7
+ "Sadique Sheik <sadique@unique.ch>",
8
+ "Fabian Schläpfer <fabian@unique.ch>",
9
+ "Pascal Hauri <pascal@unique.ch>",
10
+ ]
11
+ readme = ["README.md", "CHANGELOG.md"]
12
+ license = "Proprietary"
13
+
14
+ [tool.poetry.dependencies]
15
+ python = "^3.12"
16
+ typing-extensions = "^4.9.0"
17
+ pydantic = "^2.8.2"
18
+ pydantic-settings = "^2.10.1"
19
+ python-dotenv = "^1.0.1"
20
+ pytest = "^8.4.1"
21
+ pillow = "^10.4.0"
22
+ unique-sdk = "^0.10.0"
23
+ unique-toolkit = "^1.18.1"
24
+
25
+
26
+ [tool.poetry.group.dev.dependencies]
27
+ python = "^3.12"
28
+ typing-extensions = "^4.9.0"
29
+ pydantic = "^2.8.2"
30
+ pydantic-settings = "^2.10.1"
31
+ python-dotenv = "^1.0.1"
32
+ pytest = "^8.4.1"
33
+ unique-sdk = { path = "../../unique_sdk" }
34
+ unique-toolkit = { path = "../../unique_toolkit" }
35
+ ruff = "^0.12.10"
36
+
37
+
38
+ [build-system]
39
+ requires = ["poetry-core"]
40
+ build-backend = "poetry.core.masonry.api"
41
+
42
+ [tool.ruff]
43
+ target-version = "py311"
44
+
45
+ [tool.ruff.lint]
46
+ extend-select = ["I"]
@@ -0,0 +1,153 @@
1
+ from typing import Annotated, Any
2
+
3
+ from pydantic import (
4
+ AliasChoices,
5
+ Field,
6
+ )
7
+ from pydantic.json_schema import SkipJsonSchema
8
+ from unique_toolkit._common.chunk_relevancy_sorter.config import (
9
+ ChunkRelevancySortConfig,
10
+ )
11
+ from unique_toolkit._common.feature_flags.schema import (
12
+ FeatureExtendedSourceSerialization,
13
+ )
14
+ from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricName
15
+ from unique_toolkit.agentic.history_manager.history_manager import DeactivatedNone
16
+ from unique_toolkit.agentic.tools.schemas import BaseToolConfig
17
+ from unique_toolkit.content.schemas import (
18
+ ContentRerankerConfig,
19
+ ContentSearchType,
20
+ )
21
+
22
+ from unique_internal_search.prompts import (
23
+ DEFAULT_LANGUAGE_PARAM_DESCRIPTION,
24
+ DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION,
25
+ DEFAULT_TOOL_DESCRIPTION,
26
+ DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT,
27
+ DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT,
28
+ )
29
+ from unique_internal_search.validators import get_string_field_with_pattern_validation
30
+
31
+
32
+ class ExperimentalFeatures(FeatureExtendedSourceSerialization):
33
+ enable_multiple_search_strings_execution: bool = Field(
34
+ default=False,
35
+ description="Allow execution of multiple search strings in one call. When set to True, each string is searched individually and results are merged into a single response.",
36
+ )
37
+
38
+
39
+ DEFAULT_LIMIT_CHUNK_RELEVANCY_SORT_ENABLED = 200
40
+ DEFAULT_LIMIT_CHUNK_RELEVANCY_SORT_DISABLED = 1000
41
+
42
+
43
+ def _search_limit_factory(data: dict[str, Any]) -> int:
44
+ return (
45
+ DEFAULT_LIMIT_CHUNK_RELEVANCY_SORT_ENABLED
46
+ if data["chunk_relevancy_sort_config"].enabled
47
+ else DEFAULT_LIMIT_CHUNK_RELEVANCY_SORT_DISABLED
48
+ )
49
+
50
+
51
+ class InternalSearchConfig(BaseToolConfig):
52
+ search_type: ContentSearchType = Field(
53
+ default=ContentSearchType.COMBINED,
54
+ description="The type of search to perform. Two possible values: `COMBINED` or `VECTOR`.",
55
+ )
56
+ max_tokens_for_sources: SkipJsonSchema[int] = (
57
+ Field( # TODO: Remove SkipJsonSchema once UI (Spaces 2.0) can be configured to not include certain fields
58
+ default=30_000,
59
+ description="The maximum number of tokens to use for the sources.",
60
+ )
61
+ )
62
+ percentage_of_input_tokens_for_sources: float = Field(
63
+ default=0.4,
64
+ description="The percentage of the maximum input tokens of the language model to use for the tool response.",
65
+ ge=0.0,
66
+ le=1.0,
67
+ )
68
+ language_model_max_input_tokens: SkipJsonSchema[int | None] = Field(
69
+ default=None,
70
+ description="Language model maximum input tokens",
71
+ )
72
+ scope_ids: Annotated[list[str], Field(title="Active")] | DeactivatedNone = Field(
73
+ default=None,
74
+ description="The scope ids to use for the search.",
75
+ )
76
+ scope_to_chat_on_upload: bool = Field(
77
+ default=False,
78
+ description="Whether to scope the search should be limited to files uploaded within the chat session when uploaded files are present.",
79
+ )
80
+ chunked_sources: bool = Field(
81
+ default=True,
82
+ description="Whether each chunk is added as an individual source in the final LLM prompt. If set to False, all chunks from the same document are combined into a single source.",
83
+ )
84
+ reranker_config: (
85
+ Annotated[ContentRerankerConfig, Field(title="Active")] | DeactivatedNone
86
+ ) = Field(
87
+ default=None,
88
+ description="The reranker config to use for the search.",
89
+ )
90
+ search_language: str = Field(
91
+ default="english",
92
+ validation_alias=AliasChoices("ftsSearchLanguage", "searchLanguage"),
93
+ description="The language to use for the search.",
94
+ )
95
+ # evaluation_config: EvaluationMetricConfig = EvaluationMetricConfig()
96
+ chunk_relevancy_sort_config: ChunkRelevancySortConfig = Field(
97
+ default_factory=ChunkRelevancySortConfig,
98
+ description="The chunk relevancy sort config to use for the search.",
99
+ )
100
+ limit: int = Field(
101
+ default_factory=_search_limit_factory,
102
+ description="The limit of chunks to return.",
103
+ )
104
+ chat_only: bool = Field(
105
+ default=False,
106
+ description="Whether to only chat on the upload.",
107
+ )
108
+
109
+ tool_description: str = get_string_field_with_pattern_validation(
110
+ DEFAULT_TOOL_DESCRIPTION,
111
+ description="Tool description.",
112
+ )
113
+ param_description_search_string: str = Field(
114
+ default=DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION,
115
+ description="`search_string` parameter description.",
116
+ )
117
+ param_description_language: str = get_string_field_with_pattern_validation(
118
+ DEFAULT_LANGUAGE_PARAM_DESCRIPTION,
119
+ description="`language` parameter description.",
120
+ )
121
+ tool_description_for_system_prompt: str = get_string_field_with_pattern_validation(
122
+ DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT,
123
+ description="Tool description for the system prompt.",
124
+ )
125
+ tool_format_information_for_system_prompt: str = (
126
+ get_string_field_with_pattern_validation(
127
+ DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT,
128
+ description="Tool format information for the system prompt.",
129
+ )
130
+ )
131
+ evaluation_check_list: list[EvaluationMetricName] = Field(
132
+ default=[EvaluationMetricName.HALLUCINATION],
133
+ description="The list of evaluation metrics to check.",
134
+ )
135
+ experimental_features: ExperimentalFeatures = ExperimentalFeatures()
136
+
137
+ metadata_chunk_sections: dict[str, str] = Field(
138
+ default={},
139
+ description=(
140
+ "Metadata sections to be appended to each search result chunk’s text. The keys represent metadata field names (e.g., 'metadata_key'), and the values are template strings that define how the metadata should be embedded, using {} as a placeholder for the actual value (e.g., '<|metadata_key|>{}<|/metadata_key|>')."
141
+ ),
142
+ )
143
+
144
+ score_threshold: float = Field(
145
+ default=0.0,
146
+ ge=0.0,
147
+ le=1.0,
148
+ description="The score threshold to use for the search to filter chunks on relevancy.",
149
+ )
150
+ exclude_uploaded_files: bool = Field(
151
+ default=False,
152
+ description="Whether to exclude uploaded files from the search. Overrides the `chat_only` parameter as it removes the `chat_id` from the search.",
153
+ )
@@ -0,0 +1,64 @@
1
+ DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT = (
2
+ "You can use the InternalSearch tool to access internal company documentations, including information on policies, procedures, benefits, groups, financial details, and specific individuals. "
3
+ "If this tool can help answer your question, feel free to use it to search the internal knowledge base for more information. "
4
+ "If possible always try to get information from the internal knowledge base with the InternalSearch tool before using other tools.\n"
5
+ "Use cases for the Internal Knowledge Search are:\n"
6
+ "- User asks to work with a document: Most likely the document is uploaded to the chat and mentioned in a message and can be loaded with this tool\n"
7
+ "- Policy and Procedure Verification: Use the internal search tool to find the most current company policies, procedures, or guidelines to ensure compliance and accuracy in responses.\n"
8
+ "- Project-Specific Information: When answering questions related to ongoing projects or initiatives, use the internal search to access project documents, reports, or meeting notes for precise details.\n"
9
+ "- Employee Directory and Contact Information: Utilize the internal search to locate contact details or organizational charts to facilitate communication and collaboration within the company.\n"
10
+ "- Confidential and Proprietary Information: When dealing with sensitive topics that require proprietary knowledge or confidential data, use the internal search to ensure the information is sourced from secure and authorized company documents.\n\n"
11
+ "**Instruction Query Splitting**\n"
12
+ 'You should split the user question into multiple search strings when the user\'s question needs to be decomposed / rewritten to find different facts. Perform for each search string an individual tool call. Avoid short queries that are extremely broad and will return unrelated results. Strip the search string of any extraneous details, e.g. instructions or unnecessary context. However, you must fill in relevant context from the rest of the conversation to make the question complete. E.g. "What was their age?" => "What was Kevin\'s age?" because the preceding conversation makes it clear that the user is talking about Kevin.\n\n'
13
+ "Here are some examples of how to use the InternalSearch tool:\n"
14
+ 'User: What was the GDP of France and Italy in the 1970s? => search strings: ["france gdp 1970", "italy gdp 1970"] # Splitting of the query into 2 queries and perform 2 tool calls\n'
15
+ 'User: What does the report say about the GPT4 performance on MMLU? => search strings: ["GPT4 performance on MMLU?"] # Simplify the query'
16
+ )
17
+
18
+ DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT = (
19
+ "Whenever you use information retrieved with the InternalSearch, you must adhere to strict reference guidelines. "
20
+ "You must strictly reference each fact used with the `source_number` of the corresponding passage, in the following format: '[source<source_number>]'.\n\n"
21
+ "Example:\n"
22
+ "- The stock price of Apple Inc. is $150 [source0] and the company's revenue increased by 10% [source1].\n"
23
+ "- Moreover, the company's market capitalization is $2 trillion [source2][source3].\n"
24
+ "- Our internal documents tell us to invest[source4] (Internal)\n\n"
25
+ "A fact is preferably referenced by ONLY ONE source, e.g [sourceX], which should be the most relevant source for the fact.\n"
26
+ "Follow these guidelines closely and be sure to use the proper `source_number` when referencing facts.\n"
27
+ "Make sure that your reference follow the format [sourceX] and that the source number is correct.\n"
28
+ "Source is written in singular form and the number is written in digits.\n\n"
29
+ "IT IS VERY IMPORTANT TO FOLLOW THESE GUIDELINES!!\n"
30
+ "NEVER CITE A source_number THAT YOU DON'T SEE IN THE TOOL CALL RESPONSE!!!\n"
31
+ "The source_number in old assistant messages are no longer valid.\n"
32
+ "EXAMPLE: If you see [source34] and [source35] in the assistant message, you can't use [source34] again in the next assistant message, this has to be the number you find in the message with role 'tool'.\n"
33
+ "BE AWARE:All tool calls have been filtered to remove uncited sources. Tool calls return much more data than you see\n\n"
34
+ "### Internal Document Answering Protocol for Employee Questions\n"
35
+ "When assisting employees using internal documents, follow\n"
36
+ "this structured approach to ensure precise, well-grounded,\n"
37
+ "and context-aware responses:\n\n"
38
+ "#### 1. Locate and Prioritize Relevant Internal Sources\n"
39
+ "Give strong preference to:\n"
40
+ "- **Most relevant documents**, such as:\n"
41
+ "- **Documents authored by or involving** the employee or team in question\n"
42
+ "- **Cross-validated sources**, especially when multiple documents agree\n"
43
+ " - Project trackers, design docs, decision logs, and OKRs\n"
44
+ " - Recently updated or active files\n\n"
45
+ "#### 2. Source Reliability Guidelines\n"
46
+ "- Prioritize information that is:\n"
47
+ " - **Directly written by domain experts or stakeholders**\n"
48
+ " - **Part of approved or finalized documentation**\n"
49
+ " - **Recently modified or reviewed**, if recency matters\n"
50
+ "- Be cautious with:\n"
51
+ " - Outdated drafts\n"
52
+ " - Undocumented opinions or partial records\n\n"
53
+ "#### 3. Acknowledge Limitations\n"
54
+ "- If no relevant information is found, or documents conflict, clearly state this\n"
55
+ "- Indicate where further clarification or investigation may be required"
56
+ )
57
+
58
+ DEFAULT_TOOL_DESCRIPTION = (
59
+ "Search in the company knowledge base for information on policies, procedures, benefits, groups, financial information or specific people. "
60
+ "This should be your go-to tool if no other tools are applicable."
61
+ )
62
+ DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION = "An expanded term that is optimized for vector and full text search based on the users query it must be in english."
63
+
64
+ DEFAULT_LANGUAGE_PARAM_DESCRIPTION = "The language that the user wrote the query in"
@@ -0,0 +1,458 @@
1
+ from logging import Logger
2
+
3
+ from pydantic import Field, create_model
4
+ from typing_extensions import override
5
+ from unique_toolkit._common.chunk_relevancy_sorter.exception import (
6
+ ChunkRelevancySorterException,
7
+ )
8
+ from unique_toolkit._common.chunk_relevancy_sorter.service import ChunkRelevancySorter
9
+ from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricName
10
+ from unique_toolkit.agentic.history_manager.utils import transform_chunks_to_string
11
+ from unique_toolkit.agentic.tools.agent_chunks_hanlder import AgentChunksHandler
12
+ from unique_toolkit.agentic.tools.factory import ToolFactory
13
+ from unique_toolkit.agentic.tools.schemas import ToolCallResponse
14
+ from unique_toolkit.agentic.tools.tool import Tool
15
+ from unique_toolkit.agentic.tools.tool_progress_reporter import ProgressState
16
+ from unique_toolkit.app.schemas import BaseEvent, ChatEvent, Event
17
+ from unique_toolkit.chat.service import LanguageModelToolDescription
18
+ from unique_toolkit.content.schemas import Content, ContentChunk
19
+ from unique_toolkit.content.service import ContentService
20
+ from unique_toolkit.content.utils import (
21
+ merge_content_chunks,
22
+ pick_content_chunks_for_token_window,
23
+ sort_content_chunks,
24
+ )
25
+ from unique_toolkit.language_model.schemas import (
26
+ LanguageModelFunction,
27
+ LanguageModelMessage,
28
+ LanguageModelToolMessage,
29
+ )
30
+
31
+ from unique_internal_search.config import InternalSearchConfig
32
+ from unique_internal_search.utils import (
33
+ SearchStringResult,
34
+ append_metadata_in_chunks,
35
+ clean_search_string,
36
+ interleave_search_results_round_robin,
37
+ )
38
+
39
+
40
+ class InternalSearchService:
41
+ def __init__(
42
+ self,
43
+ config: InternalSearchConfig,
44
+ content_service: ContentService,
45
+ chunk_relevancy_sorter: ChunkRelevancySorter,
46
+ chat_id: str | None,
47
+ logger: Logger,
48
+ ):
49
+ self.config = config
50
+ self.content_service = content_service
51
+ self.chunk_relevancy_sorter = chunk_relevancy_sorter
52
+ self.chat_id = chat_id
53
+ self.logger = logger
54
+ self.tool_execution_message_name = "Internal search"
55
+
56
+ async def post_progress_message(self, message: str, *args, **kwargs):
57
+ pass
58
+
59
+ async def get_uploaded_files(self) -> list[Content]:
60
+ chat_results = await self.content_service.search_contents_async(
61
+ where={
62
+ "ownerId": {
63
+ "equals": self.chat_id,
64
+ }
65
+ },
66
+ )
67
+ sorted_chat_results: list[Content] = sorted(
68
+ chat_results,
69
+ key=lambda x: x.created_at, # type: ignore
70
+ reverse=True,
71
+ )
72
+ return sorted_chat_results
73
+
74
+ async def is_chat_only(self, **kwargs) -> bool:
75
+ """Check whether the assistant should limit itself to files in chat"""
76
+ if self.config.chat_only:
77
+ return True
78
+ if self.config.scope_to_chat_on_upload:
79
+ chat_files = await self.get_uploaded_files()
80
+ if len(chat_files) > 0:
81
+ return True
82
+ return False
83
+
84
+ async def search(
85
+ self,
86
+ search_string: str | list[str],
87
+ content_ids: list[str] | None = None,
88
+ metadata_filter: dict | None = None,
89
+ **kwargs,
90
+ ) -> list[ContentChunk]:
91
+ """
92
+ Perform a search with one or more search strings.
93
+
94
+ Args:
95
+ search_string: List of search strings or single search string
96
+ content_ids: List of content IDs
97
+ metadata_filter: Metadata filter
98
+ """
99
+
100
+ # Convert single string to list
101
+ if isinstance(search_string, str):
102
+ search_strings = [search_string]
103
+ else:
104
+ search_strings = search_string
105
+
106
+ """
107
+ Perform a search in the Vector DB based on the user's message and generate a response.
108
+ """
109
+
110
+ # Clean search strings by removing QDF and boost operators
111
+ search_strings = [clean_search_string(s) for s in search_strings]
112
+
113
+ ###
114
+ # 2. Search for context in the Vector DB
115
+ ###
116
+ chat_only = await self.is_chat_only(**kwargs)
117
+
118
+ """
119
+ Handle the fact that metadata can exclude uploaded content
120
+ and that the search service is hardcoded to use the metadata_filter
121
+ from the event if set to None
122
+ """
123
+ # Take a backup of the metadata filter
124
+ metadata_filter_copy = self.content_service._metadata_filter
125
+
126
+ if metadata_filter is None:
127
+ metadata_filter = self.content_service._metadata_filter
128
+ if chat_only and metadata_filter:
129
+ # if this is not set to none search_content_chunks_async will overwrite it inside its call thats why it needs to stay.
130
+ self.content_service._metadata_filter = None
131
+ metadata_filter = None
132
+
133
+ found_chunks_per_search_string: list[SearchStringResult] = []
134
+ for i, search_string in enumerate(search_strings):
135
+ try:
136
+ found_chunks: list[
137
+ ContentChunk
138
+ ] = await self.content_service.search_content_chunks_async(
139
+ search_string=search_string, # type: ignore
140
+ search_type=self.config.search_type,
141
+ limit=self.config.limit,
142
+ reranker_config=self.config.reranker_config,
143
+ search_language=self.config.search_language,
144
+ scope_ids=self.config.scope_ids,
145
+ metadata_filter=metadata_filter,
146
+ chat_id=self.chat_id
147
+ if self.config.exclude_uploaded_files and self.chat_id
148
+ else "",
149
+ chat_only=chat_only,
150
+ content_ids=content_ids,
151
+ score_threshold=self.config.score_threshold,
152
+ )
153
+ self.logger.info(
154
+ f"Found {len(found_chunks)} chunks (Query {i + 1}/{len(search_strings)})"
155
+ )
156
+ except Exception as e:
157
+ self.logger.error(f"Error in search_document_chunks call: {e}")
158
+ raise e
159
+
160
+ found_chunks_per_search_string.append(
161
+ SearchStringResult(
162
+ query=search_string,
163
+ chunks=found_chunks,
164
+ )
165
+ )
166
+
167
+ # Reset the metadata filter in case it was disabled
168
+ self.content_service._metadata_filter = metadata_filter_copy
169
+
170
+ # Apply chunk relevancy sorter if enabled
171
+ if self.config.chunk_relevancy_sort_config.enabled:
172
+ for i, result in enumerate(found_chunks_per_search_string):
173
+ await self.post_progress_message(
174
+ f"{result.query} (_Resorting {len(result.chunks)} search results_ 🔄 in query {i + 1}/{len(search_strings)})",
175
+ **kwargs,
176
+ )
177
+ result.chunks = await self._resort_found_chunks_if_enabled(
178
+ found_chunks=result.chunks,
179
+ search_string=result.query,
180
+ )
181
+
182
+ ###
183
+ # 3. Pick a subset of the search results
184
+ ###
185
+ if (
186
+ self.config.experimental_features.enable_multiple_search_strings_execution
187
+ and len(found_chunks_per_search_string) > 1
188
+ ):
189
+ found_chunks_per_search_string = interleave_search_results_round_robin(
190
+ found_chunks_per_search_string
191
+ )
192
+
193
+ await self.post_progress_message(
194
+ f"{', '.join(search_strings)} (_Postprocessing search results_)",
195
+ **kwargs,
196
+ )
197
+ found_chunks = [
198
+ chunk
199
+ for result in found_chunks_per_search_string
200
+ for chunk in result.chunks
201
+ ]
202
+ selected_chunks = pick_content_chunks_for_token_window(
203
+ found_chunks, self._get_max_tokens()
204
+ )
205
+
206
+ ###
207
+ # 4. cache them add index to search results & join them together
208
+ ###
209
+ if not self.config.chunked_sources:
210
+ selected_chunks = merge_content_chunks(selected_chunks)
211
+ else:
212
+ selected_chunks = sort_content_chunks(selected_chunks)
213
+
214
+ self.debug_info = {
215
+ "searchStrings": search_strings,
216
+ "metadataFilter": metadata_filter,
217
+ "chatOnly": chat_only,
218
+ }
219
+ return selected_chunks
220
+
221
+ async def _resort_found_chunks_if_enabled(
222
+ self, found_chunks: list[ContentChunk], search_string: str
223
+ ) -> list[ContentChunk]:
224
+ try:
225
+ total_chunks = len(found_chunks)
226
+ self.logger.info(f"Resorting {total_chunks} search result...")
227
+ chunk_relevancy_sorter_result = await self.chunk_relevancy_sorter.run(
228
+ input_text=search_string,
229
+ chunks=found_chunks,
230
+ config=self.config.chunk_relevancy_sort_config,
231
+ )
232
+ found_chunks = chunk_relevancy_sorter_result.content_chunks
233
+ except ChunkRelevancySorterException as e:
234
+ self.logger.warning(f"Error while sorting chunks: {e.error_message}")
235
+ finally:
236
+ return found_chunks
237
+
238
+ def _get_max_tokens(self) -> int:
239
+ if self.config.language_model_max_input_tokens is not None:
240
+ max_tokens = int(
241
+ self.config.language_model_max_input_tokens
242
+ * self.config.percentage_of_input_tokens_for_sources
243
+ )
244
+ self.logger.debug(
245
+ "Using %s of max tokens %s as token limit: %s",
246
+ self.config.percentage_of_input_tokens_for_sources,
247
+ self.config.language_model_max_input_tokens,
248
+ max_tokens,
249
+ )
250
+ return max_tokens
251
+ else:
252
+ self.logger.debug(
253
+ "language model input context size is not set, using default max tokens"
254
+ )
255
+ return self.config.max_tokens_for_sources
256
+
257
+
258
+ class InternalSearchTool(Tool[InternalSearchConfig], InternalSearchService):
259
+ name = "InternalSearch"
260
+
261
+ def __init__(
262
+ self,
263
+ configuration: InternalSearchConfig,
264
+ event: BaseEvent,
265
+ *args,
266
+ **kwargs,
267
+ ):
268
+ Tool.__init__(self, configuration, event, *args, **kwargs)
269
+
270
+ content_service = ContentService.from_event(self.event)
271
+ chunk_relevancy_sorter = ChunkRelevancySorter.from_event(self.event)
272
+ # Determing chat_id if possible
273
+ if isinstance(self.event, (ChatEvent, Event)):
274
+ chat_id = self.event.payload.chat_id
275
+ else:
276
+ chat_id = None
277
+ InternalSearchService.__init__(
278
+ self,
279
+ config=configuration,
280
+ content_service=content_service,
281
+ chunk_relevancy_sorter=chunk_relevancy_sorter,
282
+ chat_id=chat_id,
283
+ logger=self.logger,
284
+ )
285
+
286
+ async def post_progress_message(
287
+ self, message: str, tool_call: LanguageModelFunction, **kwargs
288
+ ):
289
+ if self.tool_progress_reporter:
290
+ await self.tool_progress_reporter.notify_from_tool_call(
291
+ tool_call=tool_call,
292
+ name=f"**{self.tool_execution_message_name}**",
293
+ message=message,
294
+ state=ProgressState.RUNNING,
295
+ )
296
+
297
+ async def is_chat_only(
298
+ self, tool_call: LanguageModelFunction | None = None, **kwargs
299
+ ) -> bool:
300
+ if await super().is_chat_only(**kwargs):
301
+ return True
302
+ if (
303
+ tool_call
304
+ and isinstance(tool_call.arguments, dict)
305
+ and tool_call.arguments.get("chat_only") is True
306
+ ):
307
+ return True
308
+ return False
309
+
310
+ @override
311
+ def tool_description(self) -> LanguageModelToolDescription:
312
+ # Conditionally set the type based on config
313
+ search_string_type = (
314
+ list[str]
315
+ if self.config.experimental_features.enable_multiple_search_strings_execution
316
+ else str
317
+ )
318
+
319
+ internal_search_tool_input = create_model(
320
+ "InternalSearchToolInput",
321
+ search_string=(
322
+ search_string_type,
323
+ Field(description=self.config.param_description_search_string),
324
+ ),
325
+ language=(
326
+ str,
327
+ Field(description=self.config.param_description_language),
328
+ ),
329
+ )
330
+ return LanguageModelToolDescription(
331
+ name=self.name,
332
+ description=self.config.tool_description,
333
+ parameters=internal_search_tool_input,
334
+ )
335
+
336
+ def tool_description_for_system_prompt(self) -> str:
337
+ return self.config.tool_description_for_system_prompt
338
+
339
+ def tool_format_information_for_system_prompt(self) -> str:
340
+ return self.config.tool_format_information_for_system_prompt
341
+
342
+ def evaluation_check_list(self) -> list[EvaluationMetricName]:
343
+ return self.config.evaluation_check_list
344
+
345
+ def get_evaluation_checks_based_on_tool_response(
346
+ self, tool_response: ToolCallResponse
347
+ ) -> list[EvaluationMetricName]:
348
+ evaluation_check_list = self.evaluation_check_list()
349
+
350
+ # Check if the tool response is empty
351
+ if not tool_response.content_chunks:
352
+ return []
353
+ return evaluation_check_list
354
+
355
+ # TODO: find a solution for tracking
356
+ # @track(name="internal_search_tool_run")
357
+ async def run(self, tool_call: LanguageModelFunction) -> ToolCallResponse:
358
+ """
359
+ Perform a search in the Vector DB based on the user's message and generate a response.
360
+ """
361
+ if (
362
+ tool_call.arguments is None
363
+ or not isinstance(tool_call.arguments, dict)
364
+ or (
365
+ "search_strings" not in tool_call.arguments
366
+ and "search_string"
367
+ not in tool_call.arguments # Backwards compatibility
368
+ )
369
+ ):
370
+ self.logger.error("Tool call arguments are missing or invalid")
371
+ return ToolCallResponse(
372
+ id=tool_call.id, # type: ignore
373
+ name=self.name,
374
+ content_chunks=[],
375
+ debug_info={},
376
+ )
377
+
378
+ # Extract the search strings (handle both new and old parameter names)
379
+ search_strings_data = tool_call.arguments.get(
380
+ "search_strings", tool_call.arguments.get("search_string")
381
+ )
382
+ # Ensure it's always a list for the progress message
383
+ search_strings_list: list[str] = []
384
+ if isinstance(search_strings_data, str):
385
+ search_strings_list = [search_strings_data]
386
+ elif isinstance(search_strings_data, list):
387
+ search_strings_list = search_strings_data
388
+ else:
389
+ raise ValueError("Invalid search strings data")
390
+
391
+ await self.post_progress_message(f"{'; '.join(search_strings_list)}", tool_call)
392
+
393
+ selected_chunks = await self.search(
394
+ **tool_call.arguments,
395
+ tool_call=tool_call, # Need to pass tool_call to post_progress_message
396
+ )
397
+
398
+ ## Modify metadata in chunks
399
+ selected_chunks = append_metadata_in_chunks(
400
+ chunks=selected_chunks,
401
+ metadata_chunk_sections=self.config.metadata_chunk_sections,
402
+ )
403
+
404
+ tool_response = ToolCallResponse(
405
+ id=tool_call.id, # type: ignore
406
+ name=self.name,
407
+ content_chunks=selected_chunks,
408
+ debug_info=self.debug_info,
409
+ )
410
+
411
+ if self.tool_progress_reporter:
412
+ await self.tool_progress_reporter.notify_from_tool_call(
413
+ tool_call=tool_call,
414
+ name=f"**{self.tool_execution_message_name}**",
415
+ message=f"{'; '.join(search_strings_list)}",
416
+ state=ProgressState.FINISHED,
417
+ )
418
+
419
+ return tool_response
420
+
421
+ ## Note: This function is only used by the Investment Research Agent and Agentic Search. Once these agents are moved out of the monorepo, this function should be removed.
422
+ def get_tool_call_result_for_loop_history(
423
+ self,
424
+ tool_response: ToolCallResponse,
425
+ agent_chunks_handler: AgentChunksHandler,
426
+ ) -> LanguageModelMessage:
427
+ """
428
+ Process the results of the tool.
429
+ Args:
430
+ tool_response: The tool response.
431
+ loop_history: The loop history.
432
+ Returns:
433
+ The tool result to append to the loop history.
434
+ """
435
+ self.logger.debug(
436
+ f"Appending tool call result to history: {tool_response.name}"
437
+ )
438
+ # Initialize content_chunks if None
439
+ content_chunks = tool_response.content_chunks or []
440
+
441
+ # Get the maximum source number in the loop history
442
+ max_source_number = len(agent_chunks_handler.chunks)
443
+
444
+ # Transform content chunks into sources to be appended to tool result
445
+ sources, _ = transform_chunks_to_string(
446
+ content_chunks,
447
+ max_source_number,
448
+ )
449
+
450
+ # Append the result to the history
451
+ return LanguageModelToolMessage(
452
+ content=sources,
453
+ tool_call_id=tool_response.id, # type: ignore
454
+ name=tool_response.name,
455
+ )
456
+
457
+
458
+ ToolFactory.register_tool(InternalSearchTool, InternalSearchConfig)
@@ -0,0 +1,29 @@
1
+ from unique_internal_search.config import InternalSearchConfig
2
+ from unique_internal_search.uploaded_search.prompts import (
3
+ DEFAULT_LANGUAGE_PARAM_DESCRIPTION,
4
+ DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION,
5
+ DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT,
6
+ DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT,
7
+ )
8
+ from unique_internal_search.validators import get_string_field_with_pattern_validation
9
+
10
+
11
+ class UploadedSearchConfig(InternalSearchConfig):
12
+ param_description_search_string: str = get_string_field_with_pattern_validation(
13
+ DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION,
14
+ description="`search_string` parameter description.",
15
+ )
16
+ param_description_language: str = get_string_field_with_pattern_validation(
17
+ DEFAULT_LANGUAGE_PARAM_DESCRIPTION,
18
+ description="`language` parameter description.",
19
+ )
20
+ tool_description_for_system_prompt: str = get_string_field_with_pattern_validation(
21
+ DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT,
22
+ description="Tool description for the system prompt.",
23
+ )
24
+ tool_format_information_for_system_prompt: str = (
25
+ get_string_field_with_pattern_validation(
26
+ DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT,
27
+ description="Tool format information for the system prompt.",
28
+ )
29
+ )
@@ -0,0 +1,57 @@
1
+ DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT = (
2
+ "You can use the UploadedSearch tool to access and analyze documents uploaded by users during a chat. This tool is designed to handle a variety of document-related tasks, including summarization, explanation, and detailed information retrieval. "
3
+ "Use cases for the UploadedSearch tool include:\n"
4
+ "- Document Analysis: When a user uploads a document and asks for a summary, explanation, or specific details, this tool can extract and provide the requested information.\n"
5
+ "- Named Document Queries: If a user refers to a previously uploaded document by name (e.g., 'What does the Q2_Report.pdf say about revenue?'), this tool can locate and analyze the document to answer the query.\n"
6
+ "- Policy and Procedure Verification: Use the tool to find the most current company policies, procedures, or guidelines within uploaded documents.\n"
7
+ "- Project-Specific Information: Access project documents, reports, or meeting notes uploaded by users to provide precise details.\n"
8
+ "- Confidential and Proprietary Information: Ensure that sensitive topics requiring proprietary knowledge or confidential data are sourced securely from uploaded documents.\n\n"
9
+ "**Instruction Query Splitting**\n"
10
+ "You should split the user question into multiple search strings when the user's question needs to be decomposed / rewritten to find different facts. Perform an individual tool call for each search string. Avoid overly broad queries that may return unrelated results. Ensure the search string is specific and relevant to the uploaded document(s).\n\n"
11
+ "Examples:\n"
12
+ 'User: "What does the Q2_Report.pdf say about revenue and expenses?" => search strings: ["Q2_Report.pdf revenue", "Q2_Report.pdf expenses"]\n'
13
+ 'User: "Summarize the uploaded document." => search string: ["Summarize the uploaded document"]\n'
14
+ "**The currently uploaded documents are the following**\n\n"
15
+ )
16
+
17
+ DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT = (
18
+ "Whenever you use information retrieved with the UploadedSearch, you must adhere to strict reference guidelines. "
19
+ "You must strictly reference each fact used with the `source_number` of the corresponding passage, in the following format: '[source<source_number>]'.\n\n"
20
+ "Example:\n"
21
+ "- The revenue for Q2 was $5M , while expenses were $3M .\n"
22
+ "- The uploaded document highlights a 20% increase in productivity .\n\n"
23
+ "A fact is preferably referenced by ONLY ONE source, e.g [sourceX], which should be the most relevant source for the fact.\n"
24
+ "Follow these guidelines closely and be sure to use the proper `source_number` when referencing facts.\n"
25
+ "Make sure that your reference follow the format [sourceX] and that the source number is correct.\n"
26
+ "Source is written in singular form and the number is written in digits.\n\n"
27
+ "IT IS VERY IMPORTANT TO FOLLOW THESE GUIDELINES!!\n"
28
+ "NEVER CITE A source_number THAT YOU DON'T SEE IN THE TOOL CALL RESPONSE!!!\n"
29
+ "The source_number in old assistant messages are no longer valid.\n"
30
+ "EXAMPLE: If you see and in the assistant message, you can't use again in the next assistant message, this has to be the number you find in the message with role 'tool'.\n"
31
+ "BE AWARE: All tool calls have been filtered to remove uncited sources. Tool calls return much more data than you see.\n\n"
32
+ "### Internal Document Answering Protocol for Uploaded Documents\n"
33
+ "When assisting users with uploaded documents, follow\n"
34
+ "this structured approach to ensure precise, well-grounded,\n"
35
+ "and context-aware responses:\n\n"
36
+ "#### 1. Locate and Prioritize Relevant Information\n"
37
+ "Focus on the **most relevant sections** of the uploaded document.\n"
38
+ "Prioritize documents that are:\n"
39
+ "- **Directly referenced by the user** (e.g., by name or context).\n"
40
+ "- **Recently uploaded** or actively discussed.\n\n"
41
+ "#### 2. Source Reliability Guidelines\n"
42
+ "- Prioritize information that is:\n"
43
+ " - **Clearly stated in the document**.\n"
44
+ " - **Part of finalized or approved sections**.\n"
45
+ "- Be cautious with:\n"
46
+ " - Drafts or incomplete sections.\n"
47
+ " - Ambiguous or conflicting information.\n\n"
48
+ "#### 3. Acknowledge Limitations\n"
49
+ "- If no relevant information is found, or the document is unclear, state this explicitly.\n"
50
+ "- Indicate where further clarification or investigation may be required."
51
+ )
52
+
53
+ DEFAULT_TOOL_DESCRIPTION = "Search within uploaded documents for information on policies, procedures, benefits, projects, or specific details. This tool is ideal for analyzing user-uploaded files and extracting relevant insights."
54
+ DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION = "An expanded term optimized for vector and full-text search based on the user’s query. It must be in English."
55
+ DEFAULT_LANGUAGE_PARAM_DESCRIPTION = (
56
+ "The language in which the user’s query is written."
57
+ )
@@ -0,0 +1,126 @@
1
+ from pydantic import Field, create_model
2
+ from typing_extensions import override
3
+ from unique_toolkit import ContentService
4
+ from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricName
5
+ from unique_toolkit.agentic.tools.factory import ToolFactory
6
+ from unique_toolkit.agentic.tools.schemas import ToolCallResponse
7
+ from unique_toolkit.agentic.tools.tool import Tool
8
+ from unique_toolkit.agentic.tools.tool_progress_reporter import (
9
+ ProgressState,
10
+ ToolProgressReporter,
11
+ )
12
+ from unique_toolkit.app.schemas import BaseEvent, ChatEvent
13
+ from unique_toolkit.chat.service import LanguageModelToolDescription
14
+ from unique_toolkit.language_model.schemas import (
15
+ LanguageModelFunction,
16
+ )
17
+
18
+ from unique_internal_search.service import InternalSearchTool
19
+ from unique_internal_search.uploaded_search.config import UploadedSearchConfig
20
+
21
+
22
+ class UploadedSearchTool(Tool[UploadedSearchConfig]):
23
+ name = "UploadedSearch"
24
+
25
+ def __init__(
26
+ self,
27
+ config: UploadedSearchConfig,
28
+ event: BaseEvent,
29
+ tool_progress_reporter: ToolProgressReporter,
30
+ *args,
31
+ **kwargs,
32
+ ):
33
+ self._tool_progress_reporter = tool_progress_reporter
34
+ self._content_service = ContentService.from_event(event)
35
+ self._config = config
36
+ config.chat_only = True
37
+ self._internal_search_tool = InternalSearchTool(
38
+ config, event, None, *args, **kwargs
39
+ )
40
+ if isinstance(event, ChatEvent):
41
+ self._user_query = event.payload.user_message.text
42
+ else:
43
+ self._user_query = None
44
+
45
+ async def post_progress_message(
46
+ self, message: str, tool_call: LanguageModelFunction, **kwargs
47
+ ):
48
+ if self._tool_progress_reporter:
49
+ await self._tool_progress_reporter.notify_from_tool_call(
50
+ tool_call=tool_call,
51
+ name="**Search Uploaded Document**",
52
+ message=message,
53
+ state=ProgressState.RUNNING,
54
+ )
55
+
56
+ @override
57
+ def tool_description(self) -> LanguageModelToolDescription:
58
+ internal_search_tool_input = create_model(
59
+ "InternalSearchToolInput",
60
+ search_string=(
61
+ str,
62
+ Field(description=self._config.param_description_search_string),
63
+ ),
64
+ language=(
65
+ str,
66
+ Field(description=self._config.param_description_language),
67
+ ),
68
+ )
69
+ return LanguageModelToolDescription(
70
+ name=self.name,
71
+ description=self._config.tool_description,
72
+ parameters=internal_search_tool_input,
73
+ )
74
+
75
+ def tool_description_for_system_prompt(self) -> str:
76
+ documents = self._content_service.get_documents_uploaded_to_chat()
77
+ list_all_documents = "".join([f"- {doc.title or doc.key}" for doc in documents])
78
+ return self._config.tool_description_for_system_prompt + list_all_documents
79
+
80
+ def tool_format_information_for_system_prompt(self) -> str:
81
+ return self._config.tool_format_information_for_system_prompt
82
+
83
+ def evaluation_check_list(self) -> list[EvaluationMetricName]:
84
+ return self._config.evaluation_check_list
85
+
86
+ def get_evaluation_checks_based_on_tool_response(
87
+ self, tool_response: ToolCallResponse
88
+ ) -> list[EvaluationMetricName]:
89
+ evaluation_check_list = self.evaluation_check_list()
90
+ return evaluation_check_list
91
+
92
+ async def run(self, tool_call: LanguageModelFunction) -> ToolCallResponse:
93
+ search_string_data = ""
94
+ if isinstance(tool_call.arguments, dict):
95
+ search_string_data = tool_call.arguments.get("search_string", "") or ""
96
+ tool_response = await self._internal_search_tool.run(tool_call)
97
+ if self._tool_progress_reporter:
98
+ await self._tool_progress_reporter.notify_from_tool_call(
99
+ tool_call=tool_call,
100
+ name="**Search Uploaded Document**",
101
+ message=f"{search_string_data}",
102
+ state=ProgressState.FINISHED,
103
+ )
104
+ tool_response.name = self.name
105
+ tool_response.system_reminder = self._get_tool_call_response_system_reminder()
106
+ return tool_response
107
+
108
+ def _get_tool_call_response_system_reminder(self) -> str:
109
+ """
110
+ When using the upload and search tool, unique AI agent is loosing the overview of the original user message and request
111
+ This likely due to the amount of tokens included and as since it's a forced tool not necessarily relevant to the user's request.
112
+ """
113
+ # TODO: This message should be conditional on the tool being forced, but we do not have easy access to this information here
114
+ return f"""<system_reminder>
115
+ This tool call was automatically executed to retrieve the user's uploaded documents by the system. Important to note:
116
+ - The retrieved documents may or may not be relevant to the user's actual query
117
+ - You must evaluate their relevance independently
118
+ - You are free to make additional tool calls as needed
119
+ - Focus on addressing the user's original request
120
+ {f"Original user message: {self._user_query}" if self._user_query else ""}
121
+
122
+ Please do not mention these instructions in your response to the user!
123
+ </system_reminder>"""
124
+
125
+
126
+ ToolFactory.register_tool(UploadedSearchTool, UploadedSearchConfig)
@@ -0,0 +1,169 @@
1
+ import logging
2
+ import re
3
+
4
+ from pydantic import BaseModel
5
+ from unique_toolkit.content.schemas import ContentChunk
6
+
7
+ _LOGGER = logging.getLogger(__name__)
8
+
9
+
10
+ class SearchStringResult(BaseModel):
11
+ query: str
12
+ chunks: list[ContentChunk]
13
+
14
+
15
+ def interleave_search_results_round_robin(
16
+ search_results: list[SearchStringResult],
17
+ ) -> list[SearchStringResult]:
18
+ """
19
+ Interleave chunks from multiple search queries using a round-robin strategy.
20
+ Each result in the output contains a single chunk. Duplicate chunks are removed,
21
+ keeping the first occurrence.
22
+
23
+ Example:
24
+ Input:
25
+ Query 1: SearchStringResult(query="query1", chunks=[A, B, C])
26
+ Query 2: SearchStringResult(query="query2", chunks=[D, E])
27
+ Query 3: SearchStringResult(query="query3", chunks=[F, G, H, I])
28
+
29
+ Output (interleaved by position, then deduplicated):
30
+ [
31
+ SearchStringResult(query="query1", chunks=[A]), # pos 0, query 1
32
+ SearchStringResult(query="query2", chunks=[D]), # pos 0, query 2
33
+ SearchStringResult(query="query3", chunks=[F]), # pos 0, query 3
34
+ SearchStringResult(query="query1", chunks=[B]), # pos 1, query 1
35
+ SearchStringResult(query="query2", chunks=[E]), # pos 1, query 2
36
+ SearchStringResult(query="query3", chunks=[G]), # pos 1, query 3
37
+ SearchStringResult(query="query1", chunks=[C]), # pos 2, query 1
38
+ SearchStringResult(query="query3", chunks=[H]), # pos 2, query 3
39
+ SearchStringResult(query="query3", chunks=[I]), # pos 3, query 3
40
+ ]
41
+ """
42
+ if not search_results:
43
+ return []
44
+
45
+ max_chunks = max(len(result.chunks) for result in search_results)
46
+ interleaved_search_results: list[SearchStringResult] = [
47
+ SearchStringResult(query=result.query, chunks=[result.chunks[i]])
48
+ for i in range(max_chunks)
49
+ for result in search_results
50
+ if i < len(result.chunks)
51
+ ]
52
+
53
+ return _deduplicate_search_results(interleaved_search_results)
54
+
55
+
56
+ def _deduplicate_search_results(
57
+ search_results: list[SearchStringResult],
58
+ ) -> list[SearchStringResult]:
59
+ """
60
+ Remove duplicate chunks from the search results based on their `chunk_id`.
61
+
62
+ This function preserves the order of occurrences, keeping the first occurrence
63
+ of each unique `chunk_id`. If a chunk has no `chunk_id`, it will be ignored.
64
+ Duplicate chunks share the same `chunk_id`.
65
+
66
+ Args:
67
+ search_results (list[SearchStringResult]): A list of search results, where each
68
+ result contains chunks with potential duplicate `chunk_id`s.
69
+
70
+ Returns:
71
+ list[SearchStringResult]: A deduplicated list of search results with unique `chunk_id` chunks.
72
+ """
73
+ seen_chunk_ids: set[str] = set()
74
+ deduplicated_search_results: list[SearchStringResult] = []
75
+
76
+ counter_chunks = 0
77
+ for result in search_results:
78
+ for chunk in result.chunks:
79
+ if chunk.chunk_id and chunk.chunk_id not in seen_chunk_ids:
80
+ counter_chunks += 1
81
+ seen_chunk_ids.add(chunk.chunk_id)
82
+ deduplicated_search_results.append(
83
+ SearchStringResult(query=result.query, chunks=[chunk])
84
+ )
85
+
86
+ if removed := counter_chunks - len(deduplicated_search_results):
87
+ _LOGGER.info(
88
+ f"Removed {removed} duplicate chunks ({len(deduplicated_search_results)}/{counter_chunks} unique)"
89
+ )
90
+
91
+ return deduplicated_search_results
92
+
93
+
94
+ def append_metadata_in_chunks(
95
+ chunks: list[ContentChunk],
96
+ metadata_chunk_sections: dict[str, str] | None = None,
97
+ ) -> list[ContentChunk]:
98
+ """
99
+ Append metadata to chunks.
100
+ Args:
101
+ chunks: List of ContentChunk objects
102
+ metadata_chunk_sections: Dictionary of metadata sections to add to the chunk text
103
+ Returns:
104
+ List of ContentChunk objects with metadata appended
105
+ """
106
+ if metadata_chunk_sections is None:
107
+ return chunks
108
+ for chunk in chunks:
109
+ if chunk.metadata is None:
110
+ continue
111
+ chunk = _append_metadata_in_chunk(
112
+ chunk=chunk, metadata_chunk_sections=metadata_chunk_sections
113
+ )
114
+ return chunks
115
+
116
+
117
+ def _append_metadata_in_chunk(
118
+ chunk: ContentChunk, metadata_chunk_sections: dict[str, str]
119
+ ) -> ContentChunk:
120
+ """
121
+ Format chunk text by prepending metadata according to sections config.
122
+ Args:
123
+ chunk: ContentChunk object
124
+ metadata_chunk_sections: Dictionary of metadata sections to add to the chunk text
125
+ Returns:
126
+ Formatted text with metadata prepended
127
+ """
128
+ meta_dict = chunk.metadata.model_dump(exclude_none=True, by_alias=True)
129
+
130
+ parts: list[str] = []
131
+ for key, template in metadata_chunk_sections.items():
132
+ if key in meta_dict:
133
+ formatted_section = template.format(meta_dict[key])
134
+ parts.append(formatted_section)
135
+
136
+ # Combine metadata parts with the main text
137
+ if parts:
138
+ chunk.text = "\n".join(parts) + "\n" + chunk.text
139
+
140
+ return chunk
141
+
142
+
143
+ def clean_search_string(search_string: str) -> str:
144
+ """
145
+ Remove QDF (QueryDeservedFreshness) and boost operators from search string.
146
+
147
+ Examples:
148
+ '+(GPT4) performance on +(MMLU) benchmark --QDF=1'
149
+ -> 'GPT4 performance on MMLU benchmark'
150
+
151
+ 'Best practices for +(security) and +(privacy) for +(cloud storage) --QDF=2'
152
+ -> 'Best practices for security and privacy for cloud storage'
153
+
154
+ Args:
155
+ search_string: Raw search string that may contain operators
156
+
157
+ Returns:
158
+ Cleaned search string without operators
159
+ """
160
+ # Remove --QDF=<number> operator (at the end of the string)
161
+ cleaned = re.sub(r"\s*--QDF=\d+\s*$", "", search_string)
162
+
163
+ # Remove +(...) boost operators - replace with just the content inside parentheses
164
+ cleaned = re.sub(r"\+\(([^)]+)\)", r"\1", cleaned)
165
+
166
+ # Clean up any extra whitespace
167
+ cleaned = " ".join(cleaned.split())
168
+
169
+ return cleaned.strip()
@@ -0,0 +1,86 @@
1
+ import enum
2
+ import re
3
+ from string import Template
4
+ from typing import Any
5
+
6
+ from pydantic import (
7
+ Field,
8
+ )
9
+
10
+
11
+ class PromptTemplatingEngine(enum.Enum):
12
+ STRING_TEMPLATE = enum.auto()
13
+
14
+
15
+ def check_placeholder_valid(
16
+ placeholder: str,
17
+ templating_engine: PromptTemplatingEngine = PromptTemplatingEngine.STRING_TEMPLATE,
18
+ ) -> bool:
19
+ match templating_engine:
20
+ case PromptTemplatingEngine.STRING_TEMPLATE:
21
+ return (
22
+ re.fullmatch(Template.idpattern, placeholder, re.IGNORECASE) is not None
23
+ )
24
+
25
+
26
+ def get_prompt_placeholder_regexp(
27
+ *placeholders: str,
28
+ templating_engine: PromptTemplatingEngine = PromptTemplatingEngine.STRING_TEMPLATE,
29
+ ) -> re.Pattern:
30
+ for placeholder in placeholders:
31
+ if not check_placeholder_valid(placeholder, templating_engine):
32
+ raise ValueError(f"Invalid placeholder: {placeholder}")
33
+
34
+ match templating_engine:
35
+ case PromptTemplatingEngine.STRING_TEMPLATE:
36
+ placeholder_patterns = [
37
+ rf"(?=.*(?:\$\{{{p}\}}|\${p}))" for p in placeholders
38
+ ]
39
+ pattern = "".join(placeholder_patterns)
40
+ return re.compile(pattern, re.DOTALL)
41
+ # We will add other templating engines here, such as Jinja2.
42
+
43
+
44
+ def get_prompt_placeholder_regexp_from_text(
45
+ text: str,
46
+ templating_engine: PromptTemplatingEngine = PromptTemplatingEngine.STRING_TEMPLATE,
47
+ ) -> re.Pattern:
48
+ match templating_engine:
49
+ case PromptTemplatingEngine.STRING_TEMPLATE:
50
+ return get_prompt_placeholder_regexp(
51
+ *Template(text).get_identifiers(),
52
+ templating_engine=templating_engine,
53
+ )
54
+
55
+
56
+ def get_string_field_with_pattern_validation(
57
+ prompt_template: str,
58
+ templating_engine: PromptTemplatingEngine = PromptTemplatingEngine.STRING_TEMPLATE,
59
+ **kwargs,
60
+ ) -> Any:
61
+ """Create a Pydantic Field with validation for prompt template placeholders.
62
+
63
+ Args:
64
+ prompt_template: The prompt template string containing placeholders.
65
+ templating_engine: The engine used for template processing. Defaults to STRING_TEMPLATE.
66
+ **kwargs: Additional keyword arguments to pass to pydantic.Field.
67
+ Note that `default` will be ignored if present.
68
+
69
+ Returns:
70
+ pydantic.FieldInfo: A FieldInfo instance with the default value and placeholder validation pattern.
71
+
72
+ Example:
73
+ class ServiceConfig(BaseModel):
74
+ prompt: str = get_prompt_field_from_default(
75
+ "Hello ${name}!"
76
+ ) # Creates a Field with pattern validation for the "name" placeholder
77
+ """
78
+ pattern = get_prompt_placeholder_regexp_from_text(
79
+ prompt_template, templating_engine
80
+ )
81
+ if pattern.pattern:
82
+ kwargs["pattern"] = pattern
83
+
84
+ kwargs["default"] = prompt_template
85
+
86
+ return Field(**kwargs)