unique_internal_search 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_internal_search-1.2.2/CHANGELOG.md +62 -0
- unique_internal_search-1.2.2/LICENSE +1 -0
- unique_internal_search-1.2.2/PKG-INFO +86 -0
- unique_internal_search-1.2.2/README.md +3 -0
- unique_internal_search-1.2.2/pyproject.toml +46 -0
- unique_internal_search-1.2.2/unique_internal_search/config.py +153 -0
- unique_internal_search-1.2.2/unique_internal_search/prompts.py +64 -0
- unique_internal_search-1.2.2/unique_internal_search/service.py +458 -0
- unique_internal_search-1.2.2/unique_internal_search/uploaded_search/config.py +29 -0
- unique_internal_search-1.2.2/unique_internal_search/uploaded_search/prompts.py +57 -0
- unique_internal_search-1.2.2/unique_internal_search/uploaded_search/service.py +126 -0
- unique_internal_search-1.2.2/unique_internal_search/utils.py +169 -0
- unique_internal_search-1.2.2/unique_internal_search/validators.py +86 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [1.2.2] - 2025-11-10
|
|
9
|
+
- Temporarily reverting the removal of the `get_tool_call_result_for_loop_history` function, as it is still required for the Investment Research Agent.
|
|
10
|
+
|
|
11
|
+
## [1.2.1] - 2025-11-06
|
|
12
|
+
- Upload and chat system reminder cleanup
|
|
13
|
+
|
|
14
|
+
## [1.2.0] - 2025-11-04
|
|
15
|
+
- Include system reminder for upload and chat tool about it being a forced tool in UniqueAI
|
|
16
|
+
|
|
17
|
+
## [1.1.0] - 2025-10-30
|
|
18
|
+
- Add support for multiple search strings in a single tool call
|
|
19
|
+
- Search results from multiple queries are interleaved for better diversity
|
|
20
|
+
- Add automatic deduplication of chunks by `chunk_id` when using multiple search queries
|
|
21
|
+
- Prevents duplicate content from appearing in results when multiple related queries return the same chunks
|
|
22
|
+
- Preserves first occurrence and logs number of duplicates removed
|
|
23
|
+
- Add automatic parsing and cleaning of search query operators1
|
|
24
|
+
- Removes QDF (QueryDeservedFreshness) operators: `--QDF=0` to `--QDF=5` (freshness ratings)
|
|
25
|
+
- Removes boost operators: `+(term)` and `+(multi word phrase)` for query term boosting
|
|
26
|
+
|
|
27
|
+
## [1.0.4] - 2025-10-28
|
|
28
|
+
- Removing unused tool specific `get_tool_call_result_for_loop_history` function
|
|
29
|
+
- Removing unused config `source_format_config`
|
|
30
|
+
|
|
31
|
+
## [1.0.3] - 2025-10-25
|
|
32
|
+
- Fix appending of metadata to chunks
|
|
33
|
+
|
|
34
|
+
## [1.0.2] - 2025-10-17
|
|
35
|
+
- Remove print statements originating from tool refactor
|
|
36
|
+
|
|
37
|
+
## [1.0.1] - 2025-09-30
|
|
38
|
+
- Fix bug in metadata filter in the search method.
|
|
39
|
+
|
|
40
|
+
## [1.0.0] - 2025-09-18
|
|
41
|
+
- Bump toolkit version to allow for both patch and minor updates
|
|
42
|
+
|
|
43
|
+
## [0.0.7] - 2025-09-17
|
|
44
|
+
- Updated to latest toolkit
|
|
45
|
+
|
|
46
|
+
## [0.0.6] - 2025-09-15
|
|
47
|
+
- Fix Minor bug in transforming toolResponse to toolCallResult
|
|
48
|
+
|
|
49
|
+
## [0.0.5] - 2025-09-05
|
|
50
|
+
- Fixed a bug around metadata-filter assignment
|
|
51
|
+
|
|
52
|
+
## [0.0.4] - 2025-09-05
|
|
53
|
+
- Fixed a bug around metadata-filter deep-copy
|
|
54
|
+
|
|
55
|
+
## [0.0.3] - 2025-09-01
|
|
56
|
+
- Migrated the `uploaded_search` into this package.
|
|
57
|
+
|
|
58
|
+
## [0.0.2] - 2025-09-01
|
|
59
|
+
- Migrated the `internal_search`.
|
|
60
|
+
|
|
61
|
+
## [0.0.1] - 2025-08-18
|
|
62
|
+
- Initial release of `internal_search`.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
`unique_toolkit` is covered by the [`Unique License v1`](https://github.com/Unique-AG/license/releases/tag/unique-license.v1), unless the/a header or a nested LICENSE specifies another license.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: unique_internal_search
|
|
3
|
+
Version: 1.2.2
|
|
4
|
+
Summary:
|
|
5
|
+
License: Proprietary
|
|
6
|
+
Author: Martin Fadler
|
|
7
|
+
Author-email: martin.fadler@unique.ch
|
|
8
|
+
Requires-Python: >=3.12,<4.0
|
|
9
|
+
Classifier: License :: Other/Proprietary License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Dist: pillow (>=10.4.0,<11.0.0)
|
|
13
|
+
Requires-Dist: pydantic (>=2.8.2,<3.0.0)
|
|
14
|
+
Requires-Dist: pydantic-settings (>=2.10.1,<3.0.0)
|
|
15
|
+
Requires-Dist: pytest (>=8.4.1,<9.0.0)
|
|
16
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
17
|
+
Requires-Dist: typing-extensions (>=4.9.0,<5.0.0)
|
|
18
|
+
Requires-Dist: unique-sdk (>=0.10.0,<0.11.0)
|
|
19
|
+
Requires-Dist: unique-toolkit (>=1.18.1,<2.0.0)
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Internal Search Tool
|
|
23
|
+
|
|
24
|
+
Internal Search Tool to find documents in the Knowledge Base
|
|
25
|
+
# Changelog
|
|
26
|
+
|
|
27
|
+
All notable changes to this project will be documented in this file.
|
|
28
|
+
|
|
29
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
30
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
31
|
+
|
|
32
|
+
## [1.2.2] - 2025-11-10
|
|
33
|
+
- Temporarily reverting the removal of the `get_tool_call_result_for_loop_history` function, as it is still required for the Investment Research Agent.
|
|
34
|
+
|
|
35
|
+
## [1.2.1] - 2025-11-06
|
|
36
|
+
- Upload and chat system reminder cleanup
|
|
37
|
+
|
|
38
|
+
## [1.2.0] - 2025-11-04
|
|
39
|
+
- Include system reminder for upload and chat tool about it being a forced tool in UniqueAI
|
|
40
|
+
|
|
41
|
+
## [1.1.0] - 2025-10-30
|
|
42
|
+
- Add support for multiple search strings in a single tool call
|
|
43
|
+
- Search results from multiple queries are interleaved for better diversity
|
|
44
|
+
- Add automatic deduplication of chunks by `chunk_id` when using multiple search queries
|
|
45
|
+
- Prevents duplicate content from appearing in results when multiple related queries return the same chunks
|
|
46
|
+
- Preserves first occurrence and logs number of duplicates removed
|
|
47
|
+
- Add automatic parsing and cleaning of search query operators1
|
|
48
|
+
- Removes QDF (QueryDeservedFreshness) operators: `--QDF=0` to `--QDF=5` (freshness ratings)
|
|
49
|
+
- Removes boost operators: `+(term)` and `+(multi word phrase)` for query term boosting
|
|
50
|
+
|
|
51
|
+
## [1.0.4] - 2025-10-28
|
|
52
|
+
- Removing unused tool specific `get_tool_call_result_for_loop_history` function
|
|
53
|
+
- Removing unused config `source_format_config`
|
|
54
|
+
|
|
55
|
+
## [1.0.3] - 2025-10-25
|
|
56
|
+
- Fix appending of metadata to chunks
|
|
57
|
+
|
|
58
|
+
## [1.0.2] - 2025-10-17
|
|
59
|
+
- Remove print statements originating from tool refactor
|
|
60
|
+
|
|
61
|
+
## [1.0.1] - 2025-09-30
|
|
62
|
+
- Fix bug in metadata filter in the search method.
|
|
63
|
+
|
|
64
|
+
## [1.0.0] - 2025-09-18
|
|
65
|
+
- Bump toolkit version to allow for both patch and minor updates
|
|
66
|
+
|
|
67
|
+
## [0.0.7] - 2025-09-17
|
|
68
|
+
- Updated to latest toolkit
|
|
69
|
+
|
|
70
|
+
## [0.0.6] - 2025-09-15
|
|
71
|
+
- Fix Minor bug in transforming toolResponse to toolCallResult
|
|
72
|
+
|
|
73
|
+
## [0.0.5] - 2025-09-05
|
|
74
|
+
- Fixed a bug around metadata-filter assignment
|
|
75
|
+
|
|
76
|
+
## [0.0.4] - 2025-09-05
|
|
77
|
+
- Fixed a bug around metadata-filter deep-copy
|
|
78
|
+
|
|
79
|
+
## [0.0.3] - 2025-09-01
|
|
80
|
+
- Migrated the `uploaded_search` into this package.
|
|
81
|
+
|
|
82
|
+
## [0.0.2] - 2025-09-01
|
|
83
|
+
- Migrated the `internal_search`.
|
|
84
|
+
|
|
85
|
+
## [0.0.1] - 2025-08-18
|
|
86
|
+
- Initial release of `internal_search`.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "unique_internal_search"
|
|
3
|
+
version = "1.2.2"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = [
|
|
6
|
+
"Martin Fadler <martin.fadler@unique.ch>",
|
|
7
|
+
"Sadique Sheik <sadique@unique.ch>",
|
|
8
|
+
"Fabian Schläpfer <fabian@unique.ch>",
|
|
9
|
+
"Pascal Hauri <pascal@unique.ch>",
|
|
10
|
+
]
|
|
11
|
+
readme = ["README.md", "CHANGELOG.md"]
|
|
12
|
+
license = "Proprietary"
|
|
13
|
+
|
|
14
|
+
[tool.poetry.dependencies]
|
|
15
|
+
python = "^3.12"
|
|
16
|
+
typing-extensions = "^4.9.0"
|
|
17
|
+
pydantic = "^2.8.2"
|
|
18
|
+
pydantic-settings = "^2.10.1"
|
|
19
|
+
python-dotenv = "^1.0.1"
|
|
20
|
+
pytest = "^8.4.1"
|
|
21
|
+
pillow = "^10.4.0"
|
|
22
|
+
unique-sdk = "^0.10.0"
|
|
23
|
+
unique-toolkit = "^1.18.1"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
[tool.poetry.group.dev.dependencies]
|
|
27
|
+
python = "^3.12"
|
|
28
|
+
typing-extensions = "^4.9.0"
|
|
29
|
+
pydantic = "^2.8.2"
|
|
30
|
+
pydantic-settings = "^2.10.1"
|
|
31
|
+
python-dotenv = "^1.0.1"
|
|
32
|
+
pytest = "^8.4.1"
|
|
33
|
+
unique-sdk = { path = "../../unique_sdk" }
|
|
34
|
+
unique-toolkit = { path = "../../unique_toolkit" }
|
|
35
|
+
ruff = "^0.12.10"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
[build-system]
|
|
39
|
+
requires = ["poetry-core"]
|
|
40
|
+
build-backend = "poetry.core.masonry.api"
|
|
41
|
+
|
|
42
|
+
[tool.ruff]
|
|
43
|
+
target-version = "py311"
|
|
44
|
+
|
|
45
|
+
[tool.ruff.lint]
|
|
46
|
+
extend-select = ["I"]
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
from typing import Annotated, Any
|
|
2
|
+
|
|
3
|
+
from pydantic import (
|
|
4
|
+
AliasChoices,
|
|
5
|
+
Field,
|
|
6
|
+
)
|
|
7
|
+
from pydantic.json_schema import SkipJsonSchema
|
|
8
|
+
from unique_toolkit._common.chunk_relevancy_sorter.config import (
|
|
9
|
+
ChunkRelevancySortConfig,
|
|
10
|
+
)
|
|
11
|
+
from unique_toolkit._common.feature_flags.schema import (
|
|
12
|
+
FeatureExtendedSourceSerialization,
|
|
13
|
+
)
|
|
14
|
+
from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricName
|
|
15
|
+
from unique_toolkit.agentic.history_manager.history_manager import DeactivatedNone
|
|
16
|
+
from unique_toolkit.agentic.tools.schemas import BaseToolConfig
|
|
17
|
+
from unique_toolkit.content.schemas import (
|
|
18
|
+
ContentRerankerConfig,
|
|
19
|
+
ContentSearchType,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from unique_internal_search.prompts import (
|
|
23
|
+
DEFAULT_LANGUAGE_PARAM_DESCRIPTION,
|
|
24
|
+
DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION,
|
|
25
|
+
DEFAULT_TOOL_DESCRIPTION,
|
|
26
|
+
DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT,
|
|
27
|
+
DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT,
|
|
28
|
+
)
|
|
29
|
+
from unique_internal_search.validators import get_string_field_with_pattern_validation
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ExperimentalFeatures(FeatureExtendedSourceSerialization):
|
|
33
|
+
enable_multiple_search_strings_execution: bool = Field(
|
|
34
|
+
default=False,
|
|
35
|
+
description="Allow execution of multiple search strings in one call. When set to True, each string is searched individually and results are merged into a single response.",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
DEFAULT_LIMIT_CHUNK_RELEVANCY_SORT_ENABLED = 200
|
|
40
|
+
DEFAULT_LIMIT_CHUNK_RELEVANCY_SORT_DISABLED = 1000
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _search_limit_factory(data: dict[str, Any]) -> int:
|
|
44
|
+
return (
|
|
45
|
+
DEFAULT_LIMIT_CHUNK_RELEVANCY_SORT_ENABLED
|
|
46
|
+
if data["chunk_relevancy_sort_config"].enabled
|
|
47
|
+
else DEFAULT_LIMIT_CHUNK_RELEVANCY_SORT_DISABLED
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class InternalSearchConfig(BaseToolConfig):
|
|
52
|
+
search_type: ContentSearchType = Field(
|
|
53
|
+
default=ContentSearchType.COMBINED,
|
|
54
|
+
description="The type of search to perform. Two possible values: `COMBINED` or `VECTOR`.",
|
|
55
|
+
)
|
|
56
|
+
max_tokens_for_sources: SkipJsonSchema[int] = (
|
|
57
|
+
Field( # TODO: Remove SkipJsonSchema once UI (Spaces 2.0) can be configured to not include certain fields
|
|
58
|
+
default=30_000,
|
|
59
|
+
description="The maximum number of tokens to use for the sources.",
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
percentage_of_input_tokens_for_sources: float = Field(
|
|
63
|
+
default=0.4,
|
|
64
|
+
description="The percentage of the maximum input tokens of the language model to use for the tool response.",
|
|
65
|
+
ge=0.0,
|
|
66
|
+
le=1.0,
|
|
67
|
+
)
|
|
68
|
+
language_model_max_input_tokens: SkipJsonSchema[int | None] = Field(
|
|
69
|
+
default=None,
|
|
70
|
+
description="Language model maximum input tokens",
|
|
71
|
+
)
|
|
72
|
+
scope_ids: Annotated[list[str], Field(title="Active")] | DeactivatedNone = Field(
|
|
73
|
+
default=None,
|
|
74
|
+
description="The scope ids to use for the search.",
|
|
75
|
+
)
|
|
76
|
+
scope_to_chat_on_upload: bool = Field(
|
|
77
|
+
default=False,
|
|
78
|
+
description="Whether to scope the search should be limited to files uploaded within the chat session when uploaded files are present.",
|
|
79
|
+
)
|
|
80
|
+
chunked_sources: bool = Field(
|
|
81
|
+
default=True,
|
|
82
|
+
description="Whether each chunk is added as an individual source in the final LLM prompt. If set to False, all chunks from the same document are combined into a single source.",
|
|
83
|
+
)
|
|
84
|
+
reranker_config: (
|
|
85
|
+
Annotated[ContentRerankerConfig, Field(title="Active")] | DeactivatedNone
|
|
86
|
+
) = Field(
|
|
87
|
+
default=None,
|
|
88
|
+
description="The reranker config to use for the search.",
|
|
89
|
+
)
|
|
90
|
+
search_language: str = Field(
|
|
91
|
+
default="english",
|
|
92
|
+
validation_alias=AliasChoices("ftsSearchLanguage", "searchLanguage"),
|
|
93
|
+
description="The language to use for the search.",
|
|
94
|
+
)
|
|
95
|
+
# evaluation_config: EvaluationMetricConfig = EvaluationMetricConfig()
|
|
96
|
+
chunk_relevancy_sort_config: ChunkRelevancySortConfig = Field(
|
|
97
|
+
default_factory=ChunkRelevancySortConfig,
|
|
98
|
+
description="The chunk relevancy sort config to use for the search.",
|
|
99
|
+
)
|
|
100
|
+
limit: int = Field(
|
|
101
|
+
default_factory=_search_limit_factory,
|
|
102
|
+
description="The limit of chunks to return.",
|
|
103
|
+
)
|
|
104
|
+
chat_only: bool = Field(
|
|
105
|
+
default=False,
|
|
106
|
+
description="Whether to only chat on the upload.",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
tool_description: str = get_string_field_with_pattern_validation(
|
|
110
|
+
DEFAULT_TOOL_DESCRIPTION,
|
|
111
|
+
description="Tool description.",
|
|
112
|
+
)
|
|
113
|
+
param_description_search_string: str = Field(
|
|
114
|
+
default=DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION,
|
|
115
|
+
description="`search_string` parameter description.",
|
|
116
|
+
)
|
|
117
|
+
param_description_language: str = get_string_field_with_pattern_validation(
|
|
118
|
+
DEFAULT_LANGUAGE_PARAM_DESCRIPTION,
|
|
119
|
+
description="`language` parameter description.",
|
|
120
|
+
)
|
|
121
|
+
tool_description_for_system_prompt: str = get_string_field_with_pattern_validation(
|
|
122
|
+
DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT,
|
|
123
|
+
description="Tool description for the system prompt.",
|
|
124
|
+
)
|
|
125
|
+
tool_format_information_for_system_prompt: str = (
|
|
126
|
+
get_string_field_with_pattern_validation(
|
|
127
|
+
DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT,
|
|
128
|
+
description="Tool format information for the system prompt.",
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
evaluation_check_list: list[EvaluationMetricName] = Field(
|
|
132
|
+
default=[EvaluationMetricName.HALLUCINATION],
|
|
133
|
+
description="The list of evaluation metrics to check.",
|
|
134
|
+
)
|
|
135
|
+
experimental_features: ExperimentalFeatures = ExperimentalFeatures()
|
|
136
|
+
|
|
137
|
+
metadata_chunk_sections: dict[str, str] = Field(
|
|
138
|
+
default={},
|
|
139
|
+
description=(
|
|
140
|
+
"Metadata sections to be appended to each search result chunk’s text. The keys represent metadata field names (e.g., 'metadata_key'), and the values are template strings that define how the metadata should be embedded, using {} as a placeholder for the actual value (e.g., '<|metadata_key|>{}<|/metadata_key|>')."
|
|
141
|
+
),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
score_threshold: float = Field(
|
|
145
|
+
default=0.0,
|
|
146
|
+
ge=0.0,
|
|
147
|
+
le=1.0,
|
|
148
|
+
description="The score threshold to use for the search to filter chunks on relevancy.",
|
|
149
|
+
)
|
|
150
|
+
exclude_uploaded_files: bool = Field(
|
|
151
|
+
default=False,
|
|
152
|
+
description="Whether to exclude uploaded files from the search. Overrides the `chat_only` parameter as it removes the `chat_id` from the search.",
|
|
153
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT = (
|
|
2
|
+
"You can use the InternalSearch tool to access internal company documentations, including information on policies, procedures, benefits, groups, financial details, and specific individuals. "
|
|
3
|
+
"If this tool can help answer your question, feel free to use it to search the internal knowledge base for more information. "
|
|
4
|
+
"If possible always try to get information from the internal knowledge base with the InternalSearch tool before using other tools.\n"
|
|
5
|
+
"Use cases for the Internal Knowledge Search are:\n"
|
|
6
|
+
"- User asks to work with a document: Most likely the document is uploaded to the chat and mentioned in a message and can be loaded with this tool\n"
|
|
7
|
+
"- Policy and Procedure Verification: Use the internal search tool to find the most current company policies, procedures, or guidelines to ensure compliance and accuracy in responses.\n"
|
|
8
|
+
"- Project-Specific Information: When answering questions related to ongoing projects or initiatives, use the internal search to access project documents, reports, or meeting notes for precise details.\n"
|
|
9
|
+
"- Employee Directory and Contact Information: Utilize the internal search to locate contact details or organizational charts to facilitate communication and collaboration within the company.\n"
|
|
10
|
+
"- Confidential and Proprietary Information: When dealing with sensitive topics that require proprietary knowledge or confidential data, use the internal search to ensure the information is sourced from secure and authorized company documents.\n\n"
|
|
11
|
+
"**Instruction Query Splitting**\n"
|
|
12
|
+
'You should split the user question into multiple search strings when the user\'s question needs to be decomposed / rewritten to find different facts. Perform for each search string an individual tool call. Avoid short queries that are extremely broad and will return unrelated results. Strip the search string of any extraneous details, e.g. instructions or unnecessary context. However, you must fill in relevant context from the rest of the conversation to make the question complete. E.g. "What was their age?" => "What was Kevin\'s age?" because the preceding conversation makes it clear that the user is talking about Kevin.\n\n'
|
|
13
|
+
"Here are some examples of how to use the InternalSearch tool:\n"
|
|
14
|
+
'User: What was the GDP of France and Italy in the 1970s? => search strings: ["france gdp 1970", "italy gdp 1970"] # Splitting of the query into 2 queries and perform 2 tool calls\n'
|
|
15
|
+
'User: What does the report say about the GPT4 performance on MMLU? => search strings: ["GPT4 performance on MMLU?"] # Simplify the query'
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT = (
|
|
19
|
+
"Whenever you use information retrieved with the InternalSearch, you must adhere to strict reference guidelines. "
|
|
20
|
+
"You must strictly reference each fact used with the `source_number` of the corresponding passage, in the following format: '[source<source_number>]'.\n\n"
|
|
21
|
+
"Example:\n"
|
|
22
|
+
"- The stock price of Apple Inc. is $150 [source0] and the company's revenue increased by 10% [source1].\n"
|
|
23
|
+
"- Moreover, the company's market capitalization is $2 trillion [source2][source3].\n"
|
|
24
|
+
"- Our internal documents tell us to invest[source4] (Internal)\n\n"
|
|
25
|
+
"A fact is preferably referenced by ONLY ONE source, e.g [sourceX], which should be the most relevant source for the fact.\n"
|
|
26
|
+
"Follow these guidelines closely and be sure to use the proper `source_number` when referencing facts.\n"
|
|
27
|
+
"Make sure that your reference follow the format [sourceX] and that the source number is correct.\n"
|
|
28
|
+
"Source is written in singular form and the number is written in digits.\n\n"
|
|
29
|
+
"IT IS VERY IMPORTANT TO FOLLOW THESE GUIDELINES!!\n"
|
|
30
|
+
"NEVER CITE A source_number THAT YOU DON'T SEE IN THE TOOL CALL RESPONSE!!!\n"
|
|
31
|
+
"The source_number in old assistant messages are no longer valid.\n"
|
|
32
|
+
"EXAMPLE: If you see [source34] and [source35] in the assistant message, you can't use [source34] again in the next assistant message, this has to be the number you find in the message with role 'tool'.\n"
|
|
33
|
+
"BE AWARE:All tool calls have been filtered to remove uncited sources. Tool calls return much more data than you see\n\n"
|
|
34
|
+
"### Internal Document Answering Protocol for Employee Questions\n"
|
|
35
|
+
"When assisting employees using internal documents, follow\n"
|
|
36
|
+
"this structured approach to ensure precise, well-grounded,\n"
|
|
37
|
+
"and context-aware responses:\n\n"
|
|
38
|
+
"#### 1. Locate and Prioritize Relevant Internal Sources\n"
|
|
39
|
+
"Give strong preference to:\n"
|
|
40
|
+
"- **Most relevant documents**, such as:\n"
|
|
41
|
+
"- **Documents authored by or involving** the employee or team in question\n"
|
|
42
|
+
"- **Cross-validated sources**, especially when multiple documents agree\n"
|
|
43
|
+
" - Project trackers, design docs, decision logs, and OKRs\n"
|
|
44
|
+
" - Recently updated or active files\n\n"
|
|
45
|
+
"#### 2. Source Reliability Guidelines\n"
|
|
46
|
+
"- Prioritize information that is:\n"
|
|
47
|
+
" - **Directly written by domain experts or stakeholders**\n"
|
|
48
|
+
" - **Part of approved or finalized documentation**\n"
|
|
49
|
+
" - **Recently modified or reviewed**, if recency matters\n"
|
|
50
|
+
"- Be cautious with:\n"
|
|
51
|
+
" - Outdated drafts\n"
|
|
52
|
+
" - Undocumented opinions or partial records\n\n"
|
|
53
|
+
"#### 3. Acknowledge Limitations\n"
|
|
54
|
+
"- If no relevant information is found, or documents conflict, clearly state this\n"
|
|
55
|
+
"- Indicate where further clarification or investigation may be required"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
DEFAULT_TOOL_DESCRIPTION = (
|
|
59
|
+
"Search in the company knowledge base for information on policies, procedures, benefits, groups, financial information or specific people. "
|
|
60
|
+
"This should be your go-to tool if no other tools are applicable."
|
|
61
|
+
)
|
|
62
|
+
DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION = "An expanded term that is optimized for vector and full text search based on the users query it must be in english."
|
|
63
|
+
|
|
64
|
+
DEFAULT_LANGUAGE_PARAM_DESCRIPTION = "The language that the user wrote the query in"
|
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
from logging import Logger
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, create_model
|
|
4
|
+
from typing_extensions import override
|
|
5
|
+
from unique_toolkit._common.chunk_relevancy_sorter.exception import (
|
|
6
|
+
ChunkRelevancySorterException,
|
|
7
|
+
)
|
|
8
|
+
from unique_toolkit._common.chunk_relevancy_sorter.service import ChunkRelevancySorter
|
|
9
|
+
from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricName
|
|
10
|
+
from unique_toolkit.agentic.history_manager.utils import transform_chunks_to_string
|
|
11
|
+
from unique_toolkit.agentic.tools.agent_chunks_hanlder import AgentChunksHandler
|
|
12
|
+
from unique_toolkit.agentic.tools.factory import ToolFactory
|
|
13
|
+
from unique_toolkit.agentic.tools.schemas import ToolCallResponse
|
|
14
|
+
from unique_toolkit.agentic.tools.tool import Tool
|
|
15
|
+
from unique_toolkit.agentic.tools.tool_progress_reporter import ProgressState
|
|
16
|
+
from unique_toolkit.app.schemas import BaseEvent, ChatEvent, Event
|
|
17
|
+
from unique_toolkit.chat.service import LanguageModelToolDescription
|
|
18
|
+
from unique_toolkit.content.schemas import Content, ContentChunk
|
|
19
|
+
from unique_toolkit.content.service import ContentService
|
|
20
|
+
from unique_toolkit.content.utils import (
|
|
21
|
+
merge_content_chunks,
|
|
22
|
+
pick_content_chunks_for_token_window,
|
|
23
|
+
sort_content_chunks,
|
|
24
|
+
)
|
|
25
|
+
from unique_toolkit.language_model.schemas import (
|
|
26
|
+
LanguageModelFunction,
|
|
27
|
+
LanguageModelMessage,
|
|
28
|
+
LanguageModelToolMessage,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
from unique_internal_search.config import InternalSearchConfig
|
|
32
|
+
from unique_internal_search.utils import (
|
|
33
|
+
SearchStringResult,
|
|
34
|
+
append_metadata_in_chunks,
|
|
35
|
+
clean_search_string,
|
|
36
|
+
interleave_search_results_round_robin,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class InternalSearchService:
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
config: InternalSearchConfig,
|
|
44
|
+
content_service: ContentService,
|
|
45
|
+
chunk_relevancy_sorter: ChunkRelevancySorter,
|
|
46
|
+
chat_id: str | None,
|
|
47
|
+
logger: Logger,
|
|
48
|
+
):
|
|
49
|
+
self.config = config
|
|
50
|
+
self.content_service = content_service
|
|
51
|
+
self.chunk_relevancy_sorter = chunk_relevancy_sorter
|
|
52
|
+
self.chat_id = chat_id
|
|
53
|
+
self.logger = logger
|
|
54
|
+
self.tool_execution_message_name = "Internal search"
|
|
55
|
+
|
|
56
|
+
async def post_progress_message(self, message: str, *args, **kwargs):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
async def get_uploaded_files(self) -> list[Content]:
|
|
60
|
+
chat_results = await self.content_service.search_contents_async(
|
|
61
|
+
where={
|
|
62
|
+
"ownerId": {
|
|
63
|
+
"equals": self.chat_id,
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
)
|
|
67
|
+
sorted_chat_results: list[Content] = sorted(
|
|
68
|
+
chat_results,
|
|
69
|
+
key=lambda x: x.created_at, # type: ignore
|
|
70
|
+
reverse=True,
|
|
71
|
+
)
|
|
72
|
+
return sorted_chat_results
|
|
73
|
+
|
|
74
|
+
async def is_chat_only(self, **kwargs) -> bool:
|
|
75
|
+
"""Check whether the assistant should limit itself to files in chat"""
|
|
76
|
+
if self.config.chat_only:
|
|
77
|
+
return True
|
|
78
|
+
if self.config.scope_to_chat_on_upload:
|
|
79
|
+
chat_files = await self.get_uploaded_files()
|
|
80
|
+
if len(chat_files) > 0:
|
|
81
|
+
return True
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
async def search(
|
|
85
|
+
self,
|
|
86
|
+
search_string: str | list[str],
|
|
87
|
+
content_ids: list[str] | None = None,
|
|
88
|
+
metadata_filter: dict | None = None,
|
|
89
|
+
**kwargs,
|
|
90
|
+
) -> list[ContentChunk]:
|
|
91
|
+
"""
|
|
92
|
+
Perform a search with one or more search strings.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
search_string: List of search strings or single search string
|
|
96
|
+
content_ids: List of content IDs
|
|
97
|
+
metadata_filter: Metadata filter
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
# Convert single string to list
|
|
101
|
+
if isinstance(search_string, str):
|
|
102
|
+
search_strings = [search_string]
|
|
103
|
+
else:
|
|
104
|
+
search_strings = search_string
|
|
105
|
+
|
|
106
|
+
"""
|
|
107
|
+
Perform a search in the Vector DB based on the user's message and generate a response.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
# Clean search strings by removing QDF and boost operators
|
|
111
|
+
search_strings = [clean_search_string(s) for s in search_strings]
|
|
112
|
+
|
|
113
|
+
###
|
|
114
|
+
# 2. Search for context in the Vector DB
|
|
115
|
+
###
|
|
116
|
+
chat_only = await self.is_chat_only(**kwargs)
|
|
117
|
+
|
|
118
|
+
"""
|
|
119
|
+
Handle the fact that metadata can exclude uploaded content
|
|
120
|
+
and that the search service is hardcoded to use the metadata_filter
|
|
121
|
+
from the event if set to None
|
|
122
|
+
"""
|
|
123
|
+
# Take a backup of the metadata filter
|
|
124
|
+
metadata_filter_copy = self.content_service._metadata_filter
|
|
125
|
+
|
|
126
|
+
if metadata_filter is None:
|
|
127
|
+
metadata_filter = self.content_service._metadata_filter
|
|
128
|
+
if chat_only and metadata_filter:
|
|
129
|
+
# if this is not set to none search_content_chunks_async will overwrite it inside its call thats why it needs to stay.
|
|
130
|
+
self.content_service._metadata_filter = None
|
|
131
|
+
metadata_filter = None
|
|
132
|
+
|
|
133
|
+
found_chunks_per_search_string: list[SearchStringResult] = []
|
|
134
|
+
for i, search_string in enumerate(search_strings):
|
|
135
|
+
try:
|
|
136
|
+
found_chunks: list[
|
|
137
|
+
ContentChunk
|
|
138
|
+
] = await self.content_service.search_content_chunks_async(
|
|
139
|
+
search_string=search_string, # type: ignore
|
|
140
|
+
search_type=self.config.search_type,
|
|
141
|
+
limit=self.config.limit,
|
|
142
|
+
reranker_config=self.config.reranker_config,
|
|
143
|
+
search_language=self.config.search_language,
|
|
144
|
+
scope_ids=self.config.scope_ids,
|
|
145
|
+
metadata_filter=metadata_filter,
|
|
146
|
+
chat_id=self.chat_id
|
|
147
|
+
if self.config.exclude_uploaded_files and self.chat_id
|
|
148
|
+
else "",
|
|
149
|
+
chat_only=chat_only,
|
|
150
|
+
content_ids=content_ids,
|
|
151
|
+
score_threshold=self.config.score_threshold,
|
|
152
|
+
)
|
|
153
|
+
self.logger.info(
|
|
154
|
+
f"Found {len(found_chunks)} chunks (Query {i + 1}/{len(search_strings)})"
|
|
155
|
+
)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
self.logger.error(f"Error in search_document_chunks call: {e}")
|
|
158
|
+
raise e
|
|
159
|
+
|
|
160
|
+
found_chunks_per_search_string.append(
|
|
161
|
+
SearchStringResult(
|
|
162
|
+
query=search_string,
|
|
163
|
+
chunks=found_chunks,
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Reset the metadata filter in case it was disabled
|
|
168
|
+
self.content_service._metadata_filter = metadata_filter_copy
|
|
169
|
+
|
|
170
|
+
# Apply chunk relevancy sorter if enabled
|
|
171
|
+
if self.config.chunk_relevancy_sort_config.enabled:
|
|
172
|
+
for i, result in enumerate(found_chunks_per_search_string):
|
|
173
|
+
await self.post_progress_message(
|
|
174
|
+
f"{result.query} (_Resorting {len(result.chunks)} search results_ 🔄 in query {i + 1}/{len(search_strings)})",
|
|
175
|
+
**kwargs,
|
|
176
|
+
)
|
|
177
|
+
result.chunks = await self._resort_found_chunks_if_enabled(
|
|
178
|
+
found_chunks=result.chunks,
|
|
179
|
+
search_string=result.query,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
###
|
|
183
|
+
# 3. Pick a subset of the search results
|
|
184
|
+
###
|
|
185
|
+
if (
|
|
186
|
+
self.config.experimental_features.enable_multiple_search_strings_execution
|
|
187
|
+
and len(found_chunks_per_search_string) > 1
|
|
188
|
+
):
|
|
189
|
+
found_chunks_per_search_string = interleave_search_results_round_robin(
|
|
190
|
+
found_chunks_per_search_string
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
await self.post_progress_message(
|
|
194
|
+
f"{', '.join(search_strings)} (_Postprocessing search results_)",
|
|
195
|
+
**kwargs,
|
|
196
|
+
)
|
|
197
|
+
found_chunks = [
|
|
198
|
+
chunk
|
|
199
|
+
for result in found_chunks_per_search_string
|
|
200
|
+
for chunk in result.chunks
|
|
201
|
+
]
|
|
202
|
+
selected_chunks = pick_content_chunks_for_token_window(
|
|
203
|
+
found_chunks, self._get_max_tokens()
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
###
|
|
207
|
+
# 4. cache them add index to search results & join them together
|
|
208
|
+
###
|
|
209
|
+
if not self.config.chunked_sources:
|
|
210
|
+
selected_chunks = merge_content_chunks(selected_chunks)
|
|
211
|
+
else:
|
|
212
|
+
selected_chunks = sort_content_chunks(selected_chunks)
|
|
213
|
+
|
|
214
|
+
self.debug_info = {
|
|
215
|
+
"searchStrings": search_strings,
|
|
216
|
+
"metadataFilter": metadata_filter,
|
|
217
|
+
"chatOnly": chat_only,
|
|
218
|
+
}
|
|
219
|
+
return selected_chunks
|
|
220
|
+
|
|
221
|
+
async def _resort_found_chunks_if_enabled(
|
|
222
|
+
self, found_chunks: list[ContentChunk], search_string: str
|
|
223
|
+
) -> list[ContentChunk]:
|
|
224
|
+
try:
|
|
225
|
+
total_chunks = len(found_chunks)
|
|
226
|
+
self.logger.info(f"Resorting {total_chunks} search result...")
|
|
227
|
+
chunk_relevancy_sorter_result = await self.chunk_relevancy_sorter.run(
|
|
228
|
+
input_text=search_string,
|
|
229
|
+
chunks=found_chunks,
|
|
230
|
+
config=self.config.chunk_relevancy_sort_config,
|
|
231
|
+
)
|
|
232
|
+
found_chunks = chunk_relevancy_sorter_result.content_chunks
|
|
233
|
+
except ChunkRelevancySorterException as e:
|
|
234
|
+
self.logger.warning(f"Error while sorting chunks: {e.error_message}")
|
|
235
|
+
finally:
|
|
236
|
+
return found_chunks
|
|
237
|
+
|
|
238
|
+
def _get_max_tokens(self) -> int:
|
|
239
|
+
if self.config.language_model_max_input_tokens is not None:
|
|
240
|
+
max_tokens = int(
|
|
241
|
+
self.config.language_model_max_input_tokens
|
|
242
|
+
* self.config.percentage_of_input_tokens_for_sources
|
|
243
|
+
)
|
|
244
|
+
self.logger.debug(
|
|
245
|
+
"Using %s of max tokens %s as token limit: %s",
|
|
246
|
+
self.config.percentage_of_input_tokens_for_sources,
|
|
247
|
+
self.config.language_model_max_input_tokens,
|
|
248
|
+
max_tokens,
|
|
249
|
+
)
|
|
250
|
+
return max_tokens
|
|
251
|
+
else:
|
|
252
|
+
self.logger.debug(
|
|
253
|
+
"language model input context size is not set, using default max tokens"
|
|
254
|
+
)
|
|
255
|
+
return self.config.max_tokens_for_sources
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class InternalSearchTool(Tool[InternalSearchConfig], InternalSearchService):
|
|
259
|
+
name = "InternalSearch"
|
|
260
|
+
|
|
261
|
+
def __init__(
|
|
262
|
+
self,
|
|
263
|
+
configuration: InternalSearchConfig,
|
|
264
|
+
event: BaseEvent,
|
|
265
|
+
*args,
|
|
266
|
+
**kwargs,
|
|
267
|
+
):
|
|
268
|
+
Tool.__init__(self, configuration, event, *args, **kwargs)
|
|
269
|
+
|
|
270
|
+
content_service = ContentService.from_event(self.event)
|
|
271
|
+
chunk_relevancy_sorter = ChunkRelevancySorter.from_event(self.event)
|
|
272
|
+
# Determing chat_id if possible
|
|
273
|
+
if isinstance(self.event, (ChatEvent, Event)):
|
|
274
|
+
chat_id = self.event.payload.chat_id
|
|
275
|
+
else:
|
|
276
|
+
chat_id = None
|
|
277
|
+
InternalSearchService.__init__(
|
|
278
|
+
self,
|
|
279
|
+
config=configuration,
|
|
280
|
+
content_service=content_service,
|
|
281
|
+
chunk_relevancy_sorter=chunk_relevancy_sorter,
|
|
282
|
+
chat_id=chat_id,
|
|
283
|
+
logger=self.logger,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
async def post_progress_message(
|
|
287
|
+
self, message: str, tool_call: LanguageModelFunction, **kwargs
|
|
288
|
+
):
|
|
289
|
+
if self.tool_progress_reporter:
|
|
290
|
+
await self.tool_progress_reporter.notify_from_tool_call(
|
|
291
|
+
tool_call=tool_call,
|
|
292
|
+
name=f"**{self.tool_execution_message_name}**",
|
|
293
|
+
message=message,
|
|
294
|
+
state=ProgressState.RUNNING,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
async def is_chat_only(
|
|
298
|
+
self, tool_call: LanguageModelFunction | None = None, **kwargs
|
|
299
|
+
) -> bool:
|
|
300
|
+
if await super().is_chat_only(**kwargs):
|
|
301
|
+
return True
|
|
302
|
+
if (
|
|
303
|
+
tool_call
|
|
304
|
+
and isinstance(tool_call.arguments, dict)
|
|
305
|
+
and tool_call.arguments.get("chat_only") is True
|
|
306
|
+
):
|
|
307
|
+
return True
|
|
308
|
+
return False
|
|
309
|
+
|
|
310
|
+
@override
|
|
311
|
+
def tool_description(self) -> LanguageModelToolDescription:
|
|
312
|
+
# Conditionally set the type based on config
|
|
313
|
+
search_string_type = (
|
|
314
|
+
list[str]
|
|
315
|
+
if self.config.experimental_features.enable_multiple_search_strings_execution
|
|
316
|
+
else str
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
internal_search_tool_input = create_model(
|
|
320
|
+
"InternalSearchToolInput",
|
|
321
|
+
search_string=(
|
|
322
|
+
search_string_type,
|
|
323
|
+
Field(description=self.config.param_description_search_string),
|
|
324
|
+
),
|
|
325
|
+
language=(
|
|
326
|
+
str,
|
|
327
|
+
Field(description=self.config.param_description_language),
|
|
328
|
+
),
|
|
329
|
+
)
|
|
330
|
+
return LanguageModelToolDescription(
|
|
331
|
+
name=self.name,
|
|
332
|
+
description=self.config.tool_description,
|
|
333
|
+
parameters=internal_search_tool_input,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
def tool_description_for_system_prompt(self) -> str:
|
|
337
|
+
return self.config.tool_description_for_system_prompt
|
|
338
|
+
|
|
339
|
+
def tool_format_information_for_system_prompt(self) -> str:
|
|
340
|
+
return self.config.tool_format_information_for_system_prompt
|
|
341
|
+
|
|
342
|
+
def evaluation_check_list(self) -> list[EvaluationMetricName]:
|
|
343
|
+
return self.config.evaluation_check_list
|
|
344
|
+
|
|
345
|
+
def get_evaluation_checks_based_on_tool_response(
|
|
346
|
+
self, tool_response: ToolCallResponse
|
|
347
|
+
) -> list[EvaluationMetricName]:
|
|
348
|
+
evaluation_check_list = self.evaluation_check_list()
|
|
349
|
+
|
|
350
|
+
# Check if the tool response is empty
|
|
351
|
+
if not tool_response.content_chunks:
|
|
352
|
+
return []
|
|
353
|
+
return evaluation_check_list
|
|
354
|
+
|
|
355
|
+
# TODO: find a solution for tracking
|
|
356
|
+
# @track(name="internal_search_tool_run")
|
|
357
|
+
async def run(self, tool_call: LanguageModelFunction) -> ToolCallResponse:
|
|
358
|
+
"""
|
|
359
|
+
Perform a search in the Vector DB based on the user's message and generate a response.
|
|
360
|
+
"""
|
|
361
|
+
if (
|
|
362
|
+
tool_call.arguments is None
|
|
363
|
+
or not isinstance(tool_call.arguments, dict)
|
|
364
|
+
or (
|
|
365
|
+
"search_strings" not in tool_call.arguments
|
|
366
|
+
and "search_string"
|
|
367
|
+
not in tool_call.arguments # Backwards compatibility
|
|
368
|
+
)
|
|
369
|
+
):
|
|
370
|
+
self.logger.error("Tool call arguments are missing or invalid")
|
|
371
|
+
return ToolCallResponse(
|
|
372
|
+
id=tool_call.id, # type: ignore
|
|
373
|
+
name=self.name,
|
|
374
|
+
content_chunks=[],
|
|
375
|
+
debug_info={},
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# Extract the search strings (handle both new and old parameter names)
|
|
379
|
+
search_strings_data = tool_call.arguments.get(
|
|
380
|
+
"search_strings", tool_call.arguments.get("search_string")
|
|
381
|
+
)
|
|
382
|
+
# Ensure it's always a list for the progress message
|
|
383
|
+
search_strings_list: list[str] = []
|
|
384
|
+
if isinstance(search_strings_data, str):
|
|
385
|
+
search_strings_list = [search_strings_data]
|
|
386
|
+
elif isinstance(search_strings_data, list):
|
|
387
|
+
search_strings_list = search_strings_data
|
|
388
|
+
else:
|
|
389
|
+
raise ValueError("Invalid search strings data")
|
|
390
|
+
|
|
391
|
+
await self.post_progress_message(f"{'; '.join(search_strings_list)}", tool_call)
|
|
392
|
+
|
|
393
|
+
selected_chunks = await self.search(
|
|
394
|
+
**tool_call.arguments,
|
|
395
|
+
tool_call=tool_call, # Need to pass tool_call to post_progress_message
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
## Modify metadata in chunks
|
|
399
|
+
selected_chunks = append_metadata_in_chunks(
|
|
400
|
+
chunks=selected_chunks,
|
|
401
|
+
metadata_chunk_sections=self.config.metadata_chunk_sections,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
tool_response = ToolCallResponse(
|
|
405
|
+
id=tool_call.id, # type: ignore
|
|
406
|
+
name=self.name,
|
|
407
|
+
content_chunks=selected_chunks,
|
|
408
|
+
debug_info=self.debug_info,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
if self.tool_progress_reporter:
|
|
412
|
+
await self.tool_progress_reporter.notify_from_tool_call(
|
|
413
|
+
tool_call=tool_call,
|
|
414
|
+
name=f"**{self.tool_execution_message_name}**",
|
|
415
|
+
message=f"{'; '.join(search_strings_list)}",
|
|
416
|
+
state=ProgressState.FINISHED,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
return tool_response
|
|
420
|
+
|
|
421
|
+
## Note: This function is only used by the Investment Research Agent and Agentic Search. Once these agents are moved out of the monorepo, this function should be removed.
|
|
422
|
+
def get_tool_call_result_for_loop_history(
|
|
423
|
+
self,
|
|
424
|
+
tool_response: ToolCallResponse,
|
|
425
|
+
agent_chunks_handler: AgentChunksHandler,
|
|
426
|
+
) -> LanguageModelMessage:
|
|
427
|
+
"""
|
|
428
|
+
Process the results of the tool.
|
|
429
|
+
Args:
|
|
430
|
+
tool_response: The tool response.
|
|
431
|
+
loop_history: The loop history.
|
|
432
|
+
Returns:
|
|
433
|
+
The tool result to append to the loop history.
|
|
434
|
+
"""
|
|
435
|
+
self.logger.debug(
|
|
436
|
+
f"Appending tool call result to history: {tool_response.name}"
|
|
437
|
+
)
|
|
438
|
+
# Initialize content_chunks if None
|
|
439
|
+
content_chunks = tool_response.content_chunks or []
|
|
440
|
+
|
|
441
|
+
# Get the maximum source number in the loop history
|
|
442
|
+
max_source_number = len(agent_chunks_handler.chunks)
|
|
443
|
+
|
|
444
|
+
# Transform content chunks into sources to be appended to tool result
|
|
445
|
+
sources, _ = transform_chunks_to_string(
|
|
446
|
+
content_chunks,
|
|
447
|
+
max_source_number,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
# Append the result to the history
|
|
451
|
+
return LanguageModelToolMessage(
|
|
452
|
+
content=sources,
|
|
453
|
+
tool_call_id=tool_response.id, # type: ignore
|
|
454
|
+
name=tool_response.name,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
ToolFactory.register_tool(InternalSearchTool, InternalSearchConfig)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from unique_internal_search.config import InternalSearchConfig
|
|
2
|
+
from unique_internal_search.uploaded_search.prompts import (
|
|
3
|
+
DEFAULT_LANGUAGE_PARAM_DESCRIPTION,
|
|
4
|
+
DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION,
|
|
5
|
+
DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT,
|
|
6
|
+
DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT,
|
|
7
|
+
)
|
|
8
|
+
from unique_internal_search.validators import get_string_field_with_pattern_validation
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class UploadedSearchConfig(InternalSearchConfig):
|
|
12
|
+
param_description_search_string: str = get_string_field_with_pattern_validation(
|
|
13
|
+
DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION,
|
|
14
|
+
description="`search_string` parameter description.",
|
|
15
|
+
)
|
|
16
|
+
param_description_language: str = get_string_field_with_pattern_validation(
|
|
17
|
+
DEFAULT_LANGUAGE_PARAM_DESCRIPTION,
|
|
18
|
+
description="`language` parameter description.",
|
|
19
|
+
)
|
|
20
|
+
tool_description_for_system_prompt: str = get_string_field_with_pattern_validation(
|
|
21
|
+
DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT,
|
|
22
|
+
description="Tool description for the system prompt.",
|
|
23
|
+
)
|
|
24
|
+
tool_format_information_for_system_prompt: str = (
|
|
25
|
+
get_string_field_with_pattern_validation(
|
|
26
|
+
DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT,
|
|
27
|
+
description="Tool format information for the system prompt.",
|
|
28
|
+
)
|
|
29
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
DEFAULT_TOOL_DESCRIPTION_FOR_SYSTEM_PROMPT = (
|
|
2
|
+
"You can use the UploadedSearch tool to access and analyze documents uploaded by users during a chat. This tool is designed to handle a variety of document-related tasks, including summarization, explanation, and detailed information retrieval. "
|
|
3
|
+
"Use cases for the UploadedSearch tool include:\n"
|
|
4
|
+
"- Document Analysis: When a user uploads a document and asks for a summary, explanation, or specific details, this tool can extract and provide the requested information.\n"
|
|
5
|
+
"- Named Document Queries: If a user refers to a previously uploaded document by name (e.g., 'What does the Q2_Report.pdf say about revenue?'), this tool can locate and analyze the document to answer the query.\n"
|
|
6
|
+
"- Policy and Procedure Verification: Use the tool to find the most current company policies, procedures, or guidelines within uploaded documents.\n"
|
|
7
|
+
"- Project-Specific Information: Access project documents, reports, or meeting notes uploaded by users to provide precise details.\n"
|
|
8
|
+
"- Confidential and Proprietary Information: Ensure that sensitive topics requiring proprietary knowledge or confidential data are sourced securely from uploaded documents.\n\n"
|
|
9
|
+
"**Instruction Query Splitting**\n"
|
|
10
|
+
"You should split the user question into multiple search strings when the user's question needs to be decomposed / rewritten to find different facts. Perform an individual tool call for each search string. Avoid overly broad queries that may return unrelated results. Ensure the search string is specific and relevant to the uploaded document(s).\n\n"
|
|
11
|
+
"Examples:\n"
|
|
12
|
+
'User: "What does the Q2_Report.pdf say about revenue and expenses?" => search strings: ["Q2_Report.pdf revenue", "Q2_Report.pdf expenses"]\n'
|
|
13
|
+
'User: "Summarize the uploaded document." => search string: ["Summarize the uploaded document"]\n'
|
|
14
|
+
"**The currently uploaded documents are the following**\n\n"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
DEFAULT_TOOL_FORMAT_INFORMATION_FOR_SYSTEM_PROMPT = (
|
|
18
|
+
"Whenever you use information retrieved with the UploadedSearch, you must adhere to strict reference guidelines. "
|
|
19
|
+
"You must strictly reference each fact used with the `source_number` of the corresponding passage, in the following format: '[source<source_number>]'.\n\n"
|
|
20
|
+
"Example:\n"
|
|
21
|
+
"- The revenue for Q2 was $5M , while expenses were $3M .\n"
|
|
22
|
+
"- The uploaded document highlights a 20% increase in productivity .\n\n"
|
|
23
|
+
"A fact is preferably referenced by ONLY ONE source, e.g [sourceX], which should be the most relevant source for the fact.\n"
|
|
24
|
+
"Follow these guidelines closely and be sure to use the proper `source_number` when referencing facts.\n"
|
|
25
|
+
"Make sure that your reference follow the format [sourceX] and that the source number is correct.\n"
|
|
26
|
+
"Source is written in singular form and the number is written in digits.\n\n"
|
|
27
|
+
"IT IS VERY IMPORTANT TO FOLLOW THESE GUIDELINES!!\n"
|
|
28
|
+
"NEVER CITE A source_number THAT YOU DON'T SEE IN THE TOOL CALL RESPONSE!!!\n"
|
|
29
|
+
"The source_number in old assistant messages are no longer valid.\n"
|
|
30
|
+
"EXAMPLE: If you see and in the assistant message, you can't use again in the next assistant message, this has to be the number you find in the message with role 'tool'.\n"
|
|
31
|
+
"BE AWARE: All tool calls have been filtered to remove uncited sources. Tool calls return much more data than you see.\n\n"
|
|
32
|
+
"### Internal Document Answering Protocol for Uploaded Documents\n"
|
|
33
|
+
"When assisting users with uploaded documents, follow\n"
|
|
34
|
+
"this structured approach to ensure precise, well-grounded,\n"
|
|
35
|
+
"and context-aware responses:\n\n"
|
|
36
|
+
"#### 1. Locate and Prioritize Relevant Information\n"
|
|
37
|
+
"Focus on the **most relevant sections** of the uploaded document.\n"
|
|
38
|
+
"Prioritize documents that are:\n"
|
|
39
|
+
"- **Directly referenced by the user** (e.g., by name or context).\n"
|
|
40
|
+
"- **Recently uploaded** or actively discussed.\n\n"
|
|
41
|
+
"#### 2. Source Reliability Guidelines\n"
|
|
42
|
+
"- Prioritize information that is:\n"
|
|
43
|
+
" - **Clearly stated in the document**.\n"
|
|
44
|
+
" - **Part of finalized or approved sections**.\n"
|
|
45
|
+
"- Be cautious with:\n"
|
|
46
|
+
" - Drafts or incomplete sections.\n"
|
|
47
|
+
" - Ambiguous or conflicting information.\n\n"
|
|
48
|
+
"#### 3. Acknowledge Limitations\n"
|
|
49
|
+
"- If no relevant information is found, or the document is unclear, state this explicitly.\n"
|
|
50
|
+
"- Indicate where further clarification or investigation may be required."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
DEFAULT_TOOL_DESCRIPTION = "Search within uploaded documents for information on policies, procedures, benefits, projects, or specific details. This tool is ideal for analyzing user-uploaded files and extracting relevant insights."
|
|
54
|
+
DEFAULT_SEARCH_STRING_PARAM_DESCRIPTION = "An expanded term optimized for vector and full-text search based on the user’s query. It must be in English."
|
|
55
|
+
DEFAULT_LANGUAGE_PARAM_DESCRIPTION = (
|
|
56
|
+
"The language in which the user’s query is written."
|
|
57
|
+
)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from pydantic import Field, create_model
|
|
2
|
+
from typing_extensions import override
|
|
3
|
+
from unique_toolkit import ContentService
|
|
4
|
+
from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricName
|
|
5
|
+
from unique_toolkit.agentic.tools.factory import ToolFactory
|
|
6
|
+
from unique_toolkit.agentic.tools.schemas import ToolCallResponse
|
|
7
|
+
from unique_toolkit.agentic.tools.tool import Tool
|
|
8
|
+
from unique_toolkit.agentic.tools.tool_progress_reporter import (
|
|
9
|
+
ProgressState,
|
|
10
|
+
ToolProgressReporter,
|
|
11
|
+
)
|
|
12
|
+
from unique_toolkit.app.schemas import BaseEvent, ChatEvent
|
|
13
|
+
from unique_toolkit.chat.service import LanguageModelToolDescription
|
|
14
|
+
from unique_toolkit.language_model.schemas import (
|
|
15
|
+
LanguageModelFunction,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from unique_internal_search.service import InternalSearchTool
|
|
19
|
+
from unique_internal_search.uploaded_search.config import UploadedSearchConfig
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class UploadedSearchTool(Tool[UploadedSearchConfig]):
|
|
23
|
+
name = "UploadedSearch"
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
config: UploadedSearchConfig,
|
|
28
|
+
event: BaseEvent,
|
|
29
|
+
tool_progress_reporter: ToolProgressReporter,
|
|
30
|
+
*args,
|
|
31
|
+
**kwargs,
|
|
32
|
+
):
|
|
33
|
+
self._tool_progress_reporter = tool_progress_reporter
|
|
34
|
+
self._content_service = ContentService.from_event(event)
|
|
35
|
+
self._config = config
|
|
36
|
+
config.chat_only = True
|
|
37
|
+
self._internal_search_tool = InternalSearchTool(
|
|
38
|
+
config, event, None, *args, **kwargs
|
|
39
|
+
)
|
|
40
|
+
if isinstance(event, ChatEvent):
|
|
41
|
+
self._user_query = event.payload.user_message.text
|
|
42
|
+
else:
|
|
43
|
+
self._user_query = None
|
|
44
|
+
|
|
45
|
+
async def post_progress_message(
|
|
46
|
+
self, message: str, tool_call: LanguageModelFunction, **kwargs
|
|
47
|
+
):
|
|
48
|
+
if self._tool_progress_reporter:
|
|
49
|
+
await self._tool_progress_reporter.notify_from_tool_call(
|
|
50
|
+
tool_call=tool_call,
|
|
51
|
+
name="**Search Uploaded Document**",
|
|
52
|
+
message=message,
|
|
53
|
+
state=ProgressState.RUNNING,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
@override
|
|
57
|
+
def tool_description(self) -> LanguageModelToolDescription:
|
|
58
|
+
internal_search_tool_input = create_model(
|
|
59
|
+
"InternalSearchToolInput",
|
|
60
|
+
search_string=(
|
|
61
|
+
str,
|
|
62
|
+
Field(description=self._config.param_description_search_string),
|
|
63
|
+
),
|
|
64
|
+
language=(
|
|
65
|
+
str,
|
|
66
|
+
Field(description=self._config.param_description_language),
|
|
67
|
+
),
|
|
68
|
+
)
|
|
69
|
+
return LanguageModelToolDescription(
|
|
70
|
+
name=self.name,
|
|
71
|
+
description=self._config.tool_description,
|
|
72
|
+
parameters=internal_search_tool_input,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def tool_description_for_system_prompt(self) -> str:
|
|
76
|
+
documents = self._content_service.get_documents_uploaded_to_chat()
|
|
77
|
+
list_all_documents = "".join([f"- {doc.title or doc.key}" for doc in documents])
|
|
78
|
+
return self._config.tool_description_for_system_prompt + list_all_documents
|
|
79
|
+
|
|
80
|
+
def tool_format_information_for_system_prompt(self) -> str:
|
|
81
|
+
return self._config.tool_format_information_for_system_prompt
|
|
82
|
+
|
|
83
|
+
def evaluation_check_list(self) -> list[EvaluationMetricName]:
|
|
84
|
+
return self._config.evaluation_check_list
|
|
85
|
+
|
|
86
|
+
def get_evaluation_checks_based_on_tool_response(
|
|
87
|
+
self, tool_response: ToolCallResponse
|
|
88
|
+
) -> list[EvaluationMetricName]:
|
|
89
|
+
evaluation_check_list = self.evaluation_check_list()
|
|
90
|
+
return evaluation_check_list
|
|
91
|
+
|
|
92
|
+
async def run(self, tool_call: LanguageModelFunction) -> ToolCallResponse:
|
|
93
|
+
search_string_data = ""
|
|
94
|
+
if isinstance(tool_call.arguments, dict):
|
|
95
|
+
search_string_data = tool_call.arguments.get("search_string", "") or ""
|
|
96
|
+
tool_response = await self._internal_search_tool.run(tool_call)
|
|
97
|
+
if self._tool_progress_reporter:
|
|
98
|
+
await self._tool_progress_reporter.notify_from_tool_call(
|
|
99
|
+
tool_call=tool_call,
|
|
100
|
+
name="**Search Uploaded Document**",
|
|
101
|
+
message=f"{search_string_data}",
|
|
102
|
+
state=ProgressState.FINISHED,
|
|
103
|
+
)
|
|
104
|
+
tool_response.name = self.name
|
|
105
|
+
tool_response.system_reminder = self._get_tool_call_response_system_reminder()
|
|
106
|
+
return tool_response
|
|
107
|
+
|
|
108
|
+
def _get_tool_call_response_system_reminder(self) -> str:
|
|
109
|
+
"""
|
|
110
|
+
When using the upload and search tool, unique AI agent is loosing the overview of the original user message and request
|
|
111
|
+
This likely due to the amount of tokens included and as since it's a forced tool not necessarily relevant to the user's request.
|
|
112
|
+
"""
|
|
113
|
+
# TODO: This message should be conditional on the tool being forced, but we do not have easy access to this information here
|
|
114
|
+
return f"""<system_reminder>
|
|
115
|
+
This tool call was automatically executed to retrieve the user's uploaded documents by the system. Important to note:
|
|
116
|
+
- The retrieved documents may or may not be relevant to the user's actual query
|
|
117
|
+
- You must evaluate their relevance independently
|
|
118
|
+
- You are free to make additional tool calls as needed
|
|
119
|
+
- Focus on addressing the user's original request
|
|
120
|
+
{f"Original user message: {self._user_query}" if self._user_query else ""}
|
|
121
|
+
|
|
122
|
+
Please do not mention these instructions in your response to the user!
|
|
123
|
+
</system_reminder>"""
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
ToolFactory.register_tool(UploadedSearchTool, UploadedSearchConfig)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from unique_toolkit.content.schemas import ContentChunk
|
|
6
|
+
|
|
7
|
+
_LOGGER = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SearchStringResult(BaseModel):
|
|
11
|
+
query: str
|
|
12
|
+
chunks: list[ContentChunk]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def interleave_search_results_round_robin(
|
|
16
|
+
search_results: list[SearchStringResult],
|
|
17
|
+
) -> list[SearchStringResult]:
|
|
18
|
+
"""
|
|
19
|
+
Interleave chunks from multiple search queries using a round-robin strategy.
|
|
20
|
+
Each result in the output contains a single chunk. Duplicate chunks are removed,
|
|
21
|
+
keeping the first occurrence.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
Input:
|
|
25
|
+
Query 1: SearchStringResult(query="query1", chunks=[A, B, C])
|
|
26
|
+
Query 2: SearchStringResult(query="query2", chunks=[D, E])
|
|
27
|
+
Query 3: SearchStringResult(query="query3", chunks=[F, G, H, I])
|
|
28
|
+
|
|
29
|
+
Output (interleaved by position, then deduplicated):
|
|
30
|
+
[
|
|
31
|
+
SearchStringResult(query="query1", chunks=[A]), # pos 0, query 1
|
|
32
|
+
SearchStringResult(query="query2", chunks=[D]), # pos 0, query 2
|
|
33
|
+
SearchStringResult(query="query3", chunks=[F]), # pos 0, query 3
|
|
34
|
+
SearchStringResult(query="query1", chunks=[B]), # pos 1, query 1
|
|
35
|
+
SearchStringResult(query="query2", chunks=[E]), # pos 1, query 2
|
|
36
|
+
SearchStringResult(query="query3", chunks=[G]), # pos 1, query 3
|
|
37
|
+
SearchStringResult(query="query1", chunks=[C]), # pos 2, query 1
|
|
38
|
+
SearchStringResult(query="query3", chunks=[H]), # pos 2, query 3
|
|
39
|
+
SearchStringResult(query="query3", chunks=[I]), # pos 3, query 3
|
|
40
|
+
]
|
|
41
|
+
"""
|
|
42
|
+
if not search_results:
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
max_chunks = max(len(result.chunks) for result in search_results)
|
|
46
|
+
interleaved_search_results: list[SearchStringResult] = [
|
|
47
|
+
SearchStringResult(query=result.query, chunks=[result.chunks[i]])
|
|
48
|
+
for i in range(max_chunks)
|
|
49
|
+
for result in search_results
|
|
50
|
+
if i < len(result.chunks)
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
return _deduplicate_search_results(interleaved_search_results)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _deduplicate_search_results(
|
|
57
|
+
search_results: list[SearchStringResult],
|
|
58
|
+
) -> list[SearchStringResult]:
|
|
59
|
+
"""
|
|
60
|
+
Remove duplicate chunks from the search results based on their `chunk_id`.
|
|
61
|
+
|
|
62
|
+
This function preserves the order of occurrences, keeping the first occurrence
|
|
63
|
+
of each unique `chunk_id`. If a chunk has no `chunk_id`, it will be ignored.
|
|
64
|
+
Duplicate chunks share the same `chunk_id`.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
search_results (list[SearchStringResult]): A list of search results, where each
|
|
68
|
+
result contains chunks with potential duplicate `chunk_id`s.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
list[SearchStringResult]: A deduplicated list of search results with unique `chunk_id` chunks.
|
|
72
|
+
"""
|
|
73
|
+
seen_chunk_ids: set[str] = set()
|
|
74
|
+
deduplicated_search_results: list[SearchStringResult] = []
|
|
75
|
+
|
|
76
|
+
counter_chunks = 0
|
|
77
|
+
for result in search_results:
|
|
78
|
+
for chunk in result.chunks:
|
|
79
|
+
if chunk.chunk_id and chunk.chunk_id not in seen_chunk_ids:
|
|
80
|
+
counter_chunks += 1
|
|
81
|
+
seen_chunk_ids.add(chunk.chunk_id)
|
|
82
|
+
deduplicated_search_results.append(
|
|
83
|
+
SearchStringResult(query=result.query, chunks=[chunk])
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
if removed := counter_chunks - len(deduplicated_search_results):
|
|
87
|
+
_LOGGER.info(
|
|
88
|
+
f"Removed {removed} duplicate chunks ({len(deduplicated_search_results)}/{counter_chunks} unique)"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return deduplicated_search_results
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def append_metadata_in_chunks(
|
|
95
|
+
chunks: list[ContentChunk],
|
|
96
|
+
metadata_chunk_sections: dict[str, str] | None = None,
|
|
97
|
+
) -> list[ContentChunk]:
|
|
98
|
+
"""
|
|
99
|
+
Append metadata to chunks.
|
|
100
|
+
Args:
|
|
101
|
+
chunks: List of ContentChunk objects
|
|
102
|
+
metadata_chunk_sections: Dictionary of metadata sections to add to the chunk text
|
|
103
|
+
Returns:
|
|
104
|
+
List of ContentChunk objects with metadata appended
|
|
105
|
+
"""
|
|
106
|
+
if metadata_chunk_sections is None:
|
|
107
|
+
return chunks
|
|
108
|
+
for chunk in chunks:
|
|
109
|
+
if chunk.metadata is None:
|
|
110
|
+
continue
|
|
111
|
+
chunk = _append_metadata_in_chunk(
|
|
112
|
+
chunk=chunk, metadata_chunk_sections=metadata_chunk_sections
|
|
113
|
+
)
|
|
114
|
+
return chunks
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _append_metadata_in_chunk(
|
|
118
|
+
chunk: ContentChunk, metadata_chunk_sections: dict[str, str]
|
|
119
|
+
) -> ContentChunk:
|
|
120
|
+
"""
|
|
121
|
+
Format chunk text by prepending metadata according to sections config.
|
|
122
|
+
Args:
|
|
123
|
+
chunk: ContentChunk object
|
|
124
|
+
metadata_chunk_sections: Dictionary of metadata sections to add to the chunk text
|
|
125
|
+
Returns:
|
|
126
|
+
Formatted text with metadata prepended
|
|
127
|
+
"""
|
|
128
|
+
meta_dict = chunk.metadata.model_dump(exclude_none=True, by_alias=True)
|
|
129
|
+
|
|
130
|
+
parts: list[str] = []
|
|
131
|
+
for key, template in metadata_chunk_sections.items():
|
|
132
|
+
if key in meta_dict:
|
|
133
|
+
formatted_section = template.format(meta_dict[key])
|
|
134
|
+
parts.append(formatted_section)
|
|
135
|
+
|
|
136
|
+
# Combine metadata parts with the main text
|
|
137
|
+
if parts:
|
|
138
|
+
chunk.text = "\n".join(parts) + "\n" + chunk.text
|
|
139
|
+
|
|
140
|
+
return chunk
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def clean_search_string(search_string: str) -> str:
|
|
144
|
+
"""
|
|
145
|
+
Remove QDF (QueryDeservedFreshness) and boost operators from search string.
|
|
146
|
+
|
|
147
|
+
Examples:
|
|
148
|
+
'+(GPT4) performance on +(MMLU) benchmark --QDF=1'
|
|
149
|
+
-> 'GPT4 performance on MMLU benchmark'
|
|
150
|
+
|
|
151
|
+
'Best practices for +(security) and +(privacy) for +(cloud storage) --QDF=2'
|
|
152
|
+
-> 'Best practices for security and privacy for cloud storage'
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
search_string: Raw search string that may contain operators
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Cleaned search string without operators
|
|
159
|
+
"""
|
|
160
|
+
# Remove --QDF=<number> operator (at the end of the string)
|
|
161
|
+
cleaned = re.sub(r"\s*--QDF=\d+\s*$", "", search_string)
|
|
162
|
+
|
|
163
|
+
# Remove +(...) boost operators - replace with just the content inside parentheses
|
|
164
|
+
cleaned = re.sub(r"\+\(([^)]+)\)", r"\1", cleaned)
|
|
165
|
+
|
|
166
|
+
# Clean up any extra whitespace
|
|
167
|
+
cleaned = " ".join(cleaned.split())
|
|
168
|
+
|
|
169
|
+
return cleaned.strip()
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
import re
|
|
3
|
+
from string import Template
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from pydantic import (
|
|
7
|
+
Field,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PromptTemplatingEngine(enum.Enum):
|
|
12
|
+
STRING_TEMPLATE = enum.auto()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def check_placeholder_valid(
|
|
16
|
+
placeholder: str,
|
|
17
|
+
templating_engine: PromptTemplatingEngine = PromptTemplatingEngine.STRING_TEMPLATE,
|
|
18
|
+
) -> bool:
|
|
19
|
+
match templating_engine:
|
|
20
|
+
case PromptTemplatingEngine.STRING_TEMPLATE:
|
|
21
|
+
return (
|
|
22
|
+
re.fullmatch(Template.idpattern, placeholder, re.IGNORECASE) is not None
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_prompt_placeholder_regexp(
|
|
27
|
+
*placeholders: str,
|
|
28
|
+
templating_engine: PromptTemplatingEngine = PromptTemplatingEngine.STRING_TEMPLATE,
|
|
29
|
+
) -> re.Pattern:
|
|
30
|
+
for placeholder in placeholders:
|
|
31
|
+
if not check_placeholder_valid(placeholder, templating_engine):
|
|
32
|
+
raise ValueError(f"Invalid placeholder: {placeholder}")
|
|
33
|
+
|
|
34
|
+
match templating_engine:
|
|
35
|
+
case PromptTemplatingEngine.STRING_TEMPLATE:
|
|
36
|
+
placeholder_patterns = [
|
|
37
|
+
rf"(?=.*(?:\$\{{{p}\}}|\${p}))" for p in placeholders
|
|
38
|
+
]
|
|
39
|
+
pattern = "".join(placeholder_patterns)
|
|
40
|
+
return re.compile(pattern, re.DOTALL)
|
|
41
|
+
# We will add other templating engines here, such as Jinja2.
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_prompt_placeholder_regexp_from_text(
|
|
45
|
+
text: str,
|
|
46
|
+
templating_engine: PromptTemplatingEngine = PromptTemplatingEngine.STRING_TEMPLATE,
|
|
47
|
+
) -> re.Pattern:
|
|
48
|
+
match templating_engine:
|
|
49
|
+
case PromptTemplatingEngine.STRING_TEMPLATE:
|
|
50
|
+
return get_prompt_placeholder_regexp(
|
|
51
|
+
*Template(text).get_identifiers(),
|
|
52
|
+
templating_engine=templating_engine,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_string_field_with_pattern_validation(
|
|
57
|
+
prompt_template: str,
|
|
58
|
+
templating_engine: PromptTemplatingEngine = PromptTemplatingEngine.STRING_TEMPLATE,
|
|
59
|
+
**kwargs,
|
|
60
|
+
) -> Any:
|
|
61
|
+
"""Create a Pydantic Field with validation for prompt template placeholders.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
prompt_template: The prompt template string containing placeholders.
|
|
65
|
+
templating_engine: The engine used for template processing. Defaults to STRING_TEMPLATE.
|
|
66
|
+
**kwargs: Additional keyword arguments to pass to pydantic.Field.
|
|
67
|
+
Note that `default` will be ignored if present.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
pydantic.FieldInfo: A FieldInfo instance with the default value and placeholder validation pattern.
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
class ServiceConfig(BaseModel):
|
|
74
|
+
prompt: str = get_prompt_field_from_default(
|
|
75
|
+
"Hello ${name}!"
|
|
76
|
+
) # Creates a Field with pattern validation for the "name" placeholder
|
|
77
|
+
"""
|
|
78
|
+
pattern = get_prompt_placeholder_regexp_from_text(
|
|
79
|
+
prompt_template, templating_engine
|
|
80
|
+
)
|
|
81
|
+
if pattern.pattern:
|
|
82
|
+
kwargs["pattern"] = pattern
|
|
83
|
+
|
|
84
|
+
kwargs["default"] = prompt_template
|
|
85
|
+
|
|
86
|
+
return Field(**kwargs)
|