alita-sdk 0.3.257__py3-none-any.whl → 0.3.584__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3794 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +11 -0
- alita_sdk/configurations/ado.py +148 -2
- alita_sdk/configurations/azure_search.py +1 -1
- alita_sdk/configurations/bigquery.py +1 -1
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/browser.py +18 -0
- alita_sdk/configurations/carrier.py +19 -0
- alita_sdk/configurations/confluence.py +130 -1
- alita_sdk/configurations/delta_lake.py +1 -1
- alita_sdk/configurations/figma.py +76 -5
- alita_sdk/configurations/github.py +65 -1
- alita_sdk/configurations/gitlab.py +81 -0
- alita_sdk/configurations/google_places.py +17 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/openapi.py +323 -0
- alita_sdk/configurations/postman.py +1 -1
- alita_sdk/configurations/qtest.py +72 -3
- alita_sdk/configurations/report_portal.py +115 -0
- alita_sdk/configurations/salesforce.py +19 -0
- alita_sdk/configurations/service_now.py +1 -12
- alita_sdk/configurations/sharepoint.py +167 -0
- alita_sdk/configurations/sonar.py +18 -0
- alita_sdk/configurations/sql.py +20 -0
- alita_sdk/configurations/testio.py +101 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +94 -1
- alita_sdk/configurations/zephyr_enterprise.py +94 -1
- alita_sdk/configurations/zephyr_essential.py +95 -0
- alita_sdk/runtime/clients/artifact.py +21 -4
- alita_sdk/runtime/clients/client.py +458 -67
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +352 -0
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +183 -43
- alita_sdk/runtime/langchain/constants.py +647 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
- alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
- alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
- alita_sdk/runtime/langchain/langraph_agent.py +493 -105
- alita_sdk/runtime/langchain/utils.py +118 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +28 -0
- alita_sdk/runtime/toolkits/application.py +14 -4
- alita_sdk/runtime/toolkits/artifact.py +25 -9
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +782 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +11 -6
- alita_sdk/runtime/toolkits/tools.py +314 -70
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +24 -0
- alita_sdk/runtime/tools/application.py +16 -4
- alita_sdk/runtime/tools/artifact.py +367 -33
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +100 -4
- alita_sdk/runtime/tools/graph.py +81 -0
- alita_sdk/runtime/tools/image_generation.py +218 -0
- alita_sdk/runtime/tools/llm.py +1032 -177
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +375 -0
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +69 -65
- alita_sdk/runtime/tools/vectorstore_base.py +163 -90
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +361 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +41 -14
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +48 -0
- alita_sdk/tools/__init__.py +135 -37
- alita_sdk/tools/ado/__init__.py +2 -2
- alita_sdk/tools/ado/repos/__init__.py +16 -19
- alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
- alita_sdk/tools/ado/test_plan/__init__.py +27 -8
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
- alita_sdk/tools/ado/wiki/__init__.py +28 -12
- alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
- alita_sdk/tools/ado/work_item/__init__.py +28 -12
- alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
- alita_sdk/tools/advanced_jira_mining/__init__.py +13 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +15 -11
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +14 -8
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +454 -110
- alita_sdk/tools/bitbucket/__init__.py +28 -19
- alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
- alita_sdk/tools/browser/__init__.py +41 -16
- alita_sdk/tools/browser/crawler.py +3 -1
- alita_sdk/tools/browser/utils.py +15 -6
- alita_sdk/tools/carrier/__init__.py +18 -17
- alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
- alita_sdk/tools/carrier/excel_reporter.py +8 -4
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/codeparser.py +1 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +12 -7
- alita_sdk/tools/cloud/azure/__init__.py +12 -7
- alita_sdk/tools/cloud/gcp/__init__.py +12 -7
- alita_sdk/tools/cloud/k8s/__init__.py +12 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +21 -13
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +22 -14
- alita_sdk/tools/confluence/api_wrapper.py +197 -58
- alita_sdk/tools/confluence/loader.py +14 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +546 -64
- alita_sdk/tools/figma/__init__.py +60 -11
- alita_sdk/tools/figma/api_wrapper.py +1400 -167
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +18 -17
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +81 -12
- alita_sdk/tools/github/schemas.py +2 -1
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/gitlab/__init__.py +19 -13
- alita_sdk/tools/gitlab/api_wrapper.py +256 -80
- alita_sdk/tools/gitlab_org/__init__.py +14 -10
- alita_sdk/tools/google/bigquery/__init__.py +14 -13
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +21 -11
- alita_sdk/tools/jira/__init__.py +22 -11
- alita_sdk/tools/jira/api_wrapper.py +315 -168
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +38 -14
- alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +491 -106
- alita_sdk/tools/openapi/api_wrapper.py +1357 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +40 -45
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +11 -11
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +11 -10
- alita_sdk/tools/qtest/__init__.py +22 -14
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +13 -10
- alita_sdk/tools/report_portal/__init__.py +23 -16
- alita_sdk/tools/salesforce/__init__.py +22 -16
- alita_sdk/tools/servicenow/__init__.py +21 -16
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +17 -14
- alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +13 -8
- alita_sdk/tools/sql/__init__.py +22 -19
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +21 -13
- alita_sdk/tools/testrail/__init__.py +13 -11
- alita_sdk/tools/testrail/api_wrapper.py +214 -46
- alita_sdk/tools/utils/__init__.py +28 -4
- alita_sdk/tools/utils/content_parser.py +241 -55
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
- alita_sdk/tools/xray/__init__.py +18 -14
- alita_sdk/tools/xray/api_wrapper.py +58 -113
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +12 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +16 -9
- alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
- alita_sdk/tools/zephyr_essential/__init__.py +16 -10
- alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
- alita_sdk/tools/zephyr_essential/client.py +6 -4
- alita_sdk/tools/zephyr_scale/__init__.py +13 -8
- alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
- alita_sdk/tools/zephyr_squad/__init__.py +12 -7
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/METADATA +184 -37
- alita_sdk-0.3.584.dist-info/RECORD +452 -0
- alita_sdk-0.3.584.dist-info/entry_points.txt +2 -0
- alita_sdk/tools/bitbucket/tools.py +0 -304
- alita_sdk-0.3.257.dist-info/RECORD +0 -343
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import fnmatch
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Optional, List, Generator
|
|
6
|
+
|
|
7
|
+
from langchain_core.documents import Document
|
|
8
|
+
from langchain_core.tools import ToolException
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
17
|
+
def _get_indexed_data(self, index_name: str):
|
|
18
|
+
self._ensure_vectorstore_initialized()
|
|
19
|
+
if not self.vector_adapter:
|
|
20
|
+
raise ToolException("Vector adapter is not initialized. "
|
|
21
|
+
"Check your configuration: embedding_model and vectorstore_type.")
|
|
22
|
+
return self.vector_adapter.get_code_indexed_data(self, index_name)
|
|
23
|
+
|
|
24
|
+
def key_fn(self, document: Document):
|
|
25
|
+
return document.metadata.get("filename")
|
|
26
|
+
|
|
27
|
+
def compare_fn(self, document: Document, idx_data):
|
|
28
|
+
return (document.metadata.get('commit_hash') and
|
|
29
|
+
idx_data.get('commit_hashes') and
|
|
30
|
+
document.metadata.get('commit_hash') in idx_data.get('commit_hashes')
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def remove_ids_fn(self, idx_data, key: str):
|
|
34
|
+
return idx_data[key]['ids']
|
|
35
|
+
|
|
36
|
+
def _base_loader(
|
|
37
|
+
self,
|
|
38
|
+
branch: Optional[str] = None,
|
|
39
|
+
whitelist: Optional[List[str]] = None,
|
|
40
|
+
blacklist: Optional[List[str]] = None,
|
|
41
|
+
**kwargs) -> Generator[Document, None, None]:
|
|
42
|
+
"""Index repository files in the vector store using code parsing."""
|
|
43
|
+
yield from self.loader(
|
|
44
|
+
branch=branch,
|
|
45
|
+
whitelist=whitelist,
|
|
46
|
+
blacklist=blacklist
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def _extend_data(self, documents: Generator[Document, None, None]):
|
|
50
|
+
yield from documents
|
|
51
|
+
|
|
52
|
+
def _index_tool_params(self):
|
|
53
|
+
"""Return the parameters for indexing data."""
|
|
54
|
+
return {
|
|
55
|
+
"branch": (Optional[str], Field(
|
|
56
|
+
description="Branch to index files from. Defaults to active branch if None.",
|
|
57
|
+
default=None)),
|
|
58
|
+
"whitelist": (Optional[List[str]], Field(
|
|
59
|
+
description='File extensions or paths to include. Defaults to all files if None. Example: ["*.md", "*.java"]',
|
|
60
|
+
default=None)),
|
|
61
|
+
"blacklist": (Optional[List[str]], Field(
|
|
62
|
+
description='File extensions or paths to exclude. Defaults to no exclusions if None. Example: ["*.md", "*.java"]',
|
|
63
|
+
default=None)),
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
def loader(self,
|
|
67
|
+
branch: Optional[str] = None,
|
|
68
|
+
whitelist: Optional[List[str]] = None,
|
|
69
|
+
blacklist: Optional[List[str]] = None,
|
|
70
|
+
chunked: bool = True) -> Generator[Document, None, None]:
|
|
71
|
+
"""
|
|
72
|
+
Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
|
|
73
|
+
|
|
74
|
+
Parameters:
|
|
75
|
+
- branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
|
|
76
|
+
- whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
|
|
77
|
+
- blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
|
|
78
|
+
- chunked (bool): If True (default), applies universal chunker based on file type.
|
|
79
|
+
If False, returns raw Documents without chunking.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
- generator: Yields Documents from files matching the whitelist but not the blacklist.
|
|
83
|
+
Each document has exactly the key 'filename' in metadata, which is used as an ID
|
|
84
|
+
for further operations (indexing, deduplication, and retrieval).
|
|
85
|
+
|
|
86
|
+
Example:
|
|
87
|
+
# Use 'feature-branch', include '.py' files, exclude 'test_' files
|
|
88
|
+
for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
|
|
89
|
+
print(doc.page_content)
|
|
90
|
+
|
|
91
|
+
Notes:
|
|
92
|
+
- Whitelist and blacklist use Unix shell-style wildcards.
|
|
93
|
+
- Files must match the whitelist and not the blacklist to be included.
|
|
94
|
+
- Each document MUST have exactly the key 'filename' in metadata. This key is used as an ID
|
|
95
|
+
for further operations such as indexing, deduplication, and retrieval.
|
|
96
|
+
- When chunked=True:
|
|
97
|
+
- .md files → markdown chunker (header-based splitting)
|
|
98
|
+
- .py/.js/.ts/etc → code parser (TreeSitter-based)
|
|
99
|
+
- .json files → JSON chunker
|
|
100
|
+
- other files → default text chunker
|
|
101
|
+
"""
|
|
102
|
+
import hashlib
|
|
103
|
+
|
|
104
|
+
_files = self.__handle_get_files("", self.__get_branch(branch))
|
|
105
|
+
self._log_tool_event(message="Listing files in branch", tool_name="loader")
|
|
106
|
+
logger.info(f"Files in branch: {_files}")
|
|
107
|
+
|
|
108
|
+
def is_whitelisted(file_path: str) -> bool:
|
|
109
|
+
if whitelist:
|
|
110
|
+
return (any(fnmatch.fnmatch(file_path, pattern) for pattern in whitelist)
|
|
111
|
+
or any(file_path.endswith(f'.{pattern}') for pattern in whitelist))
|
|
112
|
+
return True
|
|
113
|
+
|
|
114
|
+
def is_blacklisted(file_path: str) -> bool:
|
|
115
|
+
if blacklist:
|
|
116
|
+
return (any(fnmatch.fnmatch(file_path, pattern) for pattern in blacklist)
|
|
117
|
+
or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
def raw_document_generator() -> Generator[Document, None, None]:
|
|
121
|
+
"""Yields raw Documents without chunking."""
|
|
122
|
+
self._log_tool_event(message="Reading the files", tool_name="loader")
|
|
123
|
+
total_files = len(_files)
|
|
124
|
+
processed = 0
|
|
125
|
+
|
|
126
|
+
for idx, file in enumerate(_files, 1):
|
|
127
|
+
if is_whitelisted(file) and not is_blacklisted(file):
|
|
128
|
+
try:
|
|
129
|
+
file_content = self._read_file(file, self.__get_branch(branch))
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Failed to read file {file}: {e}")
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
if not file_content:
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
# Ensure file content is a string
|
|
138
|
+
if isinstance(file_content, bytes):
|
|
139
|
+
file_content = file_content.decode("utf-8", errors="ignore")
|
|
140
|
+
elif isinstance(file_content, dict) and file.endswith('.json'):
|
|
141
|
+
file_content = json.dumps(file_content)
|
|
142
|
+
elif not isinstance(file_content, str):
|
|
143
|
+
file_content = str(file_content)
|
|
144
|
+
|
|
145
|
+
# Hash the file content for uniqueness tracking
|
|
146
|
+
file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
|
|
147
|
+
processed += 1
|
|
148
|
+
|
|
149
|
+
yield Document(
|
|
150
|
+
page_content=file_content,
|
|
151
|
+
metadata={
|
|
152
|
+
'file_path': file,
|
|
153
|
+
'filename': file,
|
|
154
|
+
'source': file,
|
|
155
|
+
'commit_hash': file_hash,
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if idx % 10 == 0 or idx == total_files:
|
|
160
|
+
self._log_tool_event(
|
|
161
|
+
message=f"{idx} out of {total_files} files checked, {processed} matched",
|
|
162
|
+
tool_name="loader"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
|
|
166
|
+
|
|
167
|
+
if not chunked:
|
|
168
|
+
# Return raw documents without chunking
|
|
169
|
+
return raw_document_generator()
|
|
170
|
+
|
|
171
|
+
# Apply universal chunker based on file type
|
|
172
|
+
from .chunkers.universal_chunker import universal_chunker
|
|
173
|
+
return universal_chunker(raw_document_generator())
|
|
174
|
+
|
|
175
|
+
def __handle_get_files(self, path: str, branch: str):
|
|
176
|
+
"""
|
|
177
|
+
Handles the retrieval of files from a specific path and branch.
|
|
178
|
+
This method should be implemented in subclasses to provide the actual file retrieval logic.
|
|
179
|
+
"""
|
|
180
|
+
_files = self._get_files(path=path, branch=branch)
|
|
181
|
+
if isinstance(_files, str):
|
|
182
|
+
try:
|
|
183
|
+
# Attempt to convert the string to a list using ast.literal_eval
|
|
184
|
+
_files = ast.literal_eval(_files)
|
|
185
|
+
# Ensure that the result is actually a list of strings
|
|
186
|
+
if not isinstance(_files, list) or not all(isinstance(item, str) for item in _files):
|
|
187
|
+
raise ValueError("The evaluated result is not a list of strings")
|
|
188
|
+
except (SyntaxError, ValueError):
|
|
189
|
+
# Handle the case where the string cannot be converted to a list
|
|
190
|
+
raise ValueError("Expected a list of strings, but got a string that cannot be converted")
|
|
191
|
+
|
|
192
|
+
# Ensure _files is a list of strings
|
|
193
|
+
if not isinstance(_files, list) or not all(isinstance(item, str) for item in _files):
|
|
194
|
+
raise ValueError("Expected a list of strings")
|
|
195
|
+
return _files
|
|
196
|
+
|
|
197
|
+
def __get_branch(self, branch):
|
|
198
|
+
return (branch or getattr(self, 'active_branch', None)
|
|
199
|
+
or getattr(self, '_active_branch', None) or getattr(self, 'branch', None))
|
|
@@ -4,17 +4,19 @@ from .api_wrapper import ConfluenceAPIWrapper
|
|
|
4
4
|
from langchain_core.tools import BaseTool
|
|
5
5
|
from ..base.tool import BaseAction
|
|
6
6
|
from pydantic import create_model, BaseModel, ConfigDict, Field
|
|
7
|
-
|
|
7
|
+
|
|
8
|
+
from ..elitea_base import filter_missconfigured_index_tools
|
|
9
|
+
from ..utils import clean_string, get_max_toolkit_length, parse_list, check_connection_response
|
|
8
10
|
from ...configurations.confluence import ConfluenceConfiguration
|
|
9
11
|
from ...configurations.pgvector import PgVectorConfiguration
|
|
10
12
|
import requests
|
|
13
|
+
from ...runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
|
|
11
14
|
|
|
12
15
|
name = "confluence"
|
|
13
16
|
|
|
14
|
-
def
|
|
17
|
+
def get_toolkit(tool):
|
|
15
18
|
return ConfluenceToolkit().get_toolkit(
|
|
16
19
|
selected_tools=tool['settings'].get('selected_tools', []),
|
|
17
|
-
base_url=tool['settings']['base_url'],
|
|
18
20
|
space=tool['settings'].get('space', None),
|
|
19
21
|
cloud=tool['settings'].get('cloud', True),
|
|
20
22
|
confluence_configuration=tool['settings']['confluence_configuration'],
|
|
@@ -32,18 +34,19 @@ def get_tools(tool):
|
|
|
32
34
|
doctype='doc',
|
|
33
35
|
embedding_model=tool['settings'].get('embedding_model'),
|
|
34
36
|
vectorstore_type="PGVector"
|
|
35
|
-
)
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def get_tools(tool):
|
|
40
|
+
return get_toolkit(tool).get_tools()
|
|
36
41
|
|
|
37
42
|
|
|
38
43
|
class ConfluenceToolkit(BaseToolkit):
|
|
39
44
|
tools: List[BaseTool] = []
|
|
40
|
-
toolkit_max_length: int = 0
|
|
41
45
|
|
|
42
46
|
@staticmethod
|
|
43
47
|
def toolkit_config_schema() -> BaseModel:
|
|
44
48
|
selected_tools = {x['name']: x['args_schema'].schema() for x in
|
|
45
49
|
ConfluenceAPIWrapper.model_construct().get_available_tools()}
|
|
46
|
-
ConfluenceToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
47
50
|
|
|
48
51
|
@check_connection_response
|
|
49
52
|
def check_connection(self):
|
|
@@ -66,8 +69,7 @@ class ConfluenceToolkit(BaseToolkit):
|
|
|
66
69
|
|
|
67
70
|
model = create_model(
|
|
68
71
|
name,
|
|
69
|
-
space=(str, Field(description="Space",
|
|
70
|
-
'max_toolkit_length': ConfluenceToolkit.toolkit_max_length})),
|
|
72
|
+
space=(str, Field(description="Space")),
|
|
71
73
|
cloud=(bool, Field(description="Hosting Option", json_schema_extra={'configuration': True})),
|
|
72
74
|
limit=(int, Field(description="Pages limit per request", default=5)),
|
|
73
75
|
labels=(Optional[str], Field(
|
|
@@ -80,8 +82,8 @@ class ConfluenceToolkit(BaseToolkit):
|
|
|
80
82
|
min_retry_seconds=(int, Field(description="Min retry, sec", default=10)),
|
|
81
83
|
max_retry_seconds=(int, Field(description="Max retry, sec", default=60)),
|
|
82
84
|
# optional field for custom headers as dictionary
|
|
83
|
-
custom_headers=(Optional[dict], Field(description="Custom headers for API requests", default=
|
|
84
|
-
confluence_configuration=(
|
|
85
|
+
custom_headers=(Optional[dict], Field(description="Custom headers for API requests", default={})),
|
|
86
|
+
confluence_configuration=(ConfluenceConfiguration, Field(description="Confluence Configuration", json_schema_extra={'configuration_types': ['confluence']})),
|
|
85
87
|
pgvector_configuration=(Optional[PgVectorConfiguration], Field(default = None,
|
|
86
88
|
description="PgVector Configuration",
|
|
87
89
|
json_schema_extra={'configuration_types': ['pgvector']})),
|
|
@@ -103,6 +105,7 @@ class ConfluenceToolkit(BaseToolkit):
|
|
|
103
105
|
return model
|
|
104
106
|
|
|
105
107
|
@classmethod
|
|
108
|
+
@filter_missconfigured_index_tools
|
|
106
109
|
def get_toolkit(cls, selected_tools: list[str] | None = None, toolkit_name: Optional[str] = None, **kwargs):
|
|
107
110
|
if selected_tools is None:
|
|
108
111
|
selected_tools = []
|
|
@@ -113,18 +116,23 @@ class ConfluenceToolkit(BaseToolkit):
|
|
|
113
116
|
**(kwargs.get('pgvector_configuration') or {}),
|
|
114
117
|
}
|
|
115
118
|
confluence_api_wrapper = ConfluenceAPIWrapper(**wrapper_payload)
|
|
116
|
-
prefix = clean_string(toolkit_name, ConfluenceToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
|
|
117
119
|
available_tools = confluence_api_wrapper.get_available_tools()
|
|
118
120
|
tools = []
|
|
119
121
|
for tool in available_tools:
|
|
120
122
|
if selected_tools:
|
|
121
123
|
if tool["name"] not in selected_tools:
|
|
122
124
|
continue
|
|
125
|
+
description = tool["description"]
|
|
126
|
+
if toolkit_name:
|
|
127
|
+
description = f"Toolkit: {toolkit_name}\n{description}"
|
|
128
|
+
description = f"Confluence space: {confluence_api_wrapper.space}\n{description}"
|
|
129
|
+
description = description[:1000]
|
|
123
130
|
tools.append(BaseAction(
|
|
124
131
|
api_wrapper=confluence_api_wrapper,
|
|
125
|
-
name=
|
|
126
|
-
description=
|
|
127
|
-
args_schema=tool["args_schema"]
|
|
132
|
+
name=tool["name"],
|
|
133
|
+
description=description,
|
|
134
|
+
args_schema=tool["args_schema"],
|
|
135
|
+
metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
|
|
128
136
|
))
|
|
129
137
|
return cls(tools=tools)
|
|
130
138
|
|
|
@@ -1,24 +1,27 @@
|
|
|
1
|
-
import re
|
|
2
|
-
import logging
|
|
3
|
-
import requests
|
|
4
|
-
import json
|
|
5
1
|
import base64
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
6
5
|
import traceback
|
|
7
|
-
from typing import Optional, List, Any, Dict, Callable, Generator, Literal
|
|
8
6
|
from json import JSONDecodeError
|
|
7
|
+
from typing import Optional, List, Any, Dict, Callable, Generator, Literal
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
from
|
|
12
|
-
|
|
9
|
+
import requests
|
|
10
|
+
from atlassian.errors import ApiError
|
|
11
|
+
from langchain_community.document_loaders.confluence import ContentFormat
|
|
13
12
|
from langchain_core.documents import Document
|
|
14
|
-
from langchain_core.tools import ToolException
|
|
15
13
|
from langchain_core.messages import HumanMessage
|
|
14
|
+
from langchain_core.tools import ToolException
|
|
16
15
|
from markdownify import markdownify
|
|
17
|
-
from
|
|
16
|
+
from pydantic import Field, PrivateAttr, model_validator, create_model, SecretStr
|
|
17
|
+
from requests import HTTPError
|
|
18
|
+
from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log
|
|
18
19
|
|
|
19
|
-
from
|
|
20
|
+
from alita_sdk.tools.non_code_indexer_toolkit import NonCodeIndexerToolkit
|
|
21
|
+
from alita_sdk.tools.utils.available_tools_decorator import extend_with_parent_available_tools
|
|
20
22
|
from ..llm.img_utils import ImageDescriptionCache
|
|
21
23
|
from ..utils import is_cookie_token, parse_cookie_string
|
|
24
|
+
from ...runtime.utils.utils import IndexerKeywords
|
|
22
25
|
|
|
23
26
|
logger = logging.getLogger(__name__)
|
|
24
27
|
|
|
@@ -171,7 +174,7 @@ def parse_payload_params(params: Optional[str]) -> Dict[str, Any]:
|
|
|
171
174
|
return {}
|
|
172
175
|
|
|
173
176
|
|
|
174
|
-
class ConfluenceAPIWrapper(
|
|
177
|
+
class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
175
178
|
# Changed from PrivateAttr to Optional field with exclude=True
|
|
176
179
|
client: Optional[Any] = Field(default=None, exclude=True)
|
|
177
180
|
base_url: str
|
|
@@ -193,15 +196,7 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
193
196
|
keep_markdown_format: Optional[bool] = True
|
|
194
197
|
ocr_languages: Optional[str] = None
|
|
195
198
|
keep_newlines: Optional[bool] = True
|
|
196
|
-
|
|
197
|
-
# indexer related
|
|
198
|
-
connection_string: Optional[SecretStr] = None
|
|
199
|
-
collection_name: Optional[str] = None
|
|
200
|
-
doctype: Optional[str] = 'doc'
|
|
201
|
-
embedding_model: Optional[str] = "HuggingFaceEmbeddings"
|
|
202
|
-
embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
|
|
203
|
-
vectorstore_type: Optional[str] = "PGVector"
|
|
204
|
-
|
|
199
|
+
_errors: Optional[list[str]] = None
|
|
205
200
|
_image_cache: ImageDescriptionCache = PrivateAttr(default_factory=ImageDescriptionCache)
|
|
206
201
|
|
|
207
202
|
@model_validator(mode='before')
|
|
@@ -232,13 +227,13 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
232
227
|
else:
|
|
233
228
|
client_instance = Confluence(url=url, username=username, password=api_key, cloud=cloud)
|
|
234
229
|
|
|
235
|
-
custom_headers = values.get('custom_headers'
|
|
236
|
-
logger.info(f"
|
|
230
|
+
custom_headers = values.get('custom_headers') or {}
|
|
231
|
+
logger.info(f"Confluence tool: custom headers length: {len(custom_headers)}")
|
|
237
232
|
for header, value in custom_headers.items():
|
|
238
233
|
client_instance._update_header(header, value)
|
|
239
234
|
|
|
240
235
|
values['client'] = client_instance
|
|
241
|
-
return values
|
|
236
|
+
return super().validate_toolkit(values)
|
|
242
237
|
|
|
243
238
|
def __unquote_confluence_space(self) -> str | None:
|
|
244
239
|
if self.space:
|
|
@@ -485,28 +480,78 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
485
480
|
"""Gets pages with specific label in the Confluence space."""
|
|
486
481
|
|
|
487
482
|
start = 0
|
|
488
|
-
pages_info = []
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
483
|
+
pages_info: List[Dict[str, Any]] = []
|
|
484
|
+
seen_ids: set[str] = set()
|
|
485
|
+
|
|
486
|
+
# Use a while-loop driven by unique pages collected and
|
|
487
|
+
# presence of additional results instead of a fixed number
|
|
488
|
+
# of iterations based purely on max_pages/limit.
|
|
489
|
+
while len(pages_info) < (self.max_pages or 0):
|
|
490
|
+
pages = self.client.get_all_pages_by_label(
|
|
491
|
+
label,
|
|
492
|
+
start=start,
|
|
493
|
+
limit=self.limit,
|
|
494
|
+
) # , expand="body.view.value"
|
|
492
495
|
if not pages:
|
|
493
496
|
break
|
|
494
497
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
498
|
+
# Collect only ids we haven't processed yet to avoid
|
|
499
|
+
# calling get_page_by_id multiple times for the same
|
|
500
|
+
# Confluence page.
|
|
501
|
+
new_ids: List[str] = []
|
|
502
|
+
for p in pages:
|
|
503
|
+
page_id = p["id"] if isinstance(p, dict) else getattr(p, "id", None)
|
|
504
|
+
if page_id is None:
|
|
505
|
+
continue
|
|
506
|
+
if page_id in seen_ids:
|
|
507
|
+
continue
|
|
508
|
+
seen_ids.add(page_id)
|
|
509
|
+
new_ids.append(page_id)
|
|
510
|
+
|
|
511
|
+
if new_ids:
|
|
512
|
+
for page in self.get_pages_by_id(new_ids):
|
|
513
|
+
meta = getattr(page, "metadata", {}) or {}
|
|
514
|
+
page_id = meta.get("id")
|
|
515
|
+
page_title = meta.get("title")
|
|
516
|
+
page_url = meta.get("source")
|
|
517
|
+
content = getattr(page, "page_content", None)
|
|
518
|
+
|
|
519
|
+
if page_id is None:
|
|
520
|
+
continue
|
|
521
|
+
|
|
522
|
+
pages_info.append(
|
|
523
|
+
{
|
|
524
|
+
"page_id": page_id,
|
|
525
|
+
"page_title": page_title,
|
|
526
|
+
"page_url": page_url,
|
|
527
|
+
"content": content,
|
|
528
|
+
}
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Respect max_pages on unique pages collected.
|
|
532
|
+
if len(pages_info) >= (self.max_pages or 0):
|
|
533
|
+
break
|
|
534
|
+
|
|
535
|
+
# Advance the offset by the requested page size.
|
|
501
536
|
start += self.limit
|
|
502
|
-
|
|
537
|
+
|
|
538
|
+
# Defensive break: if the API returns fewer items than
|
|
539
|
+
# requested, there are likely no more pages to fetch.
|
|
540
|
+
if len(pages) < self.limit:
|
|
541
|
+
break
|
|
542
|
+
|
|
543
|
+
# Slice as an extra safety net in case of any race conditions
|
|
544
|
+
# around the max_pages guard in the loop above.
|
|
545
|
+
return pages_info[: (self.max_pages or len(pages_info))]
|
|
503
546
|
|
|
504
547
|
def is_public_page(self, page: dict) -> bool:
|
|
505
548
|
"""Check if a page is publicly accessible."""
|
|
506
549
|
restrictions = self.client.get_all_restrictions_for_content(page["id"])
|
|
507
550
|
|
|
508
551
|
return (
|
|
509
|
-
page["status"] == "current"
|
|
552
|
+
(page["status"] == "current"
|
|
553
|
+
# allow user to see archived content if needed
|
|
554
|
+
or page["status"] == "archived")
|
|
510
555
|
and not restrictions["read"]["restrictions"]["user"]["results"]
|
|
511
556
|
and not restrictions["read"]["restrictions"]["group"]["results"]
|
|
512
557
|
)
|
|
@@ -526,18 +571,35 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
526
571
|
),
|
|
527
572
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
528
573
|
)(self.client.get_page_by_id)
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
574
|
+
try:
|
|
575
|
+
page = get_page(
|
|
576
|
+
page_id=page_id, expand=f"{self.content_format.value},version"
|
|
577
|
+
)
|
|
578
|
+
except (ApiError, HTTPError) as e:
|
|
579
|
+
logger.error(f"Error fetching page with ID {page_id}: {e}")
|
|
580
|
+
page_content_temp = f"Confluence API Error: cannot fetch the page with ID {page_id}: {e}"
|
|
581
|
+
# store errors
|
|
582
|
+
if self._errors is None:
|
|
583
|
+
self._errors = []
|
|
584
|
+
self._errors.append(page_content_temp)
|
|
585
|
+
return Document(page_content=page_content_temp,
|
|
586
|
+
metadata={})
|
|
587
|
+
# TODO: update on toolkit advanced settings level as a separate feature
|
|
588
|
+
# if not self.include_restricted_content and not self.is_public_page(page):
|
|
589
|
+
# continue
|
|
534
590
|
yield self.process_page(page, skip_images)
|
|
535
591
|
|
|
592
|
+
def _log_errors(self):
|
|
593
|
+
""" Log errors encountered during toolkit execution. """
|
|
594
|
+
if self._errors:
|
|
595
|
+
logger.info(f"Errors encountered during toolkit execution: {self._errors}")
|
|
596
|
+
|
|
536
597
|
def read_page_by_id(self, page_id: str, skip_images: bool = False):
|
|
537
598
|
"""Reads a page by its id in the Confluence space. If id is not available, but there is a title - use get_page_id first."""
|
|
538
599
|
result = list(self.get_pages_by_id([page_id], skip_images))
|
|
539
600
|
if not result:
|
|
540
|
-
"
|
|
601
|
+
return f"Pages not found. Errors: {self._errors}" if self._errors \
|
|
602
|
+
else "Pages not found or you do not have access to them."
|
|
541
603
|
return result[0].page_content
|
|
542
604
|
# return self._strip_base64_images(result[0].page_content) if skip_images else result[0].page_content
|
|
543
605
|
|
|
@@ -547,7 +609,9 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
547
609
|
:param title: title
|
|
548
610
|
:param type: type of content: page or blogpost. Defaults to page
|
|
549
611
|
"""
|
|
550
|
-
|
|
612
|
+
|
|
613
|
+
result = self.client.get_page_id(space=self.space, title=title, type=type)
|
|
614
|
+
return result if result else "Page not found. Check the title or space."
|
|
551
615
|
|
|
552
616
|
def _strip_base64_images(self, content):
|
|
553
617
|
base64_md_pattern = r'data:image/(png|jpeg|gif);base64,[a-zA-Z0-9+/=]+'
|
|
@@ -570,7 +634,7 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
570
634
|
}
|
|
571
635
|
pages_info.append(page_info)
|
|
572
636
|
start += self.limit
|
|
573
|
-
return str(pages_info)
|
|
637
|
+
return str(pages_info) if pages_info else f"Unable to find anything using query {cql}. Check space or query."
|
|
574
638
|
|
|
575
639
|
def search_pages(self, query: str, skip_images: bool = False):
|
|
576
640
|
"""Search pages in Confluence by query text in title or page content."""
|
|
@@ -821,6 +885,10 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
821
885
|
from .loader import AlitaConfluenceLoader
|
|
822
886
|
from copy import copy
|
|
823
887
|
content_format = kwargs.get('content_format', 'view').lower()
|
|
888
|
+
|
|
889
|
+
self._index_include_attachments = kwargs.get('include_attachments', False)
|
|
890
|
+
self._include_extensions = kwargs.get('include_extensions', [])
|
|
891
|
+
self._skip_extensions = kwargs.get('skip_extensions', [])
|
|
824
892
|
base_params = {
|
|
825
893
|
'url': self.base_url,
|
|
826
894
|
'space_key': self.space,
|
|
@@ -852,12 +920,81 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
852
920
|
yield document
|
|
853
921
|
|
|
854
922
|
def _process_document(self, document: Document) -> Generator[Document, None, None]:
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
923
|
+
try:
|
|
924
|
+
if self._index_include_attachments:
|
|
925
|
+
page_id = document.metadata.get('id')
|
|
926
|
+
attachments = self.client.get_attachments_from_content(page_id)
|
|
927
|
+
if not attachments or not attachments.get('results'):
|
|
928
|
+
return f"No attachments found for page ID {page_id}."
|
|
929
|
+
|
|
930
|
+
# Get attachment history for created/updated info
|
|
931
|
+
history_map = {}
|
|
932
|
+
for attachment in attachments['results']:
|
|
933
|
+
try:
|
|
934
|
+
hist = self.client.history(attachment['id'])
|
|
935
|
+
history_map[attachment['id']] = hist
|
|
936
|
+
except Exception as e:
|
|
937
|
+
logger.warning(f"Failed to fetch history for attachment {attachment.get('title', '')}: {str(e)}")
|
|
938
|
+
history_map[attachment['id']] = None
|
|
939
|
+
|
|
940
|
+
import re
|
|
941
|
+
for attachment in attachments['results']:
|
|
942
|
+
title = attachment.get('title', '')
|
|
943
|
+
file_ext = title.lower().split('.')[-1] if '.' in title else ''
|
|
944
|
+
|
|
945
|
+
# Re-verify extension filters
|
|
946
|
+
# Check if file should be skipped based on skip_extensions
|
|
947
|
+
if any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
|
|
948
|
+
for pattern in self._skip_extensions):
|
|
949
|
+
continue
|
|
950
|
+
|
|
951
|
+
# Check if file should be included based on include_extensions
|
|
952
|
+
# If include_extensions is empty, process all files (that weren't skipped)
|
|
953
|
+
if self._include_extensions and not (
|
|
954
|
+
any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
|
|
955
|
+
for pattern in self._include_extensions)):
|
|
956
|
+
continue
|
|
957
|
+
|
|
958
|
+
media_type = attachment.get('metadata', {}).get('mediaType', '')
|
|
959
|
+
# Core metadata extraction with history
|
|
960
|
+
hist = history_map.get(attachment['id']) or {}
|
|
961
|
+
created_by = hist.get('createdBy', {}).get('displayName', '') if hist else attachment.get('creator', {}).get('displayName', '')
|
|
962
|
+
created_date = hist.get('createdDate', '') if hist else attachment.get('created', '')
|
|
963
|
+
last_updated = hist.get('lastUpdated', {}).get('when', '') if hist else ''
|
|
964
|
+
|
|
965
|
+
attachment_path = attachment['_links']['download'] if attachment.get(
|
|
966
|
+
'_links', {}).get('download') else ''
|
|
967
|
+
download_url = self.client.url.rstrip('/') + attachment_path
|
|
968
|
+
metadata = {
|
|
969
|
+
'name': title,
|
|
970
|
+
'size': attachment.get('extensions', {}).get('fileSize', None),
|
|
971
|
+
'creator': created_by,
|
|
972
|
+
'created': created_date,
|
|
973
|
+
'updated': last_updated,
|
|
974
|
+
'media_type': media_type,
|
|
975
|
+
'labels': [label['name'] for label in
|
|
976
|
+
attachment.get('metadata', {}).get('labels', {}).get('results', [])],
|
|
977
|
+
'download_url': download_url
|
|
978
|
+
}
|
|
979
|
+
try:
|
|
980
|
+
resp = self.client.request(method="GET", path=attachment_path, advanced_mode=True)
|
|
981
|
+
if resp.status_code == 200:
|
|
982
|
+
content = resp.content
|
|
983
|
+
else:
|
|
984
|
+
content = f"[Failed to download {download_url}: HTTP status code {resp.status_code}]"
|
|
985
|
+
except Exception as e:
|
|
986
|
+
content = f"[Error downloading content: {str(e)}]"
|
|
987
|
+
|
|
988
|
+
if isinstance(content, str):
|
|
989
|
+
yield Document(page_content=content, metadata=metadata)
|
|
990
|
+
else:
|
|
991
|
+
yield Document(page_content="", metadata={
|
|
992
|
+
**metadata,
|
|
993
|
+
IndexerKeywords.CONTENT_FILE_NAME.value: f".{file_ext}",
|
|
994
|
+
IndexerKeywords.CONTENT_IN_BYTES.value: content
|
|
995
|
+
})
|
|
996
|
+
except Exception as e:
|
|
997
|
+
yield from ()
|
|
861
998
|
|
|
862
999
|
def _download_image(self, image_url):
|
|
863
1000
|
"""
|
|
@@ -1598,18 +1735,24 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
1598
1735
|
"include_restricted_content": (Optional[bool], Field(description="Include restricted content.", default=False)),
|
|
1599
1736
|
"include_archived_content": (Optional[bool], Field(description="Include archived content.", default=False)),
|
|
1600
1737
|
"include_attachments": (Optional[bool], Field(description="Include attachments.", default=False)),
|
|
1738
|
+
'include_extensions': (Optional[List[str]], Field(
|
|
1739
|
+
description="List of file extensions to include when processing attachments: i.e. ['*.png', '*.jpg']. "
|
|
1740
|
+
"If empty, all files will be processed (except skip_extensions).",
|
|
1741
|
+
default=[])),
|
|
1742
|
+
'skip_extensions': (Optional[List[str]], Field(
|
|
1743
|
+
description="List of file extensions to skip when processing attachments: i.e. ['*.png', '*.jpg']",
|
|
1744
|
+
default=[])),
|
|
1601
1745
|
"include_comments": (Optional[bool], Field(description="Include comments.", default=False)),
|
|
1602
|
-
"include_labels": (Optional[bool], Field(description="Include labels.", default=
|
|
1746
|
+
"include_labels": (Optional[bool], Field(description="Include labels.", default=False)),
|
|
1603
1747
|
"ocr_languages": (Optional[str], Field(description="OCR languages for processing attachments.", default='eng')),
|
|
1604
1748
|
"keep_markdown_format": (Optional[bool], Field(description="Keep the markdown format.", default=True)),
|
|
1605
1749
|
"keep_newlines": (Optional[bool], Field(description="Keep newlines in the content.", default=True)),
|
|
1606
1750
|
"bins_with_llm": (Optional[bool], Field(description="Use LLM for processing binary files.", default=False)),
|
|
1607
1751
|
}
|
|
1608
1752
|
|
|
1609
|
-
@
|
|
1753
|
+
@extend_with_parent_available_tools
|
|
1610
1754
|
def get_available_tools(self):
|
|
1611
|
-
|
|
1612
|
-
confluence_tools = [
|
|
1755
|
+
return [
|
|
1613
1756
|
{
|
|
1614
1757
|
"name": "create_page",
|
|
1615
1758
|
"ref": self.create_page,
|
|
@@ -1726,7 +1869,3 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
1726
1869
|
}
|
|
1727
1870
|
]
|
|
1728
1871
|
|
|
1729
|
-
# Add standardized vector search tools from base class
|
|
1730
|
-
vector_search_tools = self._get_vector_search_tools()
|
|
1731
|
-
|
|
1732
|
-
return confluence_tools + vector_search_tools
|