alita-sdk 0.3.379__py3-none-any.whl → 0.3.627__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +156 -0
- alita_sdk/cli/agent_loader.py +245 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3113 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/testcases/__init__.py +94 -0
- alita_sdk/cli/testcases/data_generation.py +119 -0
- alita_sdk/cli/testcases/discovery.py +96 -0
- alita_sdk/cli/testcases/executor.py +84 -0
- alita_sdk/cli/testcases/logger.py +85 -0
- alita_sdk/cli/testcases/parser.py +172 -0
- alita_sdk/cli/testcases/prompts.py +91 -0
- alita_sdk/cli/testcases/reporting.py +125 -0
- alita_sdk/cli/testcases/setup.py +108 -0
- alita_sdk/cli/testcases/test_runner.py +282 -0
- alita_sdk/cli/testcases/utils.py +39 -0
- alita_sdk/cli/testcases/validation.py +90 -0
- alita_sdk/cli/testcases/workflow.py +196 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +1 -1
- alita_sdk/configurations/ado.py +141 -20
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/confluence.py +130 -1
- alita_sdk/configurations/figma.py +76 -0
- alita_sdk/configurations/gitlab.py +91 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/openapi.py +329 -0
- alita_sdk/configurations/qtest.py +72 -1
- alita_sdk/configurations/report_portal.py +96 -0
- alita_sdk/configurations/sharepoint.py +148 -0
- alita_sdk/configurations/testio.py +83 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +93 -0
- alita_sdk/configurations/zephyr_enterprise.py +93 -0
- alita_sdk/configurations/zephyr_essential.py +75 -0
- alita_sdk/runtime/clients/artifact.py +3 -3
- alita_sdk/runtime/clients/client.py +388 -46
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +8 -21
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +157 -39
- alita_sdk/runtime/langchain/constants.py +647 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
- alita_sdk/runtime/langchain/document_loaders/constants.py +40 -19
- alita_sdk/runtime/langchain/langraph_agent.py +405 -84
- alita_sdk/runtime/langchain/utils.py +106 -7
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +31 -0
- alita_sdk/runtime/toolkits/application.py +29 -10
- alita_sdk/runtime/toolkits/artifact.py +20 -11
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +783 -0
- alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +251 -6
- alita_sdk/runtime/toolkits/tools.py +356 -69
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +10 -3
- alita_sdk/runtime/tools/application.py +27 -6
- alita_sdk/runtime/tools/artifact.py +511 -28
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +67 -35
- alita_sdk/runtime/tools/graph.py +10 -4
- alita_sdk/runtime/tools/image_generation.py +148 -46
- alita_sdk/runtime/tools/llm.py +1003 -128
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +8 -5
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -4
- alita_sdk/runtime/tools/sandbox.py +65 -48
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +9 -3
- alita_sdk/runtime/tools/vectorstore_base.py +70 -14
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +361 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/serialization.py +155 -0
- alita_sdk/runtime/utils/streamlit.py +40 -13
- alita_sdk/runtime/utils/toolkit_utils.py +30 -9
- alita_sdk/runtime/utils/utils.py +36 -0
- alita_sdk/tools/__init__.py +134 -35
- alita_sdk/tools/ado/repos/__init__.py +51 -32
- alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
- alita_sdk/tools/ado/test_plan/__init__.py +25 -9
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
- alita_sdk/tools/ado/utils.py +1 -18
- alita_sdk/tools/ado/wiki/__init__.py +25 -12
- alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
- alita_sdk/tools/ado/work_item/__init__.py +26 -13
- alita_sdk/tools/ado/work_item/ado_wrapper.py +73 -11
- alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +11 -8
- alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +271 -84
- alita_sdk/tools/bitbucket/__init__.py +17 -11
- alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
- alita_sdk/tools/browser/__init__.py +5 -4
- alita_sdk/tools/carrier/__init__.py +5 -6
- alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
- alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
- alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +10 -7
- alita_sdk/tools/cloud/azure/__init__.py +10 -7
- alita_sdk/tools/cloud/gcp/__init__.py +10 -7
- alita_sdk/tools/cloud/k8s/__init__.py +10 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +11 -8
- alita_sdk/tools/code_indexer_toolkit.py +82 -22
- alita_sdk/tools/confluence/__init__.py +22 -16
- alita_sdk/tools/confluence/api_wrapper.py +107 -30
- alita_sdk/tools/confluence/loader.py +14 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +493 -30
- alita_sdk/tools/figma/__init__.py +58 -11
- alita_sdk/tools/figma/api_wrapper.py +1235 -143
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +14 -15
- alita_sdk/tools/github/github_client.py +224 -100
- alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
- alita_sdk/tools/github/schemas.py +14 -5
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/github/tool_prompts.py +9 -22
- alita_sdk/tools/gitlab/__init__.py +16 -11
- alita_sdk/tools/gitlab/api_wrapper.py +218 -48
- alita_sdk/tools/gitlab_org/__init__.py +10 -9
- alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
- alita_sdk/tools/google/bigquery/__init__.py +13 -12
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +11 -8
- alita_sdk/tools/google_places/api_wrapper.py +1 -1
- alita_sdk/tools/jira/__init__.py +17 -10
- alita_sdk/tools/jira/api_wrapper.py +92 -41
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +12 -4
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +491 -106
- alita_sdk/tools/openapi/api_wrapper.py +1368 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +38 -25
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +10 -9
- alita_sdk/tools/pptx/__init__.py +11 -10
- alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
- alita_sdk/tools/qtest/__init__.py +31 -11
- alita_sdk/tools/qtest/api_wrapper.py +2135 -86
- alita_sdk/tools/rally/__init__.py +10 -9
- alita_sdk/tools/rally/api_wrapper.py +1 -1
- alita_sdk/tools/report_portal/__init__.py +12 -8
- alita_sdk/tools/salesforce/__init__.py +10 -8
- alita_sdk/tools/servicenow/__init__.py +17 -15
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +10 -7
- alita_sdk/tools/sharepoint/api_wrapper.py +129 -38
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +10 -7
- alita_sdk/tools/slack/api_wrapper.py +2 -2
- alita_sdk/tools/sql/__init__.py +12 -9
- alita_sdk/tools/testio/__init__.py +10 -7
- alita_sdk/tools/testrail/__init__.py +11 -10
- alita_sdk/tools/testrail/api_wrapper.py +1 -1
- alita_sdk/tools/utils/__init__.py +9 -4
- alita_sdk/tools/utils/content_parser.py +103 -18
- alita_sdk/tools/utils/text_operations.py +410 -0
- alita_sdk/tools/utils/tool_prompts.py +79 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +30 -13
- alita_sdk/tools/xray/__init__.py +13 -9
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +10 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +11 -7
- alita_sdk/tools/zephyr_essential/__init__.py +10 -7
- alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
- alita_sdk/tools/zephyr_essential/client.py +2 -2
- alita_sdk/tools/zephyr_scale/__init__.py +11 -8
- alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
- alita_sdk/tools/zephyr_squad/__init__.py +10 -7
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +154 -8
- alita_sdk-0.3.627.dist-info/RECORD +468 -0
- alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
- alita_sdk-0.3.379.dist-info/RECORD +0 -360
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
|
@@ -5,8 +5,9 @@ from pydantic import create_model, BaseModel, ConfigDict, Field
|
|
|
5
5
|
from .api_wrapper import SonarApiWrapper
|
|
6
6
|
from ...base.tool import BaseAction
|
|
7
7
|
from ...elitea_base import filter_missconfigured_index_tools
|
|
8
|
-
from ...utils import clean_string,
|
|
8
|
+
from ...utils import clean_string, get_max_toolkit_length
|
|
9
9
|
from ....configurations.sonar import SonarConfiguration
|
|
10
|
+
from ....runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
|
|
10
11
|
|
|
11
12
|
name = "sonar"
|
|
12
13
|
|
|
@@ -21,15 +22,13 @@ def get_tools(tool):
|
|
|
21
22
|
|
|
22
23
|
class SonarToolkit(BaseToolkit):
|
|
23
24
|
tools: list[BaseTool] = []
|
|
24
|
-
toolkit_max_length: int = 0
|
|
25
25
|
|
|
26
26
|
@staticmethod
|
|
27
27
|
def toolkit_config_schema() -> BaseModel:
|
|
28
28
|
selected_tools = {x['name']: x['args_schema'].schema() for x in SonarApiWrapper.model_construct().get_available_tools()}
|
|
29
|
-
SonarToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
30
29
|
return create_model(
|
|
31
30
|
name,
|
|
32
|
-
sonar_project_name=(str, Field(description="Project name of the desired repository"
|
|
31
|
+
sonar_project_name=(str, Field(description="Project name of the desired repository")),
|
|
33
32
|
sonar_configuration=(SonarConfiguration, Field(description="Sonar Configuration", json_schema_extra={'configuration_types': ['sonar']})),
|
|
34
33
|
selected_tools=(List[Literal[tuple(selected_tools)]], Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
|
|
35
34
|
__config__=ConfigDict(json_schema_extra=
|
|
@@ -55,15 +54,19 @@ class SonarToolkit(BaseToolkit):
|
|
|
55
54
|
sonar_api_wrapper = SonarApiWrapper(**wrapper_payload)
|
|
56
55
|
available_tools = sonar_api_wrapper.get_available_tools()
|
|
57
56
|
tools = []
|
|
58
|
-
prefix = clean_string(toolkit_name, SonarToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
|
|
59
57
|
for tool in available_tools:
|
|
60
58
|
if selected_tools and tool["name"] not in selected_tools:
|
|
61
59
|
continue
|
|
60
|
+
description = tool["description"]
|
|
61
|
+
if toolkit_name:
|
|
62
|
+
description = f"Toolkit: {toolkit_name}\n{description}"
|
|
63
|
+
description = description[:1000]
|
|
62
64
|
tools.append(BaseAction(
|
|
63
65
|
api_wrapper=sonar_api_wrapper,
|
|
64
|
-
name=
|
|
65
|
-
description=
|
|
66
|
-
args_schema=tool["args_schema"]
|
|
66
|
+
name=tool["name"],
|
|
67
|
+
description=description,
|
|
68
|
+
args_schema=tool["args_schema"],
|
|
69
|
+
metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
|
|
67
70
|
))
|
|
68
71
|
return cls(tools=tools)
|
|
69
72
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
import fnmatch
|
|
3
|
+
import json
|
|
3
4
|
import logging
|
|
4
5
|
from typing import Optional, List, Generator
|
|
5
6
|
|
|
@@ -8,20 +9,20 @@ from langchain_core.tools import ToolException
|
|
|
8
9
|
from pydantic import Field
|
|
9
10
|
|
|
10
11
|
from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
|
|
11
|
-
from .chunkers.code.codeparser import parse_code_files_for_db
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
17
17
|
def _get_indexed_data(self, index_name: str):
|
|
18
|
+
self._ensure_vectorstore_initialized()
|
|
18
19
|
if not self.vector_adapter:
|
|
19
20
|
raise ToolException("Vector adapter is not initialized. "
|
|
20
21
|
"Check your configuration: embedding_model and vectorstore_type.")
|
|
21
22
|
return self.vector_adapter.get_code_indexed_data(self, index_name)
|
|
22
23
|
|
|
23
24
|
def key_fn(self, document: Document):
|
|
24
|
-
return document.metadata.get(
|
|
25
|
+
return document.metadata.get("filename")
|
|
25
26
|
|
|
26
27
|
def compare_fn(self, document: Document, idx_data):
|
|
27
28
|
return (document.metadata.get('commit_hash') and
|
|
@@ -37,16 +38,18 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
37
38
|
branch: Optional[str] = None,
|
|
38
39
|
whitelist: Optional[List[str]] = None,
|
|
39
40
|
blacklist: Optional[List[str]] = None,
|
|
41
|
+
chunking_config: Optional[dict] = None,
|
|
40
42
|
**kwargs) -> Generator[Document, None, None]:
|
|
41
43
|
"""Index repository files in the vector store using code parsing."""
|
|
42
44
|
yield from self.loader(
|
|
43
45
|
branch=branch,
|
|
44
46
|
whitelist=whitelist,
|
|
45
|
-
blacklist=blacklist
|
|
47
|
+
blacklist=blacklist,
|
|
48
|
+
chunking_config=chunking_config
|
|
46
49
|
)
|
|
47
50
|
|
|
48
51
|
def _extend_data(self, documents: Generator[Document, None, None]):
|
|
49
|
-
yield from
|
|
52
|
+
yield from documents
|
|
50
53
|
|
|
51
54
|
def _index_tool_params(self):
|
|
52
55
|
"""Return the parameters for indexing data."""
|
|
@@ -65,26 +68,55 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
65
68
|
def loader(self,
|
|
66
69
|
branch: Optional[str] = None,
|
|
67
70
|
whitelist: Optional[List[str]] = None,
|
|
68
|
-
blacklist: Optional[List[str]] = None
|
|
71
|
+
blacklist: Optional[List[str]] = None,
|
|
72
|
+
chunked: bool = True,
|
|
73
|
+
chunking_config: Optional[dict] = None) -> Generator[Document, None, None]:
|
|
69
74
|
"""
|
|
70
|
-
Generates
|
|
75
|
+
Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
|
|
71
76
|
|
|
72
77
|
Parameters:
|
|
73
78
|
- branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
|
|
74
79
|
- whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
|
|
75
80
|
- blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
|
|
81
|
+
- chunked (bool): If True (default), applies universal chunker based on file type.
|
|
82
|
+
If False, returns raw Documents without chunking.
|
|
83
|
+
- chunking_config (Optional[dict]): Chunking configuration by file extension
|
|
76
84
|
|
|
77
85
|
Returns:
|
|
78
|
-
- generator: Yields
|
|
86
|
+
- generator: Yields Documents from files matching the whitelist but not the blacklist.
|
|
87
|
+
Each document has exactly the key 'filename' in metadata, which is used as an ID
|
|
88
|
+
for further operations (indexing, deduplication, and retrieval).
|
|
79
89
|
|
|
80
90
|
Example:
|
|
81
91
|
# Use 'feature-branch', include '.py' files, exclude 'test_' files
|
|
82
|
-
|
|
92
|
+
for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
|
|
93
|
+
print(doc.page_content)
|
|
83
94
|
|
|
84
95
|
Notes:
|
|
85
96
|
- Whitelist and blacklist use Unix shell-style wildcards.
|
|
86
97
|
- Files must match the whitelist and not the blacklist to be included.
|
|
98
|
+
- Each document MUST have exactly the key 'filename' in metadata. This key is used as an ID
|
|
99
|
+
for further operations such as indexing, deduplication, and retrieval.
|
|
100
|
+
- When chunked=True:
|
|
101
|
+
- .md files → markdown chunker (header-based splitting)
|
|
102
|
+
- .py/.js/.ts/etc → code parser (TreeSitter-based)
|
|
103
|
+
- .json files → JSON chunker
|
|
104
|
+
- other files → default text chunker
|
|
87
105
|
"""
|
|
106
|
+
import hashlib
|
|
107
|
+
|
|
108
|
+
# Auto-include extensions from chunking_config if whitelist is specified
|
|
109
|
+
# This allows chunking config to work without manually adding extensions to whitelist
|
|
110
|
+
if chunking_config and whitelist:
|
|
111
|
+
for ext_pattern in chunking_config.keys():
|
|
112
|
+
# Normalize extension pattern (both ".cbl" and "*.cbl" should work)
|
|
113
|
+
normalized = ext_pattern if ext_pattern.startswith('*') else f'*{ext_pattern}'
|
|
114
|
+
if normalized not in whitelist:
|
|
115
|
+
whitelist.append(normalized)
|
|
116
|
+
self._log_tool_event(
|
|
117
|
+
message=f"Auto-included extension '{normalized}' from chunking_config",
|
|
118
|
+
tool_name="loader"
|
|
119
|
+
)
|
|
88
120
|
|
|
89
121
|
_files = self.__handle_get_files("", self.__get_branch(branch))
|
|
90
122
|
self._log_tool_event(message="Listing files in branch", tool_name="loader")
|
|
@@ -102,32 +134,60 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
102
134
|
or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
|
|
103
135
|
return False
|
|
104
136
|
|
|
105
|
-
def
|
|
137
|
+
def raw_document_generator() -> Generator[Document, None, None]:
|
|
138
|
+
"""Yields raw Documents without chunking."""
|
|
106
139
|
self._log_tool_event(message="Reading the files", tool_name="loader")
|
|
107
|
-
# log the progress of file reading
|
|
108
140
|
total_files = len(_files)
|
|
141
|
+
processed = 0
|
|
142
|
+
|
|
109
143
|
for idx, file in enumerate(_files, 1):
|
|
110
144
|
if is_whitelisted(file) and not is_blacklisted(file):
|
|
111
|
-
# read file ONLY if it matches whitelist and does not match blacklist
|
|
112
145
|
try:
|
|
113
146
|
file_content = self._read_file(file, self.__get_branch(branch))
|
|
114
147
|
except Exception as e:
|
|
115
148
|
logger.error(f"Failed to read file {file}: {e}")
|
|
116
|
-
|
|
149
|
+
continue
|
|
150
|
+
|
|
117
151
|
if not file_content:
|
|
118
|
-
# empty file, skip
|
|
119
152
|
continue
|
|
120
|
-
|
|
121
|
-
|
|
153
|
+
|
|
154
|
+
# Ensure file content is a string
|
|
155
|
+
if isinstance(file_content, bytes):
|
|
156
|
+
file_content = file_content.decode("utf-8", errors="ignore")
|
|
157
|
+
elif isinstance(file_content, dict) and file.endswith('.json'):
|
|
158
|
+
file_content = json.dumps(file_content)
|
|
159
|
+
elif not isinstance(file_content, str):
|
|
160
|
+
file_content = str(file_content)
|
|
161
|
+
|
|
162
|
+
# Hash the file content for uniqueness tracking
|
|
122
163
|
file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
164
|
+
processed += 1
|
|
165
|
+
|
|
166
|
+
yield Document(
|
|
167
|
+
page_content=file_content,
|
|
168
|
+
metadata={
|
|
169
|
+
'file_path': file,
|
|
170
|
+
'filename': file,
|
|
171
|
+
'source': file,
|
|
172
|
+
'commit_hash': file_hash,
|
|
173
|
+
}
|
|
174
|
+
)
|
|
175
|
+
|
|
126
176
|
if idx % 10 == 0 or idx == total_files:
|
|
127
|
-
self._log_tool_event(
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
177
|
+
self._log_tool_event(
|
|
178
|
+
message=f"{idx} out of {total_files} files checked, {processed} matched",
|
|
179
|
+
tool_name="loader"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
|
|
183
|
+
|
|
184
|
+
if not chunked:
|
|
185
|
+
# Return raw documents without chunking
|
|
186
|
+
return raw_document_generator()
|
|
187
|
+
|
|
188
|
+
# Apply universal chunker based on file type
|
|
189
|
+
from .chunkers.universal_chunker import universal_chunker
|
|
190
|
+
return universal_chunker(raw_document_generator())
|
|
131
191
|
|
|
132
192
|
def __handle_get_files(self, path: str, branch: str):
|
|
133
193
|
"""
|
|
@@ -6,14 +6,15 @@ from ..base.tool import BaseAction
|
|
|
6
6
|
from pydantic import create_model, BaseModel, ConfigDict, Field
|
|
7
7
|
|
|
8
8
|
from ..elitea_base import filter_missconfigured_index_tools
|
|
9
|
-
from ..utils import clean_string,
|
|
9
|
+
from ..utils import clean_string, get_max_toolkit_length, parse_list, check_connection_response
|
|
10
10
|
from ...configurations.confluence import ConfluenceConfiguration
|
|
11
11
|
from ...configurations.pgvector import PgVectorConfiguration
|
|
12
12
|
import requests
|
|
13
|
+
from ...runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
|
|
13
14
|
|
|
14
15
|
name = "confluence"
|
|
15
16
|
|
|
16
|
-
def
|
|
17
|
+
def get_toolkit(tool):
|
|
17
18
|
return ConfluenceToolkit().get_toolkit(
|
|
18
19
|
selected_tools=tool['settings'].get('selected_tools', []),
|
|
19
20
|
space=tool['settings'].get('space', None),
|
|
@@ -33,18 +34,19 @@ def get_tools(tool):
|
|
|
33
34
|
doctype='doc',
|
|
34
35
|
embedding_model=tool['settings'].get('embedding_model'),
|
|
35
36
|
vectorstore_type="PGVector"
|
|
36
|
-
)
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def get_tools(tool):
|
|
40
|
+
return get_toolkit(tool).get_tools()
|
|
37
41
|
|
|
38
42
|
|
|
39
43
|
class ConfluenceToolkit(BaseToolkit):
|
|
40
44
|
tools: List[BaseTool] = []
|
|
41
|
-
toolkit_max_length: int = 0
|
|
42
45
|
|
|
43
46
|
@staticmethod
|
|
44
47
|
def toolkit_config_schema() -> BaseModel:
|
|
45
48
|
selected_tools = {x['name']: x['args_schema'].schema() for x in
|
|
46
49
|
ConfluenceAPIWrapper.model_construct().get_available_tools()}
|
|
47
|
-
ConfluenceToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
48
50
|
|
|
49
51
|
@check_connection_response
|
|
50
52
|
def check_connection(self):
|
|
@@ -67,19 +69,18 @@ class ConfluenceToolkit(BaseToolkit):
|
|
|
67
69
|
|
|
68
70
|
model = create_model(
|
|
69
71
|
name,
|
|
70
|
-
space=(str, Field(description="Space",
|
|
71
|
-
'max_toolkit_length': ConfluenceToolkit.toolkit_max_length})),
|
|
72
|
+
space=(str, Field(description="Space")),
|
|
72
73
|
cloud=(bool, Field(description="Hosting Option", json_schema_extra={'configuration': True})),
|
|
73
|
-
limit=(int, Field(description="Pages limit per request", default=5)),
|
|
74
|
+
limit=(int, Field(description="Pages limit per request", default=5, gt=0)),
|
|
74
75
|
labels=(Optional[str], Field(
|
|
75
76
|
description="List of comma separated labels used for labeling of agent's created or updated entities",
|
|
76
77
|
default=None,
|
|
77
78
|
examples="alita,elitea;another-label"
|
|
78
79
|
)),
|
|
79
|
-
max_pages=(int, Field(description="Max total pages", default=10)),
|
|
80
|
-
number_of_retries=(int, Field(description="Number of retries", default=2)),
|
|
81
|
-
min_retry_seconds=(int, Field(description="Min retry, sec", default=10)),
|
|
82
|
-
max_retry_seconds=(int, Field(description="Max retry, sec", default=60)),
|
|
80
|
+
max_pages=(int, Field(description="Max total pages", default=10, gt=0)),
|
|
81
|
+
number_of_retries=(int, Field(description="Number of retries", default=2, ge=0)),
|
|
82
|
+
min_retry_seconds=(int, Field(description="Min retry, sec", default=10, ge=0)),
|
|
83
|
+
max_retry_seconds=(int, Field(description="Max retry, sec", default=60, ge=0)),
|
|
83
84
|
# optional field for custom headers as dictionary
|
|
84
85
|
custom_headers=(Optional[dict], Field(description="Custom headers for API requests", default={})),
|
|
85
86
|
confluence_configuration=(ConfluenceConfiguration, Field(description="Confluence Configuration", json_schema_extra={'configuration_types': ['confluence']})),
|
|
@@ -115,18 +116,23 @@ class ConfluenceToolkit(BaseToolkit):
|
|
|
115
116
|
**(kwargs.get('pgvector_configuration') or {}),
|
|
116
117
|
}
|
|
117
118
|
confluence_api_wrapper = ConfluenceAPIWrapper(**wrapper_payload)
|
|
118
|
-
prefix = clean_string(toolkit_name, ConfluenceToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
|
|
119
119
|
available_tools = confluence_api_wrapper.get_available_tools()
|
|
120
120
|
tools = []
|
|
121
121
|
for tool in available_tools:
|
|
122
122
|
if selected_tools:
|
|
123
123
|
if tool["name"] not in selected_tools:
|
|
124
124
|
continue
|
|
125
|
+
description = tool["description"]
|
|
126
|
+
if toolkit_name:
|
|
127
|
+
description = f"Toolkit: {toolkit_name}\n{description}"
|
|
128
|
+
description = f"Confluence space: {confluence_api_wrapper.space}\n{description}"
|
|
129
|
+
description = description[:1000]
|
|
125
130
|
tools.append(BaseAction(
|
|
126
131
|
api_wrapper=confluence_api_wrapper,
|
|
127
|
-
name=
|
|
128
|
-
description=
|
|
129
|
-
args_schema=tool["args_schema"]
|
|
132
|
+
name=tool["name"],
|
|
133
|
+
description=description,
|
|
134
|
+
args_schema=tool["args_schema"],
|
|
135
|
+
metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
|
|
130
136
|
))
|
|
131
137
|
return cls(tools=tools)
|
|
132
138
|
|
|
@@ -7,12 +7,14 @@ from json import JSONDecodeError
|
|
|
7
7
|
from typing import Optional, List, Any, Dict, Callable, Generator, Literal
|
|
8
8
|
|
|
9
9
|
import requests
|
|
10
|
+
from atlassian.errors import ApiError
|
|
10
11
|
from langchain_community.document_loaders.confluence import ContentFormat
|
|
11
12
|
from langchain_core.documents import Document
|
|
12
13
|
from langchain_core.messages import HumanMessage
|
|
13
14
|
from langchain_core.tools import ToolException
|
|
14
15
|
from markdownify import markdownify
|
|
15
16
|
from pydantic import Field, PrivateAttr, model_validator, create_model, SecretStr
|
|
17
|
+
from requests import HTTPError
|
|
16
18
|
from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log
|
|
17
19
|
|
|
18
20
|
from alita_sdk.tools.non_code_indexer_toolkit import NonCodeIndexerToolkit
|
|
@@ -194,6 +196,7 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
194
196
|
keep_markdown_format: Optional[bool] = True
|
|
195
197
|
ocr_languages: Optional[str] = None
|
|
196
198
|
keep_newlines: Optional[bool] = True
|
|
199
|
+
_errors: Optional[list[str]] = None
|
|
197
200
|
_image_cache: ImageDescriptionCache = PrivateAttr(default_factory=ImageDescriptionCache)
|
|
198
201
|
|
|
199
202
|
@model_validator(mode='before')
|
|
@@ -477,28 +480,78 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
477
480
|
"""Gets pages with specific label in the Confluence space."""
|
|
478
481
|
|
|
479
482
|
start = 0
|
|
480
|
-
pages_info = []
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
483
|
+
pages_info: List[Dict[str, Any]] = []
|
|
484
|
+
seen_ids: set[str] = set()
|
|
485
|
+
|
|
486
|
+
# Use a while-loop driven by unique pages collected and
|
|
487
|
+
# presence of additional results instead of a fixed number
|
|
488
|
+
# of iterations based purely on max_pages/limit.
|
|
489
|
+
while len(pages_info) < (self.max_pages or 0):
|
|
490
|
+
pages = self.client.get_all_pages_by_label(
|
|
491
|
+
label,
|
|
492
|
+
start=start,
|
|
493
|
+
limit=self.limit,
|
|
494
|
+
) # , expand="body.view.value"
|
|
484
495
|
if not pages:
|
|
485
496
|
break
|
|
486
497
|
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
498
|
+
# Collect only ids we haven't processed yet to avoid
|
|
499
|
+
# calling get_page_by_id multiple times for the same
|
|
500
|
+
# Confluence page.
|
|
501
|
+
new_ids: List[str] = []
|
|
502
|
+
for p in pages:
|
|
503
|
+
page_id = p["id"] if isinstance(p, dict) else getattr(p, "id", None)
|
|
504
|
+
if page_id is None:
|
|
505
|
+
continue
|
|
506
|
+
if page_id in seen_ids:
|
|
507
|
+
continue
|
|
508
|
+
seen_ids.add(page_id)
|
|
509
|
+
new_ids.append(page_id)
|
|
510
|
+
|
|
511
|
+
if new_ids:
|
|
512
|
+
for page in self.get_pages_by_id(new_ids):
|
|
513
|
+
meta = getattr(page, "metadata", {}) or {}
|
|
514
|
+
page_id = meta.get("id")
|
|
515
|
+
page_title = meta.get("title")
|
|
516
|
+
page_url = meta.get("source")
|
|
517
|
+
content = getattr(page, "page_content", None)
|
|
518
|
+
|
|
519
|
+
if page_id is None:
|
|
520
|
+
continue
|
|
521
|
+
|
|
522
|
+
pages_info.append(
|
|
523
|
+
{
|
|
524
|
+
"page_id": page_id,
|
|
525
|
+
"page_title": page_title,
|
|
526
|
+
"page_url": page_url,
|
|
527
|
+
"content": content,
|
|
528
|
+
}
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Respect max_pages on unique pages collected.
|
|
532
|
+
if len(pages_info) >= (self.max_pages or 0):
|
|
533
|
+
break
|
|
534
|
+
|
|
535
|
+
# Advance the offset by the requested page size.
|
|
493
536
|
start += self.limit
|
|
494
|
-
|
|
537
|
+
|
|
538
|
+
# Defensive break: if the API returns fewer items than
|
|
539
|
+
# requested, there are likely no more pages to fetch.
|
|
540
|
+
if len(pages) < self.limit:
|
|
541
|
+
break
|
|
542
|
+
|
|
543
|
+
# Slice as an extra safety net in case of any race conditions
|
|
544
|
+
# around the max_pages guard in the loop above.
|
|
545
|
+
return pages_info[: (self.max_pages or len(pages_info))]
|
|
495
546
|
|
|
496
547
|
def is_public_page(self, page: dict) -> bool:
|
|
497
548
|
"""Check if a page is publicly accessible."""
|
|
498
549
|
restrictions = self.client.get_all_restrictions_for_content(page["id"])
|
|
499
550
|
|
|
500
551
|
return (
|
|
501
|
-
page["status"] == "current"
|
|
552
|
+
(page["status"] == "current"
|
|
553
|
+
# allow user to see archived content if needed
|
|
554
|
+
or page["status"] == "archived")
|
|
502
555
|
and not restrictions["read"]["restrictions"]["user"]["results"]
|
|
503
556
|
and not restrictions["read"]["restrictions"]["group"]["results"]
|
|
504
557
|
)
|
|
@@ -518,18 +571,35 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
518
571
|
),
|
|
519
572
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
520
573
|
)(self.client.get_page_by_id)
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
574
|
+
try:
|
|
575
|
+
page = get_page(
|
|
576
|
+
page_id=page_id, expand=f"{self.content_format.value},version"
|
|
577
|
+
)
|
|
578
|
+
except (ApiError, HTTPError) as e:
|
|
579
|
+
logger.error(f"Error fetching page with ID {page_id}: {e}")
|
|
580
|
+
page_content_temp = f"Confluence API Error: cannot fetch the page with ID {page_id}: {e}"
|
|
581
|
+
# store errors
|
|
582
|
+
if self._errors is None:
|
|
583
|
+
self._errors = []
|
|
584
|
+
self._errors.append(page_content_temp)
|
|
585
|
+
return Document(page_content=page_content_temp,
|
|
586
|
+
metadata={})
|
|
587
|
+
# TODO: update on toolkit advanced settings level as a separate feature
|
|
588
|
+
# if not self.include_restricted_content and not self.is_public_page(page):
|
|
589
|
+
# continue
|
|
526
590
|
yield self.process_page(page, skip_images)
|
|
527
591
|
|
|
592
|
+
def _log_errors(self):
|
|
593
|
+
""" Log errors encountered during toolkit execution. """
|
|
594
|
+
if self._errors:
|
|
595
|
+
logger.info(f"Errors encountered during toolkit execution: {self._errors}")
|
|
596
|
+
|
|
528
597
|
def read_page_by_id(self, page_id: str, skip_images: bool = False):
|
|
529
598
|
"""Reads a page by its id in the Confluence space. If id is not available, but there is a title - use get_page_id first."""
|
|
530
599
|
result = list(self.get_pages_by_id([page_id], skip_images))
|
|
531
600
|
if not result:
|
|
532
|
-
"
|
|
601
|
+
return f"Pages not found. Errors: {self._errors}" if self._errors \
|
|
602
|
+
else "Pages not found or you do not have access to them."
|
|
533
603
|
return result[0].page_content
|
|
534
604
|
# return self._strip_base64_images(result[0].page_content) if skip_images else result[0].page_content
|
|
535
605
|
|
|
@@ -550,11 +620,18 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
550
620
|
def _process_search(self, cql, skip_images: bool = False):
|
|
551
621
|
start = 0
|
|
552
622
|
pages_info = []
|
|
623
|
+
seen_ids: set = set() # Track seen page IDs to avoid duplicates
|
|
553
624
|
for _ in range((self.max_pages + self.limit - 1) // self.limit):
|
|
554
625
|
pages = self.client.cql(cql, start=start, limit=self.limit).get("results", [])
|
|
555
626
|
if not pages:
|
|
556
627
|
break
|
|
557
|
-
|
|
628
|
+
# Deduplicate page IDs before processing
|
|
629
|
+
page_ids = []
|
|
630
|
+
for page in pages:
|
|
631
|
+
page_id = page['content']['id']
|
|
632
|
+
if page_id not in seen_ids:
|
|
633
|
+
seen_ids.add(page_id)
|
|
634
|
+
page_ids.append(page_id)
|
|
558
635
|
for page in self.get_pages_by_id(page_ids, skip_images):
|
|
559
636
|
page_info = {
|
|
560
637
|
'content': page.page_content,
|
|
@@ -874,14 +951,14 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
874
951
|
|
|
875
952
|
# Re-verify extension filters
|
|
876
953
|
# Check if file should be skipped based on skip_extensions
|
|
877
|
-
if any(re.match(pattern.replace('
|
|
954
|
+
if any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
|
|
878
955
|
for pattern in self._skip_extensions):
|
|
879
956
|
continue
|
|
880
957
|
|
|
881
958
|
# Check if file should be included based on include_extensions
|
|
882
959
|
# If include_extensions is empty, process all files (that weren't skipped)
|
|
883
960
|
if self._include_extensions and not (
|
|
884
|
-
any(re.match(pattern.replace('
|
|
961
|
+
any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
|
|
885
962
|
for pattern in self._include_extensions)):
|
|
886
963
|
continue
|
|
887
964
|
|
|
@@ -892,6 +969,9 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
892
969
|
created_date = hist.get('createdDate', '') if hist else attachment.get('created', '')
|
|
893
970
|
last_updated = hist.get('lastUpdated', {}).get('when', '') if hist else ''
|
|
894
971
|
|
|
972
|
+
attachment_path = attachment['_links']['download'] if attachment.get(
|
|
973
|
+
'_links', {}).get('download') else ''
|
|
974
|
+
download_url = self.client.url.rstrip('/') + attachment_path
|
|
895
975
|
metadata = {
|
|
896
976
|
'name': title,
|
|
897
977
|
'size': attachment.get('extensions', {}).get('fileSize', None),
|
|
@@ -901,14 +981,10 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
901
981
|
'media_type': media_type,
|
|
902
982
|
'labels': [label['name'] for label in
|
|
903
983
|
attachment.get('metadata', {}).get('labels', {}).get('results', [])],
|
|
904
|
-
'download_url':
|
|
905
|
-
'_links', {}).get('download') else None
|
|
984
|
+
'download_url': download_url
|
|
906
985
|
}
|
|
907
|
-
|
|
908
|
-
download_url = self.base_url.rstrip('/') + attachment['_links']['download']
|
|
909
|
-
|
|
910
986
|
try:
|
|
911
|
-
resp = self.client.request(method="GET", path=
|
|
987
|
+
resp = self.client.request(method="GET", path=attachment_path, advanced_mode=True)
|
|
912
988
|
if resp.status_code == 200:
|
|
913
989
|
content = resp.content
|
|
914
990
|
else:
|
|
@@ -1661,8 +1737,8 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
1661
1737
|
"page_ids": (Optional[List[str]], Field(description="List of page IDs to retrieve.", default=None)),
|
|
1662
1738
|
"label": (Optional[str], Field(description="Label to filter pages.", default=None)),
|
|
1663
1739
|
"cql": (Optional[str], Field(description="CQL query to filter pages.", default=None)),
|
|
1664
|
-
"limit": (Optional[int], Field(description="Limit the number of results.", default=10)),
|
|
1665
|
-
"max_pages": (Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000)),
|
|
1740
|
+
"limit": (Optional[int], Field(description="Limit the number of results.", default=10, gt=0)),
|
|
1741
|
+
"max_pages": (Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000, gt=0)),
|
|
1666
1742
|
"include_restricted_content": (Optional[bool], Field(description="Include restricted content.", default=False)),
|
|
1667
1743
|
"include_archived_content": (Optional[bool], Field(description="Include archived content.", default=False)),
|
|
1668
1744
|
"include_attachments": (Optional[bool], Field(description="Include attachments.", default=False)),
|
|
@@ -1798,4 +1874,5 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
1798
1874
|
"description": self.get_page_attachments.__doc__,
|
|
1799
1875
|
"args_schema": GetPageAttachmentsInput,
|
|
1800
1876
|
}
|
|
1801
|
-
]
|
|
1877
|
+
]
|
|
1878
|
+
|
|
@@ -3,6 +3,7 @@ from typing import Optional, List
|
|
|
3
3
|
from logging import getLogger
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
|
+
from langchain_core.documents import Document
|
|
6
7
|
|
|
7
8
|
logger = getLogger(__name__)
|
|
8
9
|
from PIL import Image
|
|
@@ -47,7 +48,8 @@ class AlitaConfluenceLoader(ConfluenceLoader):
|
|
|
47
48
|
del kwargs[key]
|
|
48
49
|
except:
|
|
49
50
|
pass
|
|
50
|
-
|
|
51
|
+
# utilize adjusted URL from Confluence instance for base_url
|
|
52
|
+
self.base_url = confluence_client.url
|
|
51
53
|
self.space_key = kwargs.get('space_key')
|
|
52
54
|
self.page_ids = kwargs.get('page_ids')
|
|
53
55
|
self.label = kwargs.get('label')
|
|
@@ -107,7 +109,8 @@ class AlitaConfluenceLoader(ConfluenceLoader):
|
|
|
107
109
|
texts = []
|
|
108
110
|
for attachment in attachments:
|
|
109
111
|
media_type = attachment["metadata"]["mediaType"]
|
|
110
|
-
|
|
112
|
+
# utilize adjusted URL from Confluence instance for attachment download URL
|
|
113
|
+
absolute_url = self.confluence.url + attachment["_links"]["download"]
|
|
111
114
|
title = attachment["title"]
|
|
112
115
|
try:
|
|
113
116
|
if media_type == "application/pdf":
|
|
@@ -193,6 +196,15 @@ class AlitaConfluenceLoader(ConfluenceLoader):
|
|
|
193
196
|
else:
|
|
194
197
|
return super().process_image(link, ocr_languages)
|
|
195
198
|
|
|
199
|
+
def process_page(self, page: dict, include_attachments: bool, include_comments: bool, include_labels: bool,
|
|
200
|
+
content_format: ContentFormat, ocr_languages: Optional[str] = None,
|
|
201
|
+
keep_markdown_format: Optional[bool] = False, keep_newlines: bool = False) -> Document:
|
|
202
|
+
if not page.get("title"):
|
|
203
|
+
# if 'include_restricted_content' set to True, draft pages are loaded and can have no title
|
|
204
|
+
page["title"] = "Untitled"
|
|
205
|
+
return super().process_page(page, include_attachments, include_comments, include_labels, content_format,
|
|
206
|
+
ocr_languages, keep_markdown_format, keep_newlines)
|
|
207
|
+
|
|
196
208
|
# TODO review usage
|
|
197
209
|
# def process_svg(
|
|
198
210
|
# self,
|
|
@@ -5,7 +5,8 @@ from pydantic import create_model, BaseModel, ConfigDict, Field
|
|
|
5
5
|
|
|
6
6
|
from .api_wrapper import OpenApiWrapper
|
|
7
7
|
from ..base.tool import BaseAction
|
|
8
|
-
from ..utils import clean_string
|
|
8
|
+
from ..utils import clean_string
|
|
9
|
+
from ...runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
|
|
9
10
|
|
|
10
11
|
name = "openapi"
|
|
11
12
|
|
|
@@ -43,15 +44,21 @@ class OpenApiToolkit(BaseToolkit):
|
|
|
43
44
|
openapi_api_wrapper = OpenApiWrapper(**kwargs)
|
|
44
45
|
available_tools = openapi_api_wrapper.get_available_tools()
|
|
45
46
|
tools = []
|
|
46
|
-
|
|
47
|
+
# Use clean toolkit name for context (max 1000 chars in description)
|
|
48
|
+
toolkit_context = f" [Toolkit: {clean_string(toolkit_name)}]" if toolkit_name else ''
|
|
47
49
|
for tool in available_tools:
|
|
48
50
|
if selected_tools and tool["name"] not in selected_tools:
|
|
49
51
|
continue
|
|
52
|
+
# Add toolkit context to description with character limit
|
|
53
|
+
description = tool["description"]
|
|
54
|
+
if toolkit_context and len(description + toolkit_context) <= 1000:
|
|
55
|
+
description = description + toolkit_context
|
|
50
56
|
tools.append(BaseAction(
|
|
51
57
|
api_wrapper=openapi_api_wrapper,
|
|
52
|
-
name=
|
|
53
|
-
description=
|
|
54
|
-
args_schema=tool["args_schema"]
|
|
58
|
+
name=tool["name"],
|
|
59
|
+
description=description,
|
|
60
|
+
args_schema=tool["args_schema"],
|
|
61
|
+
metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
|
|
55
62
|
))
|
|
56
63
|
return cls(tools=tools)
|
|
57
64
|
|