alita-sdk 0.3.209__py3-none-any.whl → 0.3.210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/runtime/clients/artifact.py +18 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaCSVLoader.py +2 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +3 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +8 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -1
- alita_sdk/runtime/langchain/langraph_agent.py +1 -1
- alita_sdk/runtime/toolkits/artifact.py +7 -3
- alita_sdk/runtime/toolkits/tools.py +8 -1
- alita_sdk/runtime/tools/application.py +2 -0
- alita_sdk/runtime/tools/artifact.py +65 -8
- alita_sdk/runtime/tools/vectorstore.py +125 -41
- alita_sdk/runtime/utils/utils.py +3 -0
- alita_sdk/tools/ado/__init__.py +8 -0
- alita_sdk/tools/ado/repos/repos_wrapper.py +37 -0
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +0 -7
- alita_sdk/tools/ado/work_item/__init__.py +4 -0
- alita_sdk/tools/ado/work_item/ado_wrapper.py +37 -4
- alita_sdk/tools/aws/delta_lake/__init__.py +1 -1
- alita_sdk/tools/bitbucket/__init__.py +13 -1
- alita_sdk/tools/bitbucket/api_wrapper.py +31 -4
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +31 -0
- alita_sdk/tools/chunkers/code/codeparser.py +18 -10
- alita_sdk/tools/confluence/api_wrapper.py +35 -134
- alita_sdk/tools/confluence/loader.py +30 -28
- alita_sdk/tools/elitea_base.py +112 -11
- alita_sdk/tools/figma/__init__.py +13 -1
- alita_sdk/tools/figma/api_wrapper.py +47 -3
- alita_sdk/tools/github/api_wrapper.py +8 -0
- alita_sdk/tools/github/github_client.py +18 -0
- alita_sdk/tools/gitlab/__init__.py +4 -0
- alita_sdk/tools/gitlab/api_wrapper.py +10 -0
- alita_sdk/tools/google/bigquery/__init__.py +1 -1
- alita_sdk/tools/jira/__init__.py +21 -13
- alita_sdk/tools/jira/api_wrapper.py +285 -5
- alita_sdk/tools/sharepoint/__init__.py +11 -1
- alita_sdk/tools/sharepoint/api_wrapper.py +23 -53
- alita_sdk/tools/testrail/__init__.py +4 -0
- alita_sdk/tools/testrail/api_wrapper.py +21 -54
- alita_sdk/tools/utils/content_parser.py +72 -8
- alita_sdk/tools/xray/__init__.py +8 -1
- alita_sdk/tools/xray/api_wrapper.py +505 -14
- alita_sdk/tools/zephyr_scale/api_wrapper.py +5 -5
- {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/METADATA +1 -1
- {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/RECORD +47 -47
- {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,24 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
3
|
import urllib.parse
|
4
|
-
from typing import
|
4
|
+
from typing import Dict, List, Generator, Optional
|
5
5
|
|
6
|
+
from alita_sdk.tools.elitea_base import BaseVectorStoreToolApiWrapper, extend_with_vector_tools
|
6
7
|
from azure.devops.connection import Connection
|
7
8
|
from azure.devops.v7_1.core import CoreClient
|
8
9
|
from azure.devops.v7_1.wiki import WikiClient
|
9
10
|
from azure.devops.v7_1.work_item_tracking import TeamContext, Wiql, WorkItemTrackingClient
|
11
|
+
from langchain_core.documents import Document
|
10
12
|
from langchain_core.tools import ToolException
|
11
13
|
from msrest.authentication import BasicAuthentication
|
12
14
|
from pydantic import create_model, PrivateAttr, SecretStr
|
13
15
|
from pydantic import model_validator
|
14
16
|
from pydantic.fields import Field
|
15
17
|
|
16
|
-
|
18
|
+
try:
|
19
|
+
from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
|
20
|
+
except ImportError:
|
21
|
+
from alita_sdk.langchain.interfaces.llm_processor import get_embeddings
|
17
22
|
|
18
23
|
logger = logging.getLogger(__name__)
|
19
24
|
|
@@ -89,8 +94,7 @@ ADOUnlinkWorkItemsFromWikiPage = create_model(
|
|
89
94
|
page_name=(str, Field(description="Wiki page path to unlink the work items from", examples=["/TargetPage"]))
|
90
95
|
)
|
91
96
|
|
92
|
-
|
93
|
-
class AzureDevOpsApiWrapper(BaseToolApiWrapper):
|
97
|
+
class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
|
94
98
|
organization_url: str
|
95
99
|
project: str
|
96
100
|
token: SecretStr
|
@@ -504,6 +508,35 @@ class AzureDevOpsApiWrapper(BaseToolApiWrapper):
|
|
504
508
|
logger.error(f"Error unlinking work items from wiki page '{page_name}': {str(e)}")
|
505
509
|
return ToolException(f"An unexpected error occurred while unlinking work items from wiki page '{page_name}': {str(e)}")
|
506
510
|
|
511
|
+
def _base_loader(self, wiql: str, **kwargs) -> Generator[Document, None, None]:
|
512
|
+
ref_items = self._client.query_by_wiql(Wiql(query=wiql)).work_items
|
513
|
+
for ref in ref_items:
|
514
|
+
wi = self._client.get_work_item(id=ref.id, project=self.project, expand='all')
|
515
|
+
yield Document(page_content=json.dumps(wi.fields), metadata={
|
516
|
+
'id': str(wi.id),
|
517
|
+
'type': wi.fields.get('System.WorkItemType', ''),
|
518
|
+
'title': wi.fields.get('System.Title', ''),
|
519
|
+
'state': wi.fields.get('System.State', ''),
|
520
|
+
'area': wi.fields.get('System.AreaPath', ''),
|
521
|
+
'reason': wi.fields.get('System.Reason', ''),
|
522
|
+
'iteration': wi.fields.get('System.IterationPath', ''),
|
523
|
+
'updated_on': wi.fields.get('System.ChangedDate', ''),
|
524
|
+
'attachment_ids': [rel.url.split('/')[-1] for rel in wi.relations or [] if rel.rel == 'AttachedFile']
|
525
|
+
})
|
526
|
+
|
527
|
+
def _process_document(self, document: Document) -> Generator[Document, None, None]:
|
528
|
+
for attachment_id in document.metadata.get('attachment_ids', []):
|
529
|
+
content_generator = self._client.get_attachment_content(id=attachment_id, download=True)
|
530
|
+
content = ''.join(str(item) for item in content_generator)
|
531
|
+
yield Document(page_content=content, metadata={'id': attachment_id})
|
532
|
+
|
533
|
+
def _index_tool_params(self):
|
534
|
+
"""Return the parameters for indexing data."""
|
535
|
+
return {
|
536
|
+
"wiql": (str, Field(description="WIQL (Work Item Query Language) query string to select and filter Azure DevOps work items."))
|
537
|
+
}
|
538
|
+
|
539
|
+
@extend_with_vector_tools
|
507
540
|
def get_available_tools(self):
|
508
541
|
"""Return a list of available tools."""
|
509
542
|
return [
|
@@ -57,7 +57,7 @@ class DeltaLakeToolkitConfig(BaseModel):
|
|
57
57
|
aws_secret_access_key: Optional[SecretStr] = Field(default=None, description="AWS secret access key", json_schema_extra={"secret": True, "configuration": True})
|
58
58
|
aws_session_token: Optional[SecretStr] = Field(default=None, description="AWS session token (optional)", json_schema_extra={"secret": True, "configuration": True})
|
59
59
|
aws_region: Optional[str] = Field(default=None, description="AWS region for Delta Lake storage", json_schema_extra={"configuration": True})
|
60
|
-
s3_path: Optional[str] = Field(default=None, description="S3 path to Delta Lake data (e.g., s3://bucket/path)", json_schema_extra={"configuration": True})
|
60
|
+
s3_path: Optional[str] = Field(default=None, description="S3 path to Delta Lake data (e.g., s3://bucket/path)", json_schema_extra={"configuration": True, "configuration_title": True})
|
61
61
|
table_path: Optional[str] = Field(default=None, description="Delta Lake table path (if not using s3_path)", json_schema_extra={"configuration": True})
|
62
62
|
selected_tools: List[str] = Field(default=[], description="Selected tools", json_schema_extra={"args_schemas": get_available_tools()})
|
63
63
|
|
@@ -24,7 +24,15 @@ def get_tools(tool):
|
|
24
24
|
password=tool['settings']['password'],
|
25
25
|
branch=tool['settings']['branch'],
|
26
26
|
cloud=tool['settings'].get('cloud'),
|
27
|
-
|
27
|
+
llm=tool['settings'].get('llm', None),
|
28
|
+
alita=tool['settings'].get('alita', None),
|
29
|
+
connection_string=tool['settings'].get('connection_string', None),
|
30
|
+
collection_name=str(tool['id']),
|
31
|
+
doctype='code',
|
32
|
+
embedding_model="HuggingFaceEmbeddings",
|
33
|
+
embedding_model_params={"model_name": "sentence-transformers/all-MiniLM-L6-v2"},
|
34
|
+
vectorstore_type="PGVector",
|
35
|
+
toolkit_name=tool.get('toolkit_name')
|
28
36
|
).get_tools()
|
29
37
|
|
30
38
|
|
@@ -48,6 +56,10 @@ class AlitaBitbucketToolkit(BaseToolkit):
|
|
48
56
|
username=(str, Field(description="Username", json_schema_extra={'configuration': True})),
|
49
57
|
password=(SecretStr, Field(description="GitLab private token", json_schema_extra={'secret': True, 'configuration': True})),
|
50
58
|
cloud=(Optional[bool], Field(description="Hosting Option", default=None)),
|
59
|
+
# indexer settings
|
60
|
+
connection_string=(Optional[SecretStr], Field(description="Connection string for vectorstore",
|
61
|
+
default=None,
|
62
|
+
json_schema_extra={'secret': True})),
|
51
63
|
selected_tools=(List[Literal[tuple(selected_tools)]], Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
|
52
64
|
__config__=ConfigDict(json_schema_extra=
|
53
65
|
{
|
@@ -41,6 +41,18 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
|
|
41
41
|
"""Bitbucket installation type: true for cloud, false for server.
|
42
42
|
"""
|
43
43
|
|
44
|
+
llm: Optional[Any] = None
|
45
|
+
# Alita instance
|
46
|
+
alita: Optional[Any] = None
|
47
|
+
|
48
|
+
# Vector store configuration
|
49
|
+
connection_string: Optional[SecretStr] = None
|
50
|
+
collection_name: Optional[str] = None
|
51
|
+
doctype: Optional[str] = 'code'
|
52
|
+
embedding_model: Optional[str] = "HuggingFaceEmbeddings"
|
53
|
+
embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
|
54
|
+
vectorstore_type: Optional[str] = "PGVector"
|
55
|
+
|
44
56
|
@model_validator(mode='before')
|
45
57
|
@classmethod
|
46
58
|
def validate_env(cls, values: Dict) -> Dict:
|
@@ -59,7 +71,7 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
|
|
59
71
|
password=values['password'],
|
60
72
|
workspace=values['project'],
|
61
73
|
repository=values['repository']
|
62
|
-
) if values
|
74
|
+
) if values.get('cloud') else BitbucketServerApi(
|
63
75
|
url=values['url'],
|
64
76
|
username=values['username'],
|
65
77
|
password=values['password'],
|
@@ -213,16 +225,31 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
|
|
213
225
|
except Exception as e:
|
214
226
|
return ToolException(f"Can't add comment to pull request `{pr_id}` due to error:\n{str(e)}")
|
215
227
|
|
216
|
-
def _get_files(self,
|
228
|
+
def _get_files(self, path: str, branch: str) -> str:
|
217
229
|
"""
|
218
230
|
Get files from the bitbucket repo
|
219
231
|
Parameters:
|
220
|
-
|
232
|
+
path(str): the file path
|
221
233
|
branch(str): branch name (by default: active_branch)
|
222
234
|
Returns:
|
223
235
|
str: List of the files
|
224
236
|
"""
|
225
|
-
return str(self._bitbucket.get_files_list(file_path=
|
237
|
+
return str(self._bitbucket.get_files_list(file_path=path if path else '', branch=branch if branch else self._active_branch))
|
238
|
+
|
239
|
+
# TODO: review this method, it may not work as expected
|
240
|
+
# def _file_commit_hash(self, file_path: str, branch: str):
|
241
|
+
# """
|
242
|
+
# Get the commit hash of a file in the gitlab repo
|
243
|
+
# Parameters:
|
244
|
+
# file_path(str): the file path
|
245
|
+
# branch(str): branch name (by default: active_branch)
|
246
|
+
# Returns:
|
247
|
+
# str: The commit hash of the file
|
248
|
+
# """
|
249
|
+
# try:
|
250
|
+
# return self._bitbucket.get_file_commit_hash(file_path=file_path, branch=branch)
|
251
|
+
# except Exception as e:
|
252
|
+
# raise ToolException(f"Can't extract file commit hash (`{file_path}`) due to error:\n{str(e)}")
|
226
253
|
|
227
254
|
def _read_file(self, file_path: str, branch: str) -> str:
|
228
255
|
"""
|
@@ -104,6 +104,22 @@ class BitbucketServerApi(BitbucketApiAbstract):
|
|
104
104
|
data=json.loads(pr_json_data)
|
105
105
|
)
|
106
106
|
|
107
|
+
# TODO: review this method, it may not work as expected
|
108
|
+
def get_file_commit_hash(self, file_path: str, branch: str):
|
109
|
+
"""
|
110
|
+
Get the commit hash of a file in a specific branch.
|
111
|
+
Parameters:
|
112
|
+
file_path (str): The path to the file.
|
113
|
+
branch (str): The branch name.
|
114
|
+
Returns:
|
115
|
+
str: The commit hash of the file.
|
116
|
+
"""
|
117
|
+
commits = self.api_client.get_commits(project_key=self.project, repository_slug=self.repository,
|
118
|
+
filename=file_path, at=branch, limit=1)
|
119
|
+
if commits:
|
120
|
+
return commits[0]['id']
|
121
|
+
return None
|
122
|
+
|
107
123
|
def get_file(self, file_path: str, branch: str) -> str:
|
108
124
|
return self.api_client.get_content_of_file(project_key=self.project, repository_slug=self.repository, at=branch,
|
109
125
|
filename=file_path).decode('utf-8')
|
@@ -262,6 +278,21 @@ class BitbucketCloudApi(BitbucketApiAbstract):
|
|
262
278
|
response = self.repository.pullrequests.post(None, data=json.loads(pr_json_data))
|
263
279
|
return response['links']['self']['href']
|
264
280
|
|
281
|
+
# TODO: review this method, it may not work as expected
|
282
|
+
def get_file_commit_hash(self, file_path: str, branch: str):
|
283
|
+
"""
|
284
|
+
Get the commit hash of a file in a specific branch.
|
285
|
+
Parameters:
|
286
|
+
file_path (str): The path to the file.
|
287
|
+
branch (str): The branch name.
|
288
|
+
Returns:
|
289
|
+
str: The commit hash of the file.
|
290
|
+
"""
|
291
|
+
commits = self.repository.commits.get(path=file_path, branch=branch, pagelen=1)
|
292
|
+
if commits['values']:
|
293
|
+
return commits['values'][0]['hash']
|
294
|
+
return None
|
295
|
+
|
265
296
|
def get_file(self, file_path: str, branch: str) -> str:
|
266
297
|
return self.repository.get(path=f'src/{branch}/{file_path}')
|
267
298
|
|
@@ -39,13 +39,17 @@ def parse_code_files_for_db(file_content_generator: Generator[str, None, None],
|
|
39
39
|
if programming_language == Language.UNKNOWN:
|
40
40
|
documents = TokenTextSplitter(encoding_name="gpt2", chunk_size=256, chunk_overlap=30).split_text(file_content)
|
41
41
|
for document in documents:
|
42
|
+
metadata = {
|
43
|
+
"filename": file_name,
|
44
|
+
"method_name": node.name,
|
45
|
+
"language": programming_language.value,
|
46
|
+
}
|
47
|
+
commit_hash = data.get("commit_hash")
|
48
|
+
if commit_hash is not None:
|
49
|
+
metadata["commit_hash"] = commit_hash
|
42
50
|
document = Document(
|
43
51
|
page_content=document,
|
44
|
-
metadata=
|
45
|
-
"filename": file_name,
|
46
|
-
"method_name": 'text',
|
47
|
-
"language": programming_language.value,
|
48
|
-
},
|
52
|
+
metadata=metadata,
|
49
53
|
)
|
50
54
|
yield document
|
51
55
|
else:
|
@@ -73,13 +77,17 @@ def parse_code_files_for_db(file_content_generator: Generator[str, None, None],
|
|
73
77
|
splitted_documents = code_splitter.split_text(method_source_code)
|
74
78
|
|
75
79
|
for splitted_document in splitted_documents:
|
80
|
+
metadata = {
|
81
|
+
"filename": file_name,
|
82
|
+
"method_name": node.name,
|
83
|
+
"language": programming_language.value,
|
84
|
+
}
|
85
|
+
commit_hash = data.get("commit_hash")
|
86
|
+
if commit_hash is not None:
|
87
|
+
metadata["commit_hash"] = commit_hash
|
76
88
|
document = Document(
|
77
89
|
page_content=splitted_document,
|
78
|
-
metadata=
|
79
|
-
"filename": file_name,
|
80
|
-
"method_name": node.name,
|
81
|
-
"language": programming_language.value,
|
82
|
-
},
|
90
|
+
metadata=metadata,
|
83
91
|
)
|
84
92
|
yield document
|
85
93
|
except Exception as e:
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import shortuuid
|
2
1
|
import re
|
3
2
|
import logging
|
4
3
|
import requests
|
@@ -17,7 +16,7 @@ from langchain_core.messages import HumanMessage
|
|
17
16
|
from markdownify import markdownify
|
18
17
|
from langchain_community.document_loaders.confluence import ContentFormat
|
19
18
|
|
20
|
-
from ..elitea_base import BaseVectorStoreToolApiWrapper,
|
19
|
+
from ..elitea_base import BaseVectorStoreToolApiWrapper, extend_with_vector_tools
|
21
20
|
from ..llm.img_utils import ImageDescriptionCache
|
22
21
|
from ..utils import is_cookie_token, parse_cookie_string
|
23
22
|
|
@@ -141,50 +140,6 @@ pageId = create_model(
|
|
141
140
|
description="Optional JSON of parameters to be sent in request body or query params. MUST be string with valid JSON. For search/read operations, you MUST always get minimum fields and set max results, until users ask explicitly for more fields. For search/read operations you must generate CQL query string and pass it as params."))
|
142
141
|
)
|
143
142
|
|
144
|
-
# loaderParams = create_model(
|
145
|
-
# "LoaderParams",
|
146
|
-
# content_format=(str, Field(description="The format of the content to be retrieved.")),
|
147
|
-
# page_ids=(Optional[List[str]], Field(description="List of page IDs to retrieve.", default=None)),
|
148
|
-
# label=(Optional[str], Field(description="Label to filter pages.", default=None)),
|
149
|
-
# cql=(Optional[str], Field(description="CQL query to filter pages.", default=None)),
|
150
|
-
# include_restricted_content=(Optional[bool], Field(description="Include restricted content.", default=False)),
|
151
|
-
# include_archived_content=(Optional[bool], Field(description="Include archived content.", default=False)),
|
152
|
-
# include_attachments=(Optional[bool], Field(description="Include attachments.", default=False)),
|
153
|
-
# include_comments=(Optional[bool], Field(description="Include comments.", default=False)),
|
154
|
-
# include_labels=(Optional[bool], Field(description="Include labels.", default=False)),
|
155
|
-
# limit=(Optional[int], Field(description="Limit the number of results.", default=10)),
|
156
|
-
# max_pages=(Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000)),
|
157
|
-
# ocr_languages=(Optional[str], Field(description="OCR languages for processing attachments.", default=None)),
|
158
|
-
# keep_markdown_format=(Optional[bool], Field(description="Keep the markdown format.", default=True)),
|
159
|
-
# keep_newlines=(Optional[bool], Field(description="Keep newlines in the content.", default=True)),
|
160
|
-
# bins_with_llm=(Optional[bool], Field(description="Use LLM for processing binary files.", default=False)),
|
161
|
-
# )
|
162
|
-
|
163
|
-
indexPagesParams = create_model(
|
164
|
-
"indexPagesParams",
|
165
|
-
__base__=BaseIndexParams,
|
166
|
-
content_format=(Literal['view', 'storage', 'export_view', 'editor', 'anonymous'],
|
167
|
-
Field(description="The format of the content to be retrieved.")),
|
168
|
-
### Loader Parameters
|
169
|
-
page_ids=(Optional[List[str]], Field(description="List of page IDs to retrieve.", default=None)),
|
170
|
-
label=(Optional[str], Field(description="Label to filter pages.", default=None)),
|
171
|
-
cql=(Optional[str], Field(description="CQL query to filter pages.", default=None)),
|
172
|
-
limit=(Optional[int], Field(description="Limit the number of results.", default=10)),
|
173
|
-
max_pages=(Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000)),
|
174
|
-
include_restricted_content=(Optional[bool], Field(description="Include restricted content.", default=False)),
|
175
|
-
include_archived_content=(Optional[bool], Field(description="Include archived content.", default=False)),
|
176
|
-
include_attachments=(Optional[bool], Field(description="Include attachments.", default=False)),
|
177
|
-
include_comments=(Optional[bool], Field(description="Include comments.", default=False)),
|
178
|
-
include_labels=(Optional[bool], Field(description="Include labels.", default=True)),
|
179
|
-
ocr_languages=(Optional[str], Field(description="OCR languages for processing attachments.", default='eng')),
|
180
|
-
keep_markdown_format=(Optional[bool], Field(description="Keep the markdown format.", default=True)),
|
181
|
-
keep_newlines=(Optional[bool], Field(description="Keep newlines in the content.", default=True)),
|
182
|
-
bins_with_llm=(Optional[bool], Field(description="Use LLM for processing binary files.", default=False)),
|
183
|
-
### Chunking Parameters
|
184
|
-
chunking_tool=(Literal['markdown', 'statistical', 'proposal'], Field(description="Name of chunking tool", default="markdown")),
|
185
|
-
chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default_factory=dict)),
|
186
|
-
)
|
187
|
-
|
188
143
|
GetPageWithImageDescriptions = create_model(
|
189
144
|
"GetPageWithImageDescriptionsModel",
|
190
145
|
page_id=(str, Field(description="Confluence page ID from which content with images will be extracted")),
|
@@ -849,7 +804,7 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
849
804
|
docs.extend(batch)
|
850
805
|
return docs[:max_pages]
|
851
806
|
|
852
|
-
def
|
807
|
+
def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
|
853
808
|
"""
|
854
809
|
Loads content from Confluence based on parameters.
|
855
810
|
Returns:
|
@@ -858,7 +813,15 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
858
813
|
from .loader import AlitaConfluenceLoader
|
859
814
|
from copy import copy
|
860
815
|
content_format = kwargs.get('content_format', 'view').lower()
|
816
|
+
base_params = {
|
817
|
+
'url': self.base_url,
|
818
|
+
'space_key': self.space,
|
819
|
+
'min_retry_seconds': self.min_retry_seconds,
|
820
|
+
'max_retry_seconds': self.max_retry_seconds,
|
821
|
+
'number_of_retries': self.number_of_retries
|
822
|
+
}
|
861
823
|
confluence_loader_params = copy(kwargs)
|
824
|
+
confluence_loader_params.update(base_params)
|
862
825
|
mapping = {
|
863
826
|
'view': ContentFormat.VIEW,
|
864
827
|
'storage': ContentFormat.STORAGE,
|
@@ -878,86 +841,9 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
878
841
|
for document in loader._lazy_load(kwargs={}):
|
879
842
|
yield document
|
880
843
|
|
881
|
-
def
|
882
|
-
|
883
|
-
|
884
|
-
label: Optional[str] = None,
|
885
|
-
cql: Optional[str] = None,
|
886
|
-
include_restricted_content: Optional[bool] = False,
|
887
|
-
include_archived_content: Optional[bool] = False,
|
888
|
-
include_attachments: Optional[bool] = False,
|
889
|
-
include_comments: Optional[bool] = False,
|
890
|
-
include_labels: Optional[bool] = False,
|
891
|
-
limit: Optional[int] = 10,
|
892
|
-
max_pages: Optional[int] = 10,
|
893
|
-
keep_markdown_format: Optional[bool] = True,
|
894
|
-
keep_newlines: Optional[bool] = True,
|
895
|
-
bins_with_llm: bool = False,
|
896
|
-
chunking_tool: str = "markdown",
|
897
|
-
chunking_config: Optional[Dict[str, Any]] = None,
|
898
|
-
**kwargs) -> Generator[str, None, None]:
|
899
|
-
"""Load Confluence pages and index them in the vector store."""
|
900
|
-
|
901
|
-
from alita_sdk.tools.chunkers import __confluence_chunkers__ as chunkers, __confluence_models__ as models
|
902
|
-
try:
|
903
|
-
from alita_sdk.langchain.interfaces.llm_processor import get_embeddings
|
904
|
-
except ImportError:
|
905
|
-
from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
|
906
|
-
|
907
|
-
loader_params = {
|
908
|
-
'url': self.base_url,
|
909
|
-
'space_key': self.space,
|
910
|
-
'content_format': content_format,
|
911
|
-
'page_ids': page_ids,
|
912
|
-
'label': label,
|
913
|
-
'cql': cql,
|
914
|
-
'include_restricted_content': include_restricted_content,
|
915
|
-
'include_archived_content': include_archived_content,
|
916
|
-
'include_attachments': include_attachments,
|
917
|
-
'include_comments': include_comments,
|
918
|
-
'include_labels': include_labels,
|
919
|
-
'limit': limit,
|
920
|
-
'max_pages': max_pages,
|
921
|
-
'keep_markdown_format': keep_markdown_format,
|
922
|
-
'keep_newlines': keep_newlines,
|
923
|
-
'bins_with_llm': bins_with_llm,
|
924
|
-
'min_retry_seconds': self.min_retry_seconds,
|
925
|
-
'max_retry_seconds': self.max_retry_seconds,
|
926
|
-
'number_of_retries': self.number_of_retries
|
927
|
-
}
|
928
|
-
documents = self._loader(**loader_params)
|
929
|
-
embedding = get_embeddings(self.embedding_model, self.embedding_model_params)
|
930
|
-
|
931
|
-
chunker = chunkers.get(chunking_tool)
|
932
|
-
|
933
|
-
chunking_config = chunking_config or {}
|
934
|
-
|
935
|
-
if chunker:
|
936
|
-
# Validate and prepare chunking configuration using Pydantic models
|
937
|
-
config_model = models.get(chunking_tool)
|
938
|
-
if config_model:
|
939
|
-
# Set required fields that should come from the instance
|
940
|
-
chunking_config['embedding'] = embedding
|
941
|
-
chunking_config['llm'] = self.llm
|
942
|
-
|
943
|
-
try:
|
944
|
-
# Validate the configuration using the appropriate Pydantic model
|
945
|
-
validated_config = config_model(**chunking_config)
|
946
|
-
chunking_config = validated_config.model_dump()
|
947
|
-
except Exception as e:
|
948
|
-
logger.error(f"Invalid chunking configuration for {chunking_tool}: {e}")
|
949
|
-
raise ToolException(f"Invalid chunking configuration: {e}")
|
950
|
-
else:
|
951
|
-
# Fallback for chunkers without models
|
952
|
-
chunking_config['embedding'] = embedding
|
953
|
-
chunking_config['llm'] = self.llm
|
954
|
-
|
955
|
-
documents = chunker(documents, chunking_config)
|
956
|
-
|
957
|
-
# passing embedding to avoid re-initialization
|
958
|
-
vectorstore = self._init_vector_store(collection_suffix, embeddings=embedding)
|
959
|
-
return vectorstore.index_documents(documents)
|
960
|
-
|
844
|
+
def _process_document(self, document: Document) -> Generator[Document, None, None]:
|
845
|
+
for attachment in self.get_page_attachments(document.metadata.get('id')):
|
846
|
+
yield Document(page_content=attachment.get('content', ''), metadata=attachment.get('metadata', {}))
|
961
847
|
|
962
848
|
def _download_image(self, image_url):
|
963
849
|
"""
|
@@ -1685,6 +1571,28 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
1685
1571
|
logger.error(f"Error retrieving attachments for page {page_id}: {str(e)}")
|
1686
1572
|
return f"Error retrieving attachments: {str(e)}"
|
1687
1573
|
|
1574
|
+
def _index_tool_params(self):
|
1575
|
+
"""Return the parameters for indexing data."""
|
1576
|
+
return {
|
1577
|
+
"content_format": (Literal['view', 'storage', 'export_view', 'editor', 'anonymous'],
|
1578
|
+
Field(description="The format of the content to be retrieved.")),
|
1579
|
+
"page_ids": (Optional[List[str]], Field(description="List of page IDs to retrieve.", default=None)),
|
1580
|
+
"label": (Optional[str], Field(description="Label to filter pages.", default=None)),
|
1581
|
+
"cql": (Optional[str], Field(description="CQL query to filter pages.", default=None)),
|
1582
|
+
"limit": (Optional[int], Field(description="Limit the number of results.", default=10)),
|
1583
|
+
"max_pages": (Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000)),
|
1584
|
+
"include_restricted_content": (Optional[bool], Field(description="Include restricted content.", default=False)),
|
1585
|
+
"include_archived_content": (Optional[bool], Field(description="Include archived content.", default=False)),
|
1586
|
+
"include_attachments": (Optional[bool], Field(description="Include attachments.", default=False)),
|
1587
|
+
"include_comments": (Optional[bool], Field(description="Include comments.", default=False)),
|
1588
|
+
"include_labels": (Optional[bool], Field(description="Include labels.", default=True)),
|
1589
|
+
"ocr_languages": (Optional[str], Field(description="OCR languages for processing attachments.", default='eng')),
|
1590
|
+
"keep_markdown_format": (Optional[bool], Field(description="Keep the markdown format.", default=True)),
|
1591
|
+
"keep_newlines": (Optional[bool], Field(description="Keep newlines in the content.", default=True)),
|
1592
|
+
"bins_with_llm": (Optional[bool], Field(description="Use LLM for processing binary files.", default=False)),
|
1593
|
+
}
|
1594
|
+
|
1595
|
+
@extend_with_vector_tools
|
1688
1596
|
def get_available_tools(self):
|
1689
1597
|
# Confluence-specific tools
|
1690
1598
|
confluence_tools = [
|
@@ -1796,13 +1704,6 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
1796
1704
|
"description": self.get_page_id_by_title.__doc__,
|
1797
1705
|
"args_schema": getPageIdByTitleInput,
|
1798
1706
|
},
|
1799
|
-
# Confluence-specific vector store indexing
|
1800
|
-
{
|
1801
|
-
"name": "index_data",
|
1802
|
-
"ref": self.index_data,
|
1803
|
-
"description": self.index_data.__doc__,
|
1804
|
-
"args_schema": indexPagesParams,
|
1805
|
-
},
|
1806
1707
|
{
|
1807
1708
|
"name": "get_page_attachments",
|
1808
1709
|
"ref": self.get_page_attachments,
|
@@ -10,8 +10,8 @@ from langchain_community.document_loaders import ConfluenceLoader
|
|
10
10
|
from langchain_community.document_loaders.confluence import ContentFormat
|
11
11
|
from langchain_core.messages import HumanMessage
|
12
12
|
from pdf2image import convert_from_bytes
|
13
|
-
from reportlab.graphics import renderPM
|
14
|
-
from svglib.svglib import svg2rlg
|
13
|
+
# from reportlab.graphics import renderPM
|
14
|
+
# from svglib.svglib import svg2rlg
|
15
15
|
|
16
16
|
from .utils import image_to_byte_array, bytes_to_base64
|
17
17
|
|
@@ -125,6 +125,7 @@ class AlitaConfluenceLoader(ConfluenceLoader):
|
|
125
125
|
text = title + self.process_doc(absolute_url)
|
126
126
|
elif media_type == "application/vnd.ms-excel":
|
127
127
|
text = title + self.process_xls(absolute_url)
|
128
|
+
# TODO review usage
|
128
129
|
# elif media_type == "image/svg+xml":
|
129
130
|
# text = title + self.process_svg(absolute_url, ocr_languages)
|
130
131
|
else:
|
@@ -192,29 +193,30 @@ class AlitaConfluenceLoader(ConfluenceLoader):
|
|
192
193
|
else:
|
193
194
|
return super().process_image(link, ocr_languages)
|
194
195
|
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
196
|
+
# TODO review usage
|
197
|
+
# def process_svg(
|
198
|
+
# self,
|
199
|
+
# link: str,
|
200
|
+
# ocr_languages: Optional[str] = None,
|
201
|
+
# ) -> str:
|
202
|
+
# if self.bins_with_llm and self.llm:
|
203
|
+
# response = self.confluence.request(path=link, absolute=True)
|
204
|
+
# text = ""
|
205
|
+
#
|
206
|
+
# if (
|
207
|
+
# response.status_code != 200
|
208
|
+
# or response.content == b""
|
209
|
+
# or response.content is None
|
210
|
+
# ):
|
211
|
+
# return text
|
212
|
+
#
|
213
|
+
# drawing = svg2rlg(BytesIO(response.content))
|
214
|
+
#
|
215
|
+
# img_data = BytesIO()
|
216
|
+
# renderPM.drawToFile(drawing, img_data, fmt="PNG")
|
217
|
+
# img_data.seek(0)
|
218
|
+
# image = Image.open(img_data)
|
219
|
+
# result = self.__perform_llm_prediction_for_image(image)
|
220
|
+
# return result
|
221
|
+
# else:
|
222
|
+
# return super().process_svg(link, ocr_languages)
|