alita-sdk 0.3.209__py3-none-any.whl → 0.3.210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. alita_sdk/runtime/clients/artifact.py +18 -4
  2. alita_sdk/runtime/langchain/document_loaders/AlitaCSVLoader.py +2 -1
  3. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +3 -3
  4. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +8 -4
  5. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -1
  6. alita_sdk/runtime/langchain/langraph_agent.py +1 -1
  7. alita_sdk/runtime/toolkits/artifact.py +7 -3
  8. alita_sdk/runtime/toolkits/tools.py +8 -1
  9. alita_sdk/runtime/tools/application.py +2 -0
  10. alita_sdk/runtime/tools/artifact.py +65 -8
  11. alita_sdk/runtime/tools/vectorstore.py +125 -41
  12. alita_sdk/runtime/utils/utils.py +3 -0
  13. alita_sdk/tools/ado/__init__.py +8 -0
  14. alita_sdk/tools/ado/repos/repos_wrapper.py +37 -0
  15. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +0 -7
  16. alita_sdk/tools/ado/work_item/__init__.py +4 -0
  17. alita_sdk/tools/ado/work_item/ado_wrapper.py +37 -4
  18. alita_sdk/tools/aws/delta_lake/__init__.py +1 -1
  19. alita_sdk/tools/bitbucket/__init__.py +13 -1
  20. alita_sdk/tools/bitbucket/api_wrapper.py +31 -4
  21. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +31 -0
  22. alita_sdk/tools/chunkers/code/codeparser.py +18 -10
  23. alita_sdk/tools/confluence/api_wrapper.py +35 -134
  24. alita_sdk/tools/confluence/loader.py +30 -28
  25. alita_sdk/tools/elitea_base.py +112 -11
  26. alita_sdk/tools/figma/__init__.py +13 -1
  27. alita_sdk/tools/figma/api_wrapper.py +47 -3
  28. alita_sdk/tools/github/api_wrapper.py +8 -0
  29. alita_sdk/tools/github/github_client.py +18 -0
  30. alita_sdk/tools/gitlab/__init__.py +4 -0
  31. alita_sdk/tools/gitlab/api_wrapper.py +10 -0
  32. alita_sdk/tools/google/bigquery/__init__.py +1 -1
  33. alita_sdk/tools/jira/__init__.py +21 -13
  34. alita_sdk/tools/jira/api_wrapper.py +285 -5
  35. alita_sdk/tools/sharepoint/__init__.py +11 -1
  36. alita_sdk/tools/sharepoint/api_wrapper.py +23 -53
  37. alita_sdk/tools/testrail/__init__.py +4 -0
  38. alita_sdk/tools/testrail/api_wrapper.py +21 -54
  39. alita_sdk/tools/utils/content_parser.py +72 -8
  40. alita_sdk/tools/xray/__init__.py +8 -1
  41. alita_sdk/tools/xray/api_wrapper.py +505 -14
  42. alita_sdk/tools/zephyr_scale/api_wrapper.py +5 -5
  43. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/METADATA +1 -1
  44. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/RECORD +47 -47
  45. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/WHEEL +0 -0
  46. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/licenses/LICENSE +0 -0
  47. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,24 @@
1
1
  import json
2
2
  import logging
3
3
  import urllib.parse
4
- from typing import Optional, Dict, List
4
+ from typing import Dict, List, Generator, Optional
5
5
 
6
+ from alita_sdk.tools.elitea_base import BaseVectorStoreToolApiWrapper, extend_with_vector_tools
6
7
  from azure.devops.connection import Connection
7
8
  from azure.devops.v7_1.core import CoreClient
8
9
  from azure.devops.v7_1.wiki import WikiClient
9
10
  from azure.devops.v7_1.work_item_tracking import TeamContext, Wiql, WorkItemTrackingClient
11
+ from langchain_core.documents import Document
10
12
  from langchain_core.tools import ToolException
11
13
  from msrest.authentication import BasicAuthentication
12
14
  from pydantic import create_model, PrivateAttr, SecretStr
13
15
  from pydantic import model_validator
14
16
  from pydantic.fields import Field
15
17
 
16
- from ...elitea_base import BaseToolApiWrapper
18
+ try:
19
+ from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
20
+ except ImportError:
21
+ from alita_sdk.langchain.interfaces.llm_processor import get_embeddings
17
22
 
18
23
  logger = logging.getLogger(__name__)
19
24
 
@@ -89,8 +94,7 @@ ADOUnlinkWorkItemsFromWikiPage = create_model(
89
94
  page_name=(str, Field(description="Wiki page path to unlink the work items from", examples=["/TargetPage"]))
90
95
  )
91
96
 
92
-
93
- class AzureDevOpsApiWrapper(BaseToolApiWrapper):
97
+ class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
94
98
  organization_url: str
95
99
  project: str
96
100
  token: SecretStr
@@ -504,6 +508,35 @@ class AzureDevOpsApiWrapper(BaseToolApiWrapper):
504
508
  logger.error(f"Error unlinking work items from wiki page '{page_name}': {str(e)}")
505
509
  return ToolException(f"An unexpected error occurred while unlinking work items from wiki page '{page_name}': {str(e)}")
506
510
 
511
+ def _base_loader(self, wiql: str, **kwargs) -> Generator[Document, None, None]:
512
+ ref_items = self._client.query_by_wiql(Wiql(query=wiql)).work_items
513
+ for ref in ref_items:
514
+ wi = self._client.get_work_item(id=ref.id, project=self.project, expand='all')
515
+ yield Document(page_content=json.dumps(wi.fields), metadata={
516
+ 'id': str(wi.id),
517
+ 'type': wi.fields.get('System.WorkItemType', ''),
518
+ 'title': wi.fields.get('System.Title', ''),
519
+ 'state': wi.fields.get('System.State', ''),
520
+ 'area': wi.fields.get('System.AreaPath', ''),
521
+ 'reason': wi.fields.get('System.Reason', ''),
522
+ 'iteration': wi.fields.get('System.IterationPath', ''),
523
+ 'updated_on': wi.fields.get('System.ChangedDate', ''),
524
+ 'attachment_ids': [rel.url.split('/')[-1] for rel in wi.relations or [] if rel.rel == 'AttachedFile']
525
+ })
526
+
527
+ def _process_document(self, document: Document) -> Generator[Document, None, None]:
528
+ for attachment_id in document.metadata.get('attachment_ids', []):
529
+ content_generator = self._client.get_attachment_content(id=attachment_id, download=True)
530
+ content = ''.join(str(item) for item in content_generator)
531
+ yield Document(page_content=content, metadata={'id': attachment_id})
532
+
533
+ def _index_tool_params(self):
534
+ """Return the parameters for indexing data."""
535
+ return {
536
+ "wiql": (str, Field(description="WIQL (Work Item Query Language) query string to select and filter Azure DevOps work items."))
537
+ }
538
+
539
+ @extend_with_vector_tools
507
540
  def get_available_tools(self):
508
541
  """Return a list of available tools."""
509
542
  return [
@@ -57,7 +57,7 @@ class DeltaLakeToolkitConfig(BaseModel):
57
57
  aws_secret_access_key: Optional[SecretStr] = Field(default=None, description="AWS secret access key", json_schema_extra={"secret": True, "configuration": True})
58
58
  aws_session_token: Optional[SecretStr] = Field(default=None, description="AWS session token (optional)", json_schema_extra={"secret": True, "configuration": True})
59
59
  aws_region: Optional[str] = Field(default=None, description="AWS region for Delta Lake storage", json_schema_extra={"configuration": True})
60
- s3_path: Optional[str] = Field(default=None, description="S3 path to Delta Lake data (e.g., s3://bucket/path)", json_schema_extra={"configuration": True})
60
+ s3_path: Optional[str] = Field(default=None, description="S3 path to Delta Lake data (e.g., s3://bucket/path)", json_schema_extra={"configuration": True, "configuration_title": True})
61
61
  table_path: Optional[str] = Field(default=None, description="Delta Lake table path (if not using s3_path)", json_schema_extra={"configuration": True})
62
62
  selected_tools: List[str] = Field(default=[], description="Selected tools", json_schema_extra={"args_schemas": get_available_tools()})
63
63
 
@@ -24,7 +24,15 @@ def get_tools(tool):
24
24
  password=tool['settings']['password'],
25
25
  branch=tool['settings']['branch'],
26
26
  cloud=tool['settings'].get('cloud'),
27
- toolkit_name=tool.get('toolkit_name'),
27
+ llm=tool['settings'].get('llm', None),
28
+ alita=tool['settings'].get('alita', None),
29
+ connection_string=tool['settings'].get('connection_string', None),
30
+ collection_name=str(tool['id']),
31
+ doctype='code',
32
+ embedding_model="HuggingFaceEmbeddings",
33
+ embedding_model_params={"model_name": "sentence-transformers/all-MiniLM-L6-v2"},
34
+ vectorstore_type="PGVector",
35
+ toolkit_name=tool.get('toolkit_name')
28
36
  ).get_tools()
29
37
 
30
38
 
@@ -48,6 +56,10 @@ class AlitaBitbucketToolkit(BaseToolkit):
48
56
  username=(str, Field(description="Username", json_schema_extra={'configuration': True})),
49
57
  password=(SecretStr, Field(description="GitLab private token", json_schema_extra={'secret': True, 'configuration': True})),
50
58
  cloud=(Optional[bool], Field(description="Hosting Option", default=None)),
59
+ # indexer settings
60
+ connection_string=(Optional[SecretStr], Field(description="Connection string for vectorstore",
61
+ default=None,
62
+ json_schema_extra={'secret': True})),
51
63
  selected_tools=(List[Literal[tuple(selected_tools)]], Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
52
64
  __config__=ConfigDict(json_schema_extra=
53
65
  {
@@ -41,6 +41,18 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
41
41
  """Bitbucket installation type: true for cloud, false for server.
42
42
  """
43
43
 
44
+ llm: Optional[Any] = None
45
+ # Alita instance
46
+ alita: Optional[Any] = None
47
+
48
+ # Vector store configuration
49
+ connection_string: Optional[SecretStr] = None
50
+ collection_name: Optional[str] = None
51
+ doctype: Optional[str] = 'code'
52
+ embedding_model: Optional[str] = "HuggingFaceEmbeddings"
53
+ embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
54
+ vectorstore_type: Optional[str] = "PGVector"
55
+
44
56
  @model_validator(mode='before')
45
57
  @classmethod
46
58
  def validate_env(cls, values: Dict) -> Dict:
@@ -59,7 +71,7 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
59
71
  password=values['password'],
60
72
  workspace=values['project'],
61
73
  repository=values['repository']
62
- ) if values['cloud'] else BitbucketServerApi(
74
+ ) if values.get('cloud') else BitbucketServerApi(
63
75
  url=values['url'],
64
76
  username=values['username'],
65
77
  password=values['password'],
@@ -213,16 +225,31 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
213
225
  except Exception as e:
214
226
  return ToolException(f"Can't add comment to pull request `{pr_id}` due to error:\n{str(e)}")
215
227
 
216
- def _get_files(self, file_path: str, branch: str) -> str:
228
+ def _get_files(self, path: str, branch: str) -> str:
217
229
  """
218
230
  Get files from the bitbucket repo
219
231
  Parameters:
220
- file_path(str): the file path
232
+ path(str): the file path
221
233
  branch(str): branch name (by default: active_branch)
222
234
  Returns:
223
235
  str: List of the files
224
236
  """
225
- return str(self._bitbucket.get_files_list(file_path=file_path if file_path else '', branch=branch if branch else self._active_branch))
237
+ return str(self._bitbucket.get_files_list(file_path=path if path else '', branch=branch if branch else self._active_branch))
238
+
239
+ # TODO: review this method, it may not work as expected
240
+ # def _file_commit_hash(self, file_path: str, branch: str):
241
+ # """
242
+ # Get the commit hash of a file in the gitlab repo
243
+ # Parameters:
244
+ # file_path(str): the file path
245
+ # branch(str): branch name (by default: active_branch)
246
+ # Returns:
247
+ # str: The commit hash of the file
248
+ # """
249
+ # try:
250
+ # return self._bitbucket.get_file_commit_hash(file_path=file_path, branch=branch)
251
+ # except Exception as e:
252
+ # raise ToolException(f"Can't extract file commit hash (`{file_path}`) due to error:\n{str(e)}")
226
253
 
227
254
  def _read_file(self, file_path: str, branch: str) -> str:
228
255
  """
@@ -104,6 +104,22 @@ class BitbucketServerApi(BitbucketApiAbstract):
104
104
  data=json.loads(pr_json_data)
105
105
  )
106
106
 
107
+ # TODO: review this method, it may not work as expected
108
+ def get_file_commit_hash(self, file_path: str, branch: str):
109
+ """
110
+ Get the commit hash of a file in a specific branch.
111
+ Parameters:
112
+ file_path (str): The path to the file.
113
+ branch (str): The branch name.
114
+ Returns:
115
+ str: The commit hash of the file.
116
+ """
117
+ commits = self.api_client.get_commits(project_key=self.project, repository_slug=self.repository,
118
+ filename=file_path, at=branch, limit=1)
119
+ if commits:
120
+ return commits[0]['id']
121
+ return None
122
+
107
123
  def get_file(self, file_path: str, branch: str) -> str:
108
124
  return self.api_client.get_content_of_file(project_key=self.project, repository_slug=self.repository, at=branch,
109
125
  filename=file_path).decode('utf-8')
@@ -262,6 +278,21 @@ class BitbucketCloudApi(BitbucketApiAbstract):
262
278
  response = self.repository.pullrequests.post(None, data=json.loads(pr_json_data))
263
279
  return response['links']['self']['href']
264
280
 
281
+ # TODO: review this method, it may not work as expected
282
+ def get_file_commit_hash(self, file_path: str, branch: str):
283
+ """
284
+ Get the commit hash of a file in a specific branch.
285
+ Parameters:
286
+ file_path (str): The path to the file.
287
+ branch (str): The branch name.
288
+ Returns:
289
+ str: The commit hash of the file.
290
+ """
291
+ commits = self.repository.commits.get(path=file_path, branch=branch, pagelen=1)
292
+ if commits['values']:
293
+ return commits['values'][0]['hash']
294
+ return None
295
+
265
296
  def get_file(self, file_path: str, branch: str) -> str:
266
297
  return self.repository.get(path=f'src/{branch}/{file_path}')
267
298
 
@@ -39,13 +39,17 @@ def parse_code_files_for_db(file_content_generator: Generator[str, None, None],
39
39
  if programming_language == Language.UNKNOWN:
40
40
  documents = TokenTextSplitter(encoding_name="gpt2", chunk_size=256, chunk_overlap=30).split_text(file_content)
41
41
  for document in documents:
42
+ metadata = {
43
+ "filename": file_name,
44
+ "method_name": node.name,
45
+ "language": programming_language.value,
46
+ }
47
+ commit_hash = data.get("commit_hash")
48
+ if commit_hash is not None:
49
+ metadata["commit_hash"] = commit_hash
42
50
  document = Document(
43
51
  page_content=document,
44
- metadata={
45
- "filename": file_name,
46
- "method_name": 'text',
47
- "language": programming_language.value,
48
- },
52
+ metadata=metadata,
49
53
  )
50
54
  yield document
51
55
  else:
@@ -73,13 +77,17 @@ def parse_code_files_for_db(file_content_generator: Generator[str, None, None],
73
77
  splitted_documents = code_splitter.split_text(method_source_code)
74
78
 
75
79
  for splitted_document in splitted_documents:
80
+ metadata = {
81
+ "filename": file_name,
82
+ "method_name": node.name,
83
+ "language": programming_language.value,
84
+ }
85
+ commit_hash = data.get("commit_hash")
86
+ if commit_hash is not None:
87
+ metadata["commit_hash"] = commit_hash
76
88
  document = Document(
77
89
  page_content=splitted_document,
78
- metadata={
79
- "filename": file_name,
80
- "method_name": node.name,
81
- "language": programming_language.value,
82
- },
90
+ metadata=metadata,
83
91
  )
84
92
  yield document
85
93
  except Exception as e:
@@ -1,4 +1,3 @@
1
- import shortuuid
2
1
  import re
3
2
  import logging
4
3
  import requests
@@ -17,7 +16,7 @@ from langchain_core.messages import HumanMessage
17
16
  from markdownify import markdownify
18
17
  from langchain_community.document_loaders.confluence import ContentFormat
19
18
 
20
- from ..elitea_base import BaseVectorStoreToolApiWrapper, BaseIndexParams
19
+ from ..elitea_base import BaseVectorStoreToolApiWrapper, extend_with_vector_tools
21
20
  from ..llm.img_utils import ImageDescriptionCache
22
21
  from ..utils import is_cookie_token, parse_cookie_string
23
22
 
@@ -141,50 +140,6 @@ pageId = create_model(
141
140
  description="Optional JSON of parameters to be sent in request body or query params. MUST be string with valid JSON. For search/read operations, you MUST always get minimum fields and set max results, until users ask explicitly for more fields. For search/read operations you must generate CQL query string and pass it as params."))
142
141
  )
143
142
 
144
- # loaderParams = create_model(
145
- # "LoaderParams",
146
- # content_format=(str, Field(description="The format of the content to be retrieved.")),
147
- # page_ids=(Optional[List[str]], Field(description="List of page IDs to retrieve.", default=None)),
148
- # label=(Optional[str], Field(description="Label to filter pages.", default=None)),
149
- # cql=(Optional[str], Field(description="CQL query to filter pages.", default=None)),
150
- # include_restricted_content=(Optional[bool], Field(description="Include restricted content.", default=False)),
151
- # include_archived_content=(Optional[bool], Field(description="Include archived content.", default=False)),
152
- # include_attachments=(Optional[bool], Field(description="Include attachments.", default=False)),
153
- # include_comments=(Optional[bool], Field(description="Include comments.", default=False)),
154
- # include_labels=(Optional[bool], Field(description="Include labels.", default=False)),
155
- # limit=(Optional[int], Field(description="Limit the number of results.", default=10)),
156
- # max_pages=(Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000)),
157
- # ocr_languages=(Optional[str], Field(description="OCR languages for processing attachments.", default=None)),
158
- # keep_markdown_format=(Optional[bool], Field(description="Keep the markdown format.", default=True)),
159
- # keep_newlines=(Optional[bool], Field(description="Keep newlines in the content.", default=True)),
160
- # bins_with_llm=(Optional[bool], Field(description="Use LLM for processing binary files.", default=False)),
161
- # )
162
-
163
- indexPagesParams = create_model(
164
- "indexPagesParams",
165
- __base__=BaseIndexParams,
166
- content_format=(Literal['view', 'storage', 'export_view', 'editor', 'anonymous'],
167
- Field(description="The format of the content to be retrieved.")),
168
- ### Loader Parameters
169
- page_ids=(Optional[List[str]], Field(description="List of page IDs to retrieve.", default=None)),
170
- label=(Optional[str], Field(description="Label to filter pages.", default=None)),
171
- cql=(Optional[str], Field(description="CQL query to filter pages.", default=None)),
172
- limit=(Optional[int], Field(description="Limit the number of results.", default=10)),
173
- max_pages=(Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000)),
174
- include_restricted_content=(Optional[bool], Field(description="Include restricted content.", default=False)),
175
- include_archived_content=(Optional[bool], Field(description="Include archived content.", default=False)),
176
- include_attachments=(Optional[bool], Field(description="Include attachments.", default=False)),
177
- include_comments=(Optional[bool], Field(description="Include comments.", default=False)),
178
- include_labels=(Optional[bool], Field(description="Include labels.", default=True)),
179
- ocr_languages=(Optional[str], Field(description="OCR languages for processing attachments.", default='eng')),
180
- keep_markdown_format=(Optional[bool], Field(description="Keep the markdown format.", default=True)),
181
- keep_newlines=(Optional[bool], Field(description="Keep newlines in the content.", default=True)),
182
- bins_with_llm=(Optional[bool], Field(description="Use LLM for processing binary files.", default=False)),
183
- ### Chunking Parameters
184
- chunking_tool=(Literal['markdown', 'statistical', 'proposal'], Field(description="Name of chunking tool", default="markdown")),
185
- chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default_factory=dict)),
186
- )
187
-
188
143
  GetPageWithImageDescriptions = create_model(
189
144
  "GetPageWithImageDescriptionsModel",
190
145
  page_id=(str, Field(description="Confluence page ID from which content with images will be extracted")),
@@ -849,7 +804,7 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
849
804
  docs.extend(batch)
850
805
  return docs[:max_pages]
851
806
 
852
- def _loader(self, **kwargs) -> Generator[str, None, None]:
807
+ def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
853
808
  """
854
809
  Loads content from Confluence based on parameters.
855
810
  Returns:
@@ -858,7 +813,15 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
858
813
  from .loader import AlitaConfluenceLoader
859
814
  from copy import copy
860
815
  content_format = kwargs.get('content_format', 'view').lower()
816
+ base_params = {
817
+ 'url': self.base_url,
818
+ 'space_key': self.space,
819
+ 'min_retry_seconds': self.min_retry_seconds,
820
+ 'max_retry_seconds': self.max_retry_seconds,
821
+ 'number_of_retries': self.number_of_retries
822
+ }
861
823
  confluence_loader_params = copy(kwargs)
824
+ confluence_loader_params.update(base_params)
862
825
  mapping = {
863
826
  'view': ContentFormat.VIEW,
864
827
  'storage': ContentFormat.STORAGE,
@@ -878,86 +841,9 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
878
841
  for document in loader._lazy_load(kwargs={}):
879
842
  yield document
880
843
 
881
- def index_data(self, content_format: str,
882
- collection_suffix: str = "",
883
- page_ids: Optional[List[str]] = None,
884
- label: Optional[str] = None,
885
- cql: Optional[str] = None,
886
- include_restricted_content: Optional[bool] = False,
887
- include_archived_content: Optional[bool] = False,
888
- include_attachments: Optional[bool] = False,
889
- include_comments: Optional[bool] = False,
890
- include_labels: Optional[bool] = False,
891
- limit: Optional[int] = 10,
892
- max_pages: Optional[int] = 10,
893
- keep_markdown_format: Optional[bool] = True,
894
- keep_newlines: Optional[bool] = True,
895
- bins_with_llm: bool = False,
896
- chunking_tool: str = "markdown",
897
- chunking_config: Optional[Dict[str, Any]] = None,
898
- **kwargs) -> Generator[str, None, None]:
899
- """Load Confluence pages and index them in the vector store."""
900
-
901
- from alita_sdk.tools.chunkers import __confluence_chunkers__ as chunkers, __confluence_models__ as models
902
- try:
903
- from alita_sdk.langchain.interfaces.llm_processor import get_embeddings
904
- except ImportError:
905
- from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
906
-
907
- loader_params = {
908
- 'url': self.base_url,
909
- 'space_key': self.space,
910
- 'content_format': content_format,
911
- 'page_ids': page_ids,
912
- 'label': label,
913
- 'cql': cql,
914
- 'include_restricted_content': include_restricted_content,
915
- 'include_archived_content': include_archived_content,
916
- 'include_attachments': include_attachments,
917
- 'include_comments': include_comments,
918
- 'include_labels': include_labels,
919
- 'limit': limit,
920
- 'max_pages': max_pages,
921
- 'keep_markdown_format': keep_markdown_format,
922
- 'keep_newlines': keep_newlines,
923
- 'bins_with_llm': bins_with_llm,
924
- 'min_retry_seconds': self.min_retry_seconds,
925
- 'max_retry_seconds': self.max_retry_seconds,
926
- 'number_of_retries': self.number_of_retries
927
- }
928
- documents = self._loader(**loader_params)
929
- embedding = get_embeddings(self.embedding_model, self.embedding_model_params)
930
-
931
- chunker = chunkers.get(chunking_tool)
932
-
933
- chunking_config = chunking_config or {}
934
-
935
- if chunker:
936
- # Validate and prepare chunking configuration using Pydantic models
937
- config_model = models.get(chunking_tool)
938
- if config_model:
939
- # Set required fields that should come from the instance
940
- chunking_config['embedding'] = embedding
941
- chunking_config['llm'] = self.llm
942
-
943
- try:
944
- # Validate the configuration using the appropriate Pydantic model
945
- validated_config = config_model(**chunking_config)
946
- chunking_config = validated_config.model_dump()
947
- except Exception as e:
948
- logger.error(f"Invalid chunking configuration for {chunking_tool}: {e}")
949
- raise ToolException(f"Invalid chunking configuration: {e}")
950
- else:
951
- # Fallback for chunkers without models
952
- chunking_config['embedding'] = embedding
953
- chunking_config['llm'] = self.llm
954
-
955
- documents = chunker(documents, chunking_config)
956
-
957
- # passing embedding to avoid re-initialization
958
- vectorstore = self._init_vector_store(collection_suffix, embeddings=embedding)
959
- return vectorstore.index_documents(documents)
960
-
844
+ def _process_document(self, document: Document) -> Generator[Document, None, None]:
845
+ for attachment in self.get_page_attachments(document.metadata.get('id')):
846
+ yield Document(page_content=attachment.get('content', ''), metadata=attachment.get('metadata', {}))
961
847
 
962
848
  def _download_image(self, image_url):
963
849
  """
@@ -1685,6 +1571,28 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
1685
1571
  logger.error(f"Error retrieving attachments for page {page_id}: {str(e)}")
1686
1572
  return f"Error retrieving attachments: {str(e)}"
1687
1573
 
1574
+ def _index_tool_params(self):
1575
+ """Return the parameters for indexing data."""
1576
+ return {
1577
+ "content_format": (Literal['view', 'storage', 'export_view', 'editor', 'anonymous'],
1578
+ Field(description="The format of the content to be retrieved.")),
1579
+ "page_ids": (Optional[List[str]], Field(description="List of page IDs to retrieve.", default=None)),
1580
+ "label": (Optional[str], Field(description="Label to filter pages.", default=None)),
1581
+ "cql": (Optional[str], Field(description="CQL query to filter pages.", default=None)),
1582
+ "limit": (Optional[int], Field(description="Limit the number of results.", default=10)),
1583
+ "max_pages": (Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000)),
1584
+ "include_restricted_content": (Optional[bool], Field(description="Include restricted content.", default=False)),
1585
+ "include_archived_content": (Optional[bool], Field(description="Include archived content.", default=False)),
1586
+ "include_attachments": (Optional[bool], Field(description="Include attachments.", default=False)),
1587
+ "include_comments": (Optional[bool], Field(description="Include comments.", default=False)),
1588
+ "include_labels": (Optional[bool], Field(description="Include labels.", default=True)),
1589
+ "ocr_languages": (Optional[str], Field(description="OCR languages for processing attachments.", default='eng')),
1590
+ "keep_markdown_format": (Optional[bool], Field(description="Keep the markdown format.", default=True)),
1591
+ "keep_newlines": (Optional[bool], Field(description="Keep newlines in the content.", default=True)),
1592
+ "bins_with_llm": (Optional[bool], Field(description="Use LLM for processing binary files.", default=False)),
1593
+ }
1594
+
1595
+ @extend_with_vector_tools
1688
1596
  def get_available_tools(self):
1689
1597
  # Confluence-specific tools
1690
1598
  confluence_tools = [
@@ -1796,13 +1704,6 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
1796
1704
  "description": self.get_page_id_by_title.__doc__,
1797
1705
  "args_schema": getPageIdByTitleInput,
1798
1706
  },
1799
- # Confluence-specific vector store indexing
1800
- {
1801
- "name": "index_data",
1802
- "ref": self.index_data,
1803
- "description": self.index_data.__doc__,
1804
- "args_schema": indexPagesParams,
1805
- },
1806
1707
  {
1807
1708
  "name": "get_page_attachments",
1808
1709
  "ref": self.get_page_attachments,
@@ -10,8 +10,8 @@ from langchain_community.document_loaders import ConfluenceLoader
10
10
  from langchain_community.document_loaders.confluence import ContentFormat
11
11
  from langchain_core.messages import HumanMessage
12
12
  from pdf2image import convert_from_bytes
13
- from reportlab.graphics import renderPM
14
- from svglib.svglib import svg2rlg
13
+ # from reportlab.graphics import renderPM
14
+ # from svglib.svglib import svg2rlg
15
15
 
16
16
  from .utils import image_to_byte_array, bytes_to_base64
17
17
 
@@ -125,6 +125,7 @@ class AlitaConfluenceLoader(ConfluenceLoader):
125
125
  text = title + self.process_doc(absolute_url)
126
126
  elif media_type == "application/vnd.ms-excel":
127
127
  text = title + self.process_xls(absolute_url)
128
+ # TODO review usage
128
129
  # elif media_type == "image/svg+xml":
129
130
  # text = title + self.process_svg(absolute_url, ocr_languages)
130
131
  else:
@@ -192,29 +193,30 @@ class AlitaConfluenceLoader(ConfluenceLoader):
192
193
  else:
193
194
  return super().process_image(link, ocr_languages)
194
195
 
195
- def process_svg(
196
- self,
197
- link: str,
198
- ocr_languages: Optional[str] = None,
199
- ) -> str:
200
- if self.bins_with_llm and self.llm:
201
- response = self.confluence.request(path=link, absolute=True)
202
- text = ""
203
-
204
- if (
205
- response.status_code != 200
206
- or response.content == b""
207
- or response.content is None
208
- ):
209
- return text
210
-
211
- drawing = svg2rlg(BytesIO(response.content))
212
-
213
- img_data = BytesIO()
214
- renderPM.drawToFile(drawing, img_data, fmt="PNG")
215
- img_data.seek(0)
216
- image = Image.open(img_data)
217
- result = self.__perform_llm_prediction_for_image(image)
218
- return result
219
- else:
220
- return super().process_svg(link, ocr_languages)
196
+ # TODO review usage
197
+ # def process_svg(
198
+ # self,
199
+ # link: str,
200
+ # ocr_languages: Optional[str] = None,
201
+ # ) -> str:
202
+ # if self.bins_with_llm and self.llm:
203
+ # response = self.confluence.request(path=link, absolute=True)
204
+ # text = ""
205
+ #
206
+ # if (
207
+ # response.status_code != 200
208
+ # or response.content == b""
209
+ # or response.content is None
210
+ # ):
211
+ # return text
212
+ #
213
+ # drawing = svg2rlg(BytesIO(response.content))
214
+ #
215
+ # img_data = BytesIO()
216
+ # renderPM.drawToFile(drawing, img_data, fmt="PNG")
217
+ # img_data.seek(0)
218
+ # image = Image.open(img_data)
219
+ # result = self.__perform_llm_prediction_for_image(image)
220
+ # return result
221
+ # else:
222
+ # return super().process_svg(link, ocr_languages)