alita-sdk 0.3.376__py3-none-any.whl → 0.3.435__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (60) hide show
  1. alita_sdk/configurations/bitbucket.py +95 -0
  2. alita_sdk/configurations/confluence.py +96 -1
  3. alita_sdk/configurations/gitlab.py +79 -0
  4. alita_sdk/configurations/jira.py +103 -0
  5. alita_sdk/configurations/testrail.py +88 -0
  6. alita_sdk/configurations/xray.py +93 -0
  7. alita_sdk/configurations/zephyr_enterprise.py +93 -0
  8. alita_sdk/configurations/zephyr_essential.py +75 -0
  9. alita_sdk/runtime/clients/client.py +9 -4
  10. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  11. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  12. alita_sdk/runtime/clients/sandbox_client.py +8 -0
  13. alita_sdk/runtime/langchain/assistant.py +41 -38
  14. alita_sdk/runtime/langchain/constants.py +5 -1
  15. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  16. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
  17. alita_sdk/runtime/langchain/document_loaders/constants.py +28 -12
  18. alita_sdk/runtime/langchain/langraph_agent.py +91 -27
  19. alita_sdk/runtime/langchain/utils.py +24 -4
  20. alita_sdk/runtime/models/mcp_models.py +57 -0
  21. alita_sdk/runtime/toolkits/__init__.py +24 -0
  22. alita_sdk/runtime/toolkits/application.py +8 -1
  23. alita_sdk/runtime/toolkits/mcp.py +787 -0
  24. alita_sdk/runtime/toolkits/tools.py +98 -50
  25. alita_sdk/runtime/tools/__init__.py +7 -2
  26. alita_sdk/runtime/tools/application.py +7 -0
  27. alita_sdk/runtime/tools/function.py +20 -28
  28. alita_sdk/runtime/tools/graph.py +10 -4
  29. alita_sdk/runtime/tools/image_generation.py +104 -8
  30. alita_sdk/runtime/tools/llm.py +146 -114
  31. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  32. alita_sdk/runtime/tools/mcp_server_tool.py +79 -10
  33. alita_sdk/runtime/tools/sandbox.py +166 -63
  34. alita_sdk/runtime/tools/vectorstore.py +3 -2
  35. alita_sdk/runtime/tools/vectorstore_base.py +4 -3
  36. alita_sdk/runtime/utils/streamlit.py +34 -3
  37. alita_sdk/runtime/utils/toolkit_utils.py +5 -2
  38. alita_sdk/runtime/utils/utils.py +1 -0
  39. alita_sdk/tools/__init__.py +48 -31
  40. alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
  41. alita_sdk/tools/base_indexer_toolkit.py +75 -66
  42. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  43. alita_sdk/tools/code_indexer_toolkit.py +13 -3
  44. alita_sdk/tools/confluence/api_wrapper.py +29 -7
  45. alita_sdk/tools/confluence/loader.py +10 -0
  46. alita_sdk/tools/elitea_base.py +7 -7
  47. alita_sdk/tools/gitlab/api_wrapper.py +11 -7
  48. alita_sdk/tools/jira/api_wrapper.py +1 -1
  49. alita_sdk/tools/openapi/__init__.py +10 -1
  50. alita_sdk/tools/qtest/api_wrapper.py +522 -74
  51. alita_sdk/tools/sharepoint/api_wrapper.py +104 -33
  52. alita_sdk/tools/sharepoint/authorization_helper.py +175 -1
  53. alita_sdk/tools/sharepoint/utils.py +8 -2
  54. alita_sdk/tools/utils/content_parser.py +27 -16
  55. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +19 -6
  56. {alita_sdk-0.3.376.dist-info → alita_sdk-0.3.435.dist-info}/METADATA +1 -1
  57. {alita_sdk-0.3.376.dist-info → alita_sdk-0.3.435.dist-info}/RECORD +60 -55
  58. {alita_sdk-0.3.376.dist-info → alita_sdk-0.3.435.dist-info}/WHEEL +0 -0
  59. {alita_sdk-0.3.376.dist-info → alita_sdk-0.3.435.dist-info}/licenses/LICENSE +0 -0
  60. {alita_sdk-0.3.376.dist-info → alita_sdk-0.3.435.dist-info}/top_level.txt +0 -0
@@ -90,62 +90,79 @@ available_count = len(AVAILABLE_TOOLS)
90
90
  total_attempted = len(AVAILABLE_TOOLS) + len(FAILED_IMPORTS)
91
91
  logger.info(f"Tool imports completed: {available_count}/{total_attempted} successful")
92
92
 
93
+
93
94
  def get_tools(tools_list, alita, llm, store: Optional[BaseStore] = None, *args, **kwargs):
94
95
  tools = []
96
+
95
97
  for tool in tools_list:
96
- # validate tool name syntax - it cannot be started with _
97
- for tool_name in tool.get('settings', {}).get('selected_tools', []):
98
- if isinstance(tool_name, str) and tool_name.startswith('_'):
99
- raise ValueError(f"Tool name '{tool_name}' from toolkit '{tool.get('type', '')}' cannot start with '_'")
100
-
101
- tool['settings']['alita'] = alita
102
- tool['settings']['llm'] = llm
103
- tool['settings']['store'] = store
98
+ settings = tool.get('settings')
99
+
100
+ # Skip tools without settings early
101
+ if not settings:
102
+ logger.warning(f"Tool '{tool.get('type', '')}' has no settings, skipping...")
103
+ continue
104
+
105
+ # Validate tool names once
106
+ selected_tools = settings.get('selected_tools', [])
107
+ invalid_tools = [name for name in selected_tools if isinstance(name, str) and name.startswith('_')]
108
+ if invalid_tools:
109
+ raise ValueError(f"Tool names {invalid_tools} from toolkit '{tool.get('type', '')}' cannot start with '_'")
110
+
111
+ # Cache tool type and add common settings
104
112
  tool_type = tool['type']
113
+ settings['alita'] = alita
114
+ settings['llm'] = llm
115
+ settings['store'] = store
116
+
117
+ # Set pgvector collection schema if present
118
+ if settings.get('pgvector_configuration'):
119
+ settings['pgvector_configuration']['collection_schema'] = str(tool['id'])
105
120
 
106
- # Handle special cases for ADO tools
121
+ # Handle ADO special cases
107
122
  if tool_type in ['ado_boards', 'ado_wiki', 'ado_plans']:
108
123
  tools.extend(AVAILABLE_TOOLS['ado']['get_tools'](tool_type, tool))
124
+ continue
109
125
 
110
- # Check if tool is available and has get_tools function
111
- elif tool_type in AVAILABLE_TOOLS and 'get_tools' in AVAILABLE_TOOLS[tool_type]:
126
+ # Handle ADO repos aliases
127
+ if tool_type in ['ado_repos', 'azure_devops_repos'] and 'ado_repos' in AVAILABLE_TOOLS:
112
128
  try:
113
- get_tools_func = AVAILABLE_TOOLS[tool_type]['get_tools']
114
- tools.extend(get_tools_func(tool))
115
-
129
+ tools.extend(AVAILABLE_TOOLS['ado_repos']['get_tools'](tool))
116
130
  except Exception as e:
117
- logger.error(f"Error getting tools for {tool_type}: {e}")
118
- raise ToolException(f"Error getting tools for {tool_type}: {e}")
131
+ logger.error(f"Error getting ADO repos tools: {e}")
132
+ continue
119
133
 
120
- # Handle ADO repos special case (it might be requested as azure_devops_repos)
121
- elif tool_type in ['ado_repos', 'azure_devops_repos'] and 'ado_repos' in AVAILABLE_TOOLS:
134
+ # Skip MCP toolkit - it's handled by runtime/toolkits/tools.py to avoid duplicate loading
135
+ if tool_type == 'mcp':
136
+ logger.debug(f"Skipping MCP toolkit '{tool.get('toolkit_name')}' - handled by runtime toolkit system")
137
+ continue
138
+
139
+ # Handle standard tools
140
+ if tool_type in AVAILABLE_TOOLS and 'get_tools' in AVAILABLE_TOOLS[tool_type]:
122
141
  try:
123
- get_tools_func = AVAILABLE_TOOLS['ado_repos']['get_tools']
124
- tools.extend(get_tools_func(tool))
142
+ tools.extend(AVAILABLE_TOOLS[tool_type]['get_tools'](tool))
125
143
  except Exception as e:
126
- logger.error(f"Error getting ADO repos tools: {e}")
144
+ logger.error(f"Error getting tools for {tool_type}: {e}")
145
+ raise ToolException(f"Error getting tools for {tool_type}: {e}")
146
+ continue
127
147
 
128
148
  # Handle custom modules
129
- elif tool.get("settings", {}).get("module"):
149
+ if settings.get("module"):
130
150
  try:
131
- settings = tool.get("settings", {})
132
151
  mod = import_module(settings.pop("module"))
133
152
  tkitclass = getattr(mod, settings.pop("class"))
134
- #
135
- get_toolkit_params = tool["settings"].copy()
153
+ get_toolkit_params = settings.copy()
136
154
  get_toolkit_params["name"] = tool.get("name")
137
- #
138
155
  toolkit = tkitclass.get_toolkit(**get_toolkit_params)
139
156
  tools.extend(toolkit.get_tools())
140
157
  except Exception as e:
141
158
  logger.error(f"Error in getting custom toolkit: {e}")
159
+ continue
142
160
 
161
+ # Tool not available
162
+ if tool_type in FAILED_IMPORTS:
163
+ logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
143
164
  else:
144
- # Tool not available or not found
145
- if tool_type in FAILED_IMPORTS:
146
- logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
147
- else:
148
- logger.warning(f"Unknown tool type: {tool_type}")
165
+ logger.warning(f"Unknown tool type: {tool_type}")
149
166
 
150
167
  return tools
151
168
 
@@ -329,11 +329,14 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
329
329
  parsed_item.update(fields_data)
330
330
 
331
331
  # extract relations if any
332
- relations_data = work_item.relations
332
+ relations_data = None
333
+ if expand and str(expand).lower() in ("relations", "all"):
334
+ try:
335
+ relations_data = getattr(work_item, 'relations', None)
336
+ except KeyError:
337
+ relations_data = None
333
338
  if relations_data:
334
- parsed_item['relations'] = []
335
- for relation in relations_data:
336
- parsed_item['relations'].append(relation.as_dict())
339
+ parsed_item['relations'] = [relation.as_dict() for relation in relations_data]
337
340
 
338
341
  if parse_attachments:
339
342
  # describe images in work item fields if present
@@ -344,13 +347,19 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
344
347
  for img in images:
345
348
  src = img.get('src')
346
349
  if src:
347
- description = self.parse_attachment_by_url(src, image_description_prompt)
350
+ description = self.parse_attachment_by_url(src, image_description_prompt=image_description_prompt)
348
351
  img['image-description'] = description
349
352
  parsed_item[field_name] = str(soup)
350
353
  # parse attached documents if present
351
- if parsed_item['relations']:
352
- for attachment in parsed_item['relations']:
353
- attachment['content'] = self.parse_attachment_by_url(attachment['url'], attachment['attributes']['name'], image_description_prompt)
354
+ for relation in parsed_item.get('relations', []):
355
+ # Only process actual file attachments
356
+ if relation.get('rel') == 'AttachedFile':
357
+ file_name = relation.get('attributes', {}).get('name')
358
+ if file_name:
359
+ try:
360
+ relation['content'] = self.parse_attachment_by_url(relation['url'], file_name, image_description_prompt=image_description_prompt)
361
+ except Exception as att_e:
362
+ logger.warning(f"Failed to parse attachment {file_name}: {att_e}")
354
363
 
355
364
 
356
365
  return parsed_item
@@ -7,7 +7,6 @@ from typing import Any, Optional, List, Dict, Generator
7
7
  from langchain_core.documents import Document
8
8
  from pydantic import create_model, Field, SecretStr
9
9
 
10
- from .utils import make_json_serializable
11
10
  from .utils.content_parser import file_extension_by_chunker, process_document_by_type
12
11
  from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
13
12
  from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
@@ -111,7 +110,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
111
110
  def __init__(self, **kwargs):
112
111
  conn = kwargs.get('connection_string', None)
113
112
  connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
114
- collection_name = kwargs.get('collection_name')
113
+ collection_name = kwargs.get('collection_schema')
115
114
 
116
115
  if 'vectorstore_type' not in kwargs:
117
116
  kwargs['vectorstore_type'] = 'PGVector'
@@ -152,39 +151,45 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
152
151
 
153
152
  def index_data(self, **kwargs):
154
153
  index_name = kwargs.get("index_name")
155
- progress_step = kwargs.get("progress_step")
156
154
  clean_index = kwargs.get("clean_index")
157
155
  chunking_tool = kwargs.get("chunking_tool")
158
156
  chunking_config = kwargs.get("chunking_config")
157
+ result = {"count": 0}
159
158
  #
160
- if clean_index:
161
- self._clean_index(index_name)
162
- #
163
- self.index_meta_init(index_name, kwargs)
164
- #
165
- self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
166
- self._log_tool_event(f"Loading the documents to index...{kwargs}")
167
- documents = self._base_loader(**kwargs)
168
- documents = list(documents) # consume/exhaust generator to count items
169
- documents_count = len(documents)
170
- documents = (doc for doc in documents)
171
- self._log_tool_event(f"Base documents were pre-loaded. "
172
- f"Search for possible document duplicates and remove them from the indexing list...")
173
- documents = self._reduce_duplicates(documents, index_name)
174
- self._log_tool_event(f"Duplicates were removed. "
175
- f"Processing documents to collect dependencies and prepare them for indexing...")
176
- result = self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, progress_step=progress_step)
177
- #
178
- self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, result)
179
- #
180
- return {"status": "ok", "message": f"successfully indexed {result} documents"}
181
-
182
- def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, index_name: Optional[str] = None, progress_step: int = 20):
159
+ try:
160
+ if clean_index:
161
+ self._clean_index(index_name)
162
+ #
163
+ self.index_meta_init(index_name, kwargs)
164
+ #
165
+ self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
166
+ self._log_tool_event(f"Loading the documents to index...{kwargs}")
167
+ documents = self._base_loader(**kwargs)
168
+ documents = list(documents) # consume/exhaust generator to count items
169
+ documents_count = len(documents)
170
+ documents = (doc for doc in documents)
171
+ self._log_tool_event(f"Base documents were pre-loaded. "
172
+ f"Search for possible document duplicates and remove them from the indexing list...")
173
+ documents = self._reduce_duplicates(documents, index_name)
174
+ self._log_tool_event(f"Duplicates were removed. "
175
+ f"Processing documents to collect dependencies and prepare them for indexing...")
176
+ self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
177
+ #
178
+ results_count = result["count"]
179
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count)
180
+ #
181
+ return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
182
+ else "no new documents to index"}
183
+ except Exception as e:
184
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"])
185
+ raise e
186
+
187
+
188
+ def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
183
189
  self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
184
190
  from ..runtime.langchain.interfaces.llm_processor import add_documents
185
191
  #
186
192
  base_doc_counter = 0
187
- total_counter = 0
188
193
  pg_vector_add_docs_chunk = []
189
194
  for base_doc in base_documents:
190
195
  base_doc_counter += 1
@@ -232,10 +237,9 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
232
237
  msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
233
238
  logger.debug(msg)
234
239
  self._log_tool_event(msg)
235
- total_counter += dependent_docs_counter
240
+ result["count"] += dependent_docs_counter
236
241
  if pg_vector_add_docs_chunk:
237
242
  add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
238
- return total_counter
239
243
 
240
244
  def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
241
245
  from ..tools.chunkers import __all__ as chunkers
@@ -343,7 +347,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
343
347
  """Cleans the indexed data in the collection."""
344
348
  super()._clean_collection(index_name=index_name)
345
349
  return (f"Collection '{index_name}' has been removed from the vector store.\n"
346
- f"Available collections: {self.list_indexes()}") if index_name \
350
+ f"Available collections: {self.list_collections()}") if index_name \
347
351
  else "All collections have been removed from the vector store."
348
352
 
349
353
  def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
@@ -385,7 +389,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
385
389
  """ Searches indexed documents in the vector store."""
386
390
  # build filter on top of index_name
387
391
 
388
- available_collections = super().list_indexes()
392
+ available_collections = super().list_collections()
389
393
  if index_name and index_name not in available_collections:
390
394
  return f"Collection '{index_name}' not found. Available collections: {available_collections}"
391
395
 
@@ -454,37 +458,28 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
454
458
  )
455
459
 
456
460
  def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
457
- index_meta_raw = super().get_index_meta(index_name)
458
- from ..runtime.langchain.interfaces.llm_processor import add_documents
459
- created_on = time.time()
460
- metadata = {
461
- "collection": index_name,
462
- "type": IndexerKeywords.INDEX_META_TYPE.value,
463
- "indexed": 0,
464
- "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
465
- "index_configuration": index_configuration,
466
- "created_on": created_on,
467
- "updated_on": created_on,
468
- "history": "[]",
469
- }
470
- index_meta_ids = None
471
- #
472
- if index_meta_raw:
473
- history_raw = index_meta_raw.get("metadata", {}).get("history", "[]")
474
- if isinstance(history_raw, str) and history_raw.strip():
475
- try:
476
- history = json.loads(history_raw)
477
- except (json.JSONDecodeError, TypeError):
478
- history = []
479
- else:
480
- history = []
481
- new_history_item = {k: v for k, v in index_meta_raw.get("metadata", {}).items() if k != "history"}
482
- history.append(new_history_item)
483
- metadata["history"] = json.dumps(history)
484
- index_meta_ids = [index_meta_raw.get("id")]
485
- #
486
- index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
487
- add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=index_meta_ids)
461
+ index_meta = super().get_index_meta(index_name)
462
+ if not index_meta:
463
+ self._log_tool_event(
464
+ f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
465
+ tool_name="index_data"
466
+ )
467
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
468
+ created_on = time.time()
469
+ metadata = {
470
+ "collection": index_name,
471
+ "type": IndexerKeywords.INDEX_META_TYPE.value,
472
+ "indexed": 0,
473
+ "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
474
+ "index_configuration": index_configuration,
475
+ "created_on": created_on,
476
+ "updated_on": created_on,
477
+ "task_id": None,
478
+ "conversation_id": None,
479
+ }
480
+ metadata["history"] = json.dumps([metadata])
481
+ index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
482
+ add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
488
483
 
489
484
  def index_meta_update(self, index_name: str, state: str, result: int):
490
485
  index_meta_raw = super().get_index_meta(index_name)
@@ -495,6 +490,20 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
495
490
  metadata["indexed"] = result
496
491
  metadata["state"] = state
497
492
  metadata["updated_on"] = time.time()
493
+ #
494
+ history_raw = metadata.pop("history", "[]")
495
+ try:
496
+ history = json.loads(history_raw) if history_raw.strip() else []
497
+ # replace the last history item with updated metadata
498
+ if history and isinstance(history, list):
499
+ history[-1] = metadata
500
+ else:
501
+ history = [metadata]
502
+ except (json.JSONDecodeError, TypeError):
503
+ logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
504
+ history = [metadata]
505
+ #
506
+ metadata["history"] = json.dumps(history)
498
507
  index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
499
508
  add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
500
509
 
@@ -547,10 +556,10 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
547
556
  "args_schema": RemoveIndexParams
548
557
  },
549
558
  {
550
- "name": "list_indexes",
551
- "mode": "list_indexes",
552
- "ref": self.list_indexes,
553
- "description": self.list_indexes.__doc__,
559
+ "name": "list_collections",
560
+ "mode": "list_collections",
561
+ "ref": self.list_collections,
562
+ "description": self.list_collections.__doc__,
554
563
  "args_schema": create_model("ListCollectionsParams") # No parameters
555
564
  },
556
565
  ]
@@ -6,7 +6,7 @@ from langchain_core.prompts import ChatPromptTemplate
6
6
  from langchain.text_splitter import TokenTextSplitter
7
7
 
8
8
  from typing import Optional, List
9
- from langchain_core.pydantic_v1 import BaseModel
9
+ from pydantic import BaseModel
10
10
  from ..utils import tiktoken_length
11
11
 
12
12
  logger = getLogger(__name__)
@@ -1,5 +1,6 @@
1
1
  import ast
2
2
  import fnmatch
3
+ import json
3
4
  import logging
4
5
  from typing import Optional, List, Generator
5
6
 
@@ -21,7 +22,7 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
21
22
  return self.vector_adapter.get_code_indexed_data(self, index_name)
22
23
 
23
24
  def key_fn(self, document: Document):
24
- return document.metadata.get('id')
25
+ return document.metadata.get("filename")
25
26
 
26
27
  def compare_fn(self, document: Document, idx_data):
27
28
  return (document.metadata.get('commit_hash') and
@@ -46,7 +47,7 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
46
47
  )
47
48
 
48
49
  def _extend_data(self, documents: Generator[Document, None, None]):
49
- yield from parse_code_files_for_db(documents)
50
+ yield from documents
50
51
 
51
52
  def _index_tool_params(self):
52
53
  """Return the parameters for indexing data."""
@@ -117,6 +118,15 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
117
118
  if not file_content:
118
119
  # empty file, skip
119
120
  continue
121
+ #
122
+ # ensure file content is a string
123
+ if isinstance(file_content, bytes):
124
+ file_content = file_content.decode("utf-8", errors="ignore")
125
+ elif isinstance(file_content, dict) and file.endswith('.json'):
126
+ file_content = json.dumps(file_content)
127
+ elif not isinstance(file_content, str):
128
+ file_content = str(file_content)
129
+ #
120
130
  # hash the file content to ensure uniqueness
121
131
  import hashlib
122
132
  file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
@@ -127,7 +137,7 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
127
137
  self._log_tool_event(message=f"{idx} out of {total_files} files have been read", tool_name="loader")
128
138
  self._log_tool_event(message=f"{len(_files)} have been read", tool_name="loader")
129
139
 
130
- return file_content_generator()
140
+ return parse_code_files_for_db(file_content_generator())
131
141
 
132
142
  def __handle_get_files(self, path: str, branch: str):
133
143
  """
@@ -7,12 +7,14 @@ from json import JSONDecodeError
7
7
  from typing import Optional, List, Any, Dict, Callable, Generator, Literal
8
8
 
9
9
  import requests
10
+ from atlassian.errors import ApiError
10
11
  from langchain_community.document_loaders.confluence import ContentFormat
11
12
  from langchain_core.documents import Document
12
13
  from langchain_core.messages import HumanMessage
13
14
  from langchain_core.tools import ToolException
14
15
  from markdownify import markdownify
15
16
  from pydantic import Field, PrivateAttr, model_validator, create_model, SecretStr
17
+ from requests import HTTPError
16
18
  from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log
17
19
 
18
20
  from alita_sdk.tools.non_code_indexer_toolkit import NonCodeIndexerToolkit
@@ -194,6 +196,7 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
194
196
  keep_markdown_format: Optional[bool] = True
195
197
  ocr_languages: Optional[str] = None
196
198
  keep_newlines: Optional[bool] = True
199
+ _errors: Optional[list[str]] = None
197
200
  _image_cache: ImageDescriptionCache = PrivateAttr(default_factory=ImageDescriptionCache)
198
201
 
199
202
  @model_validator(mode='before')
@@ -498,7 +501,9 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
498
501
  restrictions = self.client.get_all_restrictions_for_content(page["id"])
499
502
 
500
503
  return (
501
- page["status"] == "current"
504
+ (page["status"] == "current"
505
+ # allow user to see archived content if needed
506
+ or page["status"] == "archived")
502
507
  and not restrictions["read"]["restrictions"]["user"]["results"]
503
508
  and not restrictions["read"]["restrictions"]["group"]["results"]
504
509
  )
@@ -518,18 +523,35 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
518
523
  ),
519
524
  before_sleep=before_sleep_log(logger, logging.WARNING),
520
525
  )(self.client.get_page_by_id)
521
- page = get_page(
522
- page_id=page_id, expand=f"{self.content_format.value},version"
523
- )
524
- if not self.include_restricted_content and not self.is_public_page(page):
525
- continue
526
+ try:
527
+ page = get_page(
528
+ page_id=page_id, expand=f"{self.content_format.value},version"
529
+ )
530
+ except (ApiError, HTTPError) as e:
531
+ logger.error(f"Error fetching page with ID {page_id}: {e}")
532
+ page_content_temp = f"Confluence API Error: cannot fetch the page with ID {page_id}: {e}"
533
+ # store errors
534
+ if self._errors is None:
535
+ self._errors = []
536
+ self._errors.append(page_content_temp)
537
+ return Document(page_content=page_content_temp,
538
+ metadata={})
539
+ # TODO: update on toolkit advanced settings level as a separate feature
540
+ # if not self.include_restricted_content and not self.is_public_page(page):
541
+ # continue
526
542
  yield self.process_page(page, skip_images)
527
543
 
544
+ def _log_errors(self):
545
+ """ Log errors encountered during toolkit execution. """
546
+ if self._errors:
547
+ logger.info(f"Errors encountered during toolkit execution: {self._errors}")
548
+
528
549
  def read_page_by_id(self, page_id: str, skip_images: bool = False):
529
550
  """Reads a page by its id in the Confluence space. If id is not available, but there is a title - use get_page_id first."""
530
551
  result = list(self.get_pages_by_id([page_id], skip_images))
531
552
  if not result:
532
- "Page not found"
553
+ return f"Pages not found. Errors: {self._errors}" if self._errors \
554
+ else "Pages not found or you do not have access to them."
533
555
  return result[0].page_content
534
556
  # return self._strip_base64_images(result[0].page_content) if skip_images else result[0].page_content
535
557
 
@@ -3,6 +3,7 @@ from typing import Optional, List
3
3
  from logging import getLogger
4
4
 
5
5
  import requests
6
+ from langchain_core.documents import Document
6
7
 
7
8
  logger = getLogger(__name__)
8
9
  from PIL import Image
@@ -193,6 +194,15 @@ class AlitaConfluenceLoader(ConfluenceLoader):
193
194
  else:
194
195
  return super().process_image(link, ocr_languages)
195
196
 
197
+ def process_page(self, page: dict, include_attachments: bool, include_comments: bool, include_labels: bool,
198
+ content_format: ContentFormat, ocr_languages: Optional[str] = None,
199
+ keep_markdown_format: Optional[bool] = False, keep_newlines: bool = False) -> Document:
200
+ if not page.get("title"):
201
+ # if 'include_restricted_content' set to True, draft pages are loaded and can have no title
202
+ page["title"] = "Untitled"
203
+ return super().process_page(page, include_attachments, include_comments, include_labels, content_format,
204
+ ocr_languages, keep_markdown_format, keep_newlines)
205
+
196
206
  # TODO review usage
197
207
  # def process_svg(
198
208
  # self,
@@ -17,7 +17,7 @@ from ..runtime.utils.utils import IndexerKeywords
17
17
 
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
- INDEX_TOOL_NAMES = ['index_data', 'remove_index', 'list_indexes', 'search_index', 'stepback_search_index',
20
+ INDEX_TOOL_NAMES = ['index_data', 'remove_index', 'list_collections', 'search_index', 'stepback_search_index',
21
21
  'stepback_summary_index']
22
22
 
23
23
  LoaderSchema = create_model(
@@ -403,9 +403,9 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
403
403
  """Cleans the indexed data in the collection."""
404
404
  self._init_vector_store()._clean_collection(index_name=index_name)
405
405
  return (f"Collection '{index_name}' has been removed from the vector store.\n"
406
- f"Available collections: {self.list_indexes()}")
406
+ f"Available collections: {self.list_collections()}")
407
407
 
408
- def list_indexes(self):
408
+ def list_collections(self):
409
409
  """Lists all collections in the vector store."""
410
410
  vectorstore_wrapper = self._init_vector_store()
411
411
  return vectorstore_wrapper.list_collections()
@@ -537,10 +537,10 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
537
537
  "args_schema": RemoveIndexParams
538
538
  },
539
539
  {
540
- "name": "list_indexes",
541
- "mode": "list_indexes",
542
- "ref": self.list_indexes,
543
- "description": self.list_indexes.__doc__,
540
+ "name": "list_collections",
541
+ "mode": "list_collections",
542
+ "ref": self.list_collections,
543
+ "description": self.list_collections.__doc__,
544
544
  "args_schema": create_model("ListCollectionsParams") # No parameters
545
545
  },
546
546
 
@@ -117,7 +117,11 @@ class GitLabAPIWrapper(CodeIndexerToolkit):
117
117
 
118
118
  @model_validator(mode='before')
119
119
  @classmethod
120
- def validate_toolkit(cls, values: Dict) -> Dict:
120
+ def validate_toolkit_before(cls, values: Dict) -> Dict:
121
+ return super().validate_toolkit(values)
122
+
123
+ @model_validator(mode='after')
124
+ def validate_toolkit(self):
121
125
  try:
122
126
  import gitlab
123
127
  except ImportError:
@@ -125,17 +129,17 @@ class GitLabAPIWrapper(CodeIndexerToolkit):
125
129
  "python-gitlab is not installed. "
126
130
  "Please install it with `pip install python-gitlab`"
127
131
  )
128
- values['repository'] = cls._sanitize_url(values['repository'])
132
+ self.repository = self._sanitize_url(self.repository)
129
133
  g = gitlab.Gitlab(
130
- url=cls._sanitize_url(values['url']),
131
- private_token=values['private_token'],
134
+ url=self._sanitize_url(self.url),
135
+ private_token=self.private_token.get_secret_value(),
132
136
  keep_base_url=True,
133
137
  )
134
138
 
135
139
  g.auth()
136
- cls._git = g
137
- cls._active_branch = values.get('branch')
138
- return super().validate_toolkit(values)
140
+ self._git = g
141
+ self._active_branch = self.branch
142
+ return self
139
143
 
140
144
  @property
141
145
  def repo_instance(self):
@@ -563,7 +563,7 @@ class JiraApiWrapper(NonCodeIndexerToolkit):
563
563
  Use the appropriate issue link type (e.g., "Test", "Relates", "Blocks").
564
564
  If we use "Test" linktype, the test is inward issue, the story/other issue is outward issue.."""
565
565
 
566
- comment = "This test is linked to the story."
566
+ comment = f"Issue {inward_issue_key} was linked to {outward_issue_key}."
567
567
  comment_body = {"content": [{"content": [{"text": comment,"type": "text"}],"type": "paragraph"}],"type": "doc","version": 1} if self.api_version == "3" else comment
568
568
  link_data = {
569
569
  "type": {"name": f"{linktype}"},
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import re
3
3
  import logging
4
+ import yaml
4
5
  from typing import List, Any, Optional, Dict
5
6
  from langchain_core.tools import BaseTool, BaseToolkit, ToolException
6
7
  from requests_openapi import Operation, Client, Server
@@ -101,7 +102,15 @@ class AlitaOpenAPIToolkit(BaseToolkit):
101
102
  else:
102
103
  tools_set = {}
103
104
  if isinstance(openapi_spec, str):
104
- openapi_spec = json.loads(openapi_spec)
105
+ # Try to detect if it's YAML or JSON by attempting to parse as JSON first
106
+ try:
107
+ openapi_spec = json.loads(openapi_spec)
108
+ except json.JSONDecodeError:
109
+ # If JSON parsing fails, try YAML
110
+ try:
111
+ openapi_spec = yaml.safe_load(openapi_spec)
112
+ except yaml.YAMLError as e:
113
+ raise ToolException(f"Failed to parse OpenAPI spec as JSON or YAML: {e}")
105
114
  c = Client()
106
115
  c.load_spec(openapi_spec)
107
116
  if headers: