alita-sdk 0.3.351__py3-none-any.whl → 0.3.499__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1256 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +64 -8
  30. alita_sdk/community/inventory/__init__.py +224 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/visualize.py +1370 -0
  58. alita_sdk/configurations/bitbucket.py +94 -2
  59. alita_sdk/configurations/confluence.py +96 -1
  60. alita_sdk/configurations/gitlab.py +79 -0
  61. alita_sdk/configurations/jira.py +103 -0
  62. alita_sdk/configurations/testrail.py +88 -0
  63. alita_sdk/configurations/xray.py +93 -0
  64. alita_sdk/configurations/zephyr_enterprise.py +93 -0
  65. alita_sdk/configurations/zephyr_essential.py +75 -0
  66. alita_sdk/runtime/clients/artifact.py +1 -1
  67. alita_sdk/runtime/clients/client.py +214 -42
  68. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  69. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  70. alita_sdk/runtime/clients/sandbox_client.py +373 -0
  71. alita_sdk/runtime/langchain/assistant.py +118 -30
  72. alita_sdk/runtime/langchain/constants.py +8 -1
  73. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  74. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  75. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
  76. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +41 -12
  77. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -1
  78. alita_sdk/runtime/langchain/document_loaders/constants.py +116 -99
  79. alita_sdk/runtime/langchain/interfaces/llm_processor.py +2 -2
  80. alita_sdk/runtime/langchain/langraph_agent.py +307 -71
  81. alita_sdk/runtime/langchain/utils.py +48 -8
  82. alita_sdk/runtime/llms/preloaded.py +2 -6
  83. alita_sdk/runtime/models/mcp_models.py +61 -0
  84. alita_sdk/runtime/toolkits/__init__.py +26 -0
  85. alita_sdk/runtime/toolkits/application.py +9 -2
  86. alita_sdk/runtime/toolkits/artifact.py +18 -6
  87. alita_sdk/runtime/toolkits/datasource.py +13 -6
  88. alita_sdk/runtime/toolkits/mcp.py +780 -0
  89. alita_sdk/runtime/toolkits/planning.py +178 -0
  90. alita_sdk/runtime/toolkits/tools.py +205 -55
  91. alita_sdk/runtime/toolkits/vectorstore.py +9 -4
  92. alita_sdk/runtime/tools/__init__.py +11 -3
  93. alita_sdk/runtime/tools/application.py +7 -0
  94. alita_sdk/runtime/tools/artifact.py +225 -12
  95. alita_sdk/runtime/tools/function.py +95 -5
  96. alita_sdk/runtime/tools/graph.py +10 -4
  97. alita_sdk/runtime/tools/image_generation.py +212 -0
  98. alita_sdk/runtime/tools/llm.py +494 -102
  99. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  100. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  101. alita_sdk/runtime/tools/mcp_server_tool.py +4 -4
  102. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  103. alita_sdk/runtime/tools/planning/models.py +246 -0
  104. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  105. alita_sdk/runtime/tools/router.py +2 -1
  106. alita_sdk/runtime/tools/sandbox.py +180 -79
  107. alita_sdk/runtime/tools/vectorstore.py +22 -21
  108. alita_sdk/runtime/tools/vectorstore_base.py +125 -52
  109. alita_sdk/runtime/utils/AlitaCallback.py +106 -20
  110. alita_sdk/runtime/utils/mcp_client.py +465 -0
  111. alita_sdk/runtime/utils/mcp_oauth.py +244 -0
  112. alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
  113. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  114. alita_sdk/runtime/utils/streamlit.py +40 -13
  115. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  116. alita_sdk/runtime/utils/utils.py +12 -0
  117. alita_sdk/tools/__init__.py +77 -33
  118. alita_sdk/tools/ado/repos/__init__.py +7 -6
  119. alita_sdk/tools/ado/repos/repos_wrapper.py +11 -11
  120. alita_sdk/tools/ado/test_plan/__init__.py +7 -7
  121. alita_sdk/tools/ado/wiki/__init__.py +7 -11
  122. alita_sdk/tools/ado/wiki/ado_wrapper.py +89 -15
  123. alita_sdk/tools/ado/work_item/__init__.py +7 -11
  124. alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
  125. alita_sdk/tools/advanced_jira_mining/__init__.py +8 -7
  126. alita_sdk/tools/aws/delta_lake/__init__.py +11 -9
  127. alita_sdk/tools/azure_ai/search/__init__.py +7 -6
  128. alita_sdk/tools/base_indexer_toolkit.py +345 -70
  129. alita_sdk/tools/bitbucket/__init__.py +9 -8
  130. alita_sdk/tools/bitbucket/api_wrapper.py +50 -6
  131. alita_sdk/tools/browser/__init__.py +4 -4
  132. alita_sdk/tools/carrier/__init__.py +4 -6
  133. alita_sdk/tools/chunkers/__init__.py +3 -1
  134. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  135. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  136. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  137. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  138. alita_sdk/tools/cloud/aws/__init__.py +7 -6
  139. alita_sdk/tools/cloud/azure/__init__.py +7 -6
  140. alita_sdk/tools/cloud/gcp/__init__.py +7 -6
  141. alita_sdk/tools/cloud/k8s/__init__.py +7 -6
  142. alita_sdk/tools/code/linter/__init__.py +7 -7
  143. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  144. alita_sdk/tools/code/sonar/__init__.py +8 -7
  145. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  146. alita_sdk/tools/confluence/__init__.py +9 -8
  147. alita_sdk/tools/confluence/api_wrapper.py +171 -75
  148. alita_sdk/tools/confluence/loader.py +10 -0
  149. alita_sdk/tools/custom_open_api/__init__.py +9 -4
  150. alita_sdk/tools/elastic/__init__.py +8 -7
  151. alita_sdk/tools/elitea_base.py +492 -52
  152. alita_sdk/tools/figma/__init__.py +7 -7
  153. alita_sdk/tools/figma/api_wrapper.py +2 -1
  154. alita_sdk/tools/github/__init__.py +9 -9
  155. alita_sdk/tools/github/api_wrapper.py +9 -26
  156. alita_sdk/tools/github/github_client.py +62 -2
  157. alita_sdk/tools/gitlab/__init__.py +8 -8
  158. alita_sdk/tools/gitlab/api_wrapper.py +135 -33
  159. alita_sdk/tools/gitlab_org/__init__.py +7 -8
  160. alita_sdk/tools/google/bigquery/__init__.py +11 -12
  161. alita_sdk/tools/google_places/__init__.py +8 -7
  162. alita_sdk/tools/jira/__init__.py +9 -7
  163. alita_sdk/tools/jira/api_wrapper.py +100 -52
  164. alita_sdk/tools/keycloak/__init__.py +8 -7
  165. alita_sdk/tools/localgit/local_git.py +56 -54
  166. alita_sdk/tools/memory/__init__.py +1 -1
  167. alita_sdk/tools/non_code_indexer_toolkit.py +3 -2
  168. alita_sdk/tools/ocr/__init__.py +8 -7
  169. alita_sdk/tools/openapi/__init__.py +10 -1
  170. alita_sdk/tools/pandas/__init__.py +8 -7
  171. alita_sdk/tools/postman/__init__.py +7 -8
  172. alita_sdk/tools/postman/api_wrapper.py +19 -8
  173. alita_sdk/tools/postman/postman_analysis.py +8 -1
  174. alita_sdk/tools/pptx/__init__.py +8 -9
  175. alita_sdk/tools/qtest/__init__.py +16 -11
  176. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  177. alita_sdk/tools/rally/__init__.py +7 -8
  178. alita_sdk/tools/report_portal/__init__.py +9 -7
  179. alita_sdk/tools/salesforce/__init__.py +7 -7
  180. alita_sdk/tools/servicenow/__init__.py +10 -10
  181. alita_sdk/tools/sharepoint/__init__.py +7 -6
  182. alita_sdk/tools/sharepoint/api_wrapper.py +127 -36
  183. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  184. alita_sdk/tools/sharepoint/utils.py +8 -2
  185. alita_sdk/tools/slack/__init__.py +7 -6
  186. alita_sdk/tools/sql/__init__.py +8 -7
  187. alita_sdk/tools/sql/api_wrapper.py +71 -23
  188. alita_sdk/tools/testio/__init__.py +7 -6
  189. alita_sdk/tools/testrail/__init__.py +8 -9
  190. alita_sdk/tools/utils/__init__.py +26 -4
  191. alita_sdk/tools/utils/content_parser.py +88 -60
  192. alita_sdk/tools/utils/text_operations.py +254 -0
  193. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +76 -26
  194. alita_sdk/tools/xray/__init__.py +9 -7
  195. alita_sdk/tools/zephyr/__init__.py +7 -6
  196. alita_sdk/tools/zephyr_enterprise/__init__.py +8 -6
  197. alita_sdk/tools/zephyr_essential/__init__.py +7 -6
  198. alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
  199. alita_sdk/tools/zephyr_scale/__init__.py +7 -6
  200. alita_sdk/tools/zephyr_squad/__init__.py +7 -6
  201. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +147 -2
  202. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/RECORD +206 -130
  203. alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
  204. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
  205. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
  206. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0
@@ -7,12 +7,14 @@ from json import JSONDecodeError
7
7
  from typing import Optional, List, Any, Dict, Callable, Generator, Literal
8
8
 
9
9
  import requests
10
+ from atlassian.errors import ApiError
10
11
  from langchain_community.document_loaders.confluence import ContentFormat
11
12
  from langchain_core.documents import Document
12
13
  from langchain_core.messages import HumanMessage
13
14
  from langchain_core.tools import ToolException
14
15
  from markdownify import markdownify
15
16
  from pydantic import Field, PrivateAttr, model_validator, create_model, SecretStr
17
+ from requests import HTTPError
16
18
  from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log
17
19
 
18
20
  from alita_sdk.tools.non_code_indexer_toolkit import NonCodeIndexerToolkit
@@ -194,6 +196,7 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
194
196
  keep_markdown_format: Optional[bool] = True
195
197
  ocr_languages: Optional[str] = None
196
198
  keep_newlines: Optional[bool] = True
199
+ _errors: Optional[list[str]] = None
197
200
  _image_cache: ImageDescriptionCache = PrivateAttr(default_factory=ImageDescriptionCache)
198
201
 
199
202
  @model_validator(mode='before')
@@ -477,28 +480,78 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
477
480
  """Gets pages with specific label in the Confluence space."""
478
481
 
479
482
  start = 0
480
- pages_info = []
481
- for _ in range((self.max_pages + self.limit - 1) // self.limit):
482
- pages = self.client.get_all_pages_by_label(label, start=start,
483
- limit=self.limit) # , expand="body.view.value"
483
+ pages_info: List[Dict[str, Any]] = []
484
+ seen_ids: set[str] = set()
485
+
486
+ # Use a while-loop driven by unique pages collected and
487
+ # presence of additional results instead of a fixed number
488
+ # of iterations based purely on max_pages/limit.
489
+ while len(pages_info) < (self.max_pages or 0):
490
+ pages = self.client.get_all_pages_by_label(
491
+ label,
492
+ start=start,
493
+ limit=self.limit,
494
+ ) # , expand="body.view.value"
484
495
  if not pages:
485
496
  break
486
497
 
487
- pages_info += [{
488
- 'page_id': page.metadata['id'],
489
- 'page_title': page.metadata['title'],
490
- 'page_url': page.metadata['source'],
491
- 'content': page.page_content
492
- } for page in self.get_pages_by_id([page["id"] for page in pages])]
498
+ # Collect only ids we haven't processed yet to avoid
499
+ # calling get_page_by_id multiple times for the same
500
+ # Confluence page.
501
+ new_ids: List[str] = []
502
+ for p in pages:
503
+ page_id = p["id"] if isinstance(p, dict) else getattr(p, "id", None)
504
+ if page_id is None:
505
+ continue
506
+ if page_id in seen_ids:
507
+ continue
508
+ seen_ids.add(page_id)
509
+ new_ids.append(page_id)
510
+
511
+ if new_ids:
512
+ for page in self.get_pages_by_id(new_ids):
513
+ meta = getattr(page, "metadata", {}) or {}
514
+ page_id = meta.get("id")
515
+ page_title = meta.get("title")
516
+ page_url = meta.get("source")
517
+ content = getattr(page, "page_content", None)
518
+
519
+ if page_id is None:
520
+ continue
521
+
522
+ pages_info.append(
523
+ {
524
+ "page_id": page_id,
525
+ "page_title": page_title,
526
+ "page_url": page_url,
527
+ "content": content,
528
+ }
529
+ )
530
+
531
+ # Respect max_pages on unique pages collected.
532
+ if len(pages_info) >= (self.max_pages or 0):
533
+ break
534
+
535
+ # Advance the offset by the requested page size.
493
536
  start += self.limit
494
- return pages_info
537
+
538
+ # Defensive break: if the API returns fewer items than
539
+ # requested, there are likely no more pages to fetch.
540
+ if len(pages) < self.limit:
541
+ break
542
+
543
+ # Slice as an extra safety net in case of any race conditions
544
+ # around the max_pages guard in the loop above.
545
+ return pages_info[: (self.max_pages or len(pages_info))]
495
546
 
496
547
  def is_public_page(self, page: dict) -> bool:
497
548
  """Check if a page is publicly accessible."""
498
549
  restrictions = self.client.get_all_restrictions_for_content(page["id"])
499
550
 
500
551
  return (
501
- page["status"] == "current"
552
+ (page["status"] == "current"
553
+ # allow user to see archived content if needed
554
+ or page["status"] == "archived")
502
555
  and not restrictions["read"]["restrictions"]["user"]["results"]
503
556
  and not restrictions["read"]["restrictions"]["group"]["results"]
504
557
  )
@@ -518,18 +571,35 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
518
571
  ),
519
572
  before_sleep=before_sleep_log(logger, logging.WARNING),
520
573
  )(self.client.get_page_by_id)
521
- page = get_page(
522
- page_id=page_id, expand=f"{self.content_format.value},version"
523
- )
524
- if not self.include_restricted_content and not self.is_public_page(page):
525
- continue
574
+ try:
575
+ page = get_page(
576
+ page_id=page_id, expand=f"{self.content_format.value},version"
577
+ )
578
+ except (ApiError, HTTPError) as e:
579
+ logger.error(f"Error fetching page with ID {page_id}: {e}")
580
+ page_content_temp = f"Confluence API Error: cannot fetch the page with ID {page_id}: {e}"
581
+ # store errors
582
+ if self._errors is None:
583
+ self._errors = []
584
+ self._errors.append(page_content_temp)
585
+ return Document(page_content=page_content_temp,
586
+ metadata={})
587
+ # TODO: update on toolkit advanced settings level as a separate feature
588
+ # if not self.include_restricted_content and not self.is_public_page(page):
589
+ # continue
526
590
  yield self.process_page(page, skip_images)
527
591
 
592
+ def _log_errors(self):
593
+ """ Log errors encountered during toolkit execution. """
594
+ if self._errors:
595
+ logger.info(f"Errors encountered during toolkit execution: {self._errors}")
596
+
528
597
  def read_page_by_id(self, page_id: str, skip_images: bool = False):
529
598
  """Reads a page by its id in the Confluence space. If id is not available, but there is a title - use get_page_id first."""
530
599
  result = list(self.get_pages_by_id([page_id], skip_images))
531
600
  if not result:
532
- "Page not found"
601
+ return f"Pages not found. Errors: {self._errors}" if self._errors \
602
+ else "Pages not found or you do not have access to them."
533
603
  return result[0].page_content
534
604
  # return self._strip_base64_images(result[0].page_content) if skip_images else result[0].page_content
535
605
 
@@ -815,6 +885,10 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
815
885
  from .loader import AlitaConfluenceLoader
816
886
  from copy import copy
817
887
  content_format = kwargs.get('content_format', 'view').lower()
888
+
889
+ self._index_include_attachments = kwargs.get('include_attachments', False)
890
+ self._include_extensions = kwargs.get('include_extensions', [])
891
+ self._skip_extensions = kwargs.get('skip_extensions', [])
818
892
  base_params = {
819
893
  'url': self.base_url,
820
894
  'space_key': self.space,
@@ -847,65 +921,79 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
847
921
 
848
922
  def _process_document(self, document: Document) -> Generator[Document, None, None]:
849
923
  try:
850
- page_id = document.metadata.get('id')
851
- attachments = self.client.get_attachments_from_content(page_id)
852
- if not attachments or not attachments.get('results'):
853
- return f"No attachments found for page ID {page_id}."
854
-
855
- # Get attachment history for created/updated info
856
- history_map = {}
857
- for attachment in attachments['results']:
858
- try:
859
- hist = self.client.history(attachment['id'])
860
- history_map[attachment['id']] = hist
861
- except Exception as e:
862
- logger.warning(f"Failed to fetch history for attachment {attachment.get('title', '')}: {str(e)}")
863
- history_map[attachment['id']] = None
864
-
865
- import re
866
- for attachment in attachments['results']:
867
- title = attachment.get('title', '')
868
- file_ext = title.lower().split('.')[-1] if '.' in title else ''
869
-
870
- media_type = attachment.get('metadata', {}).get('mediaType', '')
871
- # Core metadata extraction with history
872
- hist = history_map.get(attachment['id']) or {}
873
- created_by = hist.get('createdBy', {}).get('displayName', '') if hist else attachment.get('creator', {}).get('displayName', '')
874
- created_date = hist.get('createdDate', '') if hist else attachment.get('created', '')
875
- last_updated = hist.get('lastUpdated', {}).get('when', '') if hist else ''
924
+ if self._index_include_attachments:
925
+ page_id = document.metadata.get('id')
926
+ attachments = self.client.get_attachments_from_content(page_id)
927
+ if not attachments or not attachments.get('results'):
928
+ return f"No attachments found for page ID {page_id}."
929
+
930
+ # Get attachment history for created/updated info
931
+ history_map = {}
932
+ for attachment in attachments['results']:
933
+ try:
934
+ hist = self.client.history(attachment['id'])
935
+ history_map[attachment['id']] = hist
936
+ except Exception as e:
937
+ logger.warning(f"Failed to fetch history for attachment {attachment.get('title', '')}: {str(e)}")
938
+ history_map[attachment['id']] = None
939
+
940
+ import re
941
+ for attachment in attachments['results']:
942
+ title = attachment.get('title', '')
943
+ file_ext = title.lower().split('.')[-1] if '.' in title else ''
944
+
945
+ # Re-verify extension filters
946
+ # Check if file should be skipped based on skip_extensions
947
+ if any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
948
+ for pattern in self._skip_extensions):
949
+ continue
950
+
951
+ # Check if file should be included based on include_extensions
952
+ # If include_extensions is empty, process all files (that weren't skipped)
953
+ if self._include_extensions and not (
954
+ any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
955
+ for pattern in self._include_extensions)):
956
+ continue
957
+
958
+ media_type = attachment.get('metadata', {}).get('mediaType', '')
959
+ # Core metadata extraction with history
960
+ hist = history_map.get(attachment['id']) or {}
961
+ created_by = hist.get('createdBy', {}).get('displayName', '') if hist else attachment.get('creator', {}).get('displayName', '')
962
+ created_date = hist.get('createdDate', '') if hist else attachment.get('created', '')
963
+ last_updated = hist.get('lastUpdated', {}).get('when', '') if hist else ''
964
+
965
+ metadata = {
966
+ 'name': title,
967
+ 'size': attachment.get('extensions', {}).get('fileSize', None),
968
+ 'creator': created_by,
969
+ 'created': created_date,
970
+ 'updated': last_updated,
971
+ 'media_type': media_type,
972
+ 'labels': [label['name'] for label in
973
+ attachment.get('metadata', {}).get('labels', {}).get('results', [])],
974
+ 'download_url': self.base_url.rstrip('/') + attachment['_links']['download'] if attachment.get(
975
+ '_links', {}).get('download') else None
976
+ }
876
977
 
877
- metadata = {
878
- 'name': title,
879
- 'size': attachment.get('extensions', {}).get('fileSize', None),
880
- 'creator': created_by,
881
- 'created': created_date,
882
- 'updated': last_updated,
883
- 'media_type': media_type,
884
- 'labels': [label['name'] for label in
885
- attachment.get('metadata', {}).get('labels', {}).get('results', [])],
886
- 'download_url': self.base_url.rstrip('/') + attachment['_links']['download'] if attachment.get(
887
- '_links', {}).get('download') else None
888
- }
978
+ download_url = self.base_url.rstrip('/') + attachment['_links']['download']
889
979
 
890
- download_url = self.base_url.rstrip('/') + attachment['_links']['download']
980
+ try:
981
+ resp = self.client.request(method="GET", path=download_url[len(self.base_url):], advanced_mode=True)
982
+ if resp.status_code == 200:
983
+ content = resp.content
984
+ else:
985
+ content = f"[Failed to download {download_url}: HTTP status code {resp.status_code}]"
986
+ except Exception as e:
987
+ content = f"[Error downloading content: {str(e)}]"
891
988
 
892
- try:
893
- resp = self.client.request(method="GET", path=download_url[len(self.base_url):], advanced_mode=True)
894
- if resp.status_code == 200:
895
- content = resp.content
989
+ if isinstance(content, str):
990
+ yield Document(page_content=content, metadata=metadata)
896
991
  else:
897
- content = f"[Failed to download {download_url}: HTTP status code {resp.status_code}]"
898
- except Exception as e:
899
- content = f"[Error downloading content: {str(e)}]"
900
-
901
- if isinstance(content, str):
902
- yield Document(page_content=content, metadata=metadata)
903
- else:
904
- yield Document(page_content="", metadata={
905
- **metadata,
906
- IndexerKeywords.CONTENT_FILE_NAME.value: f".{file_ext}",
907
- IndexerKeywords.CONTENT_IN_BYTES.value: content
908
- })
992
+ yield Document(page_content="", metadata={
993
+ **metadata,
994
+ IndexerKeywords.CONTENT_FILE_NAME.value: f".{file_ext}",
995
+ IndexerKeywords.CONTENT_IN_BYTES.value: content
996
+ })
909
997
  except Exception as e:
910
998
  yield from ()
911
999
 
@@ -1648,8 +1736,15 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
1648
1736
  "include_restricted_content": (Optional[bool], Field(description="Include restricted content.", default=False)),
1649
1737
  "include_archived_content": (Optional[bool], Field(description="Include archived content.", default=False)),
1650
1738
  "include_attachments": (Optional[bool], Field(description="Include attachments.", default=False)),
1739
+ 'include_extensions': (Optional[List[str]], Field(
1740
+ description="List of file extensions to include when processing attachments: i.e. ['*.png', '*.jpg']. "
1741
+ "If empty, all files will be processed (except skip_extensions).",
1742
+ default=[])),
1743
+ 'skip_extensions': (Optional[List[str]], Field(
1744
+ description="List of file extensions to skip when processing attachments: i.e. ['*.png', '*.jpg']",
1745
+ default=[])),
1651
1746
  "include_comments": (Optional[bool], Field(description="Include comments.", default=False)),
1652
- "include_labels": (Optional[bool], Field(description="Include labels.", default=True)),
1747
+ "include_labels": (Optional[bool], Field(description="Include labels.", default=False)),
1653
1748
  "ocr_languages": (Optional[str], Field(description="OCR languages for processing attachments.", default='eng')),
1654
1749
  "keep_markdown_format": (Optional[bool], Field(description="Keep the markdown format.", default=True)),
1655
1750
  "keep_newlines": (Optional[bool], Field(description="Keep newlines in the content.", default=True)),
@@ -1773,4 +1868,5 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
1773
1868
  "description": self.get_page_attachments.__doc__,
1774
1869
  "args_schema": GetPageAttachmentsInput,
1775
1870
  }
1776
- ]
1871
+ ]
1872
+
@@ -3,6 +3,7 @@ from typing import Optional, List
3
3
  from logging import getLogger
4
4
 
5
5
  import requests
6
+ from langchain_core.documents import Document
6
7
 
7
8
  logger = getLogger(__name__)
8
9
  from PIL import Image
@@ -193,6 +194,15 @@ class AlitaConfluenceLoader(ConfluenceLoader):
193
194
  else:
194
195
  return super().process_image(link, ocr_languages)
195
196
 
197
+ def process_page(self, page: dict, include_attachments: bool, include_comments: bool, include_labels: bool,
198
+ content_format: ContentFormat, ocr_languages: Optional[str] = None,
199
+ keep_markdown_format: Optional[bool] = False, keep_newlines: bool = False) -> Document:
200
+ if not page.get("title"):
201
+ # if 'include_restricted_content' set to True, draft pages are loaded and can have no title
202
+ page["title"] = "Untitled"
203
+ return super().process_page(page, include_attachments, include_comments, include_labels, content_format,
204
+ ocr_languages, keep_markdown_format, keep_newlines)
205
+
196
206
  # TODO review usage
197
207
  # def process_svg(
198
208
  # self,
@@ -5,7 +5,7 @@ from pydantic import create_model, BaseModel, ConfigDict, Field
5
5
 
6
6
  from .api_wrapper import OpenApiWrapper
7
7
  from ..base.tool import BaseAction
8
- from ..utils import clean_string, TOOLKIT_SPLITTER
8
+ from ..utils import clean_string
9
9
 
10
10
  name = "openapi"
11
11
 
@@ -43,14 +43,19 @@ class OpenApiToolkit(BaseToolkit):
43
43
  openapi_api_wrapper = OpenApiWrapper(**kwargs)
44
44
  available_tools = openapi_api_wrapper.get_available_tools()
45
45
  tools = []
46
- prefix = clean_string(toolkit_name + TOOLKIT_SPLITTER) if toolkit_name else ''
46
+ # Use clean toolkit name for context (max 1000 chars in description)
47
+ toolkit_context = f" [Toolkit: {clean_string(toolkit_name)}]" if toolkit_name else ''
47
48
  for tool in available_tools:
48
49
  if selected_tools and tool["name"] not in selected_tools:
49
50
  continue
51
+ # Add toolkit context to description with character limit
52
+ description = tool["description"]
53
+ if toolkit_context and len(description + toolkit_context) <= 1000:
54
+ description = description + toolkit_context
50
55
  tools.append(BaseAction(
51
56
  api_wrapper=openapi_api_wrapper,
52
- name=prefix + tool["name"],
53
- description=tool["description"],
57
+ name=tool["name"],
58
+ description=description,
54
59
  args_schema=tool["args_schema"]
55
60
  ))
56
61
  return cls(tools=tools)
@@ -5,7 +5,7 @@ from pydantic import BaseModel, ConfigDict, create_model, Field, SecretStr
5
5
 
6
6
  from .api_wrapper import ELITEAElasticApiWrapper
7
7
  from ..base.tool import BaseAction
8
- from ..utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
8
+ from ..utils import clean_string, get_max_toolkit_length
9
9
 
10
10
  name = "elastic"
11
11
 
@@ -19,15 +19,13 @@ def get_tools(tool):
19
19
 
20
20
  class ElasticToolkit(BaseToolkit):
21
21
  tools: list[BaseTool] = []
22
- toolkit_max_length: int = 0
23
22
 
24
23
  @staticmethod
25
24
  def toolkit_config_schema() -> BaseModel:
26
25
  selected_tools = {x['name']: x['args_schema'].schema() for x in ELITEAElasticApiWrapper.model_construct().get_available_tools()}
27
- ElasticToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
28
26
  return create_model(
29
27
  name,
30
- url=(str, Field(default=None, title="Elasticsearch URL", description="Elasticsearch URL", json_schema_extra={'toolkit_name': True, 'max_toolkit_length': ElasticToolkit.toolkit_max_length})),
28
+ url=(str, Field(default=None, title="Elasticsearch URL", description="Elasticsearch URL", json_schema_extra={'toolkit_name': True})),
31
29
  api_key=(
32
30
  Optional[SecretStr],
33
31
  Field(
@@ -48,14 +46,17 @@ class ElasticToolkit(BaseToolkit):
48
46
  elastic_api_wrapper = ELITEAElasticApiWrapper(**kwargs)
49
47
  available_tools = elastic_api_wrapper.get_available_tools()
50
48
  tools = []
51
- prefix = clean_string(toolkit_name, ElasticToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
52
49
  for tool in available_tools:
53
50
  if selected_tools and tool["name"] not in selected_tools:
54
51
  continue
52
+ description = tool["description"]
53
+ if toolkit_name:
54
+ description = f"Toolkit: {toolkit_name}\n{description}"
55
+ description = description[:1000]
55
56
  tools.append(BaseAction(
56
57
  api_wrapper=elastic_api_wrapper,
57
- name=prefix + tool["name"],
58
- description=tool["description"],
58
+ name=tool["name"],
59
+ description=description,
59
60
  args_schema=tool["args_schema"]
60
61
  ))
61
62
  return cls(tools=tools)