alita-sdk 0.3.351__py3-none-any.whl → 0.3.499__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1256 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +64 -8
  30. alita_sdk/community/inventory/__init__.py +224 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/visualize.py +1370 -0
  58. alita_sdk/configurations/bitbucket.py +94 -2
  59. alita_sdk/configurations/confluence.py +96 -1
  60. alita_sdk/configurations/gitlab.py +79 -0
  61. alita_sdk/configurations/jira.py +103 -0
  62. alita_sdk/configurations/testrail.py +88 -0
  63. alita_sdk/configurations/xray.py +93 -0
  64. alita_sdk/configurations/zephyr_enterprise.py +93 -0
  65. alita_sdk/configurations/zephyr_essential.py +75 -0
  66. alita_sdk/runtime/clients/artifact.py +1 -1
  67. alita_sdk/runtime/clients/client.py +214 -42
  68. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  69. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  70. alita_sdk/runtime/clients/sandbox_client.py +373 -0
  71. alita_sdk/runtime/langchain/assistant.py +118 -30
  72. alita_sdk/runtime/langchain/constants.py +8 -1
  73. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  74. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  75. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
  76. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +41 -12
  77. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -1
  78. alita_sdk/runtime/langchain/document_loaders/constants.py +116 -99
  79. alita_sdk/runtime/langchain/interfaces/llm_processor.py +2 -2
  80. alita_sdk/runtime/langchain/langraph_agent.py +307 -71
  81. alita_sdk/runtime/langchain/utils.py +48 -8
  82. alita_sdk/runtime/llms/preloaded.py +2 -6
  83. alita_sdk/runtime/models/mcp_models.py +61 -0
  84. alita_sdk/runtime/toolkits/__init__.py +26 -0
  85. alita_sdk/runtime/toolkits/application.py +9 -2
  86. alita_sdk/runtime/toolkits/artifact.py +18 -6
  87. alita_sdk/runtime/toolkits/datasource.py +13 -6
  88. alita_sdk/runtime/toolkits/mcp.py +780 -0
  89. alita_sdk/runtime/toolkits/planning.py +178 -0
  90. alita_sdk/runtime/toolkits/tools.py +205 -55
  91. alita_sdk/runtime/toolkits/vectorstore.py +9 -4
  92. alita_sdk/runtime/tools/__init__.py +11 -3
  93. alita_sdk/runtime/tools/application.py +7 -0
  94. alita_sdk/runtime/tools/artifact.py +225 -12
  95. alita_sdk/runtime/tools/function.py +95 -5
  96. alita_sdk/runtime/tools/graph.py +10 -4
  97. alita_sdk/runtime/tools/image_generation.py +212 -0
  98. alita_sdk/runtime/tools/llm.py +494 -102
  99. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  100. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  101. alita_sdk/runtime/tools/mcp_server_tool.py +4 -4
  102. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  103. alita_sdk/runtime/tools/planning/models.py +246 -0
  104. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  105. alita_sdk/runtime/tools/router.py +2 -1
  106. alita_sdk/runtime/tools/sandbox.py +180 -79
  107. alita_sdk/runtime/tools/vectorstore.py +22 -21
  108. alita_sdk/runtime/tools/vectorstore_base.py +125 -52
  109. alita_sdk/runtime/utils/AlitaCallback.py +106 -20
  110. alita_sdk/runtime/utils/mcp_client.py +465 -0
  111. alita_sdk/runtime/utils/mcp_oauth.py +244 -0
  112. alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
  113. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  114. alita_sdk/runtime/utils/streamlit.py +40 -13
  115. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  116. alita_sdk/runtime/utils/utils.py +12 -0
  117. alita_sdk/tools/__init__.py +77 -33
  118. alita_sdk/tools/ado/repos/__init__.py +7 -6
  119. alita_sdk/tools/ado/repos/repos_wrapper.py +11 -11
  120. alita_sdk/tools/ado/test_plan/__init__.py +7 -7
  121. alita_sdk/tools/ado/wiki/__init__.py +7 -11
  122. alita_sdk/tools/ado/wiki/ado_wrapper.py +89 -15
  123. alita_sdk/tools/ado/work_item/__init__.py +7 -11
  124. alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
  125. alita_sdk/tools/advanced_jira_mining/__init__.py +8 -7
  126. alita_sdk/tools/aws/delta_lake/__init__.py +11 -9
  127. alita_sdk/tools/azure_ai/search/__init__.py +7 -6
  128. alita_sdk/tools/base_indexer_toolkit.py +345 -70
  129. alita_sdk/tools/bitbucket/__init__.py +9 -8
  130. alita_sdk/tools/bitbucket/api_wrapper.py +50 -6
  131. alita_sdk/tools/browser/__init__.py +4 -4
  132. alita_sdk/tools/carrier/__init__.py +4 -6
  133. alita_sdk/tools/chunkers/__init__.py +3 -1
  134. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  135. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  136. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  137. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  138. alita_sdk/tools/cloud/aws/__init__.py +7 -6
  139. alita_sdk/tools/cloud/azure/__init__.py +7 -6
  140. alita_sdk/tools/cloud/gcp/__init__.py +7 -6
  141. alita_sdk/tools/cloud/k8s/__init__.py +7 -6
  142. alita_sdk/tools/code/linter/__init__.py +7 -7
  143. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  144. alita_sdk/tools/code/sonar/__init__.py +8 -7
  145. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  146. alita_sdk/tools/confluence/__init__.py +9 -8
  147. alita_sdk/tools/confluence/api_wrapper.py +171 -75
  148. alita_sdk/tools/confluence/loader.py +10 -0
  149. alita_sdk/tools/custom_open_api/__init__.py +9 -4
  150. alita_sdk/tools/elastic/__init__.py +8 -7
  151. alita_sdk/tools/elitea_base.py +492 -52
  152. alita_sdk/tools/figma/__init__.py +7 -7
  153. alita_sdk/tools/figma/api_wrapper.py +2 -1
  154. alita_sdk/tools/github/__init__.py +9 -9
  155. alita_sdk/tools/github/api_wrapper.py +9 -26
  156. alita_sdk/tools/github/github_client.py +62 -2
  157. alita_sdk/tools/gitlab/__init__.py +8 -8
  158. alita_sdk/tools/gitlab/api_wrapper.py +135 -33
  159. alita_sdk/tools/gitlab_org/__init__.py +7 -8
  160. alita_sdk/tools/google/bigquery/__init__.py +11 -12
  161. alita_sdk/tools/google_places/__init__.py +8 -7
  162. alita_sdk/tools/jira/__init__.py +9 -7
  163. alita_sdk/tools/jira/api_wrapper.py +100 -52
  164. alita_sdk/tools/keycloak/__init__.py +8 -7
  165. alita_sdk/tools/localgit/local_git.py +56 -54
  166. alita_sdk/tools/memory/__init__.py +1 -1
  167. alita_sdk/tools/non_code_indexer_toolkit.py +3 -2
  168. alita_sdk/tools/ocr/__init__.py +8 -7
  169. alita_sdk/tools/openapi/__init__.py +10 -1
  170. alita_sdk/tools/pandas/__init__.py +8 -7
  171. alita_sdk/tools/postman/__init__.py +7 -8
  172. alita_sdk/tools/postman/api_wrapper.py +19 -8
  173. alita_sdk/tools/postman/postman_analysis.py +8 -1
  174. alita_sdk/tools/pptx/__init__.py +8 -9
  175. alita_sdk/tools/qtest/__init__.py +16 -11
  176. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  177. alita_sdk/tools/rally/__init__.py +7 -8
  178. alita_sdk/tools/report_portal/__init__.py +9 -7
  179. alita_sdk/tools/salesforce/__init__.py +7 -7
  180. alita_sdk/tools/servicenow/__init__.py +10 -10
  181. alita_sdk/tools/sharepoint/__init__.py +7 -6
  182. alita_sdk/tools/sharepoint/api_wrapper.py +127 -36
  183. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  184. alita_sdk/tools/sharepoint/utils.py +8 -2
  185. alita_sdk/tools/slack/__init__.py +7 -6
  186. alita_sdk/tools/sql/__init__.py +8 -7
  187. alita_sdk/tools/sql/api_wrapper.py +71 -23
  188. alita_sdk/tools/testio/__init__.py +7 -6
  189. alita_sdk/tools/testrail/__init__.py +8 -9
  190. alita_sdk/tools/utils/__init__.py +26 -4
  191. alita_sdk/tools/utils/content_parser.py +88 -60
  192. alita_sdk/tools/utils/text_operations.py +254 -0
  193. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +76 -26
  194. alita_sdk/tools/xray/__init__.py +9 -7
  195. alita_sdk/tools/zephyr/__init__.py +7 -6
  196. alita_sdk/tools/zephyr_enterprise/__init__.py +8 -6
  197. alita_sdk/tools/zephyr_essential/__init__.py +7 -6
  198. alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
  199. alita_sdk/tools/zephyr_scale/__init__.py +7 -6
  200. alita_sdk/tools/zephyr_squad/__init__.py +7 -6
  201. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +147 -2
  202. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/RECORD +206 -130
  203. alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
  204. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
  205. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
  206. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@ from pydantic import BaseModel, Field, ConfigDict, create_model
9
9
 
10
10
  from ..base.tool import BaseAction
11
11
  from ..elitea_base import filter_missconfigured_index_tools
12
- from ..utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length, check_connection_response
12
+ from ..utils import clean_string, get_max_toolkit_length, check_connection_response
13
13
  from ...configurations.bitbucket import BitbucketConfiguration
14
14
  from ...configurations.pgvector import PgVectorConfiguration
15
15
  import requests
@@ -38,17 +38,15 @@ def get_tools(tool):
38
38
 
39
39
  class AlitaBitbucketToolkit(BaseToolkit):
40
40
  tools: List[BaseTool] = []
41
- toolkit_max_length: int = 0
42
41
 
43
42
  @staticmethod
44
43
  def toolkit_config_schema() -> BaseModel:
45
44
  selected_tools = {x['name']: x['args_schema'].schema() for x in
46
45
  BitbucketAPIWrapper.model_construct().get_available_tools()}
47
- AlitaBitbucketToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
48
46
  m = create_model(
49
47
  name,
50
- project=(str, Field(description="Project/Workspace", json_schema_extra={'configuration': True})),
51
- repository=(str, Field(description="Repository", json_schema_extra={'max_toolkit_length': AlitaBitbucketToolkit.toolkit_max_length, 'configuration': True})),
48
+ project=(str, Field(description="Project/Workspace")),
49
+ repository=(str, Field(description="Repository")),
52
50
  branch=(str, Field(description="Main branch", default="main")),
53
51
  cloud=(Optional[bool], Field(description="Hosting Option", default=None)),
54
52
  bitbucket_configuration=(BitbucketConfiguration, Field(description="Bitbucket Configuration", json_schema_extra={'configuration_types': ['bitbucket']})),
@@ -99,16 +97,19 @@ class AlitaBitbucketToolkit(BaseToolkit):
99
97
  }
100
98
  bitbucket_api_wrapper = BitbucketAPIWrapper(**wrapper_payload)
101
99
  available_tools: List[Dict] = bitbucket_api_wrapper.get_available_tools()
102
- prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
103
100
  tools = []
104
101
  for tool in available_tools:
105
102
  if selected_tools:
106
103
  if tool['name'] not in selected_tools:
107
104
  continue
105
+ description = tool["description"] + f"\nrepo: {bitbucket_api_wrapper.repository}"
106
+ if toolkit_name:
107
+ description = f"{description}\nToolkit: {toolkit_name}"
108
+ description = description[:1000]
108
109
  tools.append(BaseAction(
109
110
  api_wrapper=bitbucket_api_wrapper,
110
- name=prefix + tool["name"],
111
- description=tool["description"] + f"\nrepo: {bitbucket_api_wrapper.repository}",
111
+ name=tool["name"],
112
+ description=description,
112
113
  args_schema=tool["args_schema"]
113
114
  ))
114
115
  return cls(tools=tools)
@@ -11,7 +11,9 @@ from .bitbucket_constants import create_pr_data
11
11
  from .cloud_api_wrapper import BitbucketCloudApi, BitbucketServerApi
12
12
  from pydantic.fields import PrivateAttr
13
13
 
14
- from ..elitea_base import BaseCodeToolApiWrapper
14
+ from ..code_indexer_toolkit import CodeIndexerToolkit
15
+ from ..utils.available_tools_decorator import extend_with_parent_available_tools
16
+ from ..elitea_base import extend_with_file_operations
15
17
 
16
18
  logger = logging.getLogger(__name__)
17
19
 
@@ -117,7 +119,7 @@ CommentOnIssueModel = create_model(
117
119
  )
118
120
 
119
121
 
120
- class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
122
+ class BitbucketAPIWrapper(CodeIndexerToolkit):
121
123
  """Wrapper for Bitbucket API."""
122
124
 
123
125
  _bitbucket: Any = PrivateAttr()
@@ -167,7 +169,7 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
167
169
  repository=values['repository']
168
170
  )
169
171
  cls._active_branch = values.get('branch')
170
- return values
172
+ return super().validate_toolkit(values)
171
173
 
172
174
  def set_active_branch(self, branch_name: str) -> str:
173
175
  """Set the active branch for the bot."""
@@ -359,12 +361,15 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
359
361
  # except Exception as e:
360
362
  # raise ToolException(f"Can't extract file commit hash (`{file_path}`) due to error:\n{str(e)}")
361
363
 
362
- def _read_file(self, file_path: str, branch: str) -> str:
364
+ def _read_file(self, file_path: str, branch: str, **kwargs) -> str:
363
365
  """
364
- Reads a file from the gitlab repo
366
+ Reads a file from the bitbucket repo with optional partial read support.
367
+
365
368
  Parameters:
366
369
  file_path(str): the file path
367
370
  branch(str): branch name (by default: active_branch)
371
+ **kwargs: Additional parameters (offset, limit, head, tail) - currently ignored,
372
+ partial read handled client-side by base class methods
368
373
  Returns:
369
374
  str: The file decoded as a string
370
375
  """
@@ -398,7 +403,46 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
398
403
  return self._read_file(file_path, branch)
399
404
  except Exception as e:
400
405
  return f"Failed to read file {file_path}: {str(e)}"
406
+
407
+ def _write_file(
408
+ self,
409
+ file_path: str,
410
+ content: str,
411
+ branch: str = None,
412
+ commit_message: str = None
413
+ ) -> str:
414
+ """
415
+ Write content to a file (create or update).
416
+
417
+ Parameters:
418
+ file_path: Path to the file
419
+ content: New file content
420
+ branch: Branch name (uses active branch if None)
421
+ commit_message: Commit message (not used by Bitbucket API)
422
+
423
+ Returns:
424
+ Success message
425
+ """
426
+ try:
427
+ branch = branch or self._active_branch
428
+
429
+ # Check if file exists by attempting to read it
430
+ try:
431
+ self._read_file(file_path, branch)
432
+ # File exists, update it using OLD/NEW format
433
+ old_content = self._read_file(file_path, branch)
434
+ update_query = f"OLD <<<<\n{old_content}\n>>>> OLD\nNEW <<<<\n{content}\n>>>> NEW"
435
+ self._bitbucket.update_file(file_path=file_path, update_query=update_query, branch=branch)
436
+ return f"Updated file {file_path}"
437
+ except:
438
+ # File doesn't exist, create it
439
+ self._bitbucket.create_file(file_path=file_path, file_contents=content, branch=branch)
440
+ return f"Created file {file_path}"
441
+ except Exception as e:
442
+ raise ToolException(f"Unable to write file {file_path}: {str(e)}")
401
443
 
444
+ @extend_with_parent_available_tools
445
+ @extend_with_file_operations
402
446
  def get_available_tools(self):
403
447
  return [
404
448
  {
@@ -473,4 +517,4 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
473
517
  "description": self.add_pull_request_comment.__doc__ or "Add a comment to a pull request in the repository.",
474
518
  "args_schema": AddPullRequestCommentModel,
475
519
  }
476
- ] + self._get_vector_search_tools()
520
+ ]
@@ -8,7 +8,7 @@ from langchain_community.utilities.wikipedia import WikipediaAPIWrapper
8
8
  from .google_search_rag import GoogleSearchResults
9
9
  from .crawler import SingleURLCrawler, MultiURLCrawler, GetHTMLContent, GetPDFContent
10
10
  from .wiki import WikipediaQueryRun
11
- from ..utils import get_max_toolkit_length, clean_string, TOOLKIT_SPLITTER
11
+ from ..utils import get_max_toolkit_length, clean_string
12
12
  from ...configurations.browser import BrowserConfiguration
13
13
  from logging import getLogger
14
14
 
@@ -42,7 +42,6 @@ class BrowserToolkit(BaseToolkit):
42
42
  'google': GoogleSearchResults.__pydantic_fields__['args_schema'].default.schema(),
43
43
  'wiki': WikipediaQueryRun.__pydantic_fields__['args_schema'].default.schema()
44
44
  }
45
- BrowserToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
46
45
 
47
46
  def validate_google_fields(cls, values):
48
47
  if 'google' in values.get('selected_tools', []):
@@ -90,7 +89,6 @@ class BrowserToolkit(BaseToolkit):
90
89
  }
91
90
 
92
91
  tools = []
93
- prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
94
92
  if not selected_tools:
95
93
  selected_tools = [
96
94
  'single_url_crawler',
@@ -127,7 +125,9 @@ class BrowserToolkit(BaseToolkit):
127
125
 
128
126
  # Only add the tool if it was successfully created
129
127
  if tool_entry is not None:
130
- tool_entry.name = f"{prefix}{tool_entry.name}"
128
+ if toolkit_name:
129
+ tool_entry.description = f"{tool_entry.description}\nToolkit: {toolkit_name}"
130
+ tool_entry.description = tool_entry.description[:1000]
131
131
  tools.append(tool_entry)
132
132
  return cls(tools=tools)
133
133
 
@@ -7,7 +7,7 @@ from functools import lru_cache
7
7
  from .api_wrapper import CarrierAPIWrapper
8
8
  from .tools import __all__
9
9
  from ..elitea_base import filter_missconfigured_index_tools
10
- from ..utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
10
+ from ..utils import clean_string, get_max_toolkit_length
11
11
  from ...configurations.carrier import CarrierConfiguration
12
12
 
13
13
  logger = logging.getLogger(__name__)
@@ -17,7 +17,6 @@ name = 'carrier'
17
17
 
18
18
  class AlitaCarrierToolkit(BaseToolkit):
19
19
  tools: List[BaseTool] = []
20
- toolkit_max_length: int = 100
21
20
 
22
21
  @classmethod
23
22
  @lru_cache(maxsize=32)
@@ -26,7 +25,6 @@ class AlitaCarrierToolkit(BaseToolkit):
26
25
  for t in __all__:
27
26
  default = t['tool'].__pydantic_fields__['args_schema'].default
28
27
  selected_tools[t['name']] = default.schema() if default else default
29
- cls.toolkit_max_length = get_max_toolkit_length(selected_tools)
30
28
  return create_model(
31
29
  name,
32
30
  project_id=(Optional[str], Field(None, description="Optional project ID for scoped operations")),
@@ -70,15 +68,15 @@ class AlitaCarrierToolkit(BaseToolkit):
70
68
  logger.exception(f"[AlitaCarrierToolkit] Error initializing CarrierAPIWrapper: {e}")
71
69
  raise ValueError(f"CarrierAPIWrapper initialization error: {e}")
72
70
 
73
- prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
74
-
75
71
  tools = []
76
72
  for tool_def in __all__:
77
73
  if selected_tools and tool_def['name'] not in selected_tools:
78
74
  continue
79
75
  try:
80
76
  tool_instance = tool_def['tool'](api_wrapper=carrier_api_wrapper)
81
- tool_instance.name = prefix + tool_instance.name
77
+ if toolkit_name:
78
+ tool_instance.description = f"{tool_instance.description}\nToolkit: {toolkit_name}"
79
+ tool_instance.description = tool_instance.description[:1000]
82
80
  tools.append(tool_instance)
83
81
  logger.info(f"[AlitaCarrierToolkit] Successfully initialized tool '{tool_instance.name}'")
84
82
  except Exception as e:
@@ -3,6 +3,7 @@ from .sematic.statistical_chunker import statistical_chunker
3
3
  from .sematic.markdown_chunker import markdown_chunker
4
4
  from .sematic.proposal_chunker import proposal_chunker
5
5
  from .sematic.json_chunker import json_chunker
6
+ from .universal_chunker import universal_chunker, chunk_single_document, get_file_type
6
7
  from .models import StatisticalChunkerConfig, MarkdownChunkerConfig, ProposalChunkerConfig
7
8
 
8
9
  __all__ = {
@@ -10,7 +11,8 @@ __all__ = {
10
11
  'statistical': statistical_chunker,
11
12
  'markdown': markdown_chunker,
12
13
  'proposal': proposal_chunker,
13
- 'json': json_chunker
14
+ 'json': json_chunker,
15
+ 'universal': universal_chunker,
14
16
  }
15
17
 
16
18
  __confluence_chunkers__ = {
@@ -17,6 +17,7 @@ def json_chunker(file_content_generator: Generator[Document, None, None], config
17
17
  for chunk in chunks:
18
18
  metadata = doc.metadata.copy()
19
19
  metadata['chunk_id'] = chunk_id
20
+ metadata['method_name'] = 'json'
20
21
  chunk_id += 1
21
22
  yield Document(page_content=json.dumps(chunk), metadata=metadata)
22
23
  except Exception as e:
@@ -1,4 +1,4 @@
1
- from typing import Generator
1
+ from typing import Generator, List
2
2
  from langchain_core.documents import Document
3
3
  from langchain_text_splitters import MarkdownHeaderTextSplitter, ExperimentalMarkdownSyntaxTextSplitter
4
4
  from langchain.text_splitter import TokenTextSplitter
@@ -7,34 +7,60 @@ from copy import deepcopy as copy
7
7
 
8
8
 
9
9
  def markdown_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
10
+ """
11
+ Chunks markdown documents by headers, with support for:
12
+ - Minimum chunk size to avoid tiny fragments
13
+ - Maximum token limit with overflow splitting
14
+ - Header metadata preservation
15
+
16
+ Config options:
17
+ strip_header (bool): Remove headers from content. Default: False
18
+ return_each_line (bool): Split on every line. Default: False
19
+ headers_to_split_on (list): Headers to split on, e.g. [('#', 'H1'), ('##', 'H2')]
20
+ max_tokens (int): Maximum tokens per chunk. Default: 512
21
+ token_overlap (int): Token overlap for large chunk splitting. Default: 10
22
+ min_chunk_chars (int): Minimum characters per chunk. Default: 100
23
+ Chunks smaller than this will be merged with the next chunk.
24
+ """
10
25
  strip_header = config.get("strip_header", False)
11
26
  return_each_line = config.get("return_each_line", False)
12
27
  headers_to_split_on = config.get("headers_to_split_on", [])
13
28
  max_tokens = config.get("max_tokens", 512)
14
29
  tokens_overlapping = config.get("token_overlap", 10)
30
+ min_chunk_chars = config.get("min_chunk_chars", 100) # Minimum characters per chunk
31
+
15
32
  headers_to_split_on = [tuple(header) for header in headers_to_split_on]
33
+
16
34
  for doc in file_content_generator:
17
35
  doc_metadata = doc.metadata
18
36
  doc_content = doc.page_content
19
37
  chunk_id = 0
38
+
20
39
  markdown_splitter = MarkdownHeaderTextSplitter(
21
40
  headers_to_split_on=headers_to_split_on,
22
41
  strip_headers=strip_header,
23
42
  return_each_line=return_each_line
24
43
  )
25
44
  md_header_splits = markdown_splitter.split_text(doc_content)
26
- for chunk in md_header_splits:
45
+
46
+ # Merge small chunks with the next one
47
+ merged_chunks = _merge_small_chunks(md_header_splits, min_chunk_chars)
48
+
49
+ for chunk in merged_chunks:
27
50
  if tiktoken_length(chunk.page_content) > max_tokens:
28
- for subchunk in TokenTextSplitter(encoding_name="cl100k_base",
29
- chunk_size=max_tokens,
30
- chunk_overlap=tokens_overlapping
31
- ).split_text(chunk.page_content):
51
+ # Split large chunks into smaller ones
52
+ for subchunk in TokenTextSplitter(
53
+ encoding_name="cl100k_base",
54
+ chunk_size=max_tokens,
55
+ chunk_overlap=tokens_overlapping
56
+ ).split_text(chunk.page_content):
32
57
  chunk_id += 1
33
58
  headers_meta = list(chunk.metadata.values())
34
59
  docmeta = copy(doc_metadata)
35
60
  docmeta.update({"headers": "; ".join(headers_meta)})
36
61
  docmeta['chunk_id'] = chunk_id
37
62
  docmeta['chunk_type'] = "document"
63
+ docmeta['method_name'] = 'markdown'
38
64
  yield Document(
39
65
  page_content=subchunk,
40
66
  metadata=docmeta
@@ -46,12 +72,77 @@ def markdown_chunker(file_content_generator: Generator[Document, None, None], co
46
72
  docmeta.update({"headers": "; ".join(headers_meta)})
47
73
  docmeta['chunk_id'] = chunk_id
48
74
  docmeta['chunk_type'] = "document"
75
+ docmeta['method_name'] = 'text'
49
76
  yield Document(
50
77
  page_content=chunk.page_content,
51
78
  metadata=docmeta
52
79
  )
53
80
 
54
81
 
82
+ def _merge_small_chunks(chunks: List[Document], min_chars: int) -> List[Document]:
83
+ """
84
+ Merge chunks that are smaller than min_chars with the next chunk.
85
+
86
+ This prevents tiny fragments (like standalone headers or short notes)
87
+ from becoming separate chunks.
88
+
89
+ Args:
90
+ chunks: List of Document chunks from markdown splitter
91
+ min_chars: Minimum character count for a chunk
92
+
93
+ Returns:
94
+ List of merged Document chunks
95
+ """
96
+ if not chunks:
97
+ return chunks
98
+
99
+ merged = []
100
+ pending_content = ""
101
+ pending_metadata = {}
102
+
103
+ for i, chunk in enumerate(chunks):
104
+ content = chunk.page_content.strip()
105
+
106
+ if pending_content:
107
+ # Merge pending content with current chunk
108
+ combined_content = pending_content + "\n\n" + content
109
+ # Use the pending metadata (from the header) but can be extended
110
+ combined_metadata = {**pending_metadata}
111
+ # Add any new header info from current chunk
112
+ for key, value in chunk.metadata.items():
113
+ if key not in combined_metadata or not combined_metadata[key]:
114
+ combined_metadata[key] = value
115
+
116
+ if len(combined_content) >= min_chars:
117
+ # Combined is big enough, emit it
118
+ merged.append(Document(
119
+ page_content=combined_content,
120
+ metadata=combined_metadata
121
+ ))
122
+ pending_content = ""
123
+ pending_metadata = {}
124
+ else:
125
+ # Still too small, keep accumulating
126
+ pending_content = combined_content
127
+ pending_metadata = combined_metadata
128
+ elif len(content) < min_chars:
129
+ # Current chunk is too small, start pending
130
+ pending_content = content
131
+ pending_metadata = dict(chunk.metadata)
132
+ else:
133
+ # Current chunk is big enough
134
+ merged.append(chunk)
135
+
136
+ # Don't forget any remaining pending content
137
+ if pending_content:
138
+ merged.append(Document(
139
+ page_content=pending_content,
140
+ metadata=pending_metadata
141
+ ))
142
+
143
+ return merged
144
+
145
+
55
146
  def markdown_by_headers_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
56
147
  strip_header = config.get("strip_header", False)
57
148
  return_each_line = config.get("return_each_line", False)
@@ -6,7 +6,7 @@ from langchain_core.prompts import ChatPromptTemplate
6
6
  from langchain.text_splitter import TokenTextSplitter
7
7
 
8
8
  from typing import Optional, List
9
- from langchain_core.pydantic_v1 import BaseModel
9
+ from pydantic import BaseModel
10
10
  from ..utils import tiktoken_length
11
11
 
12
12
  logger = getLogger(__name__)
@@ -0,0 +1,270 @@
1
+ """
2
+ Universal Chunker - Routes documents to appropriate chunkers based on file type.
3
+
4
+ This module provides a universal chunking interface that automatically selects
5
+ the appropriate chunking strategy based on the file extension:
6
+
7
+ - .md, .markdown → Markdown chunker (header-based splitting)
8
+ - .py, .js, .ts, .java, etc. → TreeSitter code chunker
9
+ - .json → JSON chunker
10
+ - other → Default text chunker
11
+
12
+ Usage:
13
+ from alita_sdk.tools.chunkers.universal_chunker import universal_chunker
14
+
15
+ # Chunk documents from a loader
16
+ for chunk in universal_chunker(document_generator, config):
17
+ print(chunk.page_content)
18
+ """
19
+
20
+ import logging
21
+ import os
22
+ from typing import Generator, Dict, Any, Optional
23
+ from langchain_core.documents import Document
24
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
25
+
26
+ from .code.codeparser import parse_code_files_for_db
27
+ from .sematic.markdown_chunker import markdown_chunker
28
+ from .sematic.json_chunker import json_chunker
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ # File extension mappings
34
+ MARKDOWN_EXTENSIONS = {'.md', '.markdown', '.mdown', '.mkd', '.mdx'}
35
+ JSON_EXTENSIONS = {'.json', '.jsonl', '.jsonc'}
36
+ CODE_EXTENSIONS = {
37
+ '.py', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
38
+ '.java', '.kt', '.rs', '.go', '.cpp', '.c', '.cs',
39
+ '.hs', '.rb', '.scala', '.lua'
40
+ }
41
+
42
+
43
+ def get_file_extension(file_path: str) -> str:
44
+ """Extract file extension from path."""
45
+ return os.path.splitext(file_path)[-1].lower()
46
+
47
+
48
+ def get_file_type(file_path: str) -> str:
49
+ """
50
+ Determine the file type category for chunking.
51
+
52
+ Returns:
53
+ 'markdown', 'json', 'code', or 'text'
54
+ """
55
+ ext = get_file_extension(file_path)
56
+
57
+ if ext in MARKDOWN_EXTENSIONS:
58
+ return 'markdown'
59
+ elif ext in JSON_EXTENSIONS:
60
+ return 'json'
61
+ elif ext in CODE_EXTENSIONS:
62
+ return 'code'
63
+ else:
64
+ return 'text'
65
+
66
+
67
+ def _default_text_chunker(
68
+ documents: Generator[Document, None, None],
69
+ config: Dict[str, Any]
70
+ ) -> Generator[Document, None, None]:
71
+ """
72
+ Default text chunker for unknown file types.
73
+ Uses recursive character splitting.
74
+ """
75
+ chunk_size = config.get('chunk_size', 1000)
76
+ chunk_overlap = config.get('chunk_overlap', 100)
77
+
78
+ splitter = RecursiveCharacterTextSplitter(
79
+ chunk_size=chunk_size,
80
+ chunk_overlap=chunk_overlap,
81
+ length_function=len,
82
+ )
83
+
84
+ for doc in documents:
85
+ chunks = splitter.split_documents([doc])
86
+ for idx, chunk in enumerate(chunks, 1):
87
+ chunk.metadata['chunk_id'] = idx
88
+ chunk.metadata['chunk_type'] = 'text'
89
+ chunk.metadata['method_name'] = 'text'
90
+ yield chunk
91
+
92
+
93
+ def _code_chunker_from_documents(
94
+ documents: Generator[Document, None, None],
95
+ config: Dict[str, Any]
96
+ ) -> Generator[Document, None, None]:
97
+ """
98
+ Adapter to convert Document generator to code parser format.
99
+ """
100
+ def file_content_generator():
101
+ for doc in documents:
102
+ yield {
103
+ 'file_name': doc.metadata.get('file_path', doc.metadata.get('filename', 'unknown')),
104
+ 'file_content': doc.page_content,
105
+ 'commit_hash': doc.metadata.get('commit_hash', ''),
106
+ }
107
+
108
+ # parse_code_files_for_db returns chunks with proper metadata
109
+ for chunk in parse_code_files_for_db(file_content_generator()):
110
+ # Ensure file_path is preserved
111
+ if 'file_path' not in chunk.metadata and 'filename' in chunk.metadata:
112
+ chunk.metadata['file_path'] = chunk.metadata['filename']
113
+ yield chunk
114
+
115
+
116
+ def universal_chunker(
117
+ documents: Generator[Document, None, None],
118
+ config: Optional[Dict[str, Any]] = None
119
+ ) -> Generator[Document, None, None]:
120
+ """
121
+ Universal chunker that routes documents to appropriate chunkers based on file type.
122
+
123
+ Each document is inspected for its file extension (from metadata.file_path or
124
+ metadata.file_name) and routed to the appropriate chunker:
125
+
126
+ - Markdown files → markdown_chunker (header-based splitting)
127
+ - JSON files → json_chunker (recursive JSON splitting)
128
+ - Code files → code parser (TreeSitter-based parsing)
129
+ - Other files → default text chunker (recursive character splitting)
130
+
131
+ Args:
132
+ documents: Generator yielding Document objects with file content
133
+ config: Optional configuration dict with:
134
+ - markdown_config: Config for markdown chunker
135
+ - json_config: Config for JSON chunker
136
+ - code_config: Config for code chunker
137
+ - text_config: Config for default text chunker
138
+
139
+ Yields:
140
+ Document objects with chunked content and preserved metadata
141
+ """
142
+ if config is None:
143
+ config = {}
144
+
145
+ # Default configs for each chunker type
146
+ markdown_config = config.get('markdown_config', {
147
+ 'strip_header': False,
148
+ 'return_each_line': False,
149
+ 'headers_to_split_on': [
150
+ ('#', 'Header 1'),
151
+ ('##', 'Header 2'),
152
+ ('###', 'Header 3'),
153
+ ('####', 'Header 4'),
154
+ ],
155
+ 'max_tokens': 1024,
156
+ 'token_overlap': 50,
157
+ 'min_chunk_chars': 100, # Merge chunks smaller than this
158
+ })
159
+
160
+ json_config = config.get('json_config', {
161
+ 'max_tokens': 512,
162
+ })
163
+
164
+ code_config = config.get('code_config', {})
165
+
166
+ text_config = config.get('text_config', {
167
+ 'chunk_size': 1000,
168
+ 'chunk_overlap': 100,
169
+ })
170
+
171
+ # Buffer documents by type for batch processing
172
+ # This is more efficient than processing one at a time
173
+ markdown_docs = []
174
+ json_docs = []
175
+ code_docs = []
176
+ text_docs = []
177
+
178
+ # Buffer size before flushing
179
+ BUFFER_SIZE = 10
180
+
181
+ def flush_markdown():
182
+ if markdown_docs:
183
+ def gen():
184
+ for d in markdown_docs:
185
+ yield d
186
+ for chunk in markdown_chunker(gen(), markdown_config):
187
+ yield chunk
188
+ markdown_docs.clear()
189
+
190
+ def flush_json():
191
+ if json_docs:
192
+ def gen():
193
+ for d in json_docs:
194
+ yield d
195
+ for chunk in json_chunker(gen(), json_config):
196
+ yield chunk
197
+ json_docs.clear()
198
+
199
+ def flush_code():
200
+ if code_docs:
201
+ def gen():
202
+ for d in code_docs:
203
+ yield d
204
+ for chunk in _code_chunker_from_documents(gen(), code_config):
205
+ yield chunk
206
+ code_docs.clear()
207
+
208
+ def flush_text():
209
+ if text_docs:
210
+ def gen():
211
+ for d in text_docs:
212
+ yield d
213
+ for chunk in _default_text_chunker(gen(), text_config):
214
+ yield chunk
215
+ text_docs.clear()
216
+
217
+ for doc in documents:
218
+ # Get file path from metadata
219
+ file_path = (doc.metadata.get('file_path') or
220
+ doc.metadata.get('file_name') or
221
+ doc.metadata.get('source') or
222
+ 'unknown')
223
+
224
+ # Ensure file_path is in metadata for downstream use
225
+ doc.metadata['file_path'] = file_path
226
+
227
+ file_type = get_file_type(file_path)
228
+
229
+ if file_type == 'markdown':
230
+ markdown_docs.append(doc)
231
+ if len(markdown_docs) >= BUFFER_SIZE:
232
+ yield from flush_markdown()
233
+ elif file_type == 'json':
234
+ json_docs.append(doc)
235
+ if len(json_docs) >= BUFFER_SIZE:
236
+ yield from flush_json()
237
+ elif file_type == 'code':
238
+ code_docs.append(doc)
239
+ if len(code_docs) >= BUFFER_SIZE:
240
+ yield from flush_code()
241
+ else:
242
+ text_docs.append(doc)
243
+ if len(text_docs) >= BUFFER_SIZE:
244
+ yield from flush_text()
245
+
246
+ # Flush remaining documents
247
+ yield from flush_markdown()
248
+ yield from flush_json()
249
+ yield from flush_code()
250
+ yield from flush_text()
251
+
252
+
253
+ def chunk_single_document(
254
+ doc: Document,
255
+ config: Optional[Dict[str, Any]] = None
256
+ ) -> Generator[Document, None, None]:
257
+ """
258
+ Convenience function to chunk a single document.
259
+
260
+ Args:
261
+ doc: Single Document to chunk
262
+ config: Optional chunker configuration
263
+
264
+ Yields:
265
+ Chunked Document objects
266
+ """
267
+ def single_doc_gen():
268
+ yield doc
269
+
270
+ yield from universal_chunker(single_doc_gen(), config)