alita-sdk 0.3.351__py3-none-any.whl → 0.3.499__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3601 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1256 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +64 -8
- alita_sdk/community/inventory/__init__.py +224 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/confluence.py +96 -1
- alita_sdk/configurations/gitlab.py +79 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +93 -0
- alita_sdk/configurations/zephyr_enterprise.py +93 -0
- alita_sdk/configurations/zephyr_essential.py +75 -0
- alita_sdk/runtime/clients/artifact.py +1 -1
- alita_sdk/runtime/clients/client.py +214 -42
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +373 -0
- alita_sdk/runtime/langchain/assistant.py +118 -30
- alita_sdk/runtime/langchain/constants.py +8 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +41 -12
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/constants.py +116 -99
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +2 -2
- alita_sdk/runtime/langchain/langraph_agent.py +307 -71
- alita_sdk/runtime/langchain/utils.py +48 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/toolkits/__init__.py +26 -0
- alita_sdk/runtime/toolkits/application.py +9 -2
- alita_sdk/runtime/toolkits/artifact.py +18 -6
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +780 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/tools.py +205 -55
- alita_sdk/runtime/toolkits/vectorstore.py +9 -4
- alita_sdk/runtime/tools/__init__.py +11 -3
- alita_sdk/runtime/tools/application.py +7 -0
- alita_sdk/runtime/tools/artifact.py +225 -12
- alita_sdk/runtime/tools/function.py +95 -5
- alita_sdk/runtime/tools/graph.py +10 -4
- alita_sdk/runtime/tools/image_generation.py +212 -0
- alita_sdk/runtime/tools/llm.py +494 -102
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +4 -4
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +180 -79
- alita_sdk/runtime/tools/vectorstore.py +22 -21
- alita_sdk/runtime/tools/vectorstore_base.py +125 -52
- alita_sdk/runtime/utils/AlitaCallback.py +106 -20
- alita_sdk/runtime/utils/mcp_client.py +465 -0
- alita_sdk/runtime/utils/mcp_oauth.py +244 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +40 -13
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +12 -0
- alita_sdk/tools/__init__.py +77 -33
- alita_sdk/tools/ado/repos/__init__.py +7 -6
- alita_sdk/tools/ado/repos/repos_wrapper.py +11 -11
- alita_sdk/tools/ado/test_plan/__init__.py +7 -7
- alita_sdk/tools/ado/wiki/__init__.py +7 -11
- alita_sdk/tools/ado/wiki/ado_wrapper.py +89 -15
- alita_sdk/tools/ado/work_item/__init__.py +7 -11
- alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
- alita_sdk/tools/advanced_jira_mining/__init__.py +8 -7
- alita_sdk/tools/aws/delta_lake/__init__.py +11 -9
- alita_sdk/tools/azure_ai/search/__init__.py +7 -6
- alita_sdk/tools/base_indexer_toolkit.py +345 -70
- alita_sdk/tools/bitbucket/__init__.py +9 -8
- alita_sdk/tools/bitbucket/api_wrapper.py +50 -6
- alita_sdk/tools/browser/__init__.py +4 -4
- alita_sdk/tools/carrier/__init__.py +4 -6
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +7 -6
- alita_sdk/tools/cloud/azure/__init__.py +7 -6
- alita_sdk/tools/cloud/gcp/__init__.py +7 -6
- alita_sdk/tools/cloud/k8s/__init__.py +7 -6
- alita_sdk/tools/code/linter/__init__.py +7 -7
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +8 -7
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +9 -8
- alita_sdk/tools/confluence/api_wrapper.py +171 -75
- alita_sdk/tools/confluence/loader.py +10 -0
- alita_sdk/tools/custom_open_api/__init__.py +9 -4
- alita_sdk/tools/elastic/__init__.py +8 -7
- alita_sdk/tools/elitea_base.py +492 -52
- alita_sdk/tools/figma/__init__.py +7 -7
- alita_sdk/tools/figma/api_wrapper.py +2 -1
- alita_sdk/tools/github/__init__.py +9 -9
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +62 -2
- alita_sdk/tools/gitlab/__init__.py +8 -8
- alita_sdk/tools/gitlab/api_wrapper.py +135 -33
- alita_sdk/tools/gitlab_org/__init__.py +7 -8
- alita_sdk/tools/google/bigquery/__init__.py +11 -12
- alita_sdk/tools/google_places/__init__.py +8 -7
- alita_sdk/tools/jira/__init__.py +9 -7
- alita_sdk/tools/jira/api_wrapper.py +100 -52
- alita_sdk/tools/keycloak/__init__.py +8 -7
- alita_sdk/tools/localgit/local_git.py +56 -54
- alita_sdk/tools/memory/__init__.py +1 -1
- alita_sdk/tools/non_code_indexer_toolkit.py +3 -2
- alita_sdk/tools/ocr/__init__.py +8 -7
- alita_sdk/tools/openapi/__init__.py +10 -1
- alita_sdk/tools/pandas/__init__.py +8 -7
- alita_sdk/tools/postman/__init__.py +7 -8
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +8 -9
- alita_sdk/tools/qtest/__init__.py +16 -11
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +7 -8
- alita_sdk/tools/report_portal/__init__.py +9 -7
- alita_sdk/tools/salesforce/__init__.py +7 -7
- alita_sdk/tools/servicenow/__init__.py +10 -10
- alita_sdk/tools/sharepoint/__init__.py +7 -6
- alita_sdk/tools/sharepoint/api_wrapper.py +127 -36
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +7 -6
- alita_sdk/tools/sql/__init__.py +8 -7
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +7 -6
- alita_sdk/tools/testrail/__init__.py +8 -9
- alita_sdk/tools/utils/__init__.py +26 -4
- alita_sdk/tools/utils/content_parser.py +88 -60
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +76 -26
- alita_sdk/tools/xray/__init__.py +9 -7
- alita_sdk/tools/zephyr/__init__.py +7 -6
- alita_sdk/tools/zephyr_enterprise/__init__.py +8 -6
- alita_sdk/tools/zephyr_essential/__init__.py +7 -6
- alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
- alita_sdk/tools/zephyr_scale/__init__.py +7 -6
- alita_sdk/tools/zephyr_squad/__init__.py +7 -6
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +147 -2
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/RECORD +206 -130
- alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,7 @@ from pydantic import BaseModel, Field, ConfigDict, create_model
|
|
|
9
9
|
|
|
10
10
|
from ..base.tool import BaseAction
|
|
11
11
|
from ..elitea_base import filter_missconfigured_index_tools
|
|
12
|
-
from ..utils import clean_string,
|
|
12
|
+
from ..utils import clean_string, get_max_toolkit_length, check_connection_response
|
|
13
13
|
from ...configurations.bitbucket import BitbucketConfiguration
|
|
14
14
|
from ...configurations.pgvector import PgVectorConfiguration
|
|
15
15
|
import requests
|
|
@@ -38,17 +38,15 @@ def get_tools(tool):
|
|
|
38
38
|
|
|
39
39
|
class AlitaBitbucketToolkit(BaseToolkit):
|
|
40
40
|
tools: List[BaseTool] = []
|
|
41
|
-
toolkit_max_length: int = 0
|
|
42
41
|
|
|
43
42
|
@staticmethod
|
|
44
43
|
def toolkit_config_schema() -> BaseModel:
|
|
45
44
|
selected_tools = {x['name']: x['args_schema'].schema() for x in
|
|
46
45
|
BitbucketAPIWrapper.model_construct().get_available_tools()}
|
|
47
|
-
AlitaBitbucketToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
48
46
|
m = create_model(
|
|
49
47
|
name,
|
|
50
|
-
project=(str, Field(description="Project/Workspace"
|
|
51
|
-
repository=(str, Field(description="Repository"
|
|
48
|
+
project=(str, Field(description="Project/Workspace")),
|
|
49
|
+
repository=(str, Field(description="Repository")),
|
|
52
50
|
branch=(str, Field(description="Main branch", default="main")),
|
|
53
51
|
cloud=(Optional[bool], Field(description="Hosting Option", default=None)),
|
|
54
52
|
bitbucket_configuration=(BitbucketConfiguration, Field(description="Bitbucket Configuration", json_schema_extra={'configuration_types': ['bitbucket']})),
|
|
@@ -99,16 +97,19 @@ class AlitaBitbucketToolkit(BaseToolkit):
|
|
|
99
97
|
}
|
|
100
98
|
bitbucket_api_wrapper = BitbucketAPIWrapper(**wrapper_payload)
|
|
101
99
|
available_tools: List[Dict] = bitbucket_api_wrapper.get_available_tools()
|
|
102
|
-
prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
|
|
103
100
|
tools = []
|
|
104
101
|
for tool in available_tools:
|
|
105
102
|
if selected_tools:
|
|
106
103
|
if tool['name'] not in selected_tools:
|
|
107
104
|
continue
|
|
105
|
+
description = tool["description"] + f"\nrepo: {bitbucket_api_wrapper.repository}"
|
|
106
|
+
if toolkit_name:
|
|
107
|
+
description = f"{description}\nToolkit: {toolkit_name}"
|
|
108
|
+
description = description[:1000]
|
|
108
109
|
tools.append(BaseAction(
|
|
109
110
|
api_wrapper=bitbucket_api_wrapper,
|
|
110
|
-
name=
|
|
111
|
-
description=
|
|
111
|
+
name=tool["name"],
|
|
112
|
+
description=description,
|
|
112
113
|
args_schema=tool["args_schema"]
|
|
113
114
|
))
|
|
114
115
|
return cls(tools=tools)
|
|
@@ -11,7 +11,9 @@ from .bitbucket_constants import create_pr_data
|
|
|
11
11
|
from .cloud_api_wrapper import BitbucketCloudApi, BitbucketServerApi
|
|
12
12
|
from pydantic.fields import PrivateAttr
|
|
13
13
|
|
|
14
|
-
from ..
|
|
14
|
+
from ..code_indexer_toolkit import CodeIndexerToolkit
|
|
15
|
+
from ..utils.available_tools_decorator import extend_with_parent_available_tools
|
|
16
|
+
from ..elitea_base import extend_with_file_operations
|
|
15
17
|
|
|
16
18
|
logger = logging.getLogger(__name__)
|
|
17
19
|
|
|
@@ -117,7 +119,7 @@ CommentOnIssueModel = create_model(
|
|
|
117
119
|
)
|
|
118
120
|
|
|
119
121
|
|
|
120
|
-
class BitbucketAPIWrapper(
|
|
122
|
+
class BitbucketAPIWrapper(CodeIndexerToolkit):
|
|
121
123
|
"""Wrapper for Bitbucket API."""
|
|
122
124
|
|
|
123
125
|
_bitbucket: Any = PrivateAttr()
|
|
@@ -167,7 +169,7 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
|
|
|
167
169
|
repository=values['repository']
|
|
168
170
|
)
|
|
169
171
|
cls._active_branch = values.get('branch')
|
|
170
|
-
return values
|
|
172
|
+
return super().validate_toolkit(values)
|
|
171
173
|
|
|
172
174
|
def set_active_branch(self, branch_name: str) -> str:
|
|
173
175
|
"""Set the active branch for the bot."""
|
|
@@ -359,12 +361,15 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
|
|
|
359
361
|
# except Exception as e:
|
|
360
362
|
# raise ToolException(f"Can't extract file commit hash (`{file_path}`) due to error:\n{str(e)}")
|
|
361
363
|
|
|
362
|
-
def _read_file(self, file_path: str, branch: str) -> str:
|
|
364
|
+
def _read_file(self, file_path: str, branch: str, **kwargs) -> str:
|
|
363
365
|
"""
|
|
364
|
-
Reads a file from the
|
|
366
|
+
Reads a file from the bitbucket repo with optional partial read support.
|
|
367
|
+
|
|
365
368
|
Parameters:
|
|
366
369
|
file_path(str): the file path
|
|
367
370
|
branch(str): branch name (by default: active_branch)
|
|
371
|
+
**kwargs: Additional parameters (offset, limit, head, tail) - currently ignored,
|
|
372
|
+
partial read handled client-side by base class methods
|
|
368
373
|
Returns:
|
|
369
374
|
str: The file decoded as a string
|
|
370
375
|
"""
|
|
@@ -398,7 +403,46 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
|
|
|
398
403
|
return self._read_file(file_path, branch)
|
|
399
404
|
except Exception as e:
|
|
400
405
|
return f"Failed to read file {file_path}: {str(e)}"
|
|
406
|
+
|
|
407
|
+
def _write_file(
|
|
408
|
+
self,
|
|
409
|
+
file_path: str,
|
|
410
|
+
content: str,
|
|
411
|
+
branch: str = None,
|
|
412
|
+
commit_message: str = None
|
|
413
|
+
) -> str:
|
|
414
|
+
"""
|
|
415
|
+
Write content to a file (create or update).
|
|
416
|
+
|
|
417
|
+
Parameters:
|
|
418
|
+
file_path: Path to the file
|
|
419
|
+
content: New file content
|
|
420
|
+
branch: Branch name (uses active branch if None)
|
|
421
|
+
commit_message: Commit message (not used by Bitbucket API)
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
Success message
|
|
425
|
+
"""
|
|
426
|
+
try:
|
|
427
|
+
branch = branch or self._active_branch
|
|
428
|
+
|
|
429
|
+
# Check if file exists by attempting to read it
|
|
430
|
+
try:
|
|
431
|
+
self._read_file(file_path, branch)
|
|
432
|
+
# File exists, update it using OLD/NEW format
|
|
433
|
+
old_content = self._read_file(file_path, branch)
|
|
434
|
+
update_query = f"OLD <<<<\n{old_content}\n>>>> OLD\nNEW <<<<\n{content}\n>>>> NEW"
|
|
435
|
+
self._bitbucket.update_file(file_path=file_path, update_query=update_query, branch=branch)
|
|
436
|
+
return f"Updated file {file_path}"
|
|
437
|
+
except:
|
|
438
|
+
# File doesn't exist, create it
|
|
439
|
+
self._bitbucket.create_file(file_path=file_path, file_contents=content, branch=branch)
|
|
440
|
+
return f"Created file {file_path}"
|
|
441
|
+
except Exception as e:
|
|
442
|
+
raise ToolException(f"Unable to write file {file_path}: {str(e)}")
|
|
401
443
|
|
|
444
|
+
@extend_with_parent_available_tools
|
|
445
|
+
@extend_with_file_operations
|
|
402
446
|
def get_available_tools(self):
|
|
403
447
|
return [
|
|
404
448
|
{
|
|
@@ -473,4 +517,4 @@ class BitbucketAPIWrapper(BaseCodeToolApiWrapper):
|
|
|
473
517
|
"description": self.add_pull_request_comment.__doc__ or "Add a comment to a pull request in the repository.",
|
|
474
518
|
"args_schema": AddPullRequestCommentModel,
|
|
475
519
|
}
|
|
476
|
-
]
|
|
520
|
+
]
|
|
@@ -8,7 +8,7 @@ from langchain_community.utilities.wikipedia import WikipediaAPIWrapper
|
|
|
8
8
|
from .google_search_rag import GoogleSearchResults
|
|
9
9
|
from .crawler import SingleURLCrawler, MultiURLCrawler, GetHTMLContent, GetPDFContent
|
|
10
10
|
from .wiki import WikipediaQueryRun
|
|
11
|
-
from ..utils import get_max_toolkit_length, clean_string
|
|
11
|
+
from ..utils import get_max_toolkit_length, clean_string
|
|
12
12
|
from ...configurations.browser import BrowserConfiguration
|
|
13
13
|
from logging import getLogger
|
|
14
14
|
|
|
@@ -42,7 +42,6 @@ class BrowserToolkit(BaseToolkit):
|
|
|
42
42
|
'google': GoogleSearchResults.__pydantic_fields__['args_schema'].default.schema(),
|
|
43
43
|
'wiki': WikipediaQueryRun.__pydantic_fields__['args_schema'].default.schema()
|
|
44
44
|
}
|
|
45
|
-
BrowserToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
46
45
|
|
|
47
46
|
def validate_google_fields(cls, values):
|
|
48
47
|
if 'google' in values.get('selected_tools', []):
|
|
@@ -90,7 +89,6 @@ class BrowserToolkit(BaseToolkit):
|
|
|
90
89
|
}
|
|
91
90
|
|
|
92
91
|
tools = []
|
|
93
|
-
prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
|
|
94
92
|
if not selected_tools:
|
|
95
93
|
selected_tools = [
|
|
96
94
|
'single_url_crawler',
|
|
@@ -127,7 +125,9 @@ class BrowserToolkit(BaseToolkit):
|
|
|
127
125
|
|
|
128
126
|
# Only add the tool if it was successfully created
|
|
129
127
|
if tool_entry is not None:
|
|
130
|
-
|
|
128
|
+
if toolkit_name:
|
|
129
|
+
tool_entry.description = f"{tool_entry.description}\nToolkit: {toolkit_name}"
|
|
130
|
+
tool_entry.description = tool_entry.description[:1000]
|
|
131
131
|
tools.append(tool_entry)
|
|
132
132
|
return cls(tools=tools)
|
|
133
133
|
|
|
@@ -7,7 +7,7 @@ from functools import lru_cache
|
|
|
7
7
|
from .api_wrapper import CarrierAPIWrapper
|
|
8
8
|
from .tools import __all__
|
|
9
9
|
from ..elitea_base import filter_missconfigured_index_tools
|
|
10
|
-
from ..utils import clean_string,
|
|
10
|
+
from ..utils import clean_string, get_max_toolkit_length
|
|
11
11
|
from ...configurations.carrier import CarrierConfiguration
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
@@ -17,7 +17,6 @@ name = 'carrier'
|
|
|
17
17
|
|
|
18
18
|
class AlitaCarrierToolkit(BaseToolkit):
|
|
19
19
|
tools: List[BaseTool] = []
|
|
20
|
-
toolkit_max_length: int = 100
|
|
21
20
|
|
|
22
21
|
@classmethod
|
|
23
22
|
@lru_cache(maxsize=32)
|
|
@@ -26,7 +25,6 @@ class AlitaCarrierToolkit(BaseToolkit):
|
|
|
26
25
|
for t in __all__:
|
|
27
26
|
default = t['tool'].__pydantic_fields__['args_schema'].default
|
|
28
27
|
selected_tools[t['name']] = default.schema() if default else default
|
|
29
|
-
cls.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
30
28
|
return create_model(
|
|
31
29
|
name,
|
|
32
30
|
project_id=(Optional[str], Field(None, description="Optional project ID for scoped operations")),
|
|
@@ -70,15 +68,15 @@ class AlitaCarrierToolkit(BaseToolkit):
|
|
|
70
68
|
logger.exception(f"[AlitaCarrierToolkit] Error initializing CarrierAPIWrapper: {e}")
|
|
71
69
|
raise ValueError(f"CarrierAPIWrapper initialization error: {e}")
|
|
72
70
|
|
|
73
|
-
prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
|
|
74
|
-
|
|
75
71
|
tools = []
|
|
76
72
|
for tool_def in __all__:
|
|
77
73
|
if selected_tools and tool_def['name'] not in selected_tools:
|
|
78
74
|
continue
|
|
79
75
|
try:
|
|
80
76
|
tool_instance = tool_def['tool'](api_wrapper=carrier_api_wrapper)
|
|
81
|
-
|
|
77
|
+
if toolkit_name:
|
|
78
|
+
tool_instance.description = f"{tool_instance.description}\nToolkit: {toolkit_name}"
|
|
79
|
+
tool_instance.description = tool_instance.description[:1000]
|
|
82
80
|
tools.append(tool_instance)
|
|
83
81
|
logger.info(f"[AlitaCarrierToolkit] Successfully initialized tool '{tool_instance.name}'")
|
|
84
82
|
except Exception as e:
|
|
@@ -3,6 +3,7 @@ from .sematic.statistical_chunker import statistical_chunker
|
|
|
3
3
|
from .sematic.markdown_chunker import markdown_chunker
|
|
4
4
|
from .sematic.proposal_chunker import proposal_chunker
|
|
5
5
|
from .sematic.json_chunker import json_chunker
|
|
6
|
+
from .universal_chunker import universal_chunker, chunk_single_document, get_file_type
|
|
6
7
|
from .models import StatisticalChunkerConfig, MarkdownChunkerConfig, ProposalChunkerConfig
|
|
7
8
|
|
|
8
9
|
__all__ = {
|
|
@@ -10,7 +11,8 @@ __all__ = {
|
|
|
10
11
|
'statistical': statistical_chunker,
|
|
11
12
|
'markdown': markdown_chunker,
|
|
12
13
|
'proposal': proposal_chunker,
|
|
13
|
-
'json': json_chunker
|
|
14
|
+
'json': json_chunker,
|
|
15
|
+
'universal': universal_chunker,
|
|
14
16
|
}
|
|
15
17
|
|
|
16
18
|
__confluence_chunkers__ = {
|
|
@@ -17,6 +17,7 @@ def json_chunker(file_content_generator: Generator[Document, None, None], config
|
|
|
17
17
|
for chunk in chunks:
|
|
18
18
|
metadata = doc.metadata.copy()
|
|
19
19
|
metadata['chunk_id'] = chunk_id
|
|
20
|
+
metadata['method_name'] = 'json'
|
|
20
21
|
chunk_id += 1
|
|
21
22
|
yield Document(page_content=json.dumps(chunk), metadata=metadata)
|
|
22
23
|
except Exception as e:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Generator
|
|
1
|
+
from typing import Generator, List
|
|
2
2
|
from langchain_core.documents import Document
|
|
3
3
|
from langchain_text_splitters import MarkdownHeaderTextSplitter, ExperimentalMarkdownSyntaxTextSplitter
|
|
4
4
|
from langchain.text_splitter import TokenTextSplitter
|
|
@@ -7,34 +7,60 @@ from copy import deepcopy as copy
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def markdown_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
|
|
10
|
+
"""
|
|
11
|
+
Chunks markdown documents by headers, with support for:
|
|
12
|
+
- Minimum chunk size to avoid tiny fragments
|
|
13
|
+
- Maximum token limit with overflow splitting
|
|
14
|
+
- Header metadata preservation
|
|
15
|
+
|
|
16
|
+
Config options:
|
|
17
|
+
strip_header (bool): Remove headers from content. Default: False
|
|
18
|
+
return_each_line (bool): Split on every line. Default: False
|
|
19
|
+
headers_to_split_on (list): Headers to split on, e.g. [('#', 'H1'), ('##', 'H2')]
|
|
20
|
+
max_tokens (int): Maximum tokens per chunk. Default: 512
|
|
21
|
+
token_overlap (int): Token overlap for large chunk splitting. Default: 10
|
|
22
|
+
min_chunk_chars (int): Minimum characters per chunk. Default: 100
|
|
23
|
+
Chunks smaller than this will be merged with the next chunk.
|
|
24
|
+
"""
|
|
10
25
|
strip_header = config.get("strip_header", False)
|
|
11
26
|
return_each_line = config.get("return_each_line", False)
|
|
12
27
|
headers_to_split_on = config.get("headers_to_split_on", [])
|
|
13
28
|
max_tokens = config.get("max_tokens", 512)
|
|
14
29
|
tokens_overlapping = config.get("token_overlap", 10)
|
|
30
|
+
min_chunk_chars = config.get("min_chunk_chars", 100) # Minimum characters per chunk
|
|
31
|
+
|
|
15
32
|
headers_to_split_on = [tuple(header) for header in headers_to_split_on]
|
|
33
|
+
|
|
16
34
|
for doc in file_content_generator:
|
|
17
35
|
doc_metadata = doc.metadata
|
|
18
36
|
doc_content = doc.page_content
|
|
19
37
|
chunk_id = 0
|
|
38
|
+
|
|
20
39
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
|
21
40
|
headers_to_split_on=headers_to_split_on,
|
|
22
41
|
strip_headers=strip_header,
|
|
23
42
|
return_each_line=return_each_line
|
|
24
43
|
)
|
|
25
44
|
md_header_splits = markdown_splitter.split_text(doc_content)
|
|
26
|
-
|
|
45
|
+
|
|
46
|
+
# Merge small chunks with the next one
|
|
47
|
+
merged_chunks = _merge_small_chunks(md_header_splits, min_chunk_chars)
|
|
48
|
+
|
|
49
|
+
for chunk in merged_chunks:
|
|
27
50
|
if tiktoken_length(chunk.page_content) > max_tokens:
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
51
|
+
# Split large chunks into smaller ones
|
|
52
|
+
for subchunk in TokenTextSplitter(
|
|
53
|
+
encoding_name="cl100k_base",
|
|
54
|
+
chunk_size=max_tokens,
|
|
55
|
+
chunk_overlap=tokens_overlapping
|
|
56
|
+
).split_text(chunk.page_content):
|
|
32
57
|
chunk_id += 1
|
|
33
58
|
headers_meta = list(chunk.metadata.values())
|
|
34
59
|
docmeta = copy(doc_metadata)
|
|
35
60
|
docmeta.update({"headers": "; ".join(headers_meta)})
|
|
36
61
|
docmeta['chunk_id'] = chunk_id
|
|
37
62
|
docmeta['chunk_type'] = "document"
|
|
63
|
+
docmeta['method_name'] = 'markdown'
|
|
38
64
|
yield Document(
|
|
39
65
|
page_content=subchunk,
|
|
40
66
|
metadata=docmeta
|
|
@@ -46,12 +72,77 @@ def markdown_chunker(file_content_generator: Generator[Document, None, None], co
|
|
|
46
72
|
docmeta.update({"headers": "; ".join(headers_meta)})
|
|
47
73
|
docmeta['chunk_id'] = chunk_id
|
|
48
74
|
docmeta['chunk_type'] = "document"
|
|
75
|
+
docmeta['method_name'] = 'text'
|
|
49
76
|
yield Document(
|
|
50
77
|
page_content=chunk.page_content,
|
|
51
78
|
metadata=docmeta
|
|
52
79
|
)
|
|
53
80
|
|
|
54
81
|
|
|
82
|
+
def _merge_small_chunks(chunks: List[Document], min_chars: int) -> List[Document]:
|
|
83
|
+
"""
|
|
84
|
+
Merge chunks that are smaller than min_chars with the next chunk.
|
|
85
|
+
|
|
86
|
+
This prevents tiny fragments (like standalone headers or short notes)
|
|
87
|
+
from becoming separate chunks.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
chunks: List of Document chunks from markdown splitter
|
|
91
|
+
min_chars: Minimum character count for a chunk
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
List of merged Document chunks
|
|
95
|
+
"""
|
|
96
|
+
if not chunks:
|
|
97
|
+
return chunks
|
|
98
|
+
|
|
99
|
+
merged = []
|
|
100
|
+
pending_content = ""
|
|
101
|
+
pending_metadata = {}
|
|
102
|
+
|
|
103
|
+
for i, chunk in enumerate(chunks):
|
|
104
|
+
content = chunk.page_content.strip()
|
|
105
|
+
|
|
106
|
+
if pending_content:
|
|
107
|
+
# Merge pending content with current chunk
|
|
108
|
+
combined_content = pending_content + "\n\n" + content
|
|
109
|
+
# Use the pending metadata (from the header) but can be extended
|
|
110
|
+
combined_metadata = {**pending_metadata}
|
|
111
|
+
# Add any new header info from current chunk
|
|
112
|
+
for key, value in chunk.metadata.items():
|
|
113
|
+
if key not in combined_metadata or not combined_metadata[key]:
|
|
114
|
+
combined_metadata[key] = value
|
|
115
|
+
|
|
116
|
+
if len(combined_content) >= min_chars:
|
|
117
|
+
# Combined is big enough, emit it
|
|
118
|
+
merged.append(Document(
|
|
119
|
+
page_content=combined_content,
|
|
120
|
+
metadata=combined_metadata
|
|
121
|
+
))
|
|
122
|
+
pending_content = ""
|
|
123
|
+
pending_metadata = {}
|
|
124
|
+
else:
|
|
125
|
+
# Still too small, keep accumulating
|
|
126
|
+
pending_content = combined_content
|
|
127
|
+
pending_metadata = combined_metadata
|
|
128
|
+
elif len(content) < min_chars:
|
|
129
|
+
# Current chunk is too small, start pending
|
|
130
|
+
pending_content = content
|
|
131
|
+
pending_metadata = dict(chunk.metadata)
|
|
132
|
+
else:
|
|
133
|
+
# Current chunk is big enough
|
|
134
|
+
merged.append(chunk)
|
|
135
|
+
|
|
136
|
+
# Don't forget any remaining pending content
|
|
137
|
+
if pending_content:
|
|
138
|
+
merged.append(Document(
|
|
139
|
+
page_content=pending_content,
|
|
140
|
+
metadata=pending_metadata
|
|
141
|
+
))
|
|
142
|
+
|
|
143
|
+
return merged
|
|
144
|
+
|
|
145
|
+
|
|
55
146
|
def markdown_by_headers_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
|
|
56
147
|
strip_header = config.get("strip_header", False)
|
|
57
148
|
return_each_line = config.get("return_each_line", False)
|
|
@@ -6,7 +6,7 @@ from langchain_core.prompts import ChatPromptTemplate
|
|
|
6
6
|
from langchain.text_splitter import TokenTextSplitter
|
|
7
7
|
|
|
8
8
|
from typing import Optional, List
|
|
9
|
-
from
|
|
9
|
+
from pydantic import BaseModel
|
|
10
10
|
from ..utils import tiktoken_length
|
|
11
11
|
|
|
12
12
|
logger = getLogger(__name__)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Universal Chunker - Routes documents to appropriate chunkers based on file type.
|
|
3
|
+
|
|
4
|
+
This module provides a universal chunking interface that automatically selects
|
|
5
|
+
the appropriate chunking strategy based on the file extension:
|
|
6
|
+
|
|
7
|
+
- .md, .markdown → Markdown chunker (header-based splitting)
|
|
8
|
+
- .py, .js, .ts, .java, etc. → TreeSitter code chunker
|
|
9
|
+
- .json → JSON chunker
|
|
10
|
+
- other → Default text chunker
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from alita_sdk.tools.chunkers.universal_chunker import universal_chunker
|
|
14
|
+
|
|
15
|
+
# Chunk documents from a loader
|
|
16
|
+
for chunk in universal_chunker(document_generator, config):
|
|
17
|
+
print(chunk.page_content)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
from typing import Generator, Dict, Any, Optional
|
|
23
|
+
from langchain_core.documents import Document
|
|
24
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
25
|
+
|
|
26
|
+
from .code.codeparser import parse_code_files_for_db
|
|
27
|
+
from .sematic.markdown_chunker import markdown_chunker
|
|
28
|
+
from .sematic.json_chunker import json_chunker
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# File extension mappings
|
|
34
|
+
MARKDOWN_EXTENSIONS = {'.md', '.markdown', '.mdown', '.mkd', '.mdx'}
|
|
35
|
+
JSON_EXTENSIONS = {'.json', '.jsonl', '.jsonc'}
|
|
36
|
+
CODE_EXTENSIONS = {
|
|
37
|
+
'.py', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
|
|
38
|
+
'.java', '.kt', '.rs', '.go', '.cpp', '.c', '.cs',
|
|
39
|
+
'.hs', '.rb', '.scala', '.lua'
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_file_extension(file_path: str) -> str:
|
|
44
|
+
"""Extract file extension from path."""
|
|
45
|
+
return os.path.splitext(file_path)[-1].lower()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_file_type(file_path: str) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Determine the file type category for chunking.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
'markdown', 'json', 'code', or 'text'
|
|
54
|
+
"""
|
|
55
|
+
ext = get_file_extension(file_path)
|
|
56
|
+
|
|
57
|
+
if ext in MARKDOWN_EXTENSIONS:
|
|
58
|
+
return 'markdown'
|
|
59
|
+
elif ext in JSON_EXTENSIONS:
|
|
60
|
+
return 'json'
|
|
61
|
+
elif ext in CODE_EXTENSIONS:
|
|
62
|
+
return 'code'
|
|
63
|
+
else:
|
|
64
|
+
return 'text'
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _default_text_chunker(
|
|
68
|
+
documents: Generator[Document, None, None],
|
|
69
|
+
config: Dict[str, Any]
|
|
70
|
+
) -> Generator[Document, None, None]:
|
|
71
|
+
"""
|
|
72
|
+
Default text chunker for unknown file types.
|
|
73
|
+
Uses recursive character splitting.
|
|
74
|
+
"""
|
|
75
|
+
chunk_size = config.get('chunk_size', 1000)
|
|
76
|
+
chunk_overlap = config.get('chunk_overlap', 100)
|
|
77
|
+
|
|
78
|
+
splitter = RecursiveCharacterTextSplitter(
|
|
79
|
+
chunk_size=chunk_size,
|
|
80
|
+
chunk_overlap=chunk_overlap,
|
|
81
|
+
length_function=len,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
for doc in documents:
|
|
85
|
+
chunks = splitter.split_documents([doc])
|
|
86
|
+
for idx, chunk in enumerate(chunks, 1):
|
|
87
|
+
chunk.metadata['chunk_id'] = idx
|
|
88
|
+
chunk.metadata['chunk_type'] = 'text'
|
|
89
|
+
chunk.metadata['method_name'] = 'text'
|
|
90
|
+
yield chunk
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _code_chunker_from_documents(
|
|
94
|
+
documents: Generator[Document, None, None],
|
|
95
|
+
config: Dict[str, Any]
|
|
96
|
+
) -> Generator[Document, None, None]:
|
|
97
|
+
"""
|
|
98
|
+
Adapter to convert Document generator to code parser format.
|
|
99
|
+
"""
|
|
100
|
+
def file_content_generator():
|
|
101
|
+
for doc in documents:
|
|
102
|
+
yield {
|
|
103
|
+
'file_name': doc.metadata.get('file_path', doc.metadata.get('filename', 'unknown')),
|
|
104
|
+
'file_content': doc.page_content,
|
|
105
|
+
'commit_hash': doc.metadata.get('commit_hash', ''),
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
# parse_code_files_for_db returns chunks with proper metadata
|
|
109
|
+
for chunk in parse_code_files_for_db(file_content_generator()):
|
|
110
|
+
# Ensure file_path is preserved
|
|
111
|
+
if 'file_path' not in chunk.metadata and 'filename' in chunk.metadata:
|
|
112
|
+
chunk.metadata['file_path'] = chunk.metadata['filename']
|
|
113
|
+
yield chunk
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def universal_chunker(
|
|
117
|
+
documents: Generator[Document, None, None],
|
|
118
|
+
config: Optional[Dict[str, Any]] = None
|
|
119
|
+
) -> Generator[Document, None, None]:
|
|
120
|
+
"""
|
|
121
|
+
Universal chunker that routes documents to appropriate chunkers based on file type.
|
|
122
|
+
|
|
123
|
+
Each document is inspected for its file extension (from metadata.file_path or
|
|
124
|
+
metadata.file_name) and routed to the appropriate chunker:
|
|
125
|
+
|
|
126
|
+
- Markdown files → markdown_chunker (header-based splitting)
|
|
127
|
+
- JSON files → json_chunker (recursive JSON splitting)
|
|
128
|
+
- Code files → code parser (TreeSitter-based parsing)
|
|
129
|
+
- Other files → default text chunker (recursive character splitting)
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
documents: Generator yielding Document objects with file content
|
|
133
|
+
config: Optional configuration dict with:
|
|
134
|
+
- markdown_config: Config for markdown chunker
|
|
135
|
+
- json_config: Config for JSON chunker
|
|
136
|
+
- code_config: Config for code chunker
|
|
137
|
+
- text_config: Config for default text chunker
|
|
138
|
+
|
|
139
|
+
Yields:
|
|
140
|
+
Document objects with chunked content and preserved metadata
|
|
141
|
+
"""
|
|
142
|
+
if config is None:
|
|
143
|
+
config = {}
|
|
144
|
+
|
|
145
|
+
# Default configs for each chunker type
|
|
146
|
+
markdown_config = config.get('markdown_config', {
|
|
147
|
+
'strip_header': False,
|
|
148
|
+
'return_each_line': False,
|
|
149
|
+
'headers_to_split_on': [
|
|
150
|
+
('#', 'Header 1'),
|
|
151
|
+
('##', 'Header 2'),
|
|
152
|
+
('###', 'Header 3'),
|
|
153
|
+
('####', 'Header 4'),
|
|
154
|
+
],
|
|
155
|
+
'max_tokens': 1024,
|
|
156
|
+
'token_overlap': 50,
|
|
157
|
+
'min_chunk_chars': 100, # Merge chunks smaller than this
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
json_config = config.get('json_config', {
|
|
161
|
+
'max_tokens': 512,
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
code_config = config.get('code_config', {})
|
|
165
|
+
|
|
166
|
+
text_config = config.get('text_config', {
|
|
167
|
+
'chunk_size': 1000,
|
|
168
|
+
'chunk_overlap': 100,
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
# Buffer documents by type for batch processing
|
|
172
|
+
# This is more efficient than processing one at a time
|
|
173
|
+
markdown_docs = []
|
|
174
|
+
json_docs = []
|
|
175
|
+
code_docs = []
|
|
176
|
+
text_docs = []
|
|
177
|
+
|
|
178
|
+
# Buffer size before flushing
|
|
179
|
+
BUFFER_SIZE = 10
|
|
180
|
+
|
|
181
|
+
def flush_markdown():
|
|
182
|
+
if markdown_docs:
|
|
183
|
+
def gen():
|
|
184
|
+
for d in markdown_docs:
|
|
185
|
+
yield d
|
|
186
|
+
for chunk in markdown_chunker(gen(), markdown_config):
|
|
187
|
+
yield chunk
|
|
188
|
+
markdown_docs.clear()
|
|
189
|
+
|
|
190
|
+
def flush_json():
|
|
191
|
+
if json_docs:
|
|
192
|
+
def gen():
|
|
193
|
+
for d in json_docs:
|
|
194
|
+
yield d
|
|
195
|
+
for chunk in json_chunker(gen(), json_config):
|
|
196
|
+
yield chunk
|
|
197
|
+
json_docs.clear()
|
|
198
|
+
|
|
199
|
+
def flush_code():
|
|
200
|
+
if code_docs:
|
|
201
|
+
def gen():
|
|
202
|
+
for d in code_docs:
|
|
203
|
+
yield d
|
|
204
|
+
for chunk in _code_chunker_from_documents(gen(), code_config):
|
|
205
|
+
yield chunk
|
|
206
|
+
code_docs.clear()
|
|
207
|
+
|
|
208
|
+
def flush_text():
|
|
209
|
+
if text_docs:
|
|
210
|
+
def gen():
|
|
211
|
+
for d in text_docs:
|
|
212
|
+
yield d
|
|
213
|
+
for chunk in _default_text_chunker(gen(), text_config):
|
|
214
|
+
yield chunk
|
|
215
|
+
text_docs.clear()
|
|
216
|
+
|
|
217
|
+
for doc in documents:
|
|
218
|
+
# Get file path from metadata
|
|
219
|
+
file_path = (doc.metadata.get('file_path') or
|
|
220
|
+
doc.metadata.get('file_name') or
|
|
221
|
+
doc.metadata.get('source') or
|
|
222
|
+
'unknown')
|
|
223
|
+
|
|
224
|
+
# Ensure file_path is in metadata for downstream use
|
|
225
|
+
doc.metadata['file_path'] = file_path
|
|
226
|
+
|
|
227
|
+
file_type = get_file_type(file_path)
|
|
228
|
+
|
|
229
|
+
if file_type == 'markdown':
|
|
230
|
+
markdown_docs.append(doc)
|
|
231
|
+
if len(markdown_docs) >= BUFFER_SIZE:
|
|
232
|
+
yield from flush_markdown()
|
|
233
|
+
elif file_type == 'json':
|
|
234
|
+
json_docs.append(doc)
|
|
235
|
+
if len(json_docs) >= BUFFER_SIZE:
|
|
236
|
+
yield from flush_json()
|
|
237
|
+
elif file_type == 'code':
|
|
238
|
+
code_docs.append(doc)
|
|
239
|
+
if len(code_docs) >= BUFFER_SIZE:
|
|
240
|
+
yield from flush_code()
|
|
241
|
+
else:
|
|
242
|
+
text_docs.append(doc)
|
|
243
|
+
if len(text_docs) >= BUFFER_SIZE:
|
|
244
|
+
yield from flush_text()
|
|
245
|
+
|
|
246
|
+
# Flush remaining documents
|
|
247
|
+
yield from flush_markdown()
|
|
248
|
+
yield from flush_json()
|
|
249
|
+
yield from flush_code()
|
|
250
|
+
yield from flush_text()
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def chunk_single_document(
|
|
254
|
+
doc: Document,
|
|
255
|
+
config: Optional[Dict[str, Any]] = None
|
|
256
|
+
) -> Generator[Document, None, None]:
|
|
257
|
+
"""
|
|
258
|
+
Convenience function to chunk a single document.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
doc: Single Document to chunk
|
|
262
|
+
config: Optional chunker configuration
|
|
263
|
+
|
|
264
|
+
Yields:
|
|
265
|
+
Chunked Document objects
|
|
266
|
+
"""
|
|
267
|
+
def single_doc_gen():
|
|
268
|
+
yield doc
|
|
269
|
+
|
|
270
|
+
yield from universal_chunker(single_doc_gen(), config)
|