alita-sdk 0.3.457__py3-none-any.whl → 0.3.486__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +194 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3592 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1256 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1665 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +64 -8
- alita_sdk/community/inventory/__init__.py +224 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +169 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/bitbucket.py +0 -3
- alita_sdk/runtime/clients/client.py +99 -26
- alita_sdk/runtime/langchain/assistant.py +4 -2
- alita_sdk/runtime/langchain/constants.py +2 -1
- alita_sdk/runtime/langchain/langraph_agent.py +134 -31
- alita_sdk/runtime/langchain/utils.py +1 -1
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/toolkits/__init__.py +2 -0
- alita_sdk/runtime/toolkits/application.py +1 -1
- alita_sdk/runtime/toolkits/mcp.py +46 -36
- alita_sdk/runtime/toolkits/planning.py +171 -0
- alita_sdk/runtime/toolkits/tools.py +39 -6
- alita_sdk/runtime/tools/function.py +17 -5
- alita_sdk/runtime/tools/llm.py +249 -14
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/vectorstore_base.py +41 -6
- alita_sdk/runtime/utils/mcp_oauth.py +80 -0
- alita_sdk/runtime/utils/streamlit.py +6 -10
- alita_sdk/runtime/utils/toolkit_utils.py +19 -4
- alita_sdk/tools/__init__.py +54 -27
- alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
- alita_sdk/tools/base_indexer_toolkit.py +150 -19
- alita_sdk/tools/bitbucket/__init__.py +2 -2
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +95 -6
- alita_sdk/tools/chunkers/universal_chunker.py +269 -0
- alita_sdk/tools/code_indexer_toolkit.py +55 -22
- alita_sdk/tools/elitea_base.py +86 -21
- alita_sdk/tools/jira/__init__.py +1 -1
- alita_sdk/tools/jira/api_wrapper.py +91 -40
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/qtest/__init__.py +1 -1
- alita_sdk/tools/qtest/api_wrapper.py +871 -32
- alita_sdk/tools/sharepoint/api_wrapper.py +22 -2
- alita_sdk/tools/sharepoint/authorization_helper.py +17 -1
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +8 -2
- alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/METADATA +146 -2
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/RECORD +102 -40
- alita_sdk-0.3.486.dist-info/entry_points.txt +2 -0
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Universal Chunker - Routes documents to appropriate chunkers based on file type.
|
|
3
|
+
|
|
4
|
+
This module provides a universal chunking interface that automatically selects
|
|
5
|
+
the appropriate chunking strategy based on the file extension:
|
|
6
|
+
|
|
7
|
+
- .md, .markdown → Markdown chunker (header-based splitting)
|
|
8
|
+
- .py, .js, .ts, .java, etc. → TreeSitter code chunker
|
|
9
|
+
- .json → JSON chunker
|
|
10
|
+
- other → Default text chunker
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from alita_sdk.tools.chunkers.universal_chunker import universal_chunker
|
|
14
|
+
|
|
15
|
+
# Chunk documents from a loader
|
|
16
|
+
for chunk in universal_chunker(document_generator, config):
|
|
17
|
+
print(chunk.page_content)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
from typing import Generator, Dict, Any, Optional
|
|
23
|
+
from langchain_core.documents import Document
|
|
24
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
25
|
+
|
|
26
|
+
from .code.codeparser import parse_code_files_for_db
|
|
27
|
+
from .sematic.markdown_chunker import markdown_chunker
|
|
28
|
+
from .sematic.json_chunker import json_chunker
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# File extension mappings
|
|
34
|
+
MARKDOWN_EXTENSIONS = {'.md', '.markdown', '.mdown', '.mkd', '.mdx'}
|
|
35
|
+
JSON_EXTENSIONS = {'.json', '.jsonl', '.jsonc'}
|
|
36
|
+
CODE_EXTENSIONS = {
|
|
37
|
+
'.py', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
|
|
38
|
+
'.java', '.kt', '.rs', '.go', '.cpp', '.c', '.cs',
|
|
39
|
+
'.hs', '.rb', '.scala', '.lua'
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_file_extension(file_path: str) -> str:
|
|
44
|
+
"""Extract file extension from path."""
|
|
45
|
+
return os.path.splitext(file_path)[-1].lower()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_file_type(file_path: str) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Determine the file type category for chunking.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
'markdown', 'json', 'code', or 'text'
|
|
54
|
+
"""
|
|
55
|
+
ext = get_file_extension(file_path)
|
|
56
|
+
|
|
57
|
+
if ext in MARKDOWN_EXTENSIONS:
|
|
58
|
+
return 'markdown'
|
|
59
|
+
elif ext in JSON_EXTENSIONS:
|
|
60
|
+
return 'json'
|
|
61
|
+
elif ext in CODE_EXTENSIONS:
|
|
62
|
+
return 'code'
|
|
63
|
+
else:
|
|
64
|
+
return 'text'
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _default_text_chunker(
|
|
68
|
+
documents: Generator[Document, None, None],
|
|
69
|
+
config: Dict[str, Any]
|
|
70
|
+
) -> Generator[Document, None, None]:
|
|
71
|
+
"""
|
|
72
|
+
Default text chunker for unknown file types.
|
|
73
|
+
Uses recursive character splitting.
|
|
74
|
+
"""
|
|
75
|
+
chunk_size = config.get('chunk_size', 1000)
|
|
76
|
+
chunk_overlap = config.get('chunk_overlap', 100)
|
|
77
|
+
|
|
78
|
+
splitter = RecursiveCharacterTextSplitter(
|
|
79
|
+
chunk_size=chunk_size,
|
|
80
|
+
chunk_overlap=chunk_overlap,
|
|
81
|
+
length_function=len,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
for doc in documents:
|
|
85
|
+
chunks = splitter.split_documents([doc])
|
|
86
|
+
for idx, chunk in enumerate(chunks, 1):
|
|
87
|
+
chunk.metadata['chunk_id'] = idx
|
|
88
|
+
chunk.metadata['chunk_type'] = 'text'
|
|
89
|
+
yield chunk
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _code_chunker_from_documents(
|
|
93
|
+
documents: Generator[Document, None, None],
|
|
94
|
+
config: Dict[str, Any]
|
|
95
|
+
) -> Generator[Document, None, None]:
|
|
96
|
+
"""
|
|
97
|
+
Adapter to convert Document generator to code parser format.
|
|
98
|
+
"""
|
|
99
|
+
def file_content_generator():
|
|
100
|
+
for doc in documents:
|
|
101
|
+
yield {
|
|
102
|
+
'file_name': doc.metadata.get('file_path', doc.metadata.get('filename', 'unknown')),
|
|
103
|
+
'file_content': doc.page_content,
|
|
104
|
+
'commit_hash': doc.metadata.get('commit_hash', ''),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# parse_code_files_for_db returns chunks with proper metadata
|
|
108
|
+
for chunk in parse_code_files_for_db(file_content_generator()):
|
|
109
|
+
# Ensure file_path is preserved
|
|
110
|
+
if 'file_path' not in chunk.metadata and 'filename' in chunk.metadata:
|
|
111
|
+
chunk.metadata['file_path'] = chunk.metadata['filename']
|
|
112
|
+
yield chunk
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def universal_chunker(
|
|
116
|
+
documents: Generator[Document, None, None],
|
|
117
|
+
config: Optional[Dict[str, Any]] = None
|
|
118
|
+
) -> Generator[Document, None, None]:
|
|
119
|
+
"""
|
|
120
|
+
Universal chunker that routes documents to appropriate chunkers based on file type.
|
|
121
|
+
|
|
122
|
+
Each document is inspected for its file extension (from metadata.file_path or
|
|
123
|
+
metadata.file_name) and routed to the appropriate chunker:
|
|
124
|
+
|
|
125
|
+
- Markdown files → markdown_chunker (header-based splitting)
|
|
126
|
+
- JSON files → json_chunker (recursive JSON splitting)
|
|
127
|
+
- Code files → code parser (TreeSitter-based parsing)
|
|
128
|
+
- Other files → default text chunker (recursive character splitting)
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
documents: Generator yielding Document objects with file content
|
|
132
|
+
config: Optional configuration dict with:
|
|
133
|
+
- markdown_config: Config for markdown chunker
|
|
134
|
+
- json_config: Config for JSON chunker
|
|
135
|
+
- code_config: Config for code chunker
|
|
136
|
+
- text_config: Config for default text chunker
|
|
137
|
+
|
|
138
|
+
Yields:
|
|
139
|
+
Document objects with chunked content and preserved metadata
|
|
140
|
+
"""
|
|
141
|
+
if config is None:
|
|
142
|
+
config = {}
|
|
143
|
+
|
|
144
|
+
# Default configs for each chunker type
|
|
145
|
+
markdown_config = config.get('markdown_config', {
|
|
146
|
+
'strip_header': False,
|
|
147
|
+
'return_each_line': False,
|
|
148
|
+
'headers_to_split_on': [
|
|
149
|
+
('#', 'Header 1'),
|
|
150
|
+
('##', 'Header 2'),
|
|
151
|
+
('###', 'Header 3'),
|
|
152
|
+
('####', 'Header 4'),
|
|
153
|
+
],
|
|
154
|
+
'max_tokens': 1024,
|
|
155
|
+
'token_overlap': 50,
|
|
156
|
+
'min_chunk_chars': 100, # Merge chunks smaller than this
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
json_config = config.get('json_config', {
|
|
160
|
+
'max_tokens': 512,
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
code_config = config.get('code_config', {})
|
|
164
|
+
|
|
165
|
+
text_config = config.get('text_config', {
|
|
166
|
+
'chunk_size': 1000,
|
|
167
|
+
'chunk_overlap': 100,
|
|
168
|
+
})
|
|
169
|
+
|
|
170
|
+
# Buffer documents by type for batch processing
|
|
171
|
+
# This is more efficient than processing one at a time
|
|
172
|
+
markdown_docs = []
|
|
173
|
+
json_docs = []
|
|
174
|
+
code_docs = []
|
|
175
|
+
text_docs = []
|
|
176
|
+
|
|
177
|
+
# Buffer size before flushing
|
|
178
|
+
BUFFER_SIZE = 10
|
|
179
|
+
|
|
180
|
+
def flush_markdown():
|
|
181
|
+
if markdown_docs:
|
|
182
|
+
def gen():
|
|
183
|
+
for d in markdown_docs:
|
|
184
|
+
yield d
|
|
185
|
+
for chunk in markdown_chunker(gen(), markdown_config):
|
|
186
|
+
yield chunk
|
|
187
|
+
markdown_docs.clear()
|
|
188
|
+
|
|
189
|
+
def flush_json():
|
|
190
|
+
if json_docs:
|
|
191
|
+
def gen():
|
|
192
|
+
for d in json_docs:
|
|
193
|
+
yield d
|
|
194
|
+
for chunk in json_chunker(gen(), json_config):
|
|
195
|
+
yield chunk
|
|
196
|
+
json_docs.clear()
|
|
197
|
+
|
|
198
|
+
def flush_code():
|
|
199
|
+
if code_docs:
|
|
200
|
+
def gen():
|
|
201
|
+
for d in code_docs:
|
|
202
|
+
yield d
|
|
203
|
+
for chunk in _code_chunker_from_documents(gen(), code_config):
|
|
204
|
+
yield chunk
|
|
205
|
+
code_docs.clear()
|
|
206
|
+
|
|
207
|
+
def flush_text():
|
|
208
|
+
if text_docs:
|
|
209
|
+
def gen():
|
|
210
|
+
for d in text_docs:
|
|
211
|
+
yield d
|
|
212
|
+
for chunk in _default_text_chunker(gen(), text_config):
|
|
213
|
+
yield chunk
|
|
214
|
+
text_docs.clear()
|
|
215
|
+
|
|
216
|
+
for doc in documents:
|
|
217
|
+
# Get file path from metadata
|
|
218
|
+
file_path = (doc.metadata.get('file_path') or
|
|
219
|
+
doc.metadata.get('file_name') or
|
|
220
|
+
doc.metadata.get('source') or
|
|
221
|
+
'unknown')
|
|
222
|
+
|
|
223
|
+
# Ensure file_path is in metadata for downstream use
|
|
224
|
+
doc.metadata['file_path'] = file_path
|
|
225
|
+
|
|
226
|
+
file_type = get_file_type(file_path)
|
|
227
|
+
|
|
228
|
+
if file_type == 'markdown':
|
|
229
|
+
markdown_docs.append(doc)
|
|
230
|
+
if len(markdown_docs) >= BUFFER_SIZE:
|
|
231
|
+
yield from flush_markdown()
|
|
232
|
+
elif file_type == 'json':
|
|
233
|
+
json_docs.append(doc)
|
|
234
|
+
if len(json_docs) >= BUFFER_SIZE:
|
|
235
|
+
yield from flush_json()
|
|
236
|
+
elif file_type == 'code':
|
|
237
|
+
code_docs.append(doc)
|
|
238
|
+
if len(code_docs) >= BUFFER_SIZE:
|
|
239
|
+
yield from flush_code()
|
|
240
|
+
else:
|
|
241
|
+
text_docs.append(doc)
|
|
242
|
+
if len(text_docs) >= BUFFER_SIZE:
|
|
243
|
+
yield from flush_text()
|
|
244
|
+
|
|
245
|
+
# Flush remaining documents
|
|
246
|
+
yield from flush_markdown()
|
|
247
|
+
yield from flush_json()
|
|
248
|
+
yield from flush_code()
|
|
249
|
+
yield from flush_text()
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def chunk_single_document(
|
|
253
|
+
doc: Document,
|
|
254
|
+
config: Optional[Dict[str, Any]] = None
|
|
255
|
+
) -> Generator[Document, None, None]:
|
|
256
|
+
"""
|
|
257
|
+
Convenience function to chunk a single document.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
doc: Single Document to chunk
|
|
261
|
+
config: Optional chunker configuration
|
|
262
|
+
|
|
263
|
+
Yields:
|
|
264
|
+
Chunked Document objects
|
|
265
|
+
"""
|
|
266
|
+
def single_doc_gen():
|
|
267
|
+
yield doc
|
|
268
|
+
|
|
269
|
+
yield from universal_chunker(single_doc_gen(), config)
|
|
@@ -9,13 +9,13 @@ from langchain_core.tools import ToolException
|
|
|
9
9
|
from pydantic import Field
|
|
10
10
|
|
|
11
11
|
from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
|
|
12
|
-
from .chunkers.code.codeparser import parse_code_files_for_db
|
|
13
12
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
18
17
|
def _get_indexed_data(self, index_name: str):
|
|
18
|
+
self._ensure_vectorstore_initialized()
|
|
19
19
|
if not self.vector_adapter:
|
|
20
20
|
raise ToolException("Vector adapter is not initialized. "
|
|
21
21
|
"Check your configuration: embedding_model and vectorstore_type.")
|
|
@@ -66,26 +66,40 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
66
66
|
def loader(self,
|
|
67
67
|
branch: Optional[str] = None,
|
|
68
68
|
whitelist: Optional[List[str]] = None,
|
|
69
|
-
blacklist: Optional[List[str]] = None
|
|
69
|
+
blacklist: Optional[List[str]] = None,
|
|
70
|
+
chunked: bool = True) -> Generator[Document, None, None]:
|
|
70
71
|
"""
|
|
71
|
-
Generates
|
|
72
|
+
Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
|
|
72
73
|
|
|
73
74
|
Parameters:
|
|
74
75
|
- branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
|
|
75
76
|
- whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
|
|
76
77
|
- blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
|
|
78
|
+
- chunked (bool): If True (default), applies universal chunker based on file type.
|
|
79
|
+
If False, returns raw Documents without chunking.
|
|
77
80
|
|
|
78
81
|
Returns:
|
|
79
|
-
- generator: Yields
|
|
82
|
+
- generator: Yields Documents from files matching the whitelist but not the blacklist.
|
|
83
|
+
Each document has exactly the key 'filename' in metadata, which is used as an ID
|
|
84
|
+
for further operations (indexing, deduplication, and retrieval).
|
|
80
85
|
|
|
81
86
|
Example:
|
|
82
87
|
# Use 'feature-branch', include '.py' files, exclude 'test_' files
|
|
83
|
-
|
|
88
|
+
for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
|
|
89
|
+
print(doc.page_content)
|
|
84
90
|
|
|
85
91
|
Notes:
|
|
86
92
|
- Whitelist and blacklist use Unix shell-style wildcards.
|
|
87
93
|
- Files must match the whitelist and not the blacklist to be included.
|
|
94
|
+
- Each document MUST have exactly the key 'filename' in metadata. This key is used as an ID
|
|
95
|
+
for further operations such as indexing, deduplication, and retrieval.
|
|
96
|
+
- When chunked=True:
|
|
97
|
+
- .md files → markdown chunker (header-based splitting)
|
|
98
|
+
- .py/.js/.ts/etc → code parser (TreeSitter-based)
|
|
99
|
+
- .json files → JSON chunker
|
|
100
|
+
- other files → default text chunker
|
|
88
101
|
"""
|
|
102
|
+
import hashlib
|
|
89
103
|
|
|
90
104
|
_files = self.__handle_get_files("", self.__get_branch(branch))
|
|
91
105
|
self._log_tool_event(message="Listing files in branch", tool_name="loader")
|
|
@@ -103,41 +117,60 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
103
117
|
or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
|
|
104
118
|
return False
|
|
105
119
|
|
|
106
|
-
def
|
|
120
|
+
def raw_document_generator() -> Generator[Document, None, None]:
|
|
121
|
+
"""Yields raw Documents without chunking."""
|
|
107
122
|
self._log_tool_event(message="Reading the files", tool_name="loader")
|
|
108
|
-
# log the progress of file reading
|
|
109
123
|
total_files = len(_files)
|
|
124
|
+
processed = 0
|
|
125
|
+
|
|
110
126
|
for idx, file in enumerate(_files, 1):
|
|
111
127
|
if is_whitelisted(file) and not is_blacklisted(file):
|
|
112
|
-
# read file ONLY if it matches whitelist and does not match blacklist
|
|
113
128
|
try:
|
|
114
129
|
file_content = self._read_file(file, self.__get_branch(branch))
|
|
115
130
|
except Exception as e:
|
|
116
131
|
logger.error(f"Failed to read file {file}: {e}")
|
|
117
|
-
|
|
132
|
+
continue
|
|
133
|
+
|
|
118
134
|
if not file_content:
|
|
119
|
-
# empty file, skip
|
|
120
135
|
continue
|
|
121
|
-
|
|
122
|
-
#
|
|
136
|
+
|
|
137
|
+
# Ensure file content is a string
|
|
123
138
|
if isinstance(file_content, bytes):
|
|
124
139
|
file_content = file_content.decode("utf-8", errors="ignore")
|
|
125
140
|
elif isinstance(file_content, dict) and file.endswith('.json'):
|
|
126
141
|
file_content = json.dumps(file_content)
|
|
127
142
|
elif not isinstance(file_content, str):
|
|
128
143
|
file_content = str(file_content)
|
|
129
|
-
|
|
130
|
-
#
|
|
131
|
-
import hashlib
|
|
144
|
+
|
|
145
|
+
# Hash the file content for uniqueness tracking
|
|
132
146
|
file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
147
|
+
processed += 1
|
|
148
|
+
|
|
149
|
+
yield Document(
|
|
150
|
+
page_content=file_content,
|
|
151
|
+
metadata={
|
|
152
|
+
'file_path': file,
|
|
153
|
+
'filename': file,
|
|
154
|
+
'source': file,
|
|
155
|
+
'commit_hash': file_hash,
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
|
|
136
159
|
if idx % 10 == 0 or idx == total_files:
|
|
137
|
-
self._log_tool_event(
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
160
|
+
self._log_tool_event(
|
|
161
|
+
message=f"{idx} out of {total_files} files checked, {processed} matched",
|
|
162
|
+
tool_name="loader"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
|
|
166
|
+
|
|
167
|
+
if not chunked:
|
|
168
|
+
# Return raw documents without chunking
|
|
169
|
+
return raw_document_generator()
|
|
170
|
+
|
|
171
|
+
# Apply universal chunker based on file type
|
|
172
|
+
from .chunkers.universal_chunker import universal_chunker
|
|
173
|
+
return universal_chunker(raw_document_generator())
|
|
141
174
|
|
|
142
175
|
def __handle_get_files(self, path: str, branch: str):
|
|
143
176
|
"""
|
alita_sdk/tools/elitea_base.py
CHANGED
|
@@ -128,12 +128,37 @@ BaseIndexDataParams = create_model(
|
|
|
128
128
|
|
|
129
129
|
|
|
130
130
|
class BaseToolApiWrapper(BaseModel):
|
|
131
|
-
|
|
131
|
+
|
|
132
|
+
# Optional RunnableConfig for CLI/standalone usage (allows dispatch_custom_event to work)
|
|
133
|
+
_runnable_config: Optional[Dict[str, Any]] = None
|
|
134
|
+
# toolkit id propagated from backend
|
|
135
|
+
toolkit_id: int = 0
|
|
132
136
|
def get_available_tools(self):
|
|
133
137
|
raise NotImplementedError("Subclasses should implement this method")
|
|
134
138
|
|
|
135
|
-
def
|
|
136
|
-
"""
|
|
139
|
+
def set_runnable_config(self, config: Optional[Dict[str, Any]]) -> None:
|
|
140
|
+
"""
|
|
141
|
+
Set the RunnableConfig for dispatching custom events.
|
|
142
|
+
|
|
143
|
+
This is required when running outside of a LangChain agent context
|
|
144
|
+
(e.g., from CLI). Without a config containing a run_id,
|
|
145
|
+
dispatch_custom_event will fail with "Unable to dispatch an adhoc event
|
|
146
|
+
without a parent run id".
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
config: A RunnableConfig dict with at least {'run_id': uuid}
|
|
150
|
+
"""
|
|
151
|
+
self._runnable_config = config
|
|
152
|
+
|
|
153
|
+
def _log_tool_event(self, message: str, tool_name: str = None, config: Optional[Dict[str, Any]] = None):
|
|
154
|
+
"""Log data and dispatch custom event for the tool.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
message: The message to log
|
|
158
|
+
tool_name: Name of the tool (defaults to 'tool_progress')
|
|
159
|
+
config: Optional RunnableConfig. If not provided, uses self._runnable_config.
|
|
160
|
+
Required when running outside a LangChain agent context.
|
|
161
|
+
"""
|
|
137
162
|
|
|
138
163
|
try:
|
|
139
164
|
from langchain_core.callbacks import dispatch_custom_event
|
|
@@ -142,6 +167,10 @@ class BaseToolApiWrapper(BaseModel):
|
|
|
142
167
|
tool_name = 'tool_progress'
|
|
143
168
|
|
|
144
169
|
logger.info(message)
|
|
170
|
+
|
|
171
|
+
# Use provided config, fall back to instance config
|
|
172
|
+
effective_config = config or self._runnable_config
|
|
173
|
+
|
|
145
174
|
dispatch_custom_event(
|
|
146
175
|
name="thinking_step",
|
|
147
176
|
data={
|
|
@@ -149,6 +178,7 @@ class BaseToolApiWrapper(BaseModel):
|
|
|
149
178
|
"tool_name": tool_name,
|
|
150
179
|
"toolkit": self.__class__.__name__,
|
|
151
180
|
},
|
|
181
|
+
config=effective_config,
|
|
152
182
|
)
|
|
153
183
|
except Exception as e:
|
|
154
184
|
logger.warning(f"Failed to dispatch progress event: {str(e)}")
|
|
@@ -165,6 +195,11 @@ class BaseToolApiWrapper(BaseModel):
|
|
|
165
195
|
# execution = str(execution)
|
|
166
196
|
return execution
|
|
167
197
|
except Exception as e:
|
|
198
|
+
# Re-raise McpAuthorizationRequired directly without wrapping
|
|
199
|
+
from alita_sdk.runtime.utils.mcp_oauth import McpAuthorizationRequired
|
|
200
|
+
if isinstance(e, McpAuthorizationRequired):
|
|
201
|
+
raise
|
|
202
|
+
|
|
168
203
|
# Catch all tool execution exceptions and provide user-friendly error messages
|
|
169
204
|
error_type = type(e).__name__
|
|
170
205
|
error_message = str(e)
|
|
@@ -589,27 +624,37 @@ class BaseCodeToolApiWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
589
624
|
def loader(self,
|
|
590
625
|
branch: Optional[str] = None,
|
|
591
626
|
whitelist: Optional[List[str]] = None,
|
|
592
|
-
blacklist: Optional[List[str]] = None
|
|
627
|
+
blacklist: Optional[List[str]] = None,
|
|
628
|
+
chunked: bool = True) -> Generator[Document, None, None]:
|
|
593
629
|
"""
|
|
594
|
-
Generates
|
|
630
|
+
Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
|
|
595
631
|
|
|
596
632
|
Parameters:
|
|
597
633
|
- branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
|
|
598
634
|
- whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
|
|
599
635
|
- blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
|
|
636
|
+
- chunked (bool): If True (default), applies universal chunker based on file type.
|
|
637
|
+
If False, returns raw Documents without chunking.
|
|
600
638
|
|
|
601
639
|
Returns:
|
|
602
|
-
- generator: Yields
|
|
640
|
+
- generator: Yields Documents from files matching the whitelist but not the blacklist.
|
|
603
641
|
|
|
604
642
|
Example:
|
|
605
643
|
# Use 'feature-branch', include '.py' files, exclude 'test_' files
|
|
606
|
-
|
|
644
|
+
for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
|
|
645
|
+
print(doc.page_content)
|
|
607
646
|
|
|
608
647
|
Notes:
|
|
609
648
|
- Whitelist and blacklist use Unix shell-style wildcards.
|
|
610
649
|
- Files must match the whitelist and not the blacklist to be included.
|
|
650
|
+
- When chunked=True:
|
|
651
|
+
- .md files → markdown chunker (header-based splitting)
|
|
652
|
+
- .py/.js/.ts/etc → code parser (TreeSitter-based)
|
|
653
|
+
- .json files → JSON chunker
|
|
654
|
+
- other files → default text chunker
|
|
611
655
|
"""
|
|
612
|
-
from .
|
|
656
|
+
from langchain_core.documents import Document
|
|
657
|
+
import hashlib
|
|
613
658
|
|
|
614
659
|
_files = self.__handle_get_files("", self.__get_branch(branch))
|
|
615
660
|
self._log_tool_event(message="Listing files in branch", tool_name="loader")
|
|
@@ -627,32 +672,52 @@ class BaseCodeToolApiWrapper(BaseVectorStoreToolApiWrapper):
|
|
|
627
672
|
or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
|
|
628
673
|
return False
|
|
629
674
|
|
|
630
|
-
def
|
|
675
|
+
def raw_document_generator() -> Generator[Document, None, None]:
|
|
676
|
+
"""Yields raw Documents without chunking."""
|
|
631
677
|
self._log_tool_event(message="Reading the files", tool_name="loader")
|
|
632
|
-
# log the progress of file reading
|
|
633
678
|
total_files = len(_files)
|
|
679
|
+
processed = 0
|
|
680
|
+
|
|
634
681
|
for idx, file in enumerate(_files, 1):
|
|
635
682
|
if is_whitelisted(file) and not is_blacklisted(file):
|
|
636
|
-
# read file ONLY if it matches whitelist and does not match blacklist
|
|
637
683
|
try:
|
|
638
684
|
file_content = self._read_file(file, self.__get_branch(branch))
|
|
639
685
|
except Exception as e:
|
|
640
686
|
logger.error(f"Failed to read file {file}: {e}")
|
|
641
|
-
|
|
687
|
+
continue
|
|
688
|
+
|
|
642
689
|
if not file_content:
|
|
643
|
-
# empty file, skip
|
|
644
690
|
continue
|
|
645
|
-
|
|
646
|
-
|
|
691
|
+
|
|
692
|
+
# Hash the file content for uniqueness tracking
|
|
647
693
|
file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
694
|
+
processed += 1
|
|
695
|
+
|
|
696
|
+
yield Document(
|
|
697
|
+
page_content=file_content,
|
|
698
|
+
metadata={
|
|
699
|
+
'file_path': file,
|
|
700
|
+
'file_name': file,
|
|
701
|
+
'source': file,
|
|
702
|
+
'commit_hash': file_hash,
|
|
703
|
+
}
|
|
704
|
+
)
|
|
705
|
+
|
|
651
706
|
if idx % 10 == 0 or idx == total_files:
|
|
652
|
-
self._log_tool_event(
|
|
653
|
-
|
|
707
|
+
self._log_tool_event(
|
|
708
|
+
message=f"{idx} out of {total_files} files checked, {processed} matched",
|
|
709
|
+
tool_name="loader"
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
|
|
654
713
|
|
|
655
|
-
|
|
714
|
+
if not chunked:
|
|
715
|
+
# Return raw documents without chunking
|
|
716
|
+
return raw_document_generator()
|
|
717
|
+
|
|
718
|
+
# Apply universal chunker based on file type
|
|
719
|
+
from .chunkers.universal_chunker import universal_chunker
|
|
720
|
+
return universal_chunker(raw_document_generator())
|
|
656
721
|
|
|
657
722
|
def index_data(self,
|
|
658
723
|
index_name: str,
|
alita_sdk/tools/jira/__init__.py
CHANGED
|
@@ -68,7 +68,7 @@ class JiraToolkit(BaseToolkit):
|
|
|
68
68
|
name,
|
|
69
69
|
cloud=(bool, Field(description="Hosting Option", json_schema_extra={'configuration': True})),
|
|
70
70
|
limit=(int, Field(description="Limit issues. Default is 5", gt=0, default=5)),
|
|
71
|
-
api_version=(
|
|
71
|
+
api_version=(Literal['2', '3'], Field(description="Rest API version: optional. Default is 2", default="3")),
|
|
72
72
|
labels=(Optional[str], Field(
|
|
73
73
|
description="List of comma separated labels used for labeling of agent's created or updated entities",
|
|
74
74
|
default=None,
|