alita-sdk 0.3.465__py3-none-any.whl → 0.3.497__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +83 -1
- alita_sdk/cli/agent_loader.py +22 -4
- alita_sdk/cli/agent_ui.py +13 -3
- alita_sdk/cli/agents.py +1876 -186
- alita_sdk/cli/callbacks.py +96 -25
- alita_sdk/cli/cli.py +10 -1
- alita_sdk/cli/config.py +151 -9
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/input_handler.py +167 -4
- alita_sdk/cli/inventory.py +1256 -0
- alita_sdk/cli/toolkit.py +14 -17
- alita_sdk/cli/toolkit_loader.py +35 -5
- alita_sdk/cli/tools/__init__.py +8 -1
- alita_sdk/cli/tools/filesystem.py +910 -64
- alita_sdk/cli/tools/planning.py +143 -157
- alita_sdk/cli/tools/terminal.py +154 -20
- alita_sdk/community/__init__.py +64 -8
- alita_sdk/community/inventory/__init__.py +224 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +169 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/bitbucket.py +0 -3
- alita_sdk/runtime/clients/client.py +108 -31
- alita_sdk/runtime/langchain/assistant.py +4 -2
- alita_sdk/runtime/langchain/constants.py +3 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/constants.py +10 -6
- alita_sdk/runtime/langchain/langraph_agent.py +123 -31
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/toolkits/__init__.py +2 -0
- alita_sdk/runtime/toolkits/application.py +1 -1
- alita_sdk/runtime/toolkits/mcp.py +107 -91
- alita_sdk/runtime/toolkits/planning.py +173 -0
- alita_sdk/runtime/toolkits/tools.py +59 -7
- alita_sdk/runtime/tools/artifact.py +46 -17
- alita_sdk/runtime/tools/function.py +2 -1
- alita_sdk/runtime/tools/llm.py +320 -32
- alita_sdk/runtime/tools/mcp_remote_tool.py +23 -7
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/vectorstore_base.py +44 -9
- alita_sdk/runtime/utils/AlitaCallback.py +106 -20
- alita_sdk/runtime/utils/mcp_client.py +465 -0
- alita_sdk/runtime/utils/mcp_oauth.py +80 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +6 -10
- alita_sdk/runtime/utils/toolkit_utils.py +14 -5
- alita_sdk/tools/__init__.py +54 -27
- alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
- alita_sdk/tools/base_indexer_toolkit.py +99 -20
- alita_sdk/tools/bitbucket/__init__.py +2 -2
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code_indexer_toolkit.py +55 -22
- alita_sdk/tools/confluence/api_wrapper.py +63 -14
- alita_sdk/tools/elitea_base.py +86 -21
- alita_sdk/tools/jira/__init__.py +1 -1
- alita_sdk/tools/jira/api_wrapper.py +91 -40
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/qtest/__init__.py +1 -1
- alita_sdk/tools/sharepoint/api_wrapper.py +2 -2
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
- alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/METADATA +2 -1
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/RECORD +103 -61
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/entry_points.txt +0 -0
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Generator
|
|
1
|
+
from typing import Generator, List
|
|
2
2
|
from langchain_core.documents import Document
|
|
3
3
|
from langchain_text_splitters import MarkdownHeaderTextSplitter, ExperimentalMarkdownSyntaxTextSplitter
|
|
4
4
|
from langchain.text_splitter import TokenTextSplitter
|
|
@@ -7,34 +7,60 @@ from copy import deepcopy as copy
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def markdown_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
|
|
10
|
+
"""
|
|
11
|
+
Chunks markdown documents by headers, with support for:
|
|
12
|
+
- Minimum chunk size to avoid tiny fragments
|
|
13
|
+
- Maximum token limit with overflow splitting
|
|
14
|
+
- Header metadata preservation
|
|
15
|
+
|
|
16
|
+
Config options:
|
|
17
|
+
strip_header (bool): Remove headers from content. Default: False
|
|
18
|
+
return_each_line (bool): Split on every line. Default: False
|
|
19
|
+
headers_to_split_on (list): Headers to split on, e.g. [('#', 'H1'), ('##', 'H2')]
|
|
20
|
+
max_tokens (int): Maximum tokens per chunk. Default: 512
|
|
21
|
+
token_overlap (int): Token overlap for large chunk splitting. Default: 10
|
|
22
|
+
min_chunk_chars (int): Minimum characters per chunk. Default: 100
|
|
23
|
+
Chunks smaller than this will be merged with the next chunk.
|
|
24
|
+
"""
|
|
10
25
|
strip_header = config.get("strip_header", False)
|
|
11
26
|
return_each_line = config.get("return_each_line", False)
|
|
12
27
|
headers_to_split_on = config.get("headers_to_split_on", [])
|
|
13
28
|
max_tokens = config.get("max_tokens", 512)
|
|
14
29
|
tokens_overlapping = config.get("token_overlap", 10)
|
|
30
|
+
min_chunk_chars = config.get("min_chunk_chars", 100) # Minimum characters per chunk
|
|
31
|
+
|
|
15
32
|
headers_to_split_on = [tuple(header) for header in headers_to_split_on]
|
|
33
|
+
|
|
16
34
|
for doc in file_content_generator:
|
|
17
35
|
doc_metadata = doc.metadata
|
|
18
36
|
doc_content = doc.page_content
|
|
19
37
|
chunk_id = 0
|
|
38
|
+
|
|
20
39
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
|
21
40
|
headers_to_split_on=headers_to_split_on,
|
|
22
41
|
strip_headers=strip_header,
|
|
23
42
|
return_each_line=return_each_line
|
|
24
43
|
)
|
|
25
44
|
md_header_splits = markdown_splitter.split_text(doc_content)
|
|
26
|
-
|
|
45
|
+
|
|
46
|
+
# Merge small chunks with the next one
|
|
47
|
+
merged_chunks = _merge_small_chunks(md_header_splits, min_chunk_chars)
|
|
48
|
+
|
|
49
|
+
for chunk in merged_chunks:
|
|
27
50
|
if tiktoken_length(chunk.page_content) > max_tokens:
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
51
|
+
# Split large chunks into smaller ones
|
|
52
|
+
for subchunk in TokenTextSplitter(
|
|
53
|
+
encoding_name="cl100k_base",
|
|
54
|
+
chunk_size=max_tokens,
|
|
55
|
+
chunk_overlap=tokens_overlapping
|
|
56
|
+
).split_text(chunk.page_content):
|
|
32
57
|
chunk_id += 1
|
|
33
58
|
headers_meta = list(chunk.metadata.values())
|
|
34
59
|
docmeta = copy(doc_metadata)
|
|
35
60
|
docmeta.update({"headers": "; ".join(headers_meta)})
|
|
36
61
|
docmeta['chunk_id'] = chunk_id
|
|
37
62
|
docmeta['chunk_type'] = "document"
|
|
63
|
+
docmeta['method_name'] = 'markdown'
|
|
38
64
|
yield Document(
|
|
39
65
|
page_content=subchunk,
|
|
40
66
|
metadata=docmeta
|
|
@@ -46,12 +72,77 @@ def markdown_chunker(file_content_generator: Generator[Document, None, None], co
|
|
|
46
72
|
docmeta.update({"headers": "; ".join(headers_meta)})
|
|
47
73
|
docmeta['chunk_id'] = chunk_id
|
|
48
74
|
docmeta['chunk_type'] = "document"
|
|
75
|
+
docmeta['method_name'] = 'text'
|
|
49
76
|
yield Document(
|
|
50
77
|
page_content=chunk.page_content,
|
|
51
78
|
metadata=docmeta
|
|
52
79
|
)
|
|
53
80
|
|
|
54
81
|
|
|
82
|
+
def _merge_small_chunks(chunks: List[Document], min_chars: int) -> List[Document]:
|
|
83
|
+
"""
|
|
84
|
+
Merge chunks that are smaller than min_chars with the next chunk.
|
|
85
|
+
|
|
86
|
+
This prevents tiny fragments (like standalone headers or short notes)
|
|
87
|
+
from becoming separate chunks.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
chunks: List of Document chunks from markdown splitter
|
|
91
|
+
min_chars: Minimum character count for a chunk
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
List of merged Document chunks
|
|
95
|
+
"""
|
|
96
|
+
if not chunks:
|
|
97
|
+
return chunks
|
|
98
|
+
|
|
99
|
+
merged = []
|
|
100
|
+
pending_content = ""
|
|
101
|
+
pending_metadata = {}
|
|
102
|
+
|
|
103
|
+
for i, chunk in enumerate(chunks):
|
|
104
|
+
content = chunk.page_content.strip()
|
|
105
|
+
|
|
106
|
+
if pending_content:
|
|
107
|
+
# Merge pending content with current chunk
|
|
108
|
+
combined_content = pending_content + "\n\n" + content
|
|
109
|
+
# Use the pending metadata (from the header) but can be extended
|
|
110
|
+
combined_metadata = {**pending_metadata}
|
|
111
|
+
# Add any new header info from current chunk
|
|
112
|
+
for key, value in chunk.metadata.items():
|
|
113
|
+
if key not in combined_metadata or not combined_metadata[key]:
|
|
114
|
+
combined_metadata[key] = value
|
|
115
|
+
|
|
116
|
+
if len(combined_content) >= min_chars:
|
|
117
|
+
# Combined is big enough, emit it
|
|
118
|
+
merged.append(Document(
|
|
119
|
+
page_content=combined_content,
|
|
120
|
+
metadata=combined_metadata
|
|
121
|
+
))
|
|
122
|
+
pending_content = ""
|
|
123
|
+
pending_metadata = {}
|
|
124
|
+
else:
|
|
125
|
+
# Still too small, keep accumulating
|
|
126
|
+
pending_content = combined_content
|
|
127
|
+
pending_metadata = combined_metadata
|
|
128
|
+
elif len(content) < min_chars:
|
|
129
|
+
# Current chunk is too small, start pending
|
|
130
|
+
pending_content = content
|
|
131
|
+
pending_metadata = dict(chunk.metadata)
|
|
132
|
+
else:
|
|
133
|
+
# Current chunk is big enough
|
|
134
|
+
merged.append(chunk)
|
|
135
|
+
|
|
136
|
+
# Don't forget any remaining pending content
|
|
137
|
+
if pending_content:
|
|
138
|
+
merged.append(Document(
|
|
139
|
+
page_content=pending_content,
|
|
140
|
+
metadata=pending_metadata
|
|
141
|
+
))
|
|
142
|
+
|
|
143
|
+
return merged
|
|
144
|
+
|
|
145
|
+
|
|
55
146
|
def markdown_by_headers_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
|
|
56
147
|
strip_header = config.get("strip_header", False)
|
|
57
148
|
return_each_line = config.get("return_each_line", False)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Universal Chunker - Routes documents to appropriate chunkers based on file type.
|
|
3
|
+
|
|
4
|
+
This module provides a universal chunking interface that automatically selects
|
|
5
|
+
the appropriate chunking strategy based on the file extension:
|
|
6
|
+
|
|
7
|
+
- .md, .markdown → Markdown chunker (header-based splitting)
|
|
8
|
+
- .py, .js, .ts, .java, etc. → TreeSitter code chunker
|
|
9
|
+
- .json → JSON chunker
|
|
10
|
+
- other → Default text chunker
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from alita_sdk.tools.chunkers.universal_chunker import universal_chunker
|
|
14
|
+
|
|
15
|
+
# Chunk documents from a loader
|
|
16
|
+
for chunk in universal_chunker(document_generator, config):
|
|
17
|
+
print(chunk.page_content)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
from typing import Generator, Dict, Any, Optional
|
|
23
|
+
from langchain_core.documents import Document
|
|
24
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
25
|
+
|
|
26
|
+
from .code.codeparser import parse_code_files_for_db
|
|
27
|
+
from .sematic.markdown_chunker import markdown_chunker
|
|
28
|
+
from .sematic.json_chunker import json_chunker
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# File extension mappings
|
|
34
|
+
MARKDOWN_EXTENSIONS = {'.md', '.markdown', '.mdown', '.mkd', '.mdx'}
|
|
35
|
+
JSON_EXTENSIONS = {'.json', '.jsonl', '.jsonc'}
|
|
36
|
+
CODE_EXTENSIONS = {
|
|
37
|
+
'.py', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
|
|
38
|
+
'.java', '.kt', '.rs', '.go', '.cpp', '.c', '.cs',
|
|
39
|
+
'.hs', '.rb', '.scala', '.lua'
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_file_extension(file_path: str) -> str:
|
|
44
|
+
"""Extract file extension from path."""
|
|
45
|
+
return os.path.splitext(file_path)[-1].lower()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_file_type(file_path: str) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Determine the file type category for chunking.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
'markdown', 'json', 'code', or 'text'
|
|
54
|
+
"""
|
|
55
|
+
ext = get_file_extension(file_path)
|
|
56
|
+
|
|
57
|
+
if ext in MARKDOWN_EXTENSIONS:
|
|
58
|
+
return 'markdown'
|
|
59
|
+
elif ext in JSON_EXTENSIONS:
|
|
60
|
+
return 'json'
|
|
61
|
+
elif ext in CODE_EXTENSIONS:
|
|
62
|
+
return 'code'
|
|
63
|
+
else:
|
|
64
|
+
return 'text'
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _default_text_chunker(
|
|
68
|
+
documents: Generator[Document, None, None],
|
|
69
|
+
config: Dict[str, Any]
|
|
70
|
+
) -> Generator[Document, None, None]:
|
|
71
|
+
"""
|
|
72
|
+
Default text chunker for unknown file types.
|
|
73
|
+
Uses recursive character splitting.
|
|
74
|
+
"""
|
|
75
|
+
chunk_size = config.get('chunk_size', 1000)
|
|
76
|
+
chunk_overlap = config.get('chunk_overlap', 100)
|
|
77
|
+
|
|
78
|
+
splitter = RecursiveCharacterTextSplitter(
|
|
79
|
+
chunk_size=chunk_size,
|
|
80
|
+
chunk_overlap=chunk_overlap,
|
|
81
|
+
length_function=len,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
for doc in documents:
|
|
85
|
+
chunks = splitter.split_documents([doc])
|
|
86
|
+
for idx, chunk in enumerate(chunks, 1):
|
|
87
|
+
chunk.metadata['chunk_id'] = idx
|
|
88
|
+
chunk.metadata['chunk_type'] = 'text'
|
|
89
|
+
chunk.metadata['method_name'] = 'text'
|
|
90
|
+
yield chunk
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _code_chunker_from_documents(
|
|
94
|
+
documents: Generator[Document, None, None],
|
|
95
|
+
config: Dict[str, Any]
|
|
96
|
+
) -> Generator[Document, None, None]:
|
|
97
|
+
"""
|
|
98
|
+
Adapter to convert Document generator to code parser format.
|
|
99
|
+
"""
|
|
100
|
+
def file_content_generator():
|
|
101
|
+
for doc in documents:
|
|
102
|
+
yield {
|
|
103
|
+
'file_name': doc.metadata.get('file_path', doc.metadata.get('filename', 'unknown')),
|
|
104
|
+
'file_content': doc.page_content,
|
|
105
|
+
'commit_hash': doc.metadata.get('commit_hash', ''),
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
# parse_code_files_for_db returns chunks with proper metadata
|
|
109
|
+
for chunk in parse_code_files_for_db(file_content_generator()):
|
|
110
|
+
# Ensure file_path is preserved
|
|
111
|
+
if 'file_path' not in chunk.metadata and 'filename' in chunk.metadata:
|
|
112
|
+
chunk.metadata['file_path'] = chunk.metadata['filename']
|
|
113
|
+
yield chunk
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def universal_chunker(
|
|
117
|
+
documents: Generator[Document, None, None],
|
|
118
|
+
config: Optional[Dict[str, Any]] = None
|
|
119
|
+
) -> Generator[Document, None, None]:
|
|
120
|
+
"""
|
|
121
|
+
Universal chunker that routes documents to appropriate chunkers based on file type.
|
|
122
|
+
|
|
123
|
+
Each document is inspected for its file extension (from metadata.file_path or
|
|
124
|
+
metadata.file_name) and routed to the appropriate chunker:
|
|
125
|
+
|
|
126
|
+
- Markdown files → markdown_chunker (header-based splitting)
|
|
127
|
+
- JSON files → json_chunker (recursive JSON splitting)
|
|
128
|
+
- Code files → code parser (TreeSitter-based parsing)
|
|
129
|
+
- Other files → default text chunker (recursive character splitting)
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
documents: Generator yielding Document objects with file content
|
|
133
|
+
config: Optional configuration dict with:
|
|
134
|
+
- markdown_config: Config for markdown chunker
|
|
135
|
+
- json_config: Config for JSON chunker
|
|
136
|
+
- code_config: Config for code chunker
|
|
137
|
+
- text_config: Config for default text chunker
|
|
138
|
+
|
|
139
|
+
Yields:
|
|
140
|
+
Document objects with chunked content and preserved metadata
|
|
141
|
+
"""
|
|
142
|
+
if config is None:
|
|
143
|
+
config = {}
|
|
144
|
+
|
|
145
|
+
# Default configs for each chunker type
|
|
146
|
+
markdown_config = config.get('markdown_config', {
|
|
147
|
+
'strip_header': False,
|
|
148
|
+
'return_each_line': False,
|
|
149
|
+
'headers_to_split_on': [
|
|
150
|
+
('#', 'Header 1'),
|
|
151
|
+
('##', 'Header 2'),
|
|
152
|
+
('###', 'Header 3'),
|
|
153
|
+
('####', 'Header 4'),
|
|
154
|
+
],
|
|
155
|
+
'max_tokens': 1024,
|
|
156
|
+
'token_overlap': 50,
|
|
157
|
+
'min_chunk_chars': 100, # Merge chunks smaller than this
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
json_config = config.get('json_config', {
|
|
161
|
+
'max_tokens': 512,
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
code_config = config.get('code_config', {})
|
|
165
|
+
|
|
166
|
+
text_config = config.get('text_config', {
|
|
167
|
+
'chunk_size': 1000,
|
|
168
|
+
'chunk_overlap': 100,
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
# Buffer documents by type for batch processing
|
|
172
|
+
# This is more efficient than processing one at a time
|
|
173
|
+
markdown_docs = []
|
|
174
|
+
json_docs = []
|
|
175
|
+
code_docs = []
|
|
176
|
+
text_docs = []
|
|
177
|
+
|
|
178
|
+
# Buffer size before flushing
|
|
179
|
+
BUFFER_SIZE = 10
|
|
180
|
+
|
|
181
|
+
def flush_markdown():
|
|
182
|
+
if markdown_docs:
|
|
183
|
+
def gen():
|
|
184
|
+
for d in markdown_docs:
|
|
185
|
+
yield d
|
|
186
|
+
for chunk in markdown_chunker(gen(), markdown_config):
|
|
187
|
+
yield chunk
|
|
188
|
+
markdown_docs.clear()
|
|
189
|
+
|
|
190
|
+
def flush_json():
|
|
191
|
+
if json_docs:
|
|
192
|
+
def gen():
|
|
193
|
+
for d in json_docs:
|
|
194
|
+
yield d
|
|
195
|
+
for chunk in json_chunker(gen(), json_config):
|
|
196
|
+
yield chunk
|
|
197
|
+
json_docs.clear()
|
|
198
|
+
|
|
199
|
+
def flush_code():
|
|
200
|
+
if code_docs:
|
|
201
|
+
def gen():
|
|
202
|
+
for d in code_docs:
|
|
203
|
+
yield d
|
|
204
|
+
for chunk in _code_chunker_from_documents(gen(), code_config):
|
|
205
|
+
yield chunk
|
|
206
|
+
code_docs.clear()
|
|
207
|
+
|
|
208
|
+
def flush_text():
|
|
209
|
+
if text_docs:
|
|
210
|
+
def gen():
|
|
211
|
+
for d in text_docs:
|
|
212
|
+
yield d
|
|
213
|
+
for chunk in _default_text_chunker(gen(), text_config):
|
|
214
|
+
yield chunk
|
|
215
|
+
text_docs.clear()
|
|
216
|
+
|
|
217
|
+
for doc in documents:
|
|
218
|
+
# Get file path from metadata
|
|
219
|
+
file_path = (doc.metadata.get('file_path') or
|
|
220
|
+
doc.metadata.get('file_name') or
|
|
221
|
+
doc.metadata.get('source') or
|
|
222
|
+
'unknown')
|
|
223
|
+
|
|
224
|
+
# Ensure file_path is in metadata for downstream use
|
|
225
|
+
doc.metadata['file_path'] = file_path
|
|
226
|
+
|
|
227
|
+
file_type = get_file_type(file_path)
|
|
228
|
+
|
|
229
|
+
if file_type == 'markdown':
|
|
230
|
+
markdown_docs.append(doc)
|
|
231
|
+
if len(markdown_docs) >= BUFFER_SIZE:
|
|
232
|
+
yield from flush_markdown()
|
|
233
|
+
elif file_type == 'json':
|
|
234
|
+
json_docs.append(doc)
|
|
235
|
+
if len(json_docs) >= BUFFER_SIZE:
|
|
236
|
+
yield from flush_json()
|
|
237
|
+
elif file_type == 'code':
|
|
238
|
+
code_docs.append(doc)
|
|
239
|
+
if len(code_docs) >= BUFFER_SIZE:
|
|
240
|
+
yield from flush_code()
|
|
241
|
+
else:
|
|
242
|
+
text_docs.append(doc)
|
|
243
|
+
if len(text_docs) >= BUFFER_SIZE:
|
|
244
|
+
yield from flush_text()
|
|
245
|
+
|
|
246
|
+
# Flush remaining documents
|
|
247
|
+
yield from flush_markdown()
|
|
248
|
+
yield from flush_json()
|
|
249
|
+
yield from flush_code()
|
|
250
|
+
yield from flush_text()
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def chunk_single_document(
|
|
254
|
+
doc: Document,
|
|
255
|
+
config: Optional[Dict[str, Any]] = None
|
|
256
|
+
) -> Generator[Document, None, None]:
|
|
257
|
+
"""
|
|
258
|
+
Convenience function to chunk a single document.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
doc: Single Document to chunk
|
|
262
|
+
config: Optional chunker configuration
|
|
263
|
+
|
|
264
|
+
Yields:
|
|
265
|
+
Chunked Document objects
|
|
266
|
+
"""
|
|
267
|
+
def single_doc_gen():
|
|
268
|
+
yield doc
|
|
269
|
+
|
|
270
|
+
yield from universal_chunker(single_doc_gen(), config)
|
|
@@ -4,8 +4,9 @@ def search_format(items):
|
|
|
4
4
|
results = []
|
|
5
5
|
for (doc, score) in items:
|
|
6
6
|
res_chunk = ''
|
|
7
|
-
language = get_programming_language(get_file_extension(doc.metadata
|
|
8
|
-
|
|
7
|
+
language = get_programming_language(get_file_extension(doc.metadata.get("filename", "unknown")))
|
|
8
|
+
method_name = doc.metadata.get("method_name", "text")
|
|
9
|
+
res_chunk += doc.metadata.get("filename", "unknown") + " -> " + method_name + " (score: " + str(score) + ")"
|
|
9
10
|
res_chunk += "\n\n```" + language.value + "\n"+ doc.page_content + "\n```\n\n"
|
|
10
11
|
results.append(res_chunk)
|
|
11
12
|
return results
|
|
@@ -9,13 +9,13 @@ from langchain_core.tools import ToolException
|
|
|
9
9
|
from pydantic import Field
|
|
10
10
|
|
|
11
11
|
from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
|
|
12
|
-
from .chunkers.code.codeparser import parse_code_files_for_db
|
|
13
12
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
18
17
|
def _get_indexed_data(self, index_name: str):
|
|
18
|
+
self._ensure_vectorstore_initialized()
|
|
19
19
|
if not self.vector_adapter:
|
|
20
20
|
raise ToolException("Vector adapter is not initialized. "
|
|
21
21
|
"Check your configuration: embedding_model and vectorstore_type.")
|
|
@@ -66,26 +66,40 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
66
66
|
def loader(self,
|
|
67
67
|
branch: Optional[str] = None,
|
|
68
68
|
whitelist: Optional[List[str]] = None,
|
|
69
|
-
blacklist: Optional[List[str]] = None
|
|
69
|
+
blacklist: Optional[List[str]] = None,
|
|
70
|
+
chunked: bool = True) -> Generator[Document, None, None]:
|
|
70
71
|
"""
|
|
71
|
-
Generates
|
|
72
|
+
Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
|
|
72
73
|
|
|
73
74
|
Parameters:
|
|
74
75
|
- branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
|
|
75
76
|
- whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
|
|
76
77
|
- blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
|
|
78
|
+
- chunked (bool): If True (default), applies universal chunker based on file type.
|
|
79
|
+
If False, returns raw Documents without chunking.
|
|
77
80
|
|
|
78
81
|
Returns:
|
|
79
|
-
- generator: Yields
|
|
82
|
+
- generator: Yields Documents from files matching the whitelist but not the blacklist.
|
|
83
|
+
Each document has exactly the key 'filename' in metadata, which is used as an ID
|
|
84
|
+
for further operations (indexing, deduplication, and retrieval).
|
|
80
85
|
|
|
81
86
|
Example:
|
|
82
87
|
# Use 'feature-branch', include '.py' files, exclude 'test_' files
|
|
83
|
-
|
|
88
|
+
for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
|
|
89
|
+
print(doc.page_content)
|
|
84
90
|
|
|
85
91
|
Notes:
|
|
86
92
|
- Whitelist and blacklist use Unix shell-style wildcards.
|
|
87
93
|
- Files must match the whitelist and not the blacklist to be included.
|
|
94
|
+
- Each document MUST have exactly the key 'filename' in metadata. This key is used as an ID
|
|
95
|
+
for further operations such as indexing, deduplication, and retrieval.
|
|
96
|
+
- When chunked=True:
|
|
97
|
+
- .md files → markdown chunker (header-based splitting)
|
|
98
|
+
- .py/.js/.ts/etc → code parser (TreeSitter-based)
|
|
99
|
+
- .json files → JSON chunker
|
|
100
|
+
- other files → default text chunker
|
|
88
101
|
"""
|
|
102
|
+
import hashlib
|
|
89
103
|
|
|
90
104
|
_files = self.__handle_get_files("", self.__get_branch(branch))
|
|
91
105
|
self._log_tool_event(message="Listing files in branch", tool_name="loader")
|
|
@@ -103,41 +117,60 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
103
117
|
or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
|
|
104
118
|
return False
|
|
105
119
|
|
|
106
|
-
def
|
|
120
|
+
def raw_document_generator() -> Generator[Document, None, None]:
|
|
121
|
+
"""Yields raw Documents without chunking."""
|
|
107
122
|
self._log_tool_event(message="Reading the files", tool_name="loader")
|
|
108
|
-
# log the progress of file reading
|
|
109
123
|
total_files = len(_files)
|
|
124
|
+
processed = 0
|
|
125
|
+
|
|
110
126
|
for idx, file in enumerate(_files, 1):
|
|
111
127
|
if is_whitelisted(file) and not is_blacklisted(file):
|
|
112
|
-
# read file ONLY if it matches whitelist and does not match blacklist
|
|
113
128
|
try:
|
|
114
129
|
file_content = self._read_file(file, self.__get_branch(branch))
|
|
115
130
|
except Exception as e:
|
|
116
131
|
logger.error(f"Failed to read file {file}: {e}")
|
|
117
|
-
|
|
132
|
+
continue
|
|
133
|
+
|
|
118
134
|
if not file_content:
|
|
119
|
-
# empty file, skip
|
|
120
135
|
continue
|
|
121
|
-
|
|
122
|
-
#
|
|
136
|
+
|
|
137
|
+
# Ensure file content is a string
|
|
123
138
|
if isinstance(file_content, bytes):
|
|
124
139
|
file_content = file_content.decode("utf-8", errors="ignore")
|
|
125
140
|
elif isinstance(file_content, dict) and file.endswith('.json'):
|
|
126
141
|
file_content = json.dumps(file_content)
|
|
127
142
|
elif not isinstance(file_content, str):
|
|
128
143
|
file_content = str(file_content)
|
|
129
|
-
|
|
130
|
-
#
|
|
131
|
-
import hashlib
|
|
144
|
+
|
|
145
|
+
# Hash the file content for uniqueness tracking
|
|
132
146
|
file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
147
|
+
processed += 1
|
|
148
|
+
|
|
149
|
+
yield Document(
|
|
150
|
+
page_content=file_content,
|
|
151
|
+
metadata={
|
|
152
|
+
'file_path': file,
|
|
153
|
+
'filename': file,
|
|
154
|
+
'source': file,
|
|
155
|
+
'commit_hash': file_hash,
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
|
|
136
159
|
if idx % 10 == 0 or idx == total_files:
|
|
137
|
-
self._log_tool_event(
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
160
|
+
self._log_tool_event(
|
|
161
|
+
message=f"{idx} out of {total_files} files checked, {processed} matched",
|
|
162
|
+
tool_name="loader"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
|
|
166
|
+
|
|
167
|
+
if not chunked:
|
|
168
|
+
# Return raw documents without chunking
|
|
169
|
+
return raw_document_generator()
|
|
170
|
+
|
|
171
|
+
# Apply universal chunker based on file type
|
|
172
|
+
from .chunkers.universal_chunker import universal_chunker
|
|
173
|
+
return universal_chunker(raw_document_generator())
|
|
141
174
|
|
|
142
175
|
def __handle_get_files(self, path: str, branch: str):
|
|
143
176
|
"""
|
|
@@ -480,21 +480,69 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
480
480
|
"""Gets pages with specific label in the Confluence space."""
|
|
481
481
|
|
|
482
482
|
start = 0
|
|
483
|
-
pages_info = []
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
483
|
+
pages_info: List[Dict[str, Any]] = []
|
|
484
|
+
seen_ids: set[str] = set()
|
|
485
|
+
|
|
486
|
+
# Use a while-loop driven by unique pages collected and
|
|
487
|
+
# presence of additional results instead of a fixed number
|
|
488
|
+
# of iterations based purely on max_pages/limit.
|
|
489
|
+
while len(pages_info) < (self.max_pages or 0):
|
|
490
|
+
pages = self.client.get_all_pages_by_label(
|
|
491
|
+
label,
|
|
492
|
+
start=start,
|
|
493
|
+
limit=self.limit,
|
|
494
|
+
) # , expand="body.view.value"
|
|
487
495
|
if not pages:
|
|
488
496
|
break
|
|
489
497
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
498
|
+
# Collect only ids we haven't processed yet to avoid
|
|
499
|
+
# calling get_page_by_id multiple times for the same
|
|
500
|
+
# Confluence page.
|
|
501
|
+
new_ids: List[str] = []
|
|
502
|
+
for p in pages:
|
|
503
|
+
page_id = p["id"] if isinstance(p, dict) else getattr(p, "id", None)
|
|
504
|
+
if page_id is None:
|
|
505
|
+
continue
|
|
506
|
+
if page_id in seen_ids:
|
|
507
|
+
continue
|
|
508
|
+
seen_ids.add(page_id)
|
|
509
|
+
new_ids.append(page_id)
|
|
510
|
+
|
|
511
|
+
if new_ids:
|
|
512
|
+
for page in self.get_pages_by_id(new_ids):
|
|
513
|
+
meta = getattr(page, "metadata", {}) or {}
|
|
514
|
+
page_id = meta.get("id")
|
|
515
|
+
page_title = meta.get("title")
|
|
516
|
+
page_url = meta.get("source")
|
|
517
|
+
content = getattr(page, "page_content", None)
|
|
518
|
+
|
|
519
|
+
if page_id is None:
|
|
520
|
+
continue
|
|
521
|
+
|
|
522
|
+
pages_info.append(
|
|
523
|
+
{
|
|
524
|
+
"page_id": page_id,
|
|
525
|
+
"page_title": page_title,
|
|
526
|
+
"page_url": page_url,
|
|
527
|
+
"content": content,
|
|
528
|
+
}
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Respect max_pages on unique pages collected.
|
|
532
|
+
if len(pages_info) >= (self.max_pages or 0):
|
|
533
|
+
break
|
|
534
|
+
|
|
535
|
+
# Advance the offset by the requested page size.
|
|
496
536
|
start += self.limit
|
|
497
|
-
|
|
537
|
+
|
|
538
|
+
# Defensive break: if the API returns fewer items than
|
|
539
|
+
# requested, there are likely no more pages to fetch.
|
|
540
|
+
if len(pages) < self.limit:
|
|
541
|
+
break
|
|
542
|
+
|
|
543
|
+
# Slice as an extra safety net in case of any race conditions
|
|
544
|
+
# around the max_pages guard in the loop above.
|
|
545
|
+
return pages_info[: (self.max_pages or len(pages_info))]
|
|
498
546
|
|
|
499
547
|
def is_public_page(self, page: dict) -> bool:
|
|
500
548
|
"""Check if a page is publicly accessible."""
|
|
@@ -896,14 +944,14 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
896
944
|
|
|
897
945
|
# Re-verify extension filters
|
|
898
946
|
# Check if file should be skipped based on skip_extensions
|
|
899
|
-
if any(re.match(pattern.replace('
|
|
947
|
+
if any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
|
|
900
948
|
for pattern in self._skip_extensions):
|
|
901
949
|
continue
|
|
902
950
|
|
|
903
951
|
# Check if file should be included based on include_extensions
|
|
904
952
|
# If include_extensions is empty, process all files (that weren't skipped)
|
|
905
953
|
if self._include_extensions and not (
|
|
906
|
-
any(re.match(pattern.replace('
|
|
954
|
+
any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
|
|
907
955
|
for pattern in self._include_extensions)):
|
|
908
956
|
continue
|
|
909
957
|
|
|
@@ -1820,4 +1868,5 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
|
|
|
1820
1868
|
"description": self.get_page_attachments.__doc__,
|
|
1821
1869
|
"args_schema": GetPageAttachmentsInput,
|
|
1822
1870
|
}
|
|
1823
|
-
]
|
|
1871
|
+
]
|
|
1872
|
+
|