alita-sdk 0.3.465__py3-none-any.whl → 0.3.486__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (90) hide show
  1. alita_sdk/cli/agent/__init__.py +5 -0
  2. alita_sdk/cli/agent/default.py +83 -1
  3. alita_sdk/cli/agent_loader.py +6 -9
  4. alita_sdk/cli/agent_ui.py +13 -3
  5. alita_sdk/cli/agents.py +1866 -185
  6. alita_sdk/cli/callbacks.py +96 -25
  7. alita_sdk/cli/cli.py +10 -1
  8. alita_sdk/cli/config.py +151 -9
  9. alita_sdk/cli/context/__init__.py +30 -0
  10. alita_sdk/cli/context/cleanup.py +198 -0
  11. alita_sdk/cli/context/manager.py +731 -0
  12. alita_sdk/cli/context/message.py +285 -0
  13. alita_sdk/cli/context/strategies.py +289 -0
  14. alita_sdk/cli/context/token_estimation.py +127 -0
  15. alita_sdk/cli/input_handler.py +167 -4
  16. alita_sdk/cli/inventory.py +1256 -0
  17. alita_sdk/cli/toolkit.py +14 -17
  18. alita_sdk/cli/toolkit_loader.py +35 -5
  19. alita_sdk/cli/tools/__init__.py +8 -1
  20. alita_sdk/cli/tools/filesystem.py +815 -55
  21. alita_sdk/cli/tools/planning.py +143 -157
  22. alita_sdk/cli/tools/terminal.py +154 -20
  23. alita_sdk/community/__init__.py +64 -8
  24. alita_sdk/community/inventory/__init__.py +224 -0
  25. alita_sdk/community/inventory/config.py +257 -0
  26. alita_sdk/community/inventory/enrichment.py +2137 -0
  27. alita_sdk/community/inventory/extractors.py +1469 -0
  28. alita_sdk/community/inventory/ingestion.py +3172 -0
  29. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  30. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  31. alita_sdk/community/inventory/parsers/base.py +295 -0
  32. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  33. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  34. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  35. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  36. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  37. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  38. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  39. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  40. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  41. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  42. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  43. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  44. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  45. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  46. alita_sdk/community/inventory/patterns/loader.py +348 -0
  47. alita_sdk/community/inventory/patterns/registry.py +198 -0
  48. alita_sdk/community/inventory/presets.py +535 -0
  49. alita_sdk/community/inventory/retrieval.py +1403 -0
  50. alita_sdk/community/inventory/toolkit.py +169 -0
  51. alita_sdk/community/inventory/visualize.py +1370 -0
  52. alita_sdk/configurations/bitbucket.py +0 -3
  53. alita_sdk/runtime/clients/client.py +84 -26
  54. alita_sdk/runtime/langchain/assistant.py +4 -2
  55. alita_sdk/runtime/langchain/langraph_agent.py +122 -31
  56. alita_sdk/runtime/llms/preloaded.py +2 -6
  57. alita_sdk/runtime/toolkits/__init__.py +2 -0
  58. alita_sdk/runtime/toolkits/application.py +1 -1
  59. alita_sdk/runtime/toolkits/mcp.py +46 -36
  60. alita_sdk/runtime/toolkits/planning.py +171 -0
  61. alita_sdk/runtime/toolkits/tools.py +39 -6
  62. alita_sdk/runtime/tools/llm.py +185 -8
  63. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  64. alita_sdk/runtime/tools/planning/models.py +246 -0
  65. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  66. alita_sdk/runtime/tools/vectorstore_base.py +41 -6
  67. alita_sdk/runtime/utils/mcp_oauth.py +80 -0
  68. alita_sdk/runtime/utils/streamlit.py +6 -10
  69. alita_sdk/runtime/utils/toolkit_utils.py +19 -4
  70. alita_sdk/tools/__init__.py +54 -27
  71. alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
  72. alita_sdk/tools/base_indexer_toolkit.py +98 -19
  73. alita_sdk/tools/bitbucket/__init__.py +2 -2
  74. alita_sdk/tools/chunkers/__init__.py +3 -1
  75. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +95 -6
  76. alita_sdk/tools/chunkers/universal_chunker.py +269 -0
  77. alita_sdk/tools/code_indexer_toolkit.py +55 -22
  78. alita_sdk/tools/elitea_base.py +86 -21
  79. alita_sdk/tools/jira/__init__.py +1 -1
  80. alita_sdk/tools/jira/api_wrapper.py +91 -40
  81. alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
  82. alita_sdk/tools/qtest/__init__.py +1 -1
  83. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +8 -2
  84. alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
  85. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.486.dist-info}/METADATA +2 -1
  86. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.486.dist-info}/RECORD +90 -50
  87. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.486.dist-info}/WHEEL +0 -0
  88. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.486.dist-info}/entry_points.txt +0 -0
  89. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.486.dist-info}/licenses/LICENSE +0 -0
  90. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.486.dist-info}/top_level.txt +0 -0
@@ -47,8 +47,8 @@ class AlitaBitbucketToolkit(BaseToolkit):
47
47
  AlitaBitbucketToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
48
48
  m = create_model(
49
49
  name,
50
- project=(str, Field(description="Project/Workspace", json_schema_extra={'configuration': True})),
51
- repository=(str, Field(description="Repository", json_schema_extra={'max_toolkit_length': AlitaBitbucketToolkit.toolkit_max_length, 'configuration': True})),
50
+ project=(str, Field(description="Project/Workspace")),
51
+ repository=(str, Field(description="Repository")),
52
52
  branch=(str, Field(description="Main branch", default="main")),
53
53
  cloud=(Optional[bool], Field(description="Hosting Option", default=None)),
54
54
  bitbucket_configuration=(BitbucketConfiguration, Field(description="Bitbucket Configuration", json_schema_extra={'configuration_types': ['bitbucket']})),
@@ -3,6 +3,7 @@ from .sematic.statistical_chunker import statistical_chunker
3
3
  from .sematic.markdown_chunker import markdown_chunker
4
4
  from .sematic.proposal_chunker import proposal_chunker
5
5
  from .sematic.json_chunker import json_chunker
6
+ from .universal_chunker import universal_chunker, chunk_single_document, get_file_type
6
7
  from .models import StatisticalChunkerConfig, MarkdownChunkerConfig, ProposalChunkerConfig
7
8
 
8
9
  __all__ = {
@@ -10,7 +11,8 @@ __all__ = {
10
11
  'statistical': statistical_chunker,
11
12
  'markdown': markdown_chunker,
12
13
  'proposal': proposal_chunker,
13
- 'json': json_chunker
14
+ 'json': json_chunker,
15
+ 'universal': universal_chunker,
14
16
  }
15
17
 
16
18
  __confluence_chunkers__ = {
@@ -1,4 +1,4 @@
1
- from typing import Generator
1
+ from typing import Generator, List
2
2
  from langchain_core.documents import Document
3
3
  from langchain_text_splitters import MarkdownHeaderTextSplitter, ExperimentalMarkdownSyntaxTextSplitter
4
4
  from langchain.text_splitter import TokenTextSplitter
@@ -7,28 +7,53 @@ from copy import deepcopy as copy
7
7
 
8
8
 
9
9
  def markdown_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
10
+ """
11
+ Chunks markdown documents by headers, with support for:
12
+ - Minimum chunk size to avoid tiny fragments
13
+ - Maximum token limit with overflow splitting
14
+ - Header metadata preservation
15
+
16
+ Config options:
17
+ strip_header (bool): Remove headers from content. Default: False
18
+ return_each_line (bool): Split on every line. Default: False
19
+ headers_to_split_on (list): Headers to split on, e.g. [('#', 'H1'), ('##', 'H2')]
20
+ max_tokens (int): Maximum tokens per chunk. Default: 512
21
+ token_overlap (int): Token overlap for large chunk splitting. Default: 10
22
+ min_chunk_chars (int): Minimum characters per chunk. Default: 100
23
+ Chunks smaller than this will be merged with the next chunk.
24
+ """
10
25
  strip_header = config.get("strip_header", False)
11
26
  return_each_line = config.get("return_each_line", False)
12
27
  headers_to_split_on = config.get("headers_to_split_on", [])
13
28
  max_tokens = config.get("max_tokens", 512)
14
29
  tokens_overlapping = config.get("token_overlap", 10)
30
+ min_chunk_chars = config.get("min_chunk_chars", 100) # Minimum characters per chunk
31
+
15
32
  headers_to_split_on = [tuple(header) for header in headers_to_split_on]
33
+
16
34
  for doc in file_content_generator:
17
35
  doc_metadata = doc.metadata
18
36
  doc_content = doc.page_content
19
37
  chunk_id = 0
38
+
20
39
  markdown_splitter = MarkdownHeaderTextSplitter(
21
40
  headers_to_split_on=headers_to_split_on,
22
41
  strip_headers=strip_header,
23
42
  return_each_line=return_each_line
24
43
  )
25
44
  md_header_splits = markdown_splitter.split_text(doc_content)
26
- for chunk in md_header_splits:
45
+
46
+ # Merge small chunks with the next one
47
+ merged_chunks = _merge_small_chunks(md_header_splits, min_chunk_chars)
48
+
49
+ for chunk in merged_chunks:
27
50
  if tiktoken_length(chunk.page_content) > max_tokens:
28
- for subchunk in TokenTextSplitter(encoding_name="cl100k_base",
29
- chunk_size=max_tokens,
30
- chunk_overlap=tokens_overlapping
31
- ).split_text(chunk.page_content):
51
+ # Split large chunks into smaller ones
52
+ for subchunk in TokenTextSplitter(
53
+ encoding_name="cl100k_base",
54
+ chunk_size=max_tokens,
55
+ chunk_overlap=tokens_overlapping
56
+ ).split_text(chunk.page_content):
32
57
  chunk_id += 1
33
58
  headers_meta = list(chunk.metadata.values())
34
59
  docmeta = copy(doc_metadata)
@@ -52,6 +77,70 @@ def markdown_chunker(file_content_generator: Generator[Document, None, None], co
52
77
  )
53
78
 
54
79
 
80
+ def _merge_small_chunks(chunks: List[Document], min_chars: int) -> List[Document]:
81
+ """
82
+ Merge chunks that are smaller than min_chars with the next chunk.
83
+
84
+ This prevents tiny fragments (like standalone headers or short notes)
85
+ from becoming separate chunks.
86
+
87
+ Args:
88
+ chunks: List of Document chunks from markdown splitter
89
+ min_chars: Minimum character count for a chunk
90
+
91
+ Returns:
92
+ List of merged Document chunks
93
+ """
94
+ if not chunks:
95
+ return chunks
96
+
97
+ merged = []
98
+ pending_content = ""
99
+ pending_metadata = {}
100
+
101
+ for i, chunk in enumerate(chunks):
102
+ content = chunk.page_content.strip()
103
+
104
+ if pending_content:
105
+ # Merge pending content with current chunk
106
+ combined_content = pending_content + "\n\n" + content
107
+ # Use the pending metadata (from the header) but can be extended
108
+ combined_metadata = {**pending_metadata}
109
+ # Add any new header info from current chunk
110
+ for key, value in chunk.metadata.items():
111
+ if key not in combined_metadata or not combined_metadata[key]:
112
+ combined_metadata[key] = value
113
+
114
+ if len(combined_content) >= min_chars:
115
+ # Combined is big enough, emit it
116
+ merged.append(Document(
117
+ page_content=combined_content,
118
+ metadata=combined_metadata
119
+ ))
120
+ pending_content = ""
121
+ pending_metadata = {}
122
+ else:
123
+ # Still too small, keep accumulating
124
+ pending_content = combined_content
125
+ pending_metadata = combined_metadata
126
+ elif len(content) < min_chars:
127
+ # Current chunk is too small, start pending
128
+ pending_content = content
129
+ pending_metadata = dict(chunk.metadata)
130
+ else:
131
+ # Current chunk is big enough
132
+ merged.append(chunk)
133
+
134
+ # Don't forget any remaining pending content
135
+ if pending_content:
136
+ merged.append(Document(
137
+ page_content=pending_content,
138
+ metadata=pending_metadata
139
+ ))
140
+
141
+ return merged
142
+
143
+
55
144
  def markdown_by_headers_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
56
145
  strip_header = config.get("strip_header", False)
57
146
  return_each_line = config.get("return_each_line", False)
@@ -0,0 +1,269 @@
1
+ """
2
+ Universal Chunker - Routes documents to appropriate chunkers based on file type.
3
+
4
+ This module provides a universal chunking interface that automatically selects
5
+ the appropriate chunking strategy based on the file extension:
6
+
7
+ - .md, .markdown → Markdown chunker (header-based splitting)
8
+ - .py, .js, .ts, .java, etc. → TreeSitter code chunker
9
+ - .json → JSON chunker
10
+ - other → Default text chunker
11
+
12
+ Usage:
13
+ from alita_sdk.tools.chunkers.universal_chunker import universal_chunker
14
+
15
+ # Chunk documents from a loader
16
+ for chunk in universal_chunker(document_generator, config):
17
+ print(chunk.page_content)
18
+ """
19
+
20
+ import logging
21
+ import os
22
+ from typing import Generator, Dict, Any, Optional
23
+ from langchain_core.documents import Document
24
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
25
+
26
+ from .code.codeparser import parse_code_files_for_db
27
+ from .sematic.markdown_chunker import markdown_chunker
28
+ from .sematic.json_chunker import json_chunker
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ # File extension mappings
34
+ MARKDOWN_EXTENSIONS = {'.md', '.markdown', '.mdown', '.mkd', '.mdx'}
35
+ JSON_EXTENSIONS = {'.json', '.jsonl', '.jsonc'}
36
+ CODE_EXTENSIONS = {
37
+ '.py', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
38
+ '.java', '.kt', '.rs', '.go', '.cpp', '.c', '.cs',
39
+ '.hs', '.rb', '.scala', '.lua'
40
+ }
41
+
42
+
43
+ def get_file_extension(file_path: str) -> str:
44
+ """Extract file extension from path."""
45
+ return os.path.splitext(file_path)[-1].lower()
46
+
47
+
48
+ def get_file_type(file_path: str) -> str:
49
+ """
50
+ Determine the file type category for chunking.
51
+
52
+ Returns:
53
+ 'markdown', 'json', 'code', or 'text'
54
+ """
55
+ ext = get_file_extension(file_path)
56
+
57
+ if ext in MARKDOWN_EXTENSIONS:
58
+ return 'markdown'
59
+ elif ext in JSON_EXTENSIONS:
60
+ return 'json'
61
+ elif ext in CODE_EXTENSIONS:
62
+ return 'code'
63
+ else:
64
+ return 'text'
65
+
66
+
67
+ def _default_text_chunker(
68
+ documents: Generator[Document, None, None],
69
+ config: Dict[str, Any]
70
+ ) -> Generator[Document, None, None]:
71
+ """
72
+ Default text chunker for unknown file types.
73
+ Uses recursive character splitting.
74
+ """
75
+ chunk_size = config.get('chunk_size', 1000)
76
+ chunk_overlap = config.get('chunk_overlap', 100)
77
+
78
+ splitter = RecursiveCharacterTextSplitter(
79
+ chunk_size=chunk_size,
80
+ chunk_overlap=chunk_overlap,
81
+ length_function=len,
82
+ )
83
+
84
+ for doc in documents:
85
+ chunks = splitter.split_documents([doc])
86
+ for idx, chunk in enumerate(chunks, 1):
87
+ chunk.metadata['chunk_id'] = idx
88
+ chunk.metadata['chunk_type'] = 'text'
89
+ yield chunk
90
+
91
+
92
+ def _code_chunker_from_documents(
93
+ documents: Generator[Document, None, None],
94
+ config: Dict[str, Any]
95
+ ) -> Generator[Document, None, None]:
96
+ """
97
+ Adapter to convert Document generator to code parser format.
98
+ """
99
+ def file_content_generator():
100
+ for doc in documents:
101
+ yield {
102
+ 'file_name': doc.metadata.get('file_path', doc.metadata.get('filename', 'unknown')),
103
+ 'file_content': doc.page_content,
104
+ 'commit_hash': doc.metadata.get('commit_hash', ''),
105
+ }
106
+
107
+ # parse_code_files_for_db returns chunks with proper metadata
108
+ for chunk in parse_code_files_for_db(file_content_generator()):
109
+ # Ensure file_path is preserved
110
+ if 'file_path' not in chunk.metadata and 'filename' in chunk.metadata:
111
+ chunk.metadata['file_path'] = chunk.metadata['filename']
112
+ yield chunk
113
+
114
+
115
+ def universal_chunker(
116
+ documents: Generator[Document, None, None],
117
+ config: Optional[Dict[str, Any]] = None
118
+ ) -> Generator[Document, None, None]:
119
+ """
120
+ Universal chunker that routes documents to appropriate chunkers based on file type.
121
+
122
+ Each document is inspected for its file extension (from metadata.file_path or
123
+ metadata.file_name) and routed to the appropriate chunker:
124
+
125
+ - Markdown files → markdown_chunker (header-based splitting)
126
+ - JSON files → json_chunker (recursive JSON splitting)
127
+ - Code files → code parser (TreeSitter-based parsing)
128
+ - Other files → default text chunker (recursive character splitting)
129
+
130
+ Args:
131
+ documents: Generator yielding Document objects with file content
132
+ config: Optional configuration dict with:
133
+ - markdown_config: Config for markdown chunker
134
+ - json_config: Config for JSON chunker
135
+ - code_config: Config for code chunker
136
+ - text_config: Config for default text chunker
137
+
138
+ Yields:
139
+ Document objects with chunked content and preserved metadata
140
+ """
141
+ if config is None:
142
+ config = {}
143
+
144
+ # Default configs for each chunker type
145
+ markdown_config = config.get('markdown_config', {
146
+ 'strip_header': False,
147
+ 'return_each_line': False,
148
+ 'headers_to_split_on': [
149
+ ('#', 'Header 1'),
150
+ ('##', 'Header 2'),
151
+ ('###', 'Header 3'),
152
+ ('####', 'Header 4'),
153
+ ],
154
+ 'max_tokens': 1024,
155
+ 'token_overlap': 50,
156
+ 'min_chunk_chars': 100, # Merge chunks smaller than this
157
+ })
158
+
159
+ json_config = config.get('json_config', {
160
+ 'max_tokens': 512,
161
+ })
162
+
163
+ code_config = config.get('code_config', {})
164
+
165
+ text_config = config.get('text_config', {
166
+ 'chunk_size': 1000,
167
+ 'chunk_overlap': 100,
168
+ })
169
+
170
+ # Buffer documents by type for batch processing
171
+ # This is more efficient than processing one at a time
172
+ markdown_docs = []
173
+ json_docs = []
174
+ code_docs = []
175
+ text_docs = []
176
+
177
+ # Buffer size before flushing
178
+ BUFFER_SIZE = 10
179
+
180
+ def flush_markdown():
181
+ if markdown_docs:
182
+ def gen():
183
+ for d in markdown_docs:
184
+ yield d
185
+ for chunk in markdown_chunker(gen(), markdown_config):
186
+ yield chunk
187
+ markdown_docs.clear()
188
+
189
+ def flush_json():
190
+ if json_docs:
191
+ def gen():
192
+ for d in json_docs:
193
+ yield d
194
+ for chunk in json_chunker(gen(), json_config):
195
+ yield chunk
196
+ json_docs.clear()
197
+
198
+ def flush_code():
199
+ if code_docs:
200
+ def gen():
201
+ for d in code_docs:
202
+ yield d
203
+ for chunk in _code_chunker_from_documents(gen(), code_config):
204
+ yield chunk
205
+ code_docs.clear()
206
+
207
+ def flush_text():
208
+ if text_docs:
209
+ def gen():
210
+ for d in text_docs:
211
+ yield d
212
+ for chunk in _default_text_chunker(gen(), text_config):
213
+ yield chunk
214
+ text_docs.clear()
215
+
216
+ for doc in documents:
217
+ # Get file path from metadata
218
+ file_path = (doc.metadata.get('file_path') or
219
+ doc.metadata.get('file_name') or
220
+ doc.metadata.get('source') or
221
+ 'unknown')
222
+
223
+ # Ensure file_path is in metadata for downstream use
224
+ doc.metadata['file_path'] = file_path
225
+
226
+ file_type = get_file_type(file_path)
227
+
228
+ if file_type == 'markdown':
229
+ markdown_docs.append(doc)
230
+ if len(markdown_docs) >= BUFFER_SIZE:
231
+ yield from flush_markdown()
232
+ elif file_type == 'json':
233
+ json_docs.append(doc)
234
+ if len(json_docs) >= BUFFER_SIZE:
235
+ yield from flush_json()
236
+ elif file_type == 'code':
237
+ code_docs.append(doc)
238
+ if len(code_docs) >= BUFFER_SIZE:
239
+ yield from flush_code()
240
+ else:
241
+ text_docs.append(doc)
242
+ if len(text_docs) >= BUFFER_SIZE:
243
+ yield from flush_text()
244
+
245
+ # Flush remaining documents
246
+ yield from flush_markdown()
247
+ yield from flush_json()
248
+ yield from flush_code()
249
+ yield from flush_text()
250
+
251
+
252
+ def chunk_single_document(
253
+ doc: Document,
254
+ config: Optional[Dict[str, Any]] = None
255
+ ) -> Generator[Document, None, None]:
256
+ """
257
+ Convenience function to chunk a single document.
258
+
259
+ Args:
260
+ doc: Single Document to chunk
261
+ config: Optional chunker configuration
262
+
263
+ Yields:
264
+ Chunked Document objects
265
+ """
266
+ def single_doc_gen():
267
+ yield doc
268
+
269
+ yield from universal_chunker(single_doc_gen(), config)
@@ -9,13 +9,13 @@ from langchain_core.tools import ToolException
9
9
  from pydantic import Field
10
10
 
11
11
  from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
12
- from .chunkers.code.codeparser import parse_code_files_for_db
13
12
 
14
13
  logger = logging.getLogger(__name__)
15
14
 
16
15
 
17
16
  class CodeIndexerToolkit(BaseIndexerToolkit):
18
17
  def _get_indexed_data(self, index_name: str):
18
+ self._ensure_vectorstore_initialized()
19
19
  if not self.vector_adapter:
20
20
  raise ToolException("Vector adapter is not initialized. "
21
21
  "Check your configuration: embedding_model and vectorstore_type.")
@@ -66,26 +66,40 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
66
66
  def loader(self,
67
67
  branch: Optional[str] = None,
68
68
  whitelist: Optional[List[str]] = None,
69
- blacklist: Optional[List[str]] = None) -> Generator[Document, None, None]:
69
+ blacklist: Optional[List[str]] = None,
70
+ chunked: bool = True) -> Generator[Document, None, None]:
70
71
  """
71
- Generates file content from a branch, respecting whitelist and blacklist patterns.
72
+ Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
72
73
 
73
74
  Parameters:
74
75
  - branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
75
76
  - whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
76
77
  - blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
78
+ - chunked (bool): If True (default), applies universal chunker based on file type.
79
+ If False, returns raw Documents without chunking.
77
80
 
78
81
  Returns:
79
- - generator: Yields content from files matching the whitelist but not the blacklist.
82
+ - generator: Yields Documents from files matching the whitelist but not the blacklist.
83
+ Each document has exactly the key 'filename' in metadata, which is used as an ID
84
+ for further operations (indexing, deduplication, and retrieval).
80
85
 
81
86
  Example:
82
87
  # Use 'feature-branch', include '.py' files, exclude 'test_' files
83
- file_generator = loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*'])
88
+ for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
89
+ print(doc.page_content)
84
90
 
85
91
  Notes:
86
92
  - Whitelist and blacklist use Unix shell-style wildcards.
87
93
  - Files must match the whitelist and not the blacklist to be included.
94
+ - Each document MUST have exactly the key 'filename' in metadata. This key is used as an ID
95
+ for further operations such as indexing, deduplication, and retrieval.
96
+ - When chunked=True:
97
+ - .md files → markdown chunker (header-based splitting)
98
+ - .py/.js/.ts/etc → code parser (TreeSitter-based)
99
+ - .json files → JSON chunker
100
+ - other files → default text chunker
88
101
  """
102
+ import hashlib
89
103
 
90
104
  _files = self.__handle_get_files("", self.__get_branch(branch))
91
105
  self._log_tool_event(message="Listing files in branch", tool_name="loader")
@@ -103,41 +117,60 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
103
117
  or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
104
118
  return False
105
119
 
106
- def file_content_generator():
120
+ def raw_document_generator() -> Generator[Document, None, None]:
121
+ """Yields raw Documents without chunking."""
107
122
  self._log_tool_event(message="Reading the files", tool_name="loader")
108
- # log the progress of file reading
109
123
  total_files = len(_files)
124
+ processed = 0
125
+
110
126
  for idx, file in enumerate(_files, 1):
111
127
  if is_whitelisted(file) and not is_blacklisted(file):
112
- # read file ONLY if it matches whitelist and does not match blacklist
113
128
  try:
114
129
  file_content = self._read_file(file, self.__get_branch(branch))
115
130
  except Exception as e:
116
131
  logger.error(f"Failed to read file {file}: {e}")
117
- file_content = ""
132
+ continue
133
+
118
134
  if not file_content:
119
- # empty file, skip
120
135
  continue
121
- #
122
- # ensure file content is a string
136
+
137
+ # Ensure file content is a string
123
138
  if isinstance(file_content, bytes):
124
139
  file_content = file_content.decode("utf-8", errors="ignore")
125
140
  elif isinstance(file_content, dict) and file.endswith('.json'):
126
141
  file_content = json.dumps(file_content)
127
142
  elif not isinstance(file_content, str):
128
143
  file_content = str(file_content)
129
- #
130
- # hash the file content to ensure uniqueness
131
- import hashlib
144
+
145
+ # Hash the file content for uniqueness tracking
132
146
  file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
133
- yield {"file_name": file,
134
- "file_content": file_content,
135
- "commit_hash": file_hash}
147
+ processed += 1
148
+
149
+ yield Document(
150
+ page_content=file_content,
151
+ metadata={
152
+ 'file_path': file,
153
+ 'filename': file,
154
+ 'source': file,
155
+ 'commit_hash': file_hash,
156
+ }
157
+ )
158
+
136
159
  if idx % 10 == 0 or idx == total_files:
137
- self._log_tool_event(message=f"{idx} out of {total_files} files have been read", tool_name="loader")
138
- self._log_tool_event(message=f"{len(_files)} have been read", tool_name="loader")
139
-
140
- return parse_code_files_for_db(file_content_generator())
160
+ self._log_tool_event(
161
+ message=f"{idx} out of {total_files} files checked, {processed} matched",
162
+ tool_name="loader"
163
+ )
164
+
165
+ self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
166
+
167
+ if not chunked:
168
+ # Return raw documents without chunking
169
+ return raw_document_generator()
170
+
171
+ # Apply universal chunker based on file type
172
+ from .chunkers.universal_chunker import universal_chunker
173
+ return universal_chunker(raw_document_generator())
141
174
 
142
175
  def __handle_get_files(self, path: str, branch: str):
143
176
  """