alita-sdk 0.3.457__py3-none-any.whl → 0.3.486__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +194 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3592 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1256 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1665 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +64 -8
- alita_sdk/community/inventory/__init__.py +224 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +169 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/bitbucket.py +0 -3
- alita_sdk/runtime/clients/client.py +99 -26
- alita_sdk/runtime/langchain/assistant.py +4 -2
- alita_sdk/runtime/langchain/constants.py +2 -1
- alita_sdk/runtime/langchain/langraph_agent.py +134 -31
- alita_sdk/runtime/langchain/utils.py +1 -1
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/toolkits/__init__.py +2 -0
- alita_sdk/runtime/toolkits/application.py +1 -1
- alita_sdk/runtime/toolkits/mcp.py +46 -36
- alita_sdk/runtime/toolkits/planning.py +171 -0
- alita_sdk/runtime/toolkits/tools.py +39 -6
- alita_sdk/runtime/tools/function.py +17 -5
- alita_sdk/runtime/tools/llm.py +249 -14
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/vectorstore_base.py +41 -6
- alita_sdk/runtime/utils/mcp_oauth.py +80 -0
- alita_sdk/runtime/utils/streamlit.py +6 -10
- alita_sdk/runtime/utils/toolkit_utils.py +19 -4
- alita_sdk/tools/__init__.py +54 -27
- alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
- alita_sdk/tools/base_indexer_toolkit.py +150 -19
- alita_sdk/tools/bitbucket/__init__.py +2 -2
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +95 -6
- alita_sdk/tools/chunkers/universal_chunker.py +269 -0
- alita_sdk/tools/code_indexer_toolkit.py +55 -22
- alita_sdk/tools/elitea_base.py +86 -21
- alita_sdk/tools/jira/__init__.py +1 -1
- alita_sdk/tools/jira/api_wrapper.py +91 -40
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/qtest/__init__.py +1 -1
- alita_sdk/tools/qtest/api_wrapper.py +871 -32
- alita_sdk/tools/sharepoint/api_wrapper.py +22 -2
- alita_sdk/tools/sharepoint/authorization_helper.py +17 -1
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +8 -2
- alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/METADATA +146 -2
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/RECORD +102 -40
- alita_sdk-0.3.486.dist-info/entry_points.txt +2 -0
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/top_level.txt +0 -0
|
@@ -2,8 +2,10 @@ import copy
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import time
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from typing import Any, Optional, List, Dict, Generator
|
|
6
7
|
|
|
8
|
+
from langchain_core.callbacks import dispatch_custom_event
|
|
7
9
|
from langchain_core.documents import Document
|
|
8
10
|
from pydantic import create_model, Field, SecretStr
|
|
9
11
|
|
|
@@ -15,7 +17,17 @@ from ..runtime.utils.utils import IndexerKeywords
|
|
|
15
17
|
|
|
16
18
|
logger = logging.getLogger(__name__)
|
|
17
19
|
|
|
18
|
-
DEFAULT_CUT_OFF = 0.
|
|
20
|
+
DEFAULT_CUT_OFF = 0.1
|
|
21
|
+
INDEX_META_UPDATE_INTERVAL = 600.0
|
|
22
|
+
|
|
23
|
+
class IndexTools(str, Enum):
|
|
24
|
+
"""Enum for index-related tool names."""
|
|
25
|
+
INDEX_DATA = "index_data"
|
|
26
|
+
SEARCH_INDEX = "search_index"
|
|
27
|
+
STEPBACK_SEARCH_INDEX = "stepback_search_index"
|
|
28
|
+
STEPBACK_SUMMARY_INDEX = "stepback_summary_index"
|
|
29
|
+
REMOVE_INDEX = "remove_index"
|
|
30
|
+
LIST_COLLECTIONS = "list_collections"
|
|
19
31
|
|
|
20
32
|
# Base Vector Store Schema Models
|
|
21
33
|
BaseIndexParams = create_model(
|
|
@@ -156,6 +168,16 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
156
168
|
clean_index = kwargs.get("clean_index")
|
|
157
169
|
chunking_tool = kwargs.get("chunking_tool")
|
|
158
170
|
chunking_config = kwargs.get("chunking_config")
|
|
171
|
+
|
|
172
|
+
# Store the interval in a private dict to avoid Pydantic field errors
|
|
173
|
+
if not hasattr(self, "_index_meta_config"):
|
|
174
|
+
self._index_meta_config: Dict[str, Any] = {}
|
|
175
|
+
|
|
176
|
+
self._index_meta_config["update_interval"] = kwargs.get(
|
|
177
|
+
"meta_update_interval",
|
|
178
|
+
INDEX_META_UPDATE_INTERVAL,
|
|
179
|
+
)
|
|
180
|
+
|
|
159
181
|
result = {"count": 0}
|
|
160
182
|
#
|
|
161
183
|
try:
|
|
@@ -163,6 +185,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
163
185
|
self._clean_index(index_name)
|
|
164
186
|
#
|
|
165
187
|
self.index_meta_init(index_name, kwargs)
|
|
188
|
+
self._emit_index_event(index_name)
|
|
166
189
|
#
|
|
167
190
|
self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
|
|
168
191
|
self._log_tool_event(f"Loading the documents to index...{kwargs}")
|
|
@@ -178,16 +201,26 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
178
201
|
self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
|
|
179
202
|
#
|
|
180
203
|
results_count = result["count"]
|
|
181
|
-
|
|
204
|
+
# Final update should always be forced
|
|
205
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count, update_force=True)
|
|
206
|
+
self._emit_index_event(index_name)
|
|
182
207
|
#
|
|
183
208
|
return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
|
|
184
209
|
else "no new documents to index"}
|
|
185
210
|
except Exception as e:
|
|
186
|
-
|
|
211
|
+
# Do maximum effort at least send custom event for supposed changed status
|
|
212
|
+
msg = str(e)
|
|
213
|
+
try:
|
|
214
|
+
# Error update should also be forced
|
|
215
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"], update_force=True)
|
|
216
|
+
except Exception as ie:
|
|
217
|
+
logger.error(f"Failed to update index meta status to FAILED for index '{index_name}': {ie}")
|
|
218
|
+
msg = f"{msg}; additionally failed to update index meta status to FAILED: {ie}"
|
|
219
|
+
self._emit_index_event(index_name, error=msg)
|
|
187
220
|
raise e
|
|
188
|
-
|
|
189
221
|
|
|
190
222
|
def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
|
|
223
|
+
self._ensure_vectorstore_initialized()
|
|
191
224
|
self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
|
|
192
225
|
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
193
226
|
#
|
|
@@ -240,6 +273,11 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
240
273
|
logger.debug(msg)
|
|
241
274
|
self._log_tool_event(msg)
|
|
242
275
|
result["count"] += dependent_docs_counter
|
|
276
|
+
# After each base document, try a non-forced meta update; throttling handled inside index_meta_update
|
|
277
|
+
try:
|
|
278
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_IN_PROGRESS.value, result["count"], update_force=False)
|
|
279
|
+
except Exception as exc: # best-effort, do not break indexing
|
|
280
|
+
logger.warning(f"Failed to update index meta during indexing process for index '{index_name}': {exc}")
|
|
243
281
|
if pg_vector_add_docs_chunk:
|
|
244
282
|
add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
|
|
245
283
|
|
|
@@ -305,6 +343,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
305
343
|
log_msg: str = "Verification of documents to index started"
|
|
306
344
|
) -> Generator[Document, None, None]:
|
|
307
345
|
"""Generic duplicate reduction logic for documents."""
|
|
346
|
+
self._ensure_vectorstore_initialized()
|
|
308
347
|
self._log_tool_event(log_msg, tool_name="index_documents")
|
|
309
348
|
indexed_data = self._get_indexed_data(index_name)
|
|
310
349
|
indexed_keys = set(indexed_data.keys())
|
|
@@ -460,6 +499,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
460
499
|
)
|
|
461
500
|
|
|
462
501
|
def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
|
|
502
|
+
self._ensure_vectorstore_initialized()
|
|
463
503
|
index_meta = super().get_index_meta(index_name)
|
|
464
504
|
if not index_meta:
|
|
465
505
|
self._log_tool_event(
|
|
@@ -479,12 +519,53 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
479
519
|
"updated_on": created_on,
|
|
480
520
|
"task_id": None,
|
|
481
521
|
"conversation_id": None,
|
|
522
|
+
"toolkit_id": self.toolkit_id,
|
|
482
523
|
}
|
|
483
524
|
metadata["history"] = json.dumps([metadata])
|
|
484
525
|
index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
|
|
485
526
|
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
|
|
486
527
|
|
|
487
|
-
def index_meta_update(self, index_name: str, state: str, result: int):
|
|
528
|
+
def index_meta_update(self, index_name: str, state: str, result: int, update_force: bool = True, interval: Optional[float] = None):
|
|
529
|
+
"""Update `index_meta` document with optional time-based throttling.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
index_name: Index name to update meta for.
|
|
533
|
+
state: New state value for the `index_meta` record.
|
|
534
|
+
result: Number of processed documents to store in the `updated` field.
|
|
535
|
+
update_force: If `True`, perform the update unconditionally, ignoring throttling.
|
|
536
|
+
If `False`, perform the update only when the effective time interval has passed.
|
|
537
|
+
interval: Optional custom interval (in seconds) for this call when `update_force` is `False`.
|
|
538
|
+
If `None`, falls back to the value stored in `self._index_meta_config["update_interval"]`
|
|
539
|
+
if present, otherwise uses `INDEX_META_UPDATE_INTERVAL`.
|
|
540
|
+
"""
|
|
541
|
+
self._ensure_vectorstore_initialized()
|
|
542
|
+
if not hasattr(self, "_index_meta_last_update_time"):
|
|
543
|
+
self._index_meta_last_update_time: Dict[str, float] = {}
|
|
544
|
+
|
|
545
|
+
if not update_force:
|
|
546
|
+
# Resolve effective interval:
|
|
547
|
+
# 1\) explicit arg
|
|
548
|
+
# 2\) value from `_index_meta_config`
|
|
549
|
+
# 3\) default constant
|
|
550
|
+
cfg_interval = None
|
|
551
|
+
if hasattr(self, "_index_meta_config"):
|
|
552
|
+
cfg_interval = self._index_meta_config.get("update_interval")
|
|
553
|
+
|
|
554
|
+
eff_interval = (
|
|
555
|
+
interval
|
|
556
|
+
if interval is not None
|
|
557
|
+
else (cfg_interval if cfg_interval is not None else INDEX_META_UPDATE_INTERVAL)
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
last_time = self._index_meta_last_update_time.get(index_name)
|
|
561
|
+
now = time.time()
|
|
562
|
+
if last_time is not None and (now - last_time) < eff_interval:
|
|
563
|
+
return
|
|
564
|
+
self._index_meta_last_update_time[index_name] = now
|
|
565
|
+
else:
|
|
566
|
+
# For forced updates, always refresh last update time
|
|
567
|
+
self._index_meta_last_update_time[index_name] = time.time()
|
|
568
|
+
|
|
488
569
|
index_meta_raw = super().get_index_meta(index_name)
|
|
489
570
|
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
490
571
|
#
|
|
@@ -511,6 +592,55 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
511
592
|
index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
|
|
512
593
|
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
|
|
513
594
|
|
|
595
|
+
def _emit_index_event(self, index_name: str, error: Optional[str] = None):
|
|
596
|
+
"""
|
|
597
|
+
Emit custom event for index data operation.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
index_name: The name of the index
|
|
601
|
+
error: Error message if the operation failed, None otherwise
|
|
602
|
+
"""
|
|
603
|
+
index_meta = super().get_index_meta(index_name)
|
|
604
|
+
|
|
605
|
+
if not index_meta:
|
|
606
|
+
logger.warning(
|
|
607
|
+
f"No index_meta found for index '{index_name}'. "
|
|
608
|
+
"Cannot emit index event."
|
|
609
|
+
)
|
|
610
|
+
return
|
|
611
|
+
|
|
612
|
+
metadata = index_meta.get("metadata", {})
|
|
613
|
+
|
|
614
|
+
# Determine if this is a reindex operation
|
|
615
|
+
history_raw = metadata.get("history", "[]")
|
|
616
|
+
try:
|
|
617
|
+
history = json.loads(history_raw) if history_raw.strip() else []
|
|
618
|
+
is_reindex = len(history) > 1
|
|
619
|
+
except (json.JSONDecodeError, TypeError):
|
|
620
|
+
is_reindex = False
|
|
621
|
+
|
|
622
|
+
# Build event message
|
|
623
|
+
event_data = {
|
|
624
|
+
"id": index_meta.get("id"),
|
|
625
|
+
"index_name": index_name,
|
|
626
|
+
"state": "failed" if error is not None else metadata.get("state"),
|
|
627
|
+
"error": error,
|
|
628
|
+
"reindex": is_reindex,
|
|
629
|
+
"indexed": metadata.get("indexed", 0),
|
|
630
|
+
"updated": metadata.get("updated", 0),
|
|
631
|
+
"toolkit_id": metadata.get("toolkit_id"),
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
# Emit the event
|
|
635
|
+
try:
|
|
636
|
+
dispatch_custom_event("index_data_status", event_data)
|
|
637
|
+
logger.debug(
|
|
638
|
+
f"Emitted index_data_status event for index "
|
|
639
|
+
f"'{index_name}': {event_data}"
|
|
640
|
+
)
|
|
641
|
+
except Exception as e:
|
|
642
|
+
logger.warning(f"Failed to emit index_data_status event: {e}")
|
|
643
|
+
|
|
514
644
|
def get_available_tools(self):
|
|
515
645
|
"""
|
|
516
646
|
Returns the standardized vector search tools (search operations only).
|
|
@@ -521,8 +651,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
521
651
|
"""
|
|
522
652
|
return [
|
|
523
653
|
{
|
|
524
|
-
"name":
|
|
525
|
-
"mode":
|
|
654
|
+
"name": IndexTools.INDEX_DATA.value,
|
|
655
|
+
"mode": IndexTools.INDEX_DATA.value,
|
|
526
656
|
"ref": self.index_data,
|
|
527
657
|
"description": "Loads data to index.",
|
|
528
658
|
"args_schema": create_model(
|
|
@@ -532,38 +662,39 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
532
662
|
)
|
|
533
663
|
},
|
|
534
664
|
{
|
|
535
|
-
"name":
|
|
536
|
-
"mode":
|
|
665
|
+
"name": IndexTools.SEARCH_INDEX.value,
|
|
666
|
+
"mode": IndexTools.SEARCH_INDEX.value,
|
|
537
667
|
"ref": self.search_index,
|
|
538
668
|
"description": self.search_index.__doc__,
|
|
539
669
|
"args_schema": BaseSearchParams
|
|
540
670
|
},
|
|
541
671
|
{
|
|
542
|
-
"name":
|
|
543
|
-
"mode":
|
|
672
|
+
"name": IndexTools.STEPBACK_SEARCH_INDEX.value,
|
|
673
|
+
"mode": IndexTools.STEPBACK_SEARCH_INDEX.value,
|
|
544
674
|
"ref": self.stepback_search_index,
|
|
545
675
|
"description": self.stepback_search_index.__doc__,
|
|
546
676
|
"args_schema": BaseStepbackSearchParams
|
|
547
677
|
},
|
|
548
678
|
{
|
|
549
|
-
"name":
|
|
550
|
-
"mode":
|
|
679
|
+
"name": IndexTools.STEPBACK_SUMMARY_INDEX.value,
|
|
680
|
+
"mode": IndexTools.STEPBACK_SUMMARY_INDEX.value,
|
|
551
681
|
"ref": self.stepback_summary_index,
|
|
552
682
|
"description": self.stepback_summary_index.__doc__,
|
|
553
683
|
"args_schema": BaseStepbackSearchParams
|
|
554
684
|
},
|
|
555
685
|
{
|
|
556
|
-
"name":
|
|
557
|
-
"mode":
|
|
686
|
+
"name": IndexTools.REMOVE_INDEX.value,
|
|
687
|
+
"mode": IndexTools.REMOVE_INDEX.value,
|
|
558
688
|
"ref": self.remove_index,
|
|
559
689
|
"description": self.remove_index.__doc__,
|
|
560
690
|
"args_schema": RemoveIndexParams
|
|
561
691
|
},
|
|
562
692
|
{
|
|
563
|
-
"name":
|
|
564
|
-
"mode":
|
|
693
|
+
"name": IndexTools.LIST_COLLECTIONS.value,
|
|
694
|
+
"mode": IndexTools.LIST_COLLECTIONS.value,
|
|
565
695
|
"ref": self.list_collections,
|
|
566
696
|
"description": self.list_collections.__doc__,
|
|
567
|
-
|
|
697
|
+
# No parameters
|
|
698
|
+
"args_schema": create_model("ListCollectionsParams")
|
|
568
699
|
},
|
|
569
|
-
]
|
|
700
|
+
]
|
|
@@ -47,8 +47,8 @@ class AlitaBitbucketToolkit(BaseToolkit):
|
|
|
47
47
|
AlitaBitbucketToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
48
48
|
m = create_model(
|
|
49
49
|
name,
|
|
50
|
-
project=(str, Field(description="Project/Workspace"
|
|
51
|
-
repository=(str, Field(description="Repository"
|
|
50
|
+
project=(str, Field(description="Project/Workspace")),
|
|
51
|
+
repository=(str, Field(description="Repository")),
|
|
52
52
|
branch=(str, Field(description="Main branch", default="main")),
|
|
53
53
|
cloud=(Optional[bool], Field(description="Hosting Option", default=None)),
|
|
54
54
|
bitbucket_configuration=(BitbucketConfiguration, Field(description="Bitbucket Configuration", json_schema_extra={'configuration_types': ['bitbucket']})),
|
|
@@ -3,6 +3,7 @@ from .sematic.statistical_chunker import statistical_chunker
|
|
|
3
3
|
from .sematic.markdown_chunker import markdown_chunker
|
|
4
4
|
from .sematic.proposal_chunker import proposal_chunker
|
|
5
5
|
from .sematic.json_chunker import json_chunker
|
|
6
|
+
from .universal_chunker import universal_chunker, chunk_single_document, get_file_type
|
|
6
7
|
from .models import StatisticalChunkerConfig, MarkdownChunkerConfig, ProposalChunkerConfig
|
|
7
8
|
|
|
8
9
|
__all__ = {
|
|
@@ -10,7 +11,8 @@ __all__ = {
|
|
|
10
11
|
'statistical': statistical_chunker,
|
|
11
12
|
'markdown': markdown_chunker,
|
|
12
13
|
'proposal': proposal_chunker,
|
|
13
|
-
'json': json_chunker
|
|
14
|
+
'json': json_chunker,
|
|
15
|
+
'universal': universal_chunker,
|
|
14
16
|
}
|
|
15
17
|
|
|
16
18
|
__confluence_chunkers__ = {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Generator
|
|
1
|
+
from typing import Generator, List
|
|
2
2
|
from langchain_core.documents import Document
|
|
3
3
|
from langchain_text_splitters import MarkdownHeaderTextSplitter, ExperimentalMarkdownSyntaxTextSplitter
|
|
4
4
|
from langchain.text_splitter import TokenTextSplitter
|
|
@@ -7,28 +7,53 @@ from copy import deepcopy as copy
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def markdown_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
|
|
10
|
+
"""
|
|
11
|
+
Chunks markdown documents by headers, with support for:
|
|
12
|
+
- Minimum chunk size to avoid tiny fragments
|
|
13
|
+
- Maximum token limit with overflow splitting
|
|
14
|
+
- Header metadata preservation
|
|
15
|
+
|
|
16
|
+
Config options:
|
|
17
|
+
strip_header (bool): Remove headers from content. Default: False
|
|
18
|
+
return_each_line (bool): Split on every line. Default: False
|
|
19
|
+
headers_to_split_on (list): Headers to split on, e.g. [('#', 'H1'), ('##', 'H2')]
|
|
20
|
+
max_tokens (int): Maximum tokens per chunk. Default: 512
|
|
21
|
+
token_overlap (int): Token overlap for large chunk splitting. Default: 10
|
|
22
|
+
min_chunk_chars (int): Minimum characters per chunk. Default: 100
|
|
23
|
+
Chunks smaller than this will be merged with the next chunk.
|
|
24
|
+
"""
|
|
10
25
|
strip_header = config.get("strip_header", False)
|
|
11
26
|
return_each_line = config.get("return_each_line", False)
|
|
12
27
|
headers_to_split_on = config.get("headers_to_split_on", [])
|
|
13
28
|
max_tokens = config.get("max_tokens", 512)
|
|
14
29
|
tokens_overlapping = config.get("token_overlap", 10)
|
|
30
|
+
min_chunk_chars = config.get("min_chunk_chars", 100) # Minimum characters per chunk
|
|
31
|
+
|
|
15
32
|
headers_to_split_on = [tuple(header) for header in headers_to_split_on]
|
|
33
|
+
|
|
16
34
|
for doc in file_content_generator:
|
|
17
35
|
doc_metadata = doc.metadata
|
|
18
36
|
doc_content = doc.page_content
|
|
19
37
|
chunk_id = 0
|
|
38
|
+
|
|
20
39
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
|
21
40
|
headers_to_split_on=headers_to_split_on,
|
|
22
41
|
strip_headers=strip_header,
|
|
23
42
|
return_each_line=return_each_line
|
|
24
43
|
)
|
|
25
44
|
md_header_splits = markdown_splitter.split_text(doc_content)
|
|
26
|
-
|
|
45
|
+
|
|
46
|
+
# Merge small chunks with the next one
|
|
47
|
+
merged_chunks = _merge_small_chunks(md_header_splits, min_chunk_chars)
|
|
48
|
+
|
|
49
|
+
for chunk in merged_chunks:
|
|
27
50
|
if tiktoken_length(chunk.page_content) > max_tokens:
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
51
|
+
# Split large chunks into smaller ones
|
|
52
|
+
for subchunk in TokenTextSplitter(
|
|
53
|
+
encoding_name="cl100k_base",
|
|
54
|
+
chunk_size=max_tokens,
|
|
55
|
+
chunk_overlap=tokens_overlapping
|
|
56
|
+
).split_text(chunk.page_content):
|
|
32
57
|
chunk_id += 1
|
|
33
58
|
headers_meta = list(chunk.metadata.values())
|
|
34
59
|
docmeta = copy(doc_metadata)
|
|
@@ -52,6 +77,70 @@ def markdown_chunker(file_content_generator: Generator[Document, None, None], co
|
|
|
52
77
|
)
|
|
53
78
|
|
|
54
79
|
|
|
80
|
+
def _merge_small_chunks(chunks: List[Document], min_chars: int) -> List[Document]:
|
|
81
|
+
"""
|
|
82
|
+
Merge chunks that are smaller than min_chars with the next chunk.
|
|
83
|
+
|
|
84
|
+
This prevents tiny fragments (like standalone headers or short notes)
|
|
85
|
+
from becoming separate chunks.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
chunks: List of Document chunks from markdown splitter
|
|
89
|
+
min_chars: Minimum character count for a chunk
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
List of merged Document chunks
|
|
93
|
+
"""
|
|
94
|
+
if not chunks:
|
|
95
|
+
return chunks
|
|
96
|
+
|
|
97
|
+
merged = []
|
|
98
|
+
pending_content = ""
|
|
99
|
+
pending_metadata = {}
|
|
100
|
+
|
|
101
|
+
for i, chunk in enumerate(chunks):
|
|
102
|
+
content = chunk.page_content.strip()
|
|
103
|
+
|
|
104
|
+
if pending_content:
|
|
105
|
+
# Merge pending content with current chunk
|
|
106
|
+
combined_content = pending_content + "\n\n" + content
|
|
107
|
+
# Use the pending metadata (from the header) but can be extended
|
|
108
|
+
combined_metadata = {**pending_metadata}
|
|
109
|
+
# Add any new header info from current chunk
|
|
110
|
+
for key, value in chunk.metadata.items():
|
|
111
|
+
if key not in combined_metadata or not combined_metadata[key]:
|
|
112
|
+
combined_metadata[key] = value
|
|
113
|
+
|
|
114
|
+
if len(combined_content) >= min_chars:
|
|
115
|
+
# Combined is big enough, emit it
|
|
116
|
+
merged.append(Document(
|
|
117
|
+
page_content=combined_content,
|
|
118
|
+
metadata=combined_metadata
|
|
119
|
+
))
|
|
120
|
+
pending_content = ""
|
|
121
|
+
pending_metadata = {}
|
|
122
|
+
else:
|
|
123
|
+
# Still too small, keep accumulating
|
|
124
|
+
pending_content = combined_content
|
|
125
|
+
pending_metadata = combined_metadata
|
|
126
|
+
elif len(content) < min_chars:
|
|
127
|
+
# Current chunk is too small, start pending
|
|
128
|
+
pending_content = content
|
|
129
|
+
pending_metadata = dict(chunk.metadata)
|
|
130
|
+
else:
|
|
131
|
+
# Current chunk is big enough
|
|
132
|
+
merged.append(chunk)
|
|
133
|
+
|
|
134
|
+
# Don't forget any remaining pending content
|
|
135
|
+
if pending_content:
|
|
136
|
+
merged.append(Document(
|
|
137
|
+
page_content=pending_content,
|
|
138
|
+
metadata=pending_metadata
|
|
139
|
+
))
|
|
140
|
+
|
|
141
|
+
return merged
|
|
142
|
+
|
|
143
|
+
|
|
55
144
|
def markdown_by_headers_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
|
|
56
145
|
strip_header = config.get("strip_header", False)
|
|
57
146
|
return_each_line = config.get("return_each_line", False)
|