alita-sdk 0.3.457__py3-none-any.whl → 0.3.486__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (102) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +194 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3592 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1256 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1665 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +64 -8
  30. alita_sdk/community/inventory/__init__.py +224 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +169 -0
  57. alita_sdk/community/inventory/visualize.py +1370 -0
  58. alita_sdk/configurations/bitbucket.py +0 -3
  59. alita_sdk/runtime/clients/client.py +99 -26
  60. alita_sdk/runtime/langchain/assistant.py +4 -2
  61. alita_sdk/runtime/langchain/constants.py +2 -1
  62. alita_sdk/runtime/langchain/langraph_agent.py +134 -31
  63. alita_sdk/runtime/langchain/utils.py +1 -1
  64. alita_sdk/runtime/llms/preloaded.py +2 -6
  65. alita_sdk/runtime/toolkits/__init__.py +2 -0
  66. alita_sdk/runtime/toolkits/application.py +1 -1
  67. alita_sdk/runtime/toolkits/mcp.py +46 -36
  68. alita_sdk/runtime/toolkits/planning.py +171 -0
  69. alita_sdk/runtime/toolkits/tools.py +39 -6
  70. alita_sdk/runtime/tools/function.py +17 -5
  71. alita_sdk/runtime/tools/llm.py +249 -14
  72. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  73. alita_sdk/runtime/tools/planning/models.py +246 -0
  74. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  75. alita_sdk/runtime/tools/vectorstore_base.py +41 -6
  76. alita_sdk/runtime/utils/mcp_oauth.py +80 -0
  77. alita_sdk/runtime/utils/streamlit.py +6 -10
  78. alita_sdk/runtime/utils/toolkit_utils.py +19 -4
  79. alita_sdk/tools/__init__.py +54 -27
  80. alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
  81. alita_sdk/tools/base_indexer_toolkit.py +150 -19
  82. alita_sdk/tools/bitbucket/__init__.py +2 -2
  83. alita_sdk/tools/chunkers/__init__.py +3 -1
  84. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +95 -6
  85. alita_sdk/tools/chunkers/universal_chunker.py +269 -0
  86. alita_sdk/tools/code_indexer_toolkit.py +55 -22
  87. alita_sdk/tools/elitea_base.py +86 -21
  88. alita_sdk/tools/jira/__init__.py +1 -1
  89. alita_sdk/tools/jira/api_wrapper.py +91 -40
  90. alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
  91. alita_sdk/tools/qtest/__init__.py +1 -1
  92. alita_sdk/tools/qtest/api_wrapper.py +871 -32
  93. alita_sdk/tools/sharepoint/api_wrapper.py +22 -2
  94. alita_sdk/tools/sharepoint/authorization_helper.py +17 -1
  95. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +8 -2
  96. alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
  97. {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/METADATA +146 -2
  98. {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/RECORD +102 -40
  99. alita_sdk-0.3.486.dist-info/entry_points.txt +2 -0
  100. {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/WHEEL +0 -0
  101. {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/licenses/LICENSE +0 -0
  102. {alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/top_level.txt +0 -0
@@ -2,8 +2,10 @@ import copy
2
2
  import json
3
3
  import logging
4
4
  import time
5
+ from enum import Enum
5
6
  from typing import Any, Optional, List, Dict, Generator
6
7
 
8
+ from langchain_core.callbacks import dispatch_custom_event
7
9
  from langchain_core.documents import Document
8
10
  from pydantic import create_model, Field, SecretStr
9
11
 
@@ -15,7 +17,17 @@ from ..runtime.utils.utils import IndexerKeywords
15
17
 
16
18
  logger = logging.getLogger(__name__)
17
19
 
18
- DEFAULT_CUT_OFF = 0.2
20
+ DEFAULT_CUT_OFF = 0.1
21
+ INDEX_META_UPDATE_INTERVAL = 600.0
22
+
23
+ class IndexTools(str, Enum):
24
+ """Enum for index-related tool names."""
25
+ INDEX_DATA = "index_data"
26
+ SEARCH_INDEX = "search_index"
27
+ STEPBACK_SEARCH_INDEX = "stepback_search_index"
28
+ STEPBACK_SUMMARY_INDEX = "stepback_summary_index"
29
+ REMOVE_INDEX = "remove_index"
30
+ LIST_COLLECTIONS = "list_collections"
19
31
 
20
32
  # Base Vector Store Schema Models
21
33
  BaseIndexParams = create_model(
@@ -156,6 +168,16 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
156
168
  clean_index = kwargs.get("clean_index")
157
169
  chunking_tool = kwargs.get("chunking_tool")
158
170
  chunking_config = kwargs.get("chunking_config")
171
+
172
+ # Store the interval in a private dict to avoid Pydantic field errors
173
+ if not hasattr(self, "_index_meta_config"):
174
+ self._index_meta_config: Dict[str, Any] = {}
175
+
176
+ self._index_meta_config["update_interval"] = kwargs.get(
177
+ "meta_update_interval",
178
+ INDEX_META_UPDATE_INTERVAL,
179
+ )
180
+
159
181
  result = {"count": 0}
160
182
  #
161
183
  try:
@@ -163,6 +185,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
163
185
  self._clean_index(index_name)
164
186
  #
165
187
  self.index_meta_init(index_name, kwargs)
188
+ self._emit_index_event(index_name)
166
189
  #
167
190
  self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
168
191
  self._log_tool_event(f"Loading the documents to index...{kwargs}")
@@ -178,16 +201,26 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
178
201
  self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
179
202
  #
180
203
  results_count = result["count"]
181
- self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count)
204
+ # Final update should always be forced
205
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count, update_force=True)
206
+ self._emit_index_event(index_name)
182
207
  #
183
208
  return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
184
209
  else "no new documents to index"}
185
210
  except Exception as e:
186
- self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"])
211
+ # Do maximum effort at least send custom event for supposed changed status
212
+ msg = str(e)
213
+ try:
214
+ # Error update should also be forced
215
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"], update_force=True)
216
+ except Exception as ie:
217
+ logger.error(f"Failed to update index meta status to FAILED for index '{index_name}': {ie}")
218
+ msg = f"{msg}; additionally failed to update index meta status to FAILED: {ie}"
219
+ self._emit_index_event(index_name, error=msg)
187
220
  raise e
188
-
189
221
 
190
222
  def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
223
+ self._ensure_vectorstore_initialized()
191
224
  self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
192
225
  from ..runtime.langchain.interfaces.llm_processor import add_documents
193
226
  #
@@ -240,6 +273,11 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
240
273
  logger.debug(msg)
241
274
  self._log_tool_event(msg)
242
275
  result["count"] += dependent_docs_counter
276
+ # After each base document, try a non-forced meta update; throttling handled inside index_meta_update
277
+ try:
278
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_IN_PROGRESS.value, result["count"], update_force=False)
279
+ except Exception as exc: # best-effort, do not break indexing
280
+ logger.warning(f"Failed to update index meta during indexing process for index '{index_name}': {exc}")
243
281
  if pg_vector_add_docs_chunk:
244
282
  add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
245
283
 
@@ -305,6 +343,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
305
343
  log_msg: str = "Verification of documents to index started"
306
344
  ) -> Generator[Document, None, None]:
307
345
  """Generic duplicate reduction logic for documents."""
346
+ self._ensure_vectorstore_initialized()
308
347
  self._log_tool_event(log_msg, tool_name="index_documents")
309
348
  indexed_data = self._get_indexed_data(index_name)
310
349
  indexed_keys = set(indexed_data.keys())
@@ -460,6 +499,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
460
499
  )
461
500
 
462
501
  def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
502
+ self._ensure_vectorstore_initialized()
463
503
  index_meta = super().get_index_meta(index_name)
464
504
  if not index_meta:
465
505
  self._log_tool_event(
@@ -479,12 +519,53 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
479
519
  "updated_on": created_on,
480
520
  "task_id": None,
481
521
  "conversation_id": None,
522
+ "toolkit_id": self.toolkit_id,
482
523
  }
483
524
  metadata["history"] = json.dumps([metadata])
484
525
  index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
485
526
  add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
486
527
 
487
- def index_meta_update(self, index_name: str, state: str, result: int):
528
+ def index_meta_update(self, index_name: str, state: str, result: int, update_force: bool = True, interval: Optional[float] = None):
529
+ """Update `index_meta` document with optional time-based throttling.
530
+
531
+ Args:
532
+ index_name: Index name to update meta for.
533
+ state: New state value for the `index_meta` record.
534
+ result: Number of processed documents to store in the `updated` field.
535
+ update_force: If `True`, perform the update unconditionally, ignoring throttling.
536
+ If `False`, perform the update only when the effective time interval has passed.
537
+ interval: Optional custom interval (in seconds) for this call when `update_force` is `False`.
538
+ If `None`, falls back to the value stored in `self._index_meta_config["update_interval"]`
539
+ if present, otherwise uses `INDEX_META_UPDATE_INTERVAL`.
540
+ """
541
+ self._ensure_vectorstore_initialized()
542
+ if not hasattr(self, "_index_meta_last_update_time"):
543
+ self._index_meta_last_update_time: Dict[str, float] = {}
544
+
545
+ if not update_force:
546
+ # Resolve effective interval:
547
+ # 1\) explicit arg
548
+ # 2\) value from `_index_meta_config`
549
+ # 3\) default constant
550
+ cfg_interval = None
551
+ if hasattr(self, "_index_meta_config"):
552
+ cfg_interval = self._index_meta_config.get("update_interval")
553
+
554
+ eff_interval = (
555
+ interval
556
+ if interval is not None
557
+ else (cfg_interval if cfg_interval is not None else INDEX_META_UPDATE_INTERVAL)
558
+ )
559
+
560
+ last_time = self._index_meta_last_update_time.get(index_name)
561
+ now = time.time()
562
+ if last_time is not None and (now - last_time) < eff_interval:
563
+ return
564
+ self._index_meta_last_update_time[index_name] = now
565
+ else:
566
+ # For forced updates, always refresh last update time
567
+ self._index_meta_last_update_time[index_name] = time.time()
568
+
488
569
  index_meta_raw = super().get_index_meta(index_name)
489
570
  from ..runtime.langchain.interfaces.llm_processor import add_documents
490
571
  #
@@ -511,6 +592,55 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
511
592
  index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
512
593
  add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
513
594
 
595
+ def _emit_index_event(self, index_name: str, error: Optional[str] = None):
596
+ """
597
+ Emit custom event for index data operation.
598
+
599
+ Args:
600
+ index_name: The name of the index
601
+ error: Error message if the operation failed, None otherwise
602
+ """
603
+ index_meta = super().get_index_meta(index_name)
604
+
605
+ if not index_meta:
606
+ logger.warning(
607
+ f"No index_meta found for index '{index_name}'. "
608
+ "Cannot emit index event."
609
+ )
610
+ return
611
+
612
+ metadata = index_meta.get("metadata", {})
613
+
614
+ # Determine if this is a reindex operation
615
+ history_raw = metadata.get("history", "[]")
616
+ try:
617
+ history = json.loads(history_raw) if history_raw.strip() else []
618
+ is_reindex = len(history) > 1
619
+ except (json.JSONDecodeError, TypeError):
620
+ is_reindex = False
621
+
622
+ # Build event message
623
+ event_data = {
624
+ "id": index_meta.get("id"),
625
+ "index_name": index_name,
626
+ "state": "failed" if error is not None else metadata.get("state"),
627
+ "error": error,
628
+ "reindex": is_reindex,
629
+ "indexed": metadata.get("indexed", 0),
630
+ "updated": metadata.get("updated", 0),
631
+ "toolkit_id": metadata.get("toolkit_id"),
632
+ }
633
+
634
+ # Emit the event
635
+ try:
636
+ dispatch_custom_event("index_data_status", event_data)
637
+ logger.debug(
638
+ f"Emitted index_data_status event for index "
639
+ f"'{index_name}': {event_data}"
640
+ )
641
+ except Exception as e:
642
+ logger.warning(f"Failed to emit index_data_status event: {e}")
643
+
514
644
  def get_available_tools(self):
515
645
  """
516
646
  Returns the standardized vector search tools (search operations only).
@@ -521,8 +651,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
521
651
  """
522
652
  return [
523
653
  {
524
- "name": "index_data",
525
- "mode": "index_data",
654
+ "name": IndexTools.INDEX_DATA.value,
655
+ "mode": IndexTools.INDEX_DATA.value,
526
656
  "ref": self.index_data,
527
657
  "description": "Loads data to index.",
528
658
  "args_schema": create_model(
@@ -532,38 +662,39 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
532
662
  )
533
663
  },
534
664
  {
535
- "name": "search_index",
536
- "mode": "search_index",
665
+ "name": IndexTools.SEARCH_INDEX.value,
666
+ "mode": IndexTools.SEARCH_INDEX.value,
537
667
  "ref": self.search_index,
538
668
  "description": self.search_index.__doc__,
539
669
  "args_schema": BaseSearchParams
540
670
  },
541
671
  {
542
- "name": "stepback_search_index",
543
- "mode": "stepback_search_index",
672
+ "name": IndexTools.STEPBACK_SEARCH_INDEX.value,
673
+ "mode": IndexTools.STEPBACK_SEARCH_INDEX.value,
544
674
  "ref": self.stepback_search_index,
545
675
  "description": self.stepback_search_index.__doc__,
546
676
  "args_schema": BaseStepbackSearchParams
547
677
  },
548
678
  {
549
- "name": "stepback_summary_index",
550
- "mode": "stepback_summary_index",
679
+ "name": IndexTools.STEPBACK_SUMMARY_INDEX.value,
680
+ "mode": IndexTools.STEPBACK_SUMMARY_INDEX.value,
551
681
  "ref": self.stepback_summary_index,
552
682
  "description": self.stepback_summary_index.__doc__,
553
683
  "args_schema": BaseStepbackSearchParams
554
684
  },
555
685
  {
556
- "name": "remove_index",
557
- "mode": "remove_index",
686
+ "name": IndexTools.REMOVE_INDEX.value,
687
+ "mode": IndexTools.REMOVE_INDEX.value,
558
688
  "ref": self.remove_index,
559
689
  "description": self.remove_index.__doc__,
560
690
  "args_schema": RemoveIndexParams
561
691
  },
562
692
  {
563
- "name": "list_collections",
564
- "mode": "list_collections",
693
+ "name": IndexTools.LIST_COLLECTIONS.value,
694
+ "mode": IndexTools.LIST_COLLECTIONS.value,
565
695
  "ref": self.list_collections,
566
696
  "description": self.list_collections.__doc__,
567
- "args_schema": create_model("ListCollectionsParams") # No parameters
697
+ # No parameters
698
+ "args_schema": create_model("ListCollectionsParams")
568
699
  },
569
- ]
700
+ ]
@@ -47,8 +47,8 @@ class AlitaBitbucketToolkit(BaseToolkit):
47
47
  AlitaBitbucketToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
48
48
  m = create_model(
49
49
  name,
50
- project=(str, Field(description="Project/Workspace", json_schema_extra={'configuration': True})),
51
- repository=(str, Field(description="Repository", json_schema_extra={'max_toolkit_length': AlitaBitbucketToolkit.toolkit_max_length, 'configuration': True})),
50
+ project=(str, Field(description="Project/Workspace")),
51
+ repository=(str, Field(description="Repository")),
52
52
  branch=(str, Field(description="Main branch", default="main")),
53
53
  cloud=(Optional[bool], Field(description="Hosting Option", default=None)),
54
54
  bitbucket_configuration=(BitbucketConfiguration, Field(description="Bitbucket Configuration", json_schema_extra={'configuration_types': ['bitbucket']})),
@@ -3,6 +3,7 @@ from .sematic.statistical_chunker import statistical_chunker
3
3
  from .sematic.markdown_chunker import markdown_chunker
4
4
  from .sematic.proposal_chunker import proposal_chunker
5
5
  from .sematic.json_chunker import json_chunker
6
+ from .universal_chunker import universal_chunker, chunk_single_document, get_file_type
6
7
  from .models import StatisticalChunkerConfig, MarkdownChunkerConfig, ProposalChunkerConfig
7
8
 
8
9
  __all__ = {
@@ -10,7 +11,8 @@ __all__ = {
10
11
  'statistical': statistical_chunker,
11
12
  'markdown': markdown_chunker,
12
13
  'proposal': proposal_chunker,
13
- 'json': json_chunker
14
+ 'json': json_chunker,
15
+ 'universal': universal_chunker,
14
16
  }
15
17
 
16
18
  __confluence_chunkers__ = {
@@ -1,4 +1,4 @@
1
- from typing import Generator
1
+ from typing import Generator, List
2
2
  from langchain_core.documents import Document
3
3
  from langchain_text_splitters import MarkdownHeaderTextSplitter, ExperimentalMarkdownSyntaxTextSplitter
4
4
  from langchain.text_splitter import TokenTextSplitter
@@ -7,28 +7,53 @@ from copy import deepcopy as copy
7
7
 
8
8
 
9
9
  def markdown_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
10
+ """
11
+ Chunks markdown documents by headers, with support for:
12
+ - Minimum chunk size to avoid tiny fragments
13
+ - Maximum token limit with overflow splitting
14
+ - Header metadata preservation
15
+
16
+ Config options:
17
+ strip_header (bool): Remove headers from content. Default: False
18
+ return_each_line (bool): Split on every line. Default: False
19
+ headers_to_split_on (list): Headers to split on, e.g. [('#', 'H1'), ('##', 'H2')]
20
+ max_tokens (int): Maximum tokens per chunk. Default: 512
21
+ token_overlap (int): Token overlap for large chunk splitting. Default: 10
22
+ min_chunk_chars (int): Minimum characters per chunk. Default: 100
23
+ Chunks smaller than this will be merged with the next chunk.
24
+ """
10
25
  strip_header = config.get("strip_header", False)
11
26
  return_each_line = config.get("return_each_line", False)
12
27
  headers_to_split_on = config.get("headers_to_split_on", [])
13
28
  max_tokens = config.get("max_tokens", 512)
14
29
  tokens_overlapping = config.get("token_overlap", 10)
30
+ min_chunk_chars = config.get("min_chunk_chars", 100) # Minimum characters per chunk
31
+
15
32
  headers_to_split_on = [tuple(header) for header in headers_to_split_on]
33
+
16
34
  for doc in file_content_generator:
17
35
  doc_metadata = doc.metadata
18
36
  doc_content = doc.page_content
19
37
  chunk_id = 0
38
+
20
39
  markdown_splitter = MarkdownHeaderTextSplitter(
21
40
  headers_to_split_on=headers_to_split_on,
22
41
  strip_headers=strip_header,
23
42
  return_each_line=return_each_line
24
43
  )
25
44
  md_header_splits = markdown_splitter.split_text(doc_content)
26
- for chunk in md_header_splits:
45
+
46
+ # Merge small chunks with the next one
47
+ merged_chunks = _merge_small_chunks(md_header_splits, min_chunk_chars)
48
+
49
+ for chunk in merged_chunks:
27
50
  if tiktoken_length(chunk.page_content) > max_tokens:
28
- for subchunk in TokenTextSplitter(encoding_name="cl100k_base",
29
- chunk_size=max_tokens,
30
- chunk_overlap=tokens_overlapping
31
- ).split_text(chunk.page_content):
51
+ # Split large chunks into smaller ones
52
+ for subchunk in TokenTextSplitter(
53
+ encoding_name="cl100k_base",
54
+ chunk_size=max_tokens,
55
+ chunk_overlap=tokens_overlapping
56
+ ).split_text(chunk.page_content):
32
57
  chunk_id += 1
33
58
  headers_meta = list(chunk.metadata.values())
34
59
  docmeta = copy(doc_metadata)
@@ -52,6 +77,70 @@ def markdown_chunker(file_content_generator: Generator[Document, None, None], co
52
77
  )
53
78
 
54
79
 
80
+ def _merge_small_chunks(chunks: List[Document], min_chars: int) -> List[Document]:
81
+ """
82
+ Merge chunks that are smaller than min_chars with the next chunk.
83
+
84
+ This prevents tiny fragments (like standalone headers or short notes)
85
+ from becoming separate chunks.
86
+
87
+ Args:
88
+ chunks: List of Document chunks from markdown splitter
89
+ min_chars: Minimum character count for a chunk
90
+
91
+ Returns:
92
+ List of merged Document chunks
93
+ """
94
+ if not chunks:
95
+ return chunks
96
+
97
+ merged = []
98
+ pending_content = ""
99
+ pending_metadata = {}
100
+
101
+ for i, chunk in enumerate(chunks):
102
+ content = chunk.page_content.strip()
103
+
104
+ if pending_content:
105
+ # Merge pending content with current chunk
106
+ combined_content = pending_content + "\n\n" + content
107
+ # Use the pending metadata (from the header) but can be extended
108
+ combined_metadata = {**pending_metadata}
109
+ # Add any new header info from current chunk
110
+ for key, value in chunk.metadata.items():
111
+ if key not in combined_metadata or not combined_metadata[key]:
112
+ combined_metadata[key] = value
113
+
114
+ if len(combined_content) >= min_chars:
115
+ # Combined is big enough, emit it
116
+ merged.append(Document(
117
+ page_content=combined_content,
118
+ metadata=combined_metadata
119
+ ))
120
+ pending_content = ""
121
+ pending_metadata = {}
122
+ else:
123
+ # Still too small, keep accumulating
124
+ pending_content = combined_content
125
+ pending_metadata = combined_metadata
126
+ elif len(content) < min_chars:
127
+ # Current chunk is too small, start pending
128
+ pending_content = content
129
+ pending_metadata = dict(chunk.metadata)
130
+ else:
131
+ # Current chunk is big enough
132
+ merged.append(chunk)
133
+
134
+ # Don't forget any remaining pending content
135
+ if pending_content:
136
+ merged.append(Document(
137
+ page_content=pending_content,
138
+ metadata=pending_metadata
139
+ ))
140
+
141
+ return merged
142
+
143
+
55
144
  def markdown_by_headers_chunker(file_content_generator: Generator[Document, None, None], config: dict, *args, **kwargs) -> Generator[Document, None, None]:
56
145
  strip_header = config.get("strip_header", False)
57
146
  return_each_line = config.get("return_each_line", False)