hdsp-jupyter-extension 2.0.10__py3-none-any.whl → 2.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. agent_server/core/notebook_generator.py +4 -4
  2. agent_server/langchain/custom_middleware.py +95 -9
  3. agent_server/langchain/hitl_config.py +5 -0
  4. agent_server/langchain/llm_factory.py +1 -85
  5. agent_server/langchain/prompts.py +105 -128
  6. agent_server/prompts/file_action_prompts.py +8 -8
  7. agent_server/routers/langchain_agent.py +78 -12
  8. hdsp_agent_core/__init__.py +46 -47
  9. hdsp_agent_core/factory.py +6 -10
  10. hdsp_agent_core/interfaces.py +4 -2
  11. hdsp_agent_core/knowledge/__init__.py +5 -5
  12. hdsp_agent_core/knowledge/chunking.py +87 -61
  13. hdsp_agent_core/knowledge/loader.py +103 -101
  14. hdsp_agent_core/llm/service.py +192 -107
  15. hdsp_agent_core/managers/config_manager.py +16 -22
  16. hdsp_agent_core/managers/session_manager.py +5 -4
  17. hdsp_agent_core/models/__init__.py +12 -12
  18. hdsp_agent_core/models/agent.py +15 -8
  19. hdsp_agent_core/models/common.py +1 -2
  20. hdsp_agent_core/models/rag.py +48 -111
  21. hdsp_agent_core/prompts/__init__.py +12 -12
  22. hdsp_agent_core/prompts/cell_action_prompts.py +9 -7
  23. hdsp_agent_core/services/agent_service.py +10 -8
  24. hdsp_agent_core/services/chat_service.py +10 -6
  25. hdsp_agent_core/services/rag_service.py +3 -6
  26. hdsp_agent_core/tests/conftest.py +4 -1
  27. hdsp_agent_core/tests/test_factory.py +2 -2
  28. hdsp_agent_core/tests/test_services.py +12 -19
  29. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/build_log.json +1 -1
  30. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/package.json +2 -2
  31. hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.dc6434bee96ab03a0539.js → hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js +81 -77
  32. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js.map +1 -0
  33. hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.4a252df3ade74efee8d6.js → hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js +3 -3
  34. jupyter_ext/labextension/static/remoteEntry.4a252df3ade74efee8d6.js.map → hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js.map +1 -1
  35. {hdsp_jupyter_extension-2.0.10.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/METADATA +1 -1
  36. {hdsp_jupyter_extension-2.0.10.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/RECORD +68 -68
  37. jupyter_ext/__init__.py +21 -11
  38. jupyter_ext/_version.py +1 -1
  39. jupyter_ext/handlers.py +69 -50
  40. jupyter_ext/labextension/build_log.json +1 -1
  41. jupyter_ext/labextension/package.json +2 -2
  42. jupyter_ext/labextension/static/{lib_index_js.dc6434bee96ab03a0539.js → lib_index_js.58c1e128ba0b76f41f04.js} +81 -77
  43. jupyter_ext/labextension/static/lib_index_js.58c1e128ba0b76f41f04.js.map +1 -0
  44. jupyter_ext/labextension/static/{remoteEntry.4a252df3ade74efee8d6.js → remoteEntry.9da31d1134a53b0c4af5.js} +3 -3
  45. hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.4a252df3ade74efee8d6.js.map → jupyter_ext/labextension/static/remoteEntry.9da31d1134a53b0c4af5.js.map +1 -1
  46. hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.dc6434bee96ab03a0539.js.map +0 -1
  47. jupyter_ext/labextension/static/lib_index_js.dc6434bee96ab03a0539.js.map +0 -1
  48. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/etc/jupyter/jupyter_server_config.d/hdsp_jupyter_extension.json +0 -0
  49. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/install.json +0 -0
  50. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js +0 -0
  51. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +0 -0
  52. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js +0 -0
  53. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js.map +0 -0
  54. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js +0 -0
  55. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js.map +0 -0
  56. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/style.js +0 -0
  57. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js +0 -0
  58. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js.map +0 -0
  59. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js +0 -0
  60. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js.map +0 -0
  61. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js +0 -0
  62. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js.map +0 -0
  63. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js +0 -0
  64. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js.map +0 -0
  65. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js +0 -0
  66. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js.map +0 -0
  67. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js +0 -0
  68. {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js.map +0 -0
  69. {hdsp_jupyter_extension-2.0.10.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/WHEEL +0 -0
  70. {hdsp_jupyter_extension-2.0.10.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/licenses/LICENSE +0 -0
@@ -909,6 +909,26 @@ async def stream_agent(request: AgentRequest):
909
909
  "event": "todos",
910
910
  "data": json.dumps({"todos": todos}),
911
911
  }
912
+ # Check if all todos are completed - terminate early
913
+ all_completed = all(
914
+ t.get("status") == "completed" for t in todos
915
+ )
916
+ if all_completed and len(todos) > 0:
917
+ logger.info(
918
+ "All %d todos completed in AIMessage tool_calls, auto-terminating",
919
+ len(todos),
920
+ )
921
+ yield {
922
+ "event": "debug_clear",
923
+ "data": json.dumps({}),
924
+ }
925
+ yield {
926
+ "event": "done",
927
+ "data": json.dumps(
928
+ {"reason": "all_todos_completed"}
929
+ ),
930
+ }
931
+ return # Exit before executing more tool calls
912
932
  for tool_call in tool_calls:
913
933
  tool_name = tool_call.get("name", "unknown")
914
934
  tool_args = tool_call.get("args", {})
@@ -1032,16 +1052,31 @@ async def stream_agent(request: AgentRequest):
1032
1052
  content = "\n".join(text_parts)
1033
1053
 
1034
1054
  # Filter out raw JSON tool responses
1055
+ content_stripped = content.strip() if content else ""
1056
+
1057
+ # Filter out tool call JSON (but allow summary/next_items JSON for frontend rendering)
1058
+ is_json_tool_response = (
1059
+ content_stripped.startswith('{"tool":')
1060
+ or content_stripped.startswith('{ "tool":')
1061
+ or content_stripped.startswith('{"tool" :')
1062
+ or content_stripped.startswith('{"status":')
1063
+ or '"pending_execution"' in content
1064
+ or '"status": "complete"' in content
1065
+ or (
1066
+ '"tool"' in content
1067
+ and '"write_todos"' in content
1068
+ )
1069
+ or (
1070
+ '"tool"' in content
1071
+ and '"arguments"' in content
1072
+ and content_stripped.startswith("{")
1073
+ )
1074
+ )
1035
1075
  if (
1036
1076
  content
1037
1077
  and isinstance(content, str)
1038
1078
  and not has_final_answer_tool
1039
- and not (
1040
- content.strip().startswith('{"tool":')
1041
- or content.strip().startswith('{"status":')
1042
- or '"pending_execution"' in content
1043
- or '"status": "complete"' in content
1044
- )
1079
+ and not is_json_tool_response
1045
1080
  ):
1046
1081
  # Check if we've already emitted this content (prevents duplicates)
1047
1082
  content_hash = hash(content)
@@ -1814,16 +1849,27 @@ async def resume_agent(request: ResumeRequest):
1814
1849
  content = "\n".join(text_parts)
1815
1850
 
1816
1851
  # Filter out raw JSON tool responses
1852
+ content_stripped = content.strip() if content else ""
1853
+ # Filter out tool call JSON (but allow summary/next_items JSON for frontend rendering)
1854
+ is_json_tool_response = (
1855
+ content_stripped.startswith('{"tool":')
1856
+ or content_stripped.startswith('{ "tool":')
1857
+ or content_stripped.startswith('{"tool" :')
1858
+ or content_stripped.startswith('{"status":')
1859
+ or '"pending_execution"' in content
1860
+ or '"status": "complete"' in content
1861
+ or ('"tool"' in content and '"write_todos"' in content)
1862
+ or (
1863
+ '"tool"' in content
1864
+ and '"arguments"' in content
1865
+ and content_stripped.startswith("{")
1866
+ )
1867
+ )
1817
1868
  if (
1818
1869
  content
1819
1870
  and isinstance(content, str)
1820
1871
  and not has_final_answer_tool
1821
- and not (
1822
- content.strip().startswith('{"tool":')
1823
- or content.strip().startswith('{"status":')
1824
- or '"pending_execution"' in content
1825
- or '"status": "complete"' in content
1826
- )
1872
+ and not is_json_tool_response
1827
1873
  ):
1828
1874
  # Check if we've already emitted this content (prevents duplicates)
1829
1875
  content_hash = hash(content)
@@ -1874,6 +1920,26 @@ async def resume_agent(request: ResumeRequest):
1874
1920
  "event": "todos",
1875
1921
  "data": json.dumps({"todos": todos}),
1876
1922
  }
1923
+ # Check if all todos are completed - terminate early
1924
+ all_completed = all(
1925
+ t.get("status") == "completed" for t in todos
1926
+ )
1927
+ if all_completed and len(todos) > 0:
1928
+ logger.info(
1929
+ "Resume: All %d todos completed in AIMessage tool_calls, auto-terminating",
1930
+ len(todos),
1931
+ )
1932
+ yield {
1933
+ "event": "debug_clear",
1934
+ "data": json.dumps({}),
1935
+ }
1936
+ yield {
1937
+ "event": "done",
1938
+ "data": json.dumps(
1939
+ {"reason": "all_todos_completed"}
1940
+ ),
1941
+ }
1942
+ return # Exit before executing more tool calls
1877
1943
 
1878
1944
  # Process tool calls
1879
1945
  for tool_call in new_tool_calls:
@@ -15,77 +15,76 @@ Modules:
15
15
  __version__ = "1.0.0"
16
16
 
17
17
  # Models
18
+ # Knowledge
19
+ from hdsp_agent_core.knowledge import (
20
+ LIBRARY_DESCRIPTIONS,
21
+ DocumentChunker,
22
+ KnowledgeBase,
23
+ KnowledgeLoader,
24
+ LibraryDetector,
25
+ chunk_file,
26
+ get_knowledge_base,
27
+ get_knowledge_loader,
28
+ get_library_detector,
29
+ )
30
+
31
+ # LLM
32
+ from hdsp_agent_core.llm import (
33
+ LLMService,
34
+ call_llm,
35
+ call_llm_stream,
36
+ )
37
+
38
+ # Managers
39
+ from hdsp_agent_core.managers import (
40
+ ConfigManager,
41
+ SessionManager,
42
+ get_config_manager,
43
+ get_session_manager,
44
+ )
18
45
  from hdsp_agent_core.models import (
19
46
  # Common
20
47
  APIResponse,
48
+ # Chat
49
+ ChatRequest,
50
+ ChatResponse,
51
+ # RAG
52
+ ChunkingConfig,
53
+ EmbeddingConfig,
21
54
  ErrorInfo,
55
+ # Agent
56
+ ExecutionPlan,
22
57
  GeminiConfig,
58
+ IndexStatusResponse,
23
59
  LLMConfig,
24
60
  NotebookContext,
25
61
  OpenAIConfig,
26
- ToolCall,
27
- VLLMConfig,
28
- # Agent
29
- ExecutionPlan,
30
62
  PlanRequest,
31
63
  PlanResponse,
32
64
  PlanStep,
33
- RefineRequest,
34
- RefineResponse,
35
- ReplanRequest,
36
- ReplanResponse,
37
- ValidationIssue,
38
- # Chat
39
- ChatRequest,
40
- ChatResponse,
41
- StreamChunk,
42
- # RAG
43
- ChunkingConfig,
44
- EmbeddingConfig,
45
- IndexStatusResponse,
46
65
  QdrantConfig,
47
66
  RAGConfig,
67
+ RefineRequest,
68
+ RefineResponse,
48
69
  ReindexRequest,
49
70
  ReindexResponse,
71
+ ReplanRequest,
72
+ ReplanResponse,
50
73
  SearchRequest,
51
74
  SearchResponse,
75
+ StreamChunk,
76
+ ToolCall,
77
+ ValidationIssue,
78
+ VLLMConfig,
52
79
  WatchdogConfig,
53
80
  )
54
81
 
55
- # Managers
56
- from hdsp_agent_core.managers import (
57
- ConfigManager,
58
- SessionManager,
59
- get_config_manager,
60
- get_session_manager,
61
- )
62
-
63
- # LLM
64
- from hdsp_agent_core.llm import (
65
- LLMService,
66
- call_llm,
67
- call_llm_stream,
68
- )
69
-
70
- # Knowledge
71
- from hdsp_agent_core.knowledge import (
72
- DocumentChunker,
73
- KnowledgeBase,
74
- KnowledgeLoader,
75
- LibraryDetector,
76
- chunk_file,
77
- get_knowledge_base,
78
- get_knowledge_loader,
79
- get_library_detector,
80
- LIBRARY_DESCRIPTIONS,
81
- )
82
-
83
82
  # Prompts
84
83
  from hdsp_agent_core.prompts import (
85
- PLAN_GENERATION_PROMPT,
84
+ ADAPTIVE_REPLAN_PROMPT,
86
85
  CODE_GENERATION_PROMPT,
87
86
  ERROR_REFINEMENT_PROMPT,
88
- ADAPTIVE_REPLAN_PROMPT,
87
+ PLAN_GENERATION_PROMPT,
89
88
  format_plan_prompt,
90
89
  format_refine_prompt,
91
90
  format_replan_prompt,
@@ -21,8 +21,9 @@ logger = logging.getLogger(__name__)
21
21
 
22
22
  class AgentMode(Enum):
23
23
  """Agent execution mode"""
24
+
24
25
  EMBEDDED = "embedded" # Direct in-process execution
25
- PROXY = "proxy" # HTTP proxy to external server
26
+ PROXY = "proxy" # HTTP proxy to external server
26
27
 
27
28
 
28
29
  class ServiceFactory:
@@ -84,9 +85,7 @@ class ServiceFactory:
84
85
  elif mode_str == "proxy":
85
86
  return AgentMode.PROXY
86
87
  else:
87
- logger.warning(
88
- f"Unknown HDSP_AGENT_MODE '{mode_str}', defaulting to proxy"
89
- )
88
+ logger.warning(f"Unknown HDSP_AGENT_MODE '{mode_str}', defaulting to proxy")
90
89
  return AgentMode.PROXY
91
90
 
92
91
  @property
@@ -164,16 +163,13 @@ class ServiceFactory:
164
163
 
165
164
  # Create proxy service instances
166
165
  self._agent_service = ProxyAgentService(
167
- base_url=self._server_url,
168
- timeout=self._timeout
166
+ base_url=self._server_url, timeout=self._timeout
169
167
  )
170
168
  self._chat_service = ProxyChatService(
171
- base_url=self._server_url,
172
- timeout=self._timeout
169
+ base_url=self._server_url, timeout=self._timeout
173
170
  )
174
171
  self._rag_service = ProxyRAGService(
175
- base_url=self._server_url,
176
- timeout=self._timeout
172
+ base_url=self._server_url, timeout=self._timeout
177
173
  )
178
174
 
179
175
  # Optionally validate connectivity
@@ -73,7 +73,9 @@ class IAgentService(ABC):
73
73
  ...
74
74
 
75
75
  @abstractmethod
76
- async def validate_code(self, code: str, notebook_context: Optional[Dict] = None) -> Dict[str, Any]:
76
+ async def validate_code(
77
+ self, code: str, notebook_context: Optional[Dict] = None
78
+ ) -> Dict[str, Any]:
77
79
  """
78
80
  Validate code before execution.
79
81
 
@@ -154,7 +156,7 @@ class IRAGService(ABC):
154
156
  self,
155
157
  query: str,
156
158
  detected_libraries: Optional[List[str]] = None,
157
- max_results: int = 5
159
+ max_results: int = 5,
158
160
  ) -> Optional[str]:
159
161
  """
160
162
  Get formatted context for a query (for prompt injection).
@@ -4,18 +4,18 @@ HDSP Agent Core - Knowledge Base
4
4
  Deterministic library detection and API guide management.
5
5
  """
6
6
 
7
+ from .chunking import (
8
+ DocumentChunker,
9
+ chunk_file,
10
+ )
7
11
  from .loader import (
12
+ LIBRARY_DESCRIPTIONS,
8
13
  KnowledgeBase,
9
14
  KnowledgeLoader,
10
15
  LibraryDetector,
11
16
  get_knowledge_base,
12
17
  get_knowledge_loader,
13
18
  get_library_detector,
14
- LIBRARY_DESCRIPTIONS,
15
- )
16
- from .chunking import (
17
- DocumentChunker,
18
- chunk_file,
19
19
  )
20
20
 
21
21
  __all__ = [
@@ -9,10 +9,10 @@ Provides intelligent chunking strategies:
9
9
  Each strategy preserves context and adds relevant metadata.
10
10
  """
11
11
 
12
- import re
13
12
  import logging
14
- from typing import List, Dict, Any, Optional, TYPE_CHECKING
13
+ import re
15
14
  from pathlib import Path
15
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
16
16
 
17
17
  if TYPE_CHECKING:
18
18
  from hdsp_agent_core.models.rag import ChunkingConfig
@@ -36,13 +36,14 @@ class DocumentChunker:
36
36
 
37
37
  def __init__(self, config: Optional["ChunkingConfig"] = None):
38
38
  from hdsp_agent_core.models.rag import ChunkingConfig
39
+
39
40
  self._config = config or ChunkingConfig()
40
41
 
41
42
  def chunk_document(
42
43
  self,
43
44
  content: str,
44
45
  metadata: Optional[Dict[str, Any]] = None,
45
- file_type: Optional[str] = None
46
+ file_type: Optional[str] = None,
46
47
  ) -> List[Dict[str, Any]]:
47
48
  """
48
49
  Chunk document based on content type.
@@ -75,13 +76,12 @@ class DocumentChunker:
75
76
  for chunk in chunks:
76
77
  chunk_content = chunk["content"].strip()
77
78
  if len(chunk_content) >= self._config.min_chunk_size:
78
- result.append({
79
- "content": chunk_content,
80
- "metadata": {
81
- **metadata,
82
- **chunk.get("metadata", {})
79
+ result.append(
80
+ {
81
+ "content": chunk_content,
82
+ "metadata": {**metadata, **chunk.get("metadata", {})},
83
83
  }
84
- })
84
+ )
85
85
 
86
86
  logger.debug(f"Chunked document into {len(result)} chunks (type={file_type})")
87
87
  return result
@@ -108,9 +108,9 @@ class DocumentChunker:
108
108
  - Respect max chunk size with sub-splitting
109
109
  """
110
110
  # Pattern for markdown headers
111
- header_pattern = r'^(#{1,6})\s+(.+)$'
111
+ header_pattern = r"^(#{1,6})\s+(.+)$"
112
112
 
113
- lines = content.split('\n')
113
+ lines = content.split("\n")
114
114
  chunks = []
115
115
  current_chunk_lines = []
116
116
  current_headers = [] # Stack of (level, text)
@@ -121,13 +121,19 @@ class DocumentChunker:
121
121
  if header_match:
122
122
  # Save current chunk if it has content
123
123
  if current_chunk_lines:
124
- chunk_content = '\n'.join(current_chunk_lines).strip()
124
+ chunk_content = "\n".join(current_chunk_lines).strip()
125
125
  if chunk_content:
126
- section_path = ' > '.join(h[1] for h in current_headers) if current_headers else "Introduction"
127
- chunks.append({
128
- "content": chunk_content,
129
- "metadata": {"section": section_path}
130
- })
126
+ section_path = (
127
+ " > ".join(h[1] for h in current_headers)
128
+ if current_headers
129
+ else "Introduction"
130
+ )
131
+ chunks.append(
132
+ {
133
+ "content": chunk_content,
134
+ "metadata": {"section": section_path},
135
+ }
136
+ )
131
137
 
132
138
  # Update header stack
133
139
  level = len(header_match.group(1))
@@ -143,26 +149,35 @@ class DocumentChunker:
143
149
  current_chunk_lines.append(line)
144
150
 
145
151
  # Check chunk size limit
146
- chunk_text = '\n'.join(current_chunk_lines)
152
+ chunk_text = "\n".join(current_chunk_lines)
147
153
  if len(chunk_text) >= self._config.max_chunk_size:
148
- section_path = ' > '.join(h[1] for h in current_headers) if current_headers else "Content"
149
- chunks.append({
150
- "content": chunk_text.strip(),
151
- "metadata": {"section": section_path}
152
- })
154
+ section_path = (
155
+ " > ".join(h[1] for h in current_headers)
156
+ if current_headers
157
+ else "Content"
158
+ )
159
+ chunks.append(
160
+ {
161
+ "content": chunk_text.strip(),
162
+ "metadata": {"section": section_path},
163
+ }
164
+ )
153
165
  # Keep overlap for context continuity
154
166
  overlap_lines = self._get_overlap_lines(current_chunk_lines)
155
167
  current_chunk_lines = overlap_lines
156
168
 
157
169
  # Save final chunk
158
170
  if current_chunk_lines:
159
- chunk_content = '\n'.join(current_chunk_lines).strip()
171
+ chunk_content = "\n".join(current_chunk_lines).strip()
160
172
  if chunk_content:
161
- section_path = ' > '.join(h[1] for h in current_headers) if current_headers else "Content"
162
- chunks.append({
163
- "content": chunk_content,
164
- "metadata": {"section": section_path}
165
- })
173
+ section_path = (
174
+ " > ".join(h[1] for h in current_headers)
175
+ if current_headers
176
+ else "Content"
177
+ )
178
+ chunks.append(
179
+ {"content": chunk_content, "metadata": {"section": section_path}}
180
+ )
166
181
 
167
182
  return chunks
168
183
 
@@ -176,9 +191,9 @@ class DocumentChunker:
176
191
  - Preserve import statements and module docstrings
177
192
  """
178
193
  # Pattern for class and function definitions (top-level only)
179
- def_pattern = r'^(class|def|async\s+def)\s+(\w+)'
194
+ def_pattern = r"^(class|def|async\s+def)\s+(\w+)"
180
195
 
181
- lines = content.split('\n')
196
+ lines = content.split("\n")
182
197
  chunks = []
183
198
  current_chunk_lines = []
184
199
  current_def = None
@@ -193,15 +208,21 @@ class DocumentChunker:
193
208
  def_match = re.match(def_pattern, line)
194
209
 
195
210
  # Check if this is a top-level definition (not indented)
196
- if def_match and not line.startswith((' ', '\t')) and not in_multiline_string:
211
+ if (
212
+ def_match
213
+ and not line.startswith((" ", "\t"))
214
+ and not in_multiline_string
215
+ ):
197
216
  # Save current chunk
198
217
  if current_chunk_lines:
199
- chunk_content = '\n'.join(current_chunk_lines).strip()
218
+ chunk_content = "\n".join(current_chunk_lines).strip()
200
219
  if chunk_content:
201
- chunks.append({
202
- "content": chunk_content,
203
- "metadata": {"definition": current_def or "module"}
204
- })
220
+ chunks.append(
221
+ {
222
+ "content": chunk_content,
223
+ "metadata": {"definition": current_def or "module"},
224
+ }
225
+ )
205
226
 
206
227
  current_def = f"{def_match.group(1)} {def_match.group(2)}"
207
228
  current_chunk_lines = [line]
@@ -209,22 +230,26 @@ class DocumentChunker:
209
230
  current_chunk_lines.append(line)
210
231
 
211
232
  # Check max chunk size
212
- if len('\n'.join(current_chunk_lines)) >= self._config.max_chunk_size:
213
- chunks.append({
214
- "content": '\n'.join(current_chunk_lines).strip(),
215
- "metadata": {"definition": current_def or "module"}
216
- })
233
+ if len("\n".join(current_chunk_lines)) >= self._config.max_chunk_size:
234
+ chunks.append(
235
+ {
236
+ "content": "\n".join(current_chunk_lines).strip(),
237
+ "metadata": {"definition": current_def or "module"},
238
+ }
239
+ )
217
240
  overlap_lines = self._get_overlap_lines(current_chunk_lines)
218
241
  current_chunk_lines = overlap_lines
219
242
 
220
243
  # Save final chunk
221
244
  if current_chunk_lines:
222
- chunk_content = '\n'.join(current_chunk_lines).strip()
245
+ chunk_content = "\n".join(current_chunk_lines).strip()
223
246
  if chunk_content:
224
- chunks.append({
225
- "content": chunk_content,
226
- "metadata": {"definition": current_def or "module"}
227
- })
247
+ chunks.append(
248
+ {
249
+ "content": chunk_content,
250
+ "metadata": {"definition": current_def or "module"},
251
+ }
252
+ )
228
253
 
229
254
  return chunks
230
255
 
@@ -251,10 +276,12 @@ class DocumentChunker:
251
276
  if end >= len(content):
252
277
  chunk_content = content[start:].strip()
253
278
  if chunk_content:
254
- chunks.append({
255
- "content": chunk_content,
256
- "metadata": {"chunk_index": chunk_index}
257
- })
279
+ chunks.append(
280
+ {
281
+ "content": chunk_content,
282
+ "metadata": {"chunk_index": chunk_index},
283
+ }
284
+ )
258
285
  break
259
286
 
260
287
  # Try to find a good break point
@@ -263,10 +290,9 @@ class DocumentChunker:
263
290
 
264
291
  chunk_content = content[start:end].strip()
265
292
  if chunk_content:
266
- chunks.append({
267
- "content": chunk_content,
268
- "metadata": {"chunk_index": chunk_index}
269
- })
293
+ chunks.append(
294
+ {"content": chunk_content, "metadata": {"chunk_index": chunk_index}}
295
+ )
270
296
 
271
297
  # Move start with overlap
272
298
  start = max(end - overlap, start + 1)
@@ -288,12 +314,12 @@ class DocumentChunker:
288
314
  search_start = start + (end - start) // 2 # Search in latter half
289
315
 
290
316
  # Try paragraph break (double newline)
291
- para_break = content.rfind('\n\n', search_start, end)
317
+ para_break = content.rfind("\n\n", search_start, end)
292
318
  if para_break > search_start:
293
319
  return para_break + 2
294
320
 
295
321
  # Try sentence break (. or ! or ? followed by space or newline)
296
- sentence_pattern = r'[.!?]\s'
322
+ sentence_pattern = r"[.!?]\s"
297
323
  for match in re.finditer(sentence_pattern, content[search_start:end]):
298
324
  last_match_end = search_start + match.end()
299
325
  else:
@@ -301,15 +327,15 @@ class DocumentChunker:
301
327
 
302
328
  # Find last sentence break
303
329
  for i in range(end - 1, search_start, -1):
304
- if i + 1 < len(content) and content[i] in '.!?' and content[i + 1] in ' \n':
330
+ if i + 1 < len(content) and content[i] in ".!?" and content[i + 1] in " \n":
305
331
  return i + 1
306
332
 
307
333
  # Try word break (space or newline)
308
- space_break = content.rfind(' ', search_start, end)
334
+ space_break = content.rfind(" ", search_start, end)
309
335
  if space_break > search_start:
310
336
  return space_break + 1
311
337
 
312
- newline_break = content.rfind('\n', search_start, end)
338
+ newline_break = content.rfind("\n", search_start, end)
313
339
  if newline_break > search_start:
314
340
  return newline_break + 1
315
341
 
@@ -333,7 +359,7 @@ class DocumentChunker:
333
359
  def chunk_file(
334
360
  file_path: Path,
335
361
  config: Optional["ChunkingConfig"] = None,
336
- base_metadata: Optional[Dict[str, Any]] = None
362
+ base_metadata: Optional[Dict[str, Any]] = None,
337
363
  ) -> List[Dict[str, Any]]:
338
364
  """
339
365
  Convenience function to chunk a file directly.