hdsp-jupyter-extension 2.0.8__py3-none-any.whl → 2.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. agent_server/core/notebook_generator.py +4 -4
  2. agent_server/core/rag_manager.py +12 -3
  3. agent_server/core/retriever.py +2 -1
  4. agent_server/core/vllm_embedding_service.py +8 -5
  5. agent_server/langchain/ARCHITECTURE.md +7 -51
  6. agent_server/langchain/agent.py +31 -20
  7. agent_server/langchain/custom_middleware.py +234 -31
  8. agent_server/langchain/hitl_config.py +5 -8
  9. agent_server/langchain/logging_utils.py +7 -7
  10. agent_server/langchain/prompts.py +106 -120
  11. agent_server/langchain/tools/__init__.py +1 -10
  12. agent_server/langchain/tools/file_tools.py +9 -61
  13. agent_server/langchain/tools/jupyter_tools.py +0 -1
  14. agent_server/langchain/tools/lsp_tools.py +8 -8
  15. agent_server/langchain/tools/resource_tools.py +12 -12
  16. agent_server/langchain/tools/search_tools.py +3 -158
  17. agent_server/prompts/file_action_prompts.py +8 -8
  18. agent_server/routers/langchain_agent.py +200 -125
  19. hdsp_agent_core/__init__.py +46 -47
  20. hdsp_agent_core/factory.py +6 -10
  21. hdsp_agent_core/interfaces.py +4 -2
  22. hdsp_agent_core/knowledge/__init__.py +5 -5
  23. hdsp_agent_core/knowledge/chunking.py +87 -61
  24. hdsp_agent_core/knowledge/loader.py +103 -101
  25. hdsp_agent_core/llm/service.py +192 -107
  26. hdsp_agent_core/managers/config_manager.py +16 -22
  27. hdsp_agent_core/managers/session_manager.py +5 -4
  28. hdsp_agent_core/models/__init__.py +12 -12
  29. hdsp_agent_core/models/agent.py +15 -8
  30. hdsp_agent_core/models/common.py +1 -2
  31. hdsp_agent_core/models/rag.py +48 -111
  32. hdsp_agent_core/prompts/__init__.py +12 -12
  33. hdsp_agent_core/prompts/cell_action_prompts.py +9 -7
  34. hdsp_agent_core/services/agent_service.py +10 -8
  35. hdsp_agent_core/services/chat_service.py +10 -6
  36. hdsp_agent_core/services/rag_service.py +3 -6
  37. hdsp_agent_core/tests/conftest.py +4 -1
  38. hdsp_agent_core/tests/test_factory.py +2 -2
  39. hdsp_agent_core/tests/test_services.py +12 -19
  40. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/build_log.json +1 -1
  41. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/package.json +2 -2
  42. hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.8740a527757068814573.js → hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js +93 -4
  43. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +1 -0
  44. hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.e4ff4b5779b5e049f84c.js → hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js +153 -130
  45. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js.map +1 -0
  46. hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.020cdb0b864cfaa4e41e.js → hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js +6 -6
  47. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js.map +1 -0
  48. {hdsp_jupyter_extension-2.0.8.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/METADATA +1 -3
  49. hdsp_jupyter_extension-2.0.11.dist-info/RECORD +144 -0
  50. jupyter_ext/__init__.py +21 -11
  51. jupyter_ext/_version.py +1 -1
  52. jupyter_ext/handlers.py +69 -50
  53. jupyter_ext/labextension/build_log.json +1 -1
  54. jupyter_ext/labextension/package.json +2 -2
  55. jupyter_ext/labextension/static/{frontend_styles_index_js.8740a527757068814573.js → frontend_styles_index_js.2d9fb488c82498c45c2d.js} +93 -4
  56. jupyter_ext/labextension/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +1 -0
  57. jupyter_ext/labextension/static/{lib_index_js.e4ff4b5779b5e049f84c.js → lib_index_js.58c1e128ba0b76f41f04.js} +153 -130
  58. jupyter_ext/labextension/static/lib_index_js.58c1e128ba0b76f41f04.js.map +1 -0
  59. jupyter_ext/labextension/static/{remoteEntry.020cdb0b864cfaa4e41e.js → remoteEntry.9da31d1134a53b0c4af5.js} +6 -6
  60. jupyter_ext/labextension/static/remoteEntry.9da31d1134a53b0c4af5.js.map +1 -0
  61. hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.8740a527757068814573.js.map +0 -1
  62. hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.e4ff4b5779b5e049f84c.js.map +0 -1
  63. hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.020cdb0b864cfaa4e41e.js.map +0 -1
  64. hdsp_jupyter_extension-2.0.8.dist-info/RECORD +0 -144
  65. jupyter_ext/labextension/static/frontend_styles_index_js.8740a527757068814573.js.map +0 -1
  66. jupyter_ext/labextension/static/lib_index_js.e4ff4b5779b5e049f84c.js.map +0 -1
  67. jupyter_ext/labextension/static/remoteEntry.020cdb0b864cfaa4e41e.js.map +0 -1
  68. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/etc/jupyter/jupyter_server_config.d/hdsp_jupyter_extension.json +0 -0
  69. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/install.json +0 -0
  70. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js +0 -0
  71. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js.map +0 -0
  72. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js +0 -0
  73. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js.map +0 -0
  74. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/style.js +0 -0
  75. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js +0 -0
  76. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js.map +0 -0
  77. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js +0 -0
  78. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js.map +0 -0
  79. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js +0 -0
  80. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js.map +0 -0
  81. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js +0 -0
  82. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js.map +0 -0
  83. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js +0 -0
  84. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js.map +0 -0
  85. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js +0 -0
  86. {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js.map +0 -0
  87. {hdsp_jupyter_extension-2.0.8.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/WHEEL +0 -0
  88. {hdsp_jupyter_extension-2.0.8.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/licenses/LICENSE +0 -0
@@ -73,7 +73,9 @@ class IAgentService(ABC):
73
73
  ...
74
74
 
75
75
  @abstractmethod
76
- async def validate_code(self, code: str, notebook_context: Optional[Dict] = None) -> Dict[str, Any]:
76
+ async def validate_code(
77
+ self, code: str, notebook_context: Optional[Dict] = None
78
+ ) -> Dict[str, Any]:
77
79
  """
78
80
  Validate code before execution.
79
81
 
@@ -154,7 +156,7 @@ class IRAGService(ABC):
154
156
  self,
155
157
  query: str,
156
158
  detected_libraries: Optional[List[str]] = None,
157
- max_results: int = 5
159
+ max_results: int = 5,
158
160
  ) -> Optional[str]:
159
161
  """
160
162
  Get formatted context for a query (for prompt injection).
@@ -4,18 +4,18 @@ HDSP Agent Core - Knowledge Base
4
4
  Deterministic library detection and API guide management.
5
5
  """
6
6
 
7
+ from .chunking import (
8
+ DocumentChunker,
9
+ chunk_file,
10
+ )
7
11
  from .loader import (
12
+ LIBRARY_DESCRIPTIONS,
8
13
  KnowledgeBase,
9
14
  KnowledgeLoader,
10
15
  LibraryDetector,
11
16
  get_knowledge_base,
12
17
  get_knowledge_loader,
13
18
  get_library_detector,
14
- LIBRARY_DESCRIPTIONS,
15
- )
16
- from .chunking import (
17
- DocumentChunker,
18
- chunk_file,
19
19
  )
20
20
 
21
21
  __all__ = [
@@ -9,10 +9,10 @@ Provides intelligent chunking strategies:
9
9
  Each strategy preserves context and adds relevant metadata.
10
10
  """
11
11
 
12
- import re
13
12
  import logging
14
- from typing import List, Dict, Any, Optional, TYPE_CHECKING
13
+ import re
15
14
  from pathlib import Path
15
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
16
16
 
17
17
  if TYPE_CHECKING:
18
18
  from hdsp_agent_core.models.rag import ChunkingConfig
@@ -36,13 +36,14 @@ class DocumentChunker:
36
36
 
37
37
  def __init__(self, config: Optional["ChunkingConfig"] = None):
38
38
  from hdsp_agent_core.models.rag import ChunkingConfig
39
+
39
40
  self._config = config or ChunkingConfig()
40
41
 
41
42
  def chunk_document(
42
43
  self,
43
44
  content: str,
44
45
  metadata: Optional[Dict[str, Any]] = None,
45
- file_type: Optional[str] = None
46
+ file_type: Optional[str] = None,
46
47
  ) -> List[Dict[str, Any]]:
47
48
  """
48
49
  Chunk document based on content type.
@@ -75,13 +76,12 @@ class DocumentChunker:
75
76
  for chunk in chunks:
76
77
  chunk_content = chunk["content"].strip()
77
78
  if len(chunk_content) >= self._config.min_chunk_size:
78
- result.append({
79
- "content": chunk_content,
80
- "metadata": {
81
- **metadata,
82
- **chunk.get("metadata", {})
79
+ result.append(
80
+ {
81
+ "content": chunk_content,
82
+ "metadata": {**metadata, **chunk.get("metadata", {})},
83
83
  }
84
- })
84
+ )
85
85
 
86
86
  logger.debug(f"Chunked document into {len(result)} chunks (type={file_type})")
87
87
  return result
@@ -108,9 +108,9 @@ class DocumentChunker:
108
108
  - Respect max chunk size with sub-splitting
109
109
  """
110
110
  # Pattern for markdown headers
111
- header_pattern = r'^(#{1,6})\s+(.+)$'
111
+ header_pattern = r"^(#{1,6})\s+(.+)$"
112
112
 
113
- lines = content.split('\n')
113
+ lines = content.split("\n")
114
114
  chunks = []
115
115
  current_chunk_lines = []
116
116
  current_headers = [] # Stack of (level, text)
@@ -121,13 +121,19 @@ class DocumentChunker:
121
121
  if header_match:
122
122
  # Save current chunk if it has content
123
123
  if current_chunk_lines:
124
- chunk_content = '\n'.join(current_chunk_lines).strip()
124
+ chunk_content = "\n".join(current_chunk_lines).strip()
125
125
  if chunk_content:
126
- section_path = ' > '.join(h[1] for h in current_headers) if current_headers else "Introduction"
127
- chunks.append({
128
- "content": chunk_content,
129
- "metadata": {"section": section_path}
130
- })
126
+ section_path = (
127
+ " > ".join(h[1] for h in current_headers)
128
+ if current_headers
129
+ else "Introduction"
130
+ )
131
+ chunks.append(
132
+ {
133
+ "content": chunk_content,
134
+ "metadata": {"section": section_path},
135
+ }
136
+ )
131
137
 
132
138
  # Update header stack
133
139
  level = len(header_match.group(1))
@@ -143,26 +149,35 @@ class DocumentChunker:
143
149
  current_chunk_lines.append(line)
144
150
 
145
151
  # Check chunk size limit
146
- chunk_text = '\n'.join(current_chunk_lines)
152
+ chunk_text = "\n".join(current_chunk_lines)
147
153
  if len(chunk_text) >= self._config.max_chunk_size:
148
- section_path = ' > '.join(h[1] for h in current_headers) if current_headers else "Content"
149
- chunks.append({
150
- "content": chunk_text.strip(),
151
- "metadata": {"section": section_path}
152
- })
154
+ section_path = (
155
+ " > ".join(h[1] for h in current_headers)
156
+ if current_headers
157
+ else "Content"
158
+ )
159
+ chunks.append(
160
+ {
161
+ "content": chunk_text.strip(),
162
+ "metadata": {"section": section_path},
163
+ }
164
+ )
153
165
  # Keep overlap for context continuity
154
166
  overlap_lines = self._get_overlap_lines(current_chunk_lines)
155
167
  current_chunk_lines = overlap_lines
156
168
 
157
169
  # Save final chunk
158
170
  if current_chunk_lines:
159
- chunk_content = '\n'.join(current_chunk_lines).strip()
171
+ chunk_content = "\n".join(current_chunk_lines).strip()
160
172
  if chunk_content:
161
- section_path = ' > '.join(h[1] for h in current_headers) if current_headers else "Content"
162
- chunks.append({
163
- "content": chunk_content,
164
- "metadata": {"section": section_path}
165
- })
173
+ section_path = (
174
+ " > ".join(h[1] for h in current_headers)
175
+ if current_headers
176
+ else "Content"
177
+ )
178
+ chunks.append(
179
+ {"content": chunk_content, "metadata": {"section": section_path}}
180
+ )
166
181
 
167
182
  return chunks
168
183
 
@@ -176,9 +191,9 @@ class DocumentChunker:
176
191
  - Preserve import statements and module docstrings
177
192
  """
178
193
  # Pattern for class and function definitions (top-level only)
179
- def_pattern = r'^(class|def|async\s+def)\s+(\w+)'
194
+ def_pattern = r"^(class|def|async\s+def)\s+(\w+)"
180
195
 
181
- lines = content.split('\n')
196
+ lines = content.split("\n")
182
197
  chunks = []
183
198
  current_chunk_lines = []
184
199
  current_def = None
@@ -193,15 +208,21 @@ class DocumentChunker:
193
208
  def_match = re.match(def_pattern, line)
194
209
 
195
210
  # Check if this is a top-level definition (not indented)
196
- if def_match and not line.startswith((' ', '\t')) and not in_multiline_string:
211
+ if (
212
+ def_match
213
+ and not line.startswith((" ", "\t"))
214
+ and not in_multiline_string
215
+ ):
197
216
  # Save current chunk
198
217
  if current_chunk_lines:
199
- chunk_content = '\n'.join(current_chunk_lines).strip()
218
+ chunk_content = "\n".join(current_chunk_lines).strip()
200
219
  if chunk_content:
201
- chunks.append({
202
- "content": chunk_content,
203
- "metadata": {"definition": current_def or "module"}
204
- })
220
+ chunks.append(
221
+ {
222
+ "content": chunk_content,
223
+ "metadata": {"definition": current_def or "module"},
224
+ }
225
+ )
205
226
 
206
227
  current_def = f"{def_match.group(1)} {def_match.group(2)}"
207
228
  current_chunk_lines = [line]
@@ -209,22 +230,26 @@ class DocumentChunker:
209
230
  current_chunk_lines.append(line)
210
231
 
211
232
  # Check max chunk size
212
- if len('\n'.join(current_chunk_lines)) >= self._config.max_chunk_size:
213
- chunks.append({
214
- "content": '\n'.join(current_chunk_lines).strip(),
215
- "metadata": {"definition": current_def or "module"}
216
- })
233
+ if len("\n".join(current_chunk_lines)) >= self._config.max_chunk_size:
234
+ chunks.append(
235
+ {
236
+ "content": "\n".join(current_chunk_lines).strip(),
237
+ "metadata": {"definition": current_def or "module"},
238
+ }
239
+ )
217
240
  overlap_lines = self._get_overlap_lines(current_chunk_lines)
218
241
  current_chunk_lines = overlap_lines
219
242
 
220
243
  # Save final chunk
221
244
  if current_chunk_lines:
222
- chunk_content = '\n'.join(current_chunk_lines).strip()
245
+ chunk_content = "\n".join(current_chunk_lines).strip()
223
246
  if chunk_content:
224
- chunks.append({
225
- "content": chunk_content,
226
- "metadata": {"definition": current_def or "module"}
227
- })
247
+ chunks.append(
248
+ {
249
+ "content": chunk_content,
250
+ "metadata": {"definition": current_def or "module"},
251
+ }
252
+ )
228
253
 
229
254
  return chunks
230
255
 
@@ -251,10 +276,12 @@ class DocumentChunker:
251
276
  if end >= len(content):
252
277
  chunk_content = content[start:].strip()
253
278
  if chunk_content:
254
- chunks.append({
255
- "content": chunk_content,
256
- "metadata": {"chunk_index": chunk_index}
257
- })
279
+ chunks.append(
280
+ {
281
+ "content": chunk_content,
282
+ "metadata": {"chunk_index": chunk_index},
283
+ }
284
+ )
258
285
  break
259
286
 
260
287
  # Try to find a good break point
@@ -263,10 +290,9 @@ class DocumentChunker:
263
290
 
264
291
  chunk_content = content[start:end].strip()
265
292
  if chunk_content:
266
- chunks.append({
267
- "content": chunk_content,
268
- "metadata": {"chunk_index": chunk_index}
269
- })
293
+ chunks.append(
294
+ {"content": chunk_content, "metadata": {"chunk_index": chunk_index}}
295
+ )
270
296
 
271
297
  # Move start with overlap
272
298
  start = max(end - overlap, start + 1)
@@ -288,12 +314,12 @@ class DocumentChunker:
288
314
  search_start = start + (end - start) // 2 # Search in latter half
289
315
 
290
316
  # Try paragraph break (double newline)
291
- para_break = content.rfind('\n\n', search_start, end)
317
+ para_break = content.rfind("\n\n", search_start, end)
292
318
  if para_break > search_start:
293
319
  return para_break + 2
294
320
 
295
321
  # Try sentence break (. or ! or ? followed by space or newline)
296
- sentence_pattern = r'[.!?]\s'
322
+ sentence_pattern = r"[.!?]\s"
297
323
  for match in re.finditer(sentence_pattern, content[search_start:end]):
298
324
  last_match_end = search_start + match.end()
299
325
  else:
@@ -301,15 +327,15 @@ class DocumentChunker:
301
327
 
302
328
  # Find last sentence break
303
329
  for i in range(end - 1, search_start, -1):
304
- if i + 1 < len(content) and content[i] in '.!?' and content[i + 1] in ' \n':
330
+ if i + 1 < len(content) and content[i] in ".!?" and content[i + 1] in " \n":
305
331
  return i + 1
306
332
 
307
333
  # Try word break (space or newline)
308
- space_break = content.rfind(' ', search_start, end)
334
+ space_break = content.rfind(" ", search_start, end)
309
335
  if space_break > search_start:
310
336
  return space_break + 1
311
337
 
312
- newline_break = content.rfind('\n', search_start, end)
338
+ newline_break = content.rfind("\n", search_start, end)
313
339
  if newline_break > search_start:
314
340
  return newline_break + 1
315
341
 
@@ -333,7 +359,7 @@ class DocumentChunker:
333
359
  def chunk_file(
334
360
  file_path: Path,
335
361
  config: Optional["ChunkingConfig"] = None,
336
- base_metadata: Optional[Dict[str, Any]] = None
362
+ base_metadata: Optional[Dict[str, Any]] = None,
337
363
  ) -> List[Dict[str, Any]]:
338
364
  """
339
365
  Convenience function to chunk a file directly.
@@ -5,19 +5,19 @@ Keyword matching + regex based library detection for loading appropriate API gui
5
5
  (No LLM calls - saves tokens and improves reliability)
6
6
  """
7
7
 
8
- from pathlib import Path
9
- from typing import List, Dict, Optional, Set
10
8
  import re
9
+ from pathlib import Path
10
+ from typing import Dict, List, Optional, Set
11
11
 
12
12
  # Library descriptions (reference)
13
13
  LIBRARY_DESCRIPTIONS: Dict[str, str] = {
14
- 'matplotlib': 'Visualization, graphs, charts, plot, histogram, scatter plot, EDA, data visualization, used with seaborn',
15
- 'dask': 'Large-scale data processing, pandas replacement, distributed processing, lazy evaluation, dd.read_csv',
16
- 'polars': 'High-performance DataFrame, pandas replacement, Rust-based, pl.read_csv',
17
- 'pyspark': 'Spark-based distributed processing, big data, SparkSession',
18
- 'vaex': 'Large-scale data exploration, out-of-core processing',
19
- 'modin': 'pandas acceleration, parallel processing',
20
- 'ray': 'Distributed computing, parallel processing framework',
14
+ "matplotlib": "Visualization, graphs, charts, plot, histogram, scatter plot, EDA, data visualization, used with seaborn",
15
+ "dask": "Large-scale data processing, pandas replacement, distributed processing, lazy evaluation, dd.read_csv",
16
+ "polars": "High-performance DataFrame, pandas replacement, Rust-based, pl.read_csv",
17
+ "pyspark": "Spark-based distributed processing, big data, SparkSession",
18
+ "vaex": "Large-scale data exploration, out-of-core processing",
19
+ "modin": "pandas acceleration, parallel processing",
20
+ "ray": "Distributed computing, parallel processing framework",
21
21
  }
22
22
 
23
23
 
@@ -29,90 +29,90 @@ class LibraryDetector:
29
29
 
30
30
  # Explicit library mention patterns (highest priority)
31
31
  EXPLICIT_PATTERNS: Dict[str, str] = {
32
- r'\bdask\b': 'dask',
33
- r'\bpolars\b': 'polars',
34
- r'\bpyspark\b': 'pyspark',
35
- r'\bvaex\b': 'vaex',
36
- r'\bmodin\b': 'modin',
37
- r'\bray\b': 'ray',
38
- r'\bmatplotlib\b': 'matplotlib',
39
- r'\bseaborn\b': 'matplotlib', # seaborn -> matplotlib guide
40
- r'\bplt\.': 'matplotlib',
41
- r'\bdd\.read': 'dask',
42
- r'\bpl\.read': 'polars',
43
- r'\bpl\.DataFrame': 'polars',
32
+ r"\bdask\b": "dask",
33
+ r"\bpolars\b": "polars",
34
+ r"\bpyspark\b": "pyspark",
35
+ r"\bvaex\b": "vaex",
36
+ r"\bmodin\b": "modin",
37
+ r"\bray\b": "ray",
38
+ r"\bmatplotlib\b": "matplotlib",
39
+ r"\bseaborn\b": "matplotlib", # seaborn -> matplotlib guide
40
+ r"\bplt\.": "matplotlib",
41
+ r"\bdd\.read": "dask",
42
+ r"\bpl\.read": "polars",
43
+ r"\bpl\.DataFrame": "polars",
44
44
  }
45
45
 
46
46
  # Keyword scores per library (0.0 ~ 1.0)
47
47
  KEYWORD_SCORES: Dict[str, Dict[str, float]] = {
48
- 'dask': {
49
- '대용량': 0.7,
50
- 'big data': 0.7,
51
- 'bigdata': 0.7,
52
- '빅데이터': 0.7,
53
- 'lazy': 0.8,
54
- 'lazy evaluation': 0.9,
55
- 'out-of-core': 0.9,
56
- 'out of core': 0.9,
57
- '분산 처리': 0.6,
58
- 'distributed': 0.6,
59
- 'parallel dataframe': 0.8,
60
- '병렬 데이터프레임': 0.8,
48
+ "dask": {
49
+ "대용량": 0.7,
50
+ "big data": 0.7,
51
+ "bigdata": 0.7,
52
+ "빅데이터": 0.7,
53
+ "lazy": 0.8,
54
+ "lazy evaluation": 0.9,
55
+ "out-of-core": 0.9,
56
+ "out of core": 0.9,
57
+ "분산 처리": 0.6,
58
+ "distributed": 0.6,
59
+ "parallel dataframe": 0.8,
60
+ "병렬 데이터프레임": 0.8,
61
61
  },
62
- 'polars': {
63
- 'rust 기반': 0.9,
64
- 'rust-based': 0.9,
65
- 'fast dataframe': 0.7,
66
- '고성능 dataframe': 0.7,
67
- '빠른 데이터프레임': 0.7,
62
+ "polars": {
63
+ "rust 기반": 0.9,
64
+ "rust-based": 0.9,
65
+ "fast dataframe": 0.7,
66
+ "고성능 dataframe": 0.7,
67
+ "빠른 데이터프레임": 0.7,
68
68
  },
69
- 'matplotlib': {
70
- '시각화': 0.7,
71
- 'visualization': 0.7,
72
- 'visualize': 0.7,
73
- 'plot': 0.7,
74
- 'chart': 0.7,
75
- 'graph': 0.6,
76
- '그래프': 0.6,
77
- '차트': 0.7,
78
- 'histogram': 0.8,
79
- '히스토그램': 0.8,
80
- 'scatter': 0.8,
81
- '산점도': 0.8,
82
- 'line plot': 0.8,
83
- '라인 플롯': 0.8,
84
- 'bar chart': 0.8,
85
- '막대 그래프': 0.8,
86
- 'eda': 0.5,
87
- '탐색적 데이터 분석': 0.6,
88
- 'figure': 0.5,
89
- 'subplot': 0.8,
90
- 'heatmap': 0.7,
91
- '히트맵': 0.7,
69
+ "matplotlib": {
70
+ "시각화": 0.7,
71
+ "visualization": 0.7,
72
+ "visualize": 0.7,
73
+ "plot": 0.7,
74
+ "chart": 0.7,
75
+ "graph": 0.6,
76
+ "그래프": 0.6,
77
+ "차트": 0.7,
78
+ "histogram": 0.8,
79
+ "히스토그램": 0.8,
80
+ "scatter": 0.8,
81
+ "산점도": 0.8,
82
+ "line plot": 0.8,
83
+ "라인 플롯": 0.8,
84
+ "bar chart": 0.8,
85
+ "막대 그래프": 0.8,
86
+ "eda": 0.5,
87
+ "탐색적 데이터 분석": 0.6,
88
+ "figure": 0.5,
89
+ "subplot": 0.8,
90
+ "heatmap": 0.7,
91
+ "히트맵": 0.7,
92
92
  },
93
- 'pyspark': {
94
- 'spark': 0.9,
95
- 'sparksession': 0.95,
96
- 'spark session': 0.95,
97
- 'rdd': 0.9,
98
- 'hadoop': 0.7,
99
- '클러스터': 0.6,
100
- 'cluster': 0.6,
93
+ "pyspark": {
94
+ "spark": 0.9,
95
+ "sparksession": 0.95,
96
+ "spark session": 0.95,
97
+ "rdd": 0.9,
98
+ "hadoop": 0.7,
99
+ "클러스터": 0.6,
100
+ "cluster": 0.6,
101
101
  },
102
- 'vaex': {
103
- 'vaex': 1.0,
104
- 'memory mapping': 0.8,
105
- '메모리 매핑': 0.8,
102
+ "vaex": {
103
+ "vaex": 1.0,
104
+ "memory mapping": 0.8,
105
+ "메모리 매핑": 0.8,
106
106
  },
107
- 'modin': {
108
- 'modin': 1.0,
109
- 'pandas 가속': 0.8,
110
- 'pandas acceleration': 0.8,
107
+ "modin": {
108
+ "modin": 1.0,
109
+ "pandas 가속": 0.8,
110
+ "pandas acceleration": 0.8,
111
111
  },
112
- 'ray': {
113
- 'ray': 0.9,
114
- '분산 컴퓨팅': 0.7,
115
- 'distributed computing': 0.7,
112
+ "ray": {
113
+ "ray": 0.9,
114
+ "분산 컴퓨팅": 0.7,
115
+ "distributed computing": 0.7,
116
116
  },
117
117
  }
118
118
 
@@ -123,7 +123,7 @@ class LibraryDetector:
123
123
  self,
124
124
  request: str,
125
125
  available_libraries: List[str],
126
- imported_libraries: List[str] = None
126
+ imported_libraries: List[str] = None,
127
127
  ) -> List[str]:
128
128
  """
129
129
  Detect required libraries from user request.
@@ -141,7 +141,9 @@ class LibraryDetector:
141
141
 
142
142
  # Step 1: Explicit pattern matching (highest priority)
143
143
  for pattern, lib in self.EXPLICIT_PATTERNS.items():
144
- if lib in available_libraries and re.search(pattern, request, re.IGNORECASE):
144
+ if lib in available_libraries and re.search(
145
+ pattern, request, re.IGNORECASE
146
+ ):
145
147
  detected.add(lib)
146
148
 
147
149
  # Step 2: Keyword scoring
@@ -162,8 +164,8 @@ class LibraryDetector:
162
164
  for lib in imported_libraries:
163
165
  lib_lower = lib.lower()
164
166
  # seaborn -> matplotlib
165
- if lib_lower == 'seaborn' and 'matplotlib' in available_libraries:
166
- detected.add('matplotlib')
167
+ if lib_lower == "seaborn" and "matplotlib" in available_libraries:
168
+ detected.add("matplotlib")
167
169
  elif lib_lower in available_libraries:
168
170
  detected.add(lib_lower)
169
171
 
@@ -183,7 +185,7 @@ def get_library_detector() -> LibraryDetector:
183
185
 
184
186
 
185
187
  # LLM library detection prompt
186
- LIBRARY_DETECTION_PROMPT = '''Analyze the user's request and determine which libraries to use for code generation.
188
+ LIBRARY_DETECTION_PROMPT = """Analyze the user's request and determine which libraries to use for code generation.
187
189
 
188
190
  ## Available Library API Guides:
189
191
  {library_list}
@@ -205,7 +207,7 @@ LIBRARY_DETECTION_PROMPT = '''Analyze the user's request and determine which lib
205
207
  {{"libraries": ["library1", "library2"]}}
206
208
 
207
209
  Empty array is also valid: {{"libraries": []}}
208
- '''
210
+ """
209
211
 
210
212
 
211
213
  class KnowledgeBase:
@@ -216,7 +218,7 @@ class KnowledgeBase:
216
218
  self.knowledge_dir = Path(knowledge_dir)
217
219
  else:
218
220
  # Default path: knowledge/libraries
219
- self.knowledge_dir = Path(__file__).parent / 'libraries'
221
+ self.knowledge_dir = Path(__file__).parent / "libraries"
220
222
 
221
223
  self._cache: Dict[str, str] = {}
222
224
 
@@ -225,19 +227,19 @@ class KnowledgeBase:
225
227
  available = self.list_available_libraries()
226
228
  lines = []
227
229
  for lib in available:
228
- desc = LIBRARY_DESCRIPTIONS.get(lib, 'Other library')
230
+ desc = LIBRARY_DESCRIPTIONS.get(lib, "Other library")
229
231
  lines.append(f"- **{lib}**: {desc}")
230
232
  return "\n".join(lines)
231
233
 
232
- def get_detection_prompt(self, request: str, imported_libraries: List[str] = None) -> str:
234
+ def get_detection_prompt(
235
+ self, request: str, imported_libraries: List[str] = None
236
+ ) -> str:
233
237
  """Generate LLM library detection prompt"""
234
238
  library_list = self.get_library_list_for_prompt()
235
239
  imported = ", ".join(imported_libraries) if imported_libraries else "None"
236
240
 
237
241
  return LIBRARY_DETECTION_PROMPT.format(
238
- library_list=library_list,
239
- request=request,
240
- imported_libraries=imported
242
+ library_list=library_list, request=request, imported_libraries=imported
241
243
  )
242
244
 
243
245
  def load_library_guide(self, library: str) -> Optional[str]:
@@ -255,9 +257,9 @@ class KnowledgeBase:
255
257
  return self._cache[library]
256
258
 
257
259
  # Load file
258
- file_path = self.knowledge_dir / f'{library}.md'
260
+ file_path = self.knowledge_dir / f"{library}.md"
259
261
  if file_path.exists():
260
- content = file_path.read_text(encoding='utf-8')
262
+ content = file_path.read_text(encoding="utf-8")
261
263
  self._cache[library] = content
262
264
  return content
263
265
 
@@ -274,7 +276,7 @@ class KnowledgeBase:
274
276
  Combined guide string
275
277
  """
276
278
  if not libraries:
277
- return ''
279
+ return ""
278
280
 
279
281
  guides = []
280
282
  for lib in sorted(libraries):
@@ -283,7 +285,7 @@ class KnowledgeBase:
283
285
  guides.append(f"## {lib.upper()} Library API Guide\n\n{guide}")
284
286
 
285
287
  if not guides:
286
- return ''
288
+ return ""
287
289
 
288
290
  return "\n\n---\n\n".join(guides)
289
291
 
@@ -300,7 +302,7 @@ class KnowledgeBase:
300
302
  knowledge = self.load_libraries_knowledge(libraries)
301
303
 
302
304
  if not knowledge:
303
- return ''
305
+ return ""
304
306
 
305
307
  return f"""
306
308
  ## 📚 Library API Reference (MUST follow!)
@@ -317,7 +319,7 @@ Follow the API usage in the guides below. Avoid ❌ incorrect code and use ✅ c
317
319
  if not self.knowledge_dir.exists():
318
320
  return []
319
321
 
320
- return [f.stem for f in self.knowledge_dir.glob('*.md')]
322
+ return [f.stem for f in self.knowledge_dir.glob("*.md")]
321
323
 
322
324
 
323
325
  # Singleton instance