hdsp-jupyter-extension 2.0.8__py3-none-any.whl → 2.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_server/core/notebook_generator.py +4 -4
- agent_server/core/rag_manager.py +12 -3
- agent_server/core/retriever.py +2 -1
- agent_server/core/vllm_embedding_service.py +8 -5
- agent_server/langchain/ARCHITECTURE.md +7 -51
- agent_server/langchain/agent.py +31 -20
- agent_server/langchain/custom_middleware.py +234 -31
- agent_server/langchain/hitl_config.py +5 -8
- agent_server/langchain/logging_utils.py +7 -7
- agent_server/langchain/prompts.py +106 -120
- agent_server/langchain/tools/__init__.py +1 -10
- agent_server/langchain/tools/file_tools.py +9 -61
- agent_server/langchain/tools/jupyter_tools.py +0 -1
- agent_server/langchain/tools/lsp_tools.py +8 -8
- agent_server/langchain/tools/resource_tools.py +12 -12
- agent_server/langchain/tools/search_tools.py +3 -158
- agent_server/prompts/file_action_prompts.py +8 -8
- agent_server/routers/langchain_agent.py +200 -125
- hdsp_agent_core/__init__.py +46 -47
- hdsp_agent_core/factory.py +6 -10
- hdsp_agent_core/interfaces.py +4 -2
- hdsp_agent_core/knowledge/__init__.py +5 -5
- hdsp_agent_core/knowledge/chunking.py +87 -61
- hdsp_agent_core/knowledge/loader.py +103 -101
- hdsp_agent_core/llm/service.py +192 -107
- hdsp_agent_core/managers/config_manager.py +16 -22
- hdsp_agent_core/managers/session_manager.py +5 -4
- hdsp_agent_core/models/__init__.py +12 -12
- hdsp_agent_core/models/agent.py +15 -8
- hdsp_agent_core/models/common.py +1 -2
- hdsp_agent_core/models/rag.py +48 -111
- hdsp_agent_core/prompts/__init__.py +12 -12
- hdsp_agent_core/prompts/cell_action_prompts.py +9 -7
- hdsp_agent_core/services/agent_service.py +10 -8
- hdsp_agent_core/services/chat_service.py +10 -6
- hdsp_agent_core/services/rag_service.py +3 -6
- hdsp_agent_core/tests/conftest.py +4 -1
- hdsp_agent_core/tests/test_factory.py +2 -2
- hdsp_agent_core/tests/test_services.py +12 -19
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/build_log.json +1 -1
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/package.json +2 -2
- hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.8740a527757068814573.js → hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js +93 -4
- hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +1 -0
- hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.e4ff4b5779b5e049f84c.js → hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js +153 -130
- hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js.map +1 -0
- hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.020cdb0b864cfaa4e41e.js → hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js +6 -6
- hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js.map +1 -0
- {hdsp_jupyter_extension-2.0.8.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/METADATA +1 -3
- hdsp_jupyter_extension-2.0.11.dist-info/RECORD +144 -0
- jupyter_ext/__init__.py +21 -11
- jupyter_ext/_version.py +1 -1
- jupyter_ext/handlers.py +69 -50
- jupyter_ext/labextension/build_log.json +1 -1
- jupyter_ext/labextension/package.json +2 -2
- jupyter_ext/labextension/static/{frontend_styles_index_js.8740a527757068814573.js → frontend_styles_index_js.2d9fb488c82498c45c2d.js} +93 -4
- jupyter_ext/labextension/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +1 -0
- jupyter_ext/labextension/static/{lib_index_js.e4ff4b5779b5e049f84c.js → lib_index_js.58c1e128ba0b76f41f04.js} +153 -130
- jupyter_ext/labextension/static/lib_index_js.58c1e128ba0b76f41f04.js.map +1 -0
- jupyter_ext/labextension/static/{remoteEntry.020cdb0b864cfaa4e41e.js → remoteEntry.9da31d1134a53b0c4af5.js} +6 -6
- jupyter_ext/labextension/static/remoteEntry.9da31d1134a53b0c4af5.js.map +1 -0
- hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.8740a527757068814573.js.map +0 -1
- hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.e4ff4b5779b5e049f84c.js.map +0 -1
- hdsp_jupyter_extension-2.0.8.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.020cdb0b864cfaa4e41e.js.map +0 -1
- hdsp_jupyter_extension-2.0.8.dist-info/RECORD +0 -144
- jupyter_ext/labextension/static/frontend_styles_index_js.8740a527757068814573.js.map +0 -1
- jupyter_ext/labextension/static/lib_index_js.e4ff4b5779b5e049f84c.js.map +0 -1
- jupyter_ext/labextension/static/remoteEntry.020cdb0b864cfaa4e41e.js.map +0 -1
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/etc/jupyter/jupyter_server_config.d/hdsp_jupyter_extension.json +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/install.json +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js.map +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js.map +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/style.js +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js.map +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js.map +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js.map +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js.map +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js.map +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js +0 -0
- {hdsp_jupyter_extension-2.0.8.data → hdsp_jupyter_extension-2.0.11.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js.map +0 -0
- {hdsp_jupyter_extension-2.0.8.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/WHEEL +0 -0
- {hdsp_jupyter_extension-2.0.8.dist-info → hdsp_jupyter_extension-2.0.11.dist-info}/licenses/LICENSE +0 -0
hdsp_agent_core/interfaces.py
CHANGED
|
@@ -73,7 +73,9 @@ class IAgentService(ABC):
|
|
|
73
73
|
...
|
|
74
74
|
|
|
75
75
|
@abstractmethod
|
|
76
|
-
async def validate_code(
|
|
76
|
+
async def validate_code(
|
|
77
|
+
self, code: str, notebook_context: Optional[Dict] = None
|
|
78
|
+
) -> Dict[str, Any]:
|
|
77
79
|
"""
|
|
78
80
|
Validate code before execution.
|
|
79
81
|
|
|
@@ -154,7 +156,7 @@ class IRAGService(ABC):
|
|
|
154
156
|
self,
|
|
155
157
|
query: str,
|
|
156
158
|
detected_libraries: Optional[List[str]] = None,
|
|
157
|
-
max_results: int = 5
|
|
159
|
+
max_results: int = 5,
|
|
158
160
|
) -> Optional[str]:
|
|
159
161
|
"""
|
|
160
162
|
Get formatted context for a query (for prompt injection).
|
|
@@ -4,18 +4,18 @@ HDSP Agent Core - Knowledge Base
|
|
|
4
4
|
Deterministic library detection and API guide management.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
from .chunking import (
|
|
8
|
+
DocumentChunker,
|
|
9
|
+
chunk_file,
|
|
10
|
+
)
|
|
7
11
|
from .loader import (
|
|
12
|
+
LIBRARY_DESCRIPTIONS,
|
|
8
13
|
KnowledgeBase,
|
|
9
14
|
KnowledgeLoader,
|
|
10
15
|
LibraryDetector,
|
|
11
16
|
get_knowledge_base,
|
|
12
17
|
get_knowledge_loader,
|
|
13
18
|
get_library_detector,
|
|
14
|
-
LIBRARY_DESCRIPTIONS,
|
|
15
|
-
)
|
|
16
|
-
from .chunking import (
|
|
17
|
-
DocumentChunker,
|
|
18
|
-
chunk_file,
|
|
19
19
|
)
|
|
20
20
|
|
|
21
21
|
__all__ = [
|
|
@@ -9,10 +9,10 @@ Provides intelligent chunking strategies:
|
|
|
9
9
|
Each strategy preserves context and adds relevant metadata.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
import re
|
|
13
12
|
import logging
|
|
14
|
-
|
|
13
|
+
import re
|
|
15
14
|
from pathlib import Path
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
18
|
from hdsp_agent_core.models.rag import ChunkingConfig
|
|
@@ -36,13 +36,14 @@ class DocumentChunker:
|
|
|
36
36
|
|
|
37
37
|
def __init__(self, config: Optional["ChunkingConfig"] = None):
|
|
38
38
|
from hdsp_agent_core.models.rag import ChunkingConfig
|
|
39
|
+
|
|
39
40
|
self._config = config or ChunkingConfig()
|
|
40
41
|
|
|
41
42
|
def chunk_document(
|
|
42
43
|
self,
|
|
43
44
|
content: str,
|
|
44
45
|
metadata: Optional[Dict[str, Any]] = None,
|
|
45
|
-
file_type: Optional[str] = None
|
|
46
|
+
file_type: Optional[str] = None,
|
|
46
47
|
) -> List[Dict[str, Any]]:
|
|
47
48
|
"""
|
|
48
49
|
Chunk document based on content type.
|
|
@@ -75,13 +76,12 @@ class DocumentChunker:
|
|
|
75
76
|
for chunk in chunks:
|
|
76
77
|
chunk_content = chunk["content"].strip()
|
|
77
78
|
if len(chunk_content) >= self._config.min_chunk_size:
|
|
78
|
-
result.append(
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
**metadata,
|
|
82
|
-
**chunk.get("metadata", {})
|
|
79
|
+
result.append(
|
|
80
|
+
{
|
|
81
|
+
"content": chunk_content,
|
|
82
|
+
"metadata": {**metadata, **chunk.get("metadata", {})},
|
|
83
83
|
}
|
|
84
|
-
|
|
84
|
+
)
|
|
85
85
|
|
|
86
86
|
logger.debug(f"Chunked document into {len(result)} chunks (type={file_type})")
|
|
87
87
|
return result
|
|
@@ -108,9 +108,9 @@ class DocumentChunker:
|
|
|
108
108
|
- Respect max chunk size with sub-splitting
|
|
109
109
|
"""
|
|
110
110
|
# Pattern for markdown headers
|
|
111
|
-
header_pattern = r
|
|
111
|
+
header_pattern = r"^(#{1,6})\s+(.+)$"
|
|
112
112
|
|
|
113
|
-
lines = content.split(
|
|
113
|
+
lines = content.split("\n")
|
|
114
114
|
chunks = []
|
|
115
115
|
current_chunk_lines = []
|
|
116
116
|
current_headers = [] # Stack of (level, text)
|
|
@@ -121,13 +121,19 @@ class DocumentChunker:
|
|
|
121
121
|
if header_match:
|
|
122
122
|
# Save current chunk if it has content
|
|
123
123
|
if current_chunk_lines:
|
|
124
|
-
chunk_content =
|
|
124
|
+
chunk_content = "\n".join(current_chunk_lines).strip()
|
|
125
125
|
if chunk_content:
|
|
126
|
-
section_path =
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
126
|
+
section_path = (
|
|
127
|
+
" > ".join(h[1] for h in current_headers)
|
|
128
|
+
if current_headers
|
|
129
|
+
else "Introduction"
|
|
130
|
+
)
|
|
131
|
+
chunks.append(
|
|
132
|
+
{
|
|
133
|
+
"content": chunk_content,
|
|
134
|
+
"metadata": {"section": section_path},
|
|
135
|
+
}
|
|
136
|
+
)
|
|
131
137
|
|
|
132
138
|
# Update header stack
|
|
133
139
|
level = len(header_match.group(1))
|
|
@@ -143,26 +149,35 @@ class DocumentChunker:
|
|
|
143
149
|
current_chunk_lines.append(line)
|
|
144
150
|
|
|
145
151
|
# Check chunk size limit
|
|
146
|
-
chunk_text =
|
|
152
|
+
chunk_text = "\n".join(current_chunk_lines)
|
|
147
153
|
if len(chunk_text) >= self._config.max_chunk_size:
|
|
148
|
-
section_path =
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
154
|
+
section_path = (
|
|
155
|
+
" > ".join(h[1] for h in current_headers)
|
|
156
|
+
if current_headers
|
|
157
|
+
else "Content"
|
|
158
|
+
)
|
|
159
|
+
chunks.append(
|
|
160
|
+
{
|
|
161
|
+
"content": chunk_text.strip(),
|
|
162
|
+
"metadata": {"section": section_path},
|
|
163
|
+
}
|
|
164
|
+
)
|
|
153
165
|
# Keep overlap for context continuity
|
|
154
166
|
overlap_lines = self._get_overlap_lines(current_chunk_lines)
|
|
155
167
|
current_chunk_lines = overlap_lines
|
|
156
168
|
|
|
157
169
|
# Save final chunk
|
|
158
170
|
if current_chunk_lines:
|
|
159
|
-
chunk_content =
|
|
171
|
+
chunk_content = "\n".join(current_chunk_lines).strip()
|
|
160
172
|
if chunk_content:
|
|
161
|
-
section_path =
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
173
|
+
section_path = (
|
|
174
|
+
" > ".join(h[1] for h in current_headers)
|
|
175
|
+
if current_headers
|
|
176
|
+
else "Content"
|
|
177
|
+
)
|
|
178
|
+
chunks.append(
|
|
179
|
+
{"content": chunk_content, "metadata": {"section": section_path}}
|
|
180
|
+
)
|
|
166
181
|
|
|
167
182
|
return chunks
|
|
168
183
|
|
|
@@ -176,9 +191,9 @@ class DocumentChunker:
|
|
|
176
191
|
- Preserve import statements and module docstrings
|
|
177
192
|
"""
|
|
178
193
|
# Pattern for class and function definitions (top-level only)
|
|
179
|
-
def_pattern = r
|
|
194
|
+
def_pattern = r"^(class|def|async\s+def)\s+(\w+)"
|
|
180
195
|
|
|
181
|
-
lines = content.split(
|
|
196
|
+
lines = content.split("\n")
|
|
182
197
|
chunks = []
|
|
183
198
|
current_chunk_lines = []
|
|
184
199
|
current_def = None
|
|
@@ -193,15 +208,21 @@ class DocumentChunker:
|
|
|
193
208
|
def_match = re.match(def_pattern, line)
|
|
194
209
|
|
|
195
210
|
# Check if this is a top-level definition (not indented)
|
|
196
|
-
if
|
|
211
|
+
if (
|
|
212
|
+
def_match
|
|
213
|
+
and not line.startswith((" ", "\t"))
|
|
214
|
+
and not in_multiline_string
|
|
215
|
+
):
|
|
197
216
|
# Save current chunk
|
|
198
217
|
if current_chunk_lines:
|
|
199
|
-
chunk_content =
|
|
218
|
+
chunk_content = "\n".join(current_chunk_lines).strip()
|
|
200
219
|
if chunk_content:
|
|
201
|
-
chunks.append(
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
220
|
+
chunks.append(
|
|
221
|
+
{
|
|
222
|
+
"content": chunk_content,
|
|
223
|
+
"metadata": {"definition": current_def or "module"},
|
|
224
|
+
}
|
|
225
|
+
)
|
|
205
226
|
|
|
206
227
|
current_def = f"{def_match.group(1)} {def_match.group(2)}"
|
|
207
228
|
current_chunk_lines = [line]
|
|
@@ -209,22 +230,26 @@ class DocumentChunker:
|
|
|
209
230
|
current_chunk_lines.append(line)
|
|
210
231
|
|
|
211
232
|
# Check max chunk size
|
|
212
|
-
if len(
|
|
213
|
-
chunks.append(
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
233
|
+
if len("\n".join(current_chunk_lines)) >= self._config.max_chunk_size:
|
|
234
|
+
chunks.append(
|
|
235
|
+
{
|
|
236
|
+
"content": "\n".join(current_chunk_lines).strip(),
|
|
237
|
+
"metadata": {"definition": current_def or "module"},
|
|
238
|
+
}
|
|
239
|
+
)
|
|
217
240
|
overlap_lines = self._get_overlap_lines(current_chunk_lines)
|
|
218
241
|
current_chunk_lines = overlap_lines
|
|
219
242
|
|
|
220
243
|
# Save final chunk
|
|
221
244
|
if current_chunk_lines:
|
|
222
|
-
chunk_content =
|
|
245
|
+
chunk_content = "\n".join(current_chunk_lines).strip()
|
|
223
246
|
if chunk_content:
|
|
224
|
-
chunks.append(
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
247
|
+
chunks.append(
|
|
248
|
+
{
|
|
249
|
+
"content": chunk_content,
|
|
250
|
+
"metadata": {"definition": current_def or "module"},
|
|
251
|
+
}
|
|
252
|
+
)
|
|
228
253
|
|
|
229
254
|
return chunks
|
|
230
255
|
|
|
@@ -251,10 +276,12 @@ class DocumentChunker:
|
|
|
251
276
|
if end >= len(content):
|
|
252
277
|
chunk_content = content[start:].strip()
|
|
253
278
|
if chunk_content:
|
|
254
|
-
chunks.append(
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
279
|
+
chunks.append(
|
|
280
|
+
{
|
|
281
|
+
"content": chunk_content,
|
|
282
|
+
"metadata": {"chunk_index": chunk_index},
|
|
283
|
+
}
|
|
284
|
+
)
|
|
258
285
|
break
|
|
259
286
|
|
|
260
287
|
# Try to find a good break point
|
|
@@ -263,10 +290,9 @@ class DocumentChunker:
|
|
|
263
290
|
|
|
264
291
|
chunk_content = content[start:end].strip()
|
|
265
292
|
if chunk_content:
|
|
266
|
-
chunks.append(
|
|
267
|
-
"content": chunk_content,
|
|
268
|
-
|
|
269
|
-
})
|
|
293
|
+
chunks.append(
|
|
294
|
+
{"content": chunk_content, "metadata": {"chunk_index": chunk_index}}
|
|
295
|
+
)
|
|
270
296
|
|
|
271
297
|
# Move start with overlap
|
|
272
298
|
start = max(end - overlap, start + 1)
|
|
@@ -288,12 +314,12 @@ class DocumentChunker:
|
|
|
288
314
|
search_start = start + (end - start) // 2 # Search in latter half
|
|
289
315
|
|
|
290
316
|
# Try paragraph break (double newline)
|
|
291
|
-
para_break = content.rfind(
|
|
317
|
+
para_break = content.rfind("\n\n", search_start, end)
|
|
292
318
|
if para_break > search_start:
|
|
293
319
|
return para_break + 2
|
|
294
320
|
|
|
295
321
|
# Try sentence break (. or ! or ? followed by space or newline)
|
|
296
|
-
sentence_pattern = r
|
|
322
|
+
sentence_pattern = r"[.!?]\s"
|
|
297
323
|
for match in re.finditer(sentence_pattern, content[search_start:end]):
|
|
298
324
|
last_match_end = search_start + match.end()
|
|
299
325
|
else:
|
|
@@ -301,15 +327,15 @@ class DocumentChunker:
|
|
|
301
327
|
|
|
302
328
|
# Find last sentence break
|
|
303
329
|
for i in range(end - 1, search_start, -1):
|
|
304
|
-
if i + 1 < len(content) and content[i] in
|
|
330
|
+
if i + 1 < len(content) and content[i] in ".!?" and content[i + 1] in " \n":
|
|
305
331
|
return i + 1
|
|
306
332
|
|
|
307
333
|
# Try word break (space or newline)
|
|
308
|
-
space_break = content.rfind(
|
|
334
|
+
space_break = content.rfind(" ", search_start, end)
|
|
309
335
|
if space_break > search_start:
|
|
310
336
|
return space_break + 1
|
|
311
337
|
|
|
312
|
-
newline_break = content.rfind(
|
|
338
|
+
newline_break = content.rfind("\n", search_start, end)
|
|
313
339
|
if newline_break > search_start:
|
|
314
340
|
return newline_break + 1
|
|
315
341
|
|
|
@@ -333,7 +359,7 @@ class DocumentChunker:
|
|
|
333
359
|
def chunk_file(
|
|
334
360
|
file_path: Path,
|
|
335
361
|
config: Optional["ChunkingConfig"] = None,
|
|
336
|
-
base_metadata: Optional[Dict[str, Any]] = None
|
|
362
|
+
base_metadata: Optional[Dict[str, Any]] = None,
|
|
337
363
|
) -> List[Dict[str, Any]]:
|
|
338
364
|
"""
|
|
339
365
|
Convenience function to chunk a file directly.
|
|
@@ -5,19 +5,19 @@ Keyword matching + regex based library detection for loading appropriate API gui
|
|
|
5
5
|
(No LLM calls - saves tokens and improves reliability)
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from typing import List, Dict, Optional, Set
|
|
10
8
|
import re
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Optional, Set
|
|
11
11
|
|
|
12
12
|
# Library descriptions (reference)
|
|
13
13
|
LIBRARY_DESCRIPTIONS: Dict[str, str] = {
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
"matplotlib": "Visualization, graphs, charts, plot, histogram, scatter plot, EDA, data visualization, used with seaborn",
|
|
15
|
+
"dask": "Large-scale data processing, pandas replacement, distributed processing, lazy evaluation, dd.read_csv",
|
|
16
|
+
"polars": "High-performance DataFrame, pandas replacement, Rust-based, pl.read_csv",
|
|
17
|
+
"pyspark": "Spark-based distributed processing, big data, SparkSession",
|
|
18
|
+
"vaex": "Large-scale data exploration, out-of-core processing",
|
|
19
|
+
"modin": "pandas acceleration, parallel processing",
|
|
20
|
+
"ray": "Distributed computing, parallel processing framework",
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
|
|
@@ -29,90 +29,90 @@ class LibraryDetector:
|
|
|
29
29
|
|
|
30
30
|
# Explicit library mention patterns (highest priority)
|
|
31
31
|
EXPLICIT_PATTERNS: Dict[str, str] = {
|
|
32
|
-
r
|
|
33
|
-
r
|
|
34
|
-
r
|
|
35
|
-
r
|
|
36
|
-
r
|
|
37
|
-
r
|
|
38
|
-
r
|
|
39
|
-
r
|
|
40
|
-
r
|
|
41
|
-
r
|
|
42
|
-
r
|
|
43
|
-
r
|
|
32
|
+
r"\bdask\b": "dask",
|
|
33
|
+
r"\bpolars\b": "polars",
|
|
34
|
+
r"\bpyspark\b": "pyspark",
|
|
35
|
+
r"\bvaex\b": "vaex",
|
|
36
|
+
r"\bmodin\b": "modin",
|
|
37
|
+
r"\bray\b": "ray",
|
|
38
|
+
r"\bmatplotlib\b": "matplotlib",
|
|
39
|
+
r"\bseaborn\b": "matplotlib", # seaborn -> matplotlib guide
|
|
40
|
+
r"\bplt\.": "matplotlib",
|
|
41
|
+
r"\bdd\.read": "dask",
|
|
42
|
+
r"\bpl\.read": "polars",
|
|
43
|
+
r"\bpl\.DataFrame": "polars",
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
# Keyword scores per library (0.0 ~ 1.0)
|
|
47
47
|
KEYWORD_SCORES: Dict[str, Dict[str, float]] = {
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
48
|
+
"dask": {
|
|
49
|
+
"대용량": 0.7,
|
|
50
|
+
"big data": 0.7,
|
|
51
|
+
"bigdata": 0.7,
|
|
52
|
+
"빅데이터": 0.7,
|
|
53
|
+
"lazy": 0.8,
|
|
54
|
+
"lazy evaluation": 0.9,
|
|
55
|
+
"out-of-core": 0.9,
|
|
56
|
+
"out of core": 0.9,
|
|
57
|
+
"분산 처리": 0.6,
|
|
58
|
+
"distributed": 0.6,
|
|
59
|
+
"parallel dataframe": 0.8,
|
|
60
|
+
"병렬 데이터프레임": 0.8,
|
|
61
61
|
},
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
62
|
+
"polars": {
|
|
63
|
+
"rust 기반": 0.9,
|
|
64
|
+
"rust-based": 0.9,
|
|
65
|
+
"fast dataframe": 0.7,
|
|
66
|
+
"고성능 dataframe": 0.7,
|
|
67
|
+
"빠른 데이터프레임": 0.7,
|
|
68
68
|
},
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
69
|
+
"matplotlib": {
|
|
70
|
+
"시각화": 0.7,
|
|
71
|
+
"visualization": 0.7,
|
|
72
|
+
"visualize": 0.7,
|
|
73
|
+
"plot": 0.7,
|
|
74
|
+
"chart": 0.7,
|
|
75
|
+
"graph": 0.6,
|
|
76
|
+
"그래프": 0.6,
|
|
77
|
+
"차트": 0.7,
|
|
78
|
+
"histogram": 0.8,
|
|
79
|
+
"히스토그램": 0.8,
|
|
80
|
+
"scatter": 0.8,
|
|
81
|
+
"산점도": 0.8,
|
|
82
|
+
"line plot": 0.8,
|
|
83
|
+
"라인 플롯": 0.8,
|
|
84
|
+
"bar chart": 0.8,
|
|
85
|
+
"막대 그래프": 0.8,
|
|
86
|
+
"eda": 0.5,
|
|
87
|
+
"탐색적 데이터 분석": 0.6,
|
|
88
|
+
"figure": 0.5,
|
|
89
|
+
"subplot": 0.8,
|
|
90
|
+
"heatmap": 0.7,
|
|
91
|
+
"히트맵": 0.7,
|
|
92
92
|
},
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
93
|
+
"pyspark": {
|
|
94
|
+
"spark": 0.9,
|
|
95
|
+
"sparksession": 0.95,
|
|
96
|
+
"spark session": 0.95,
|
|
97
|
+
"rdd": 0.9,
|
|
98
|
+
"hadoop": 0.7,
|
|
99
|
+
"클러스터": 0.6,
|
|
100
|
+
"cluster": 0.6,
|
|
101
101
|
},
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
102
|
+
"vaex": {
|
|
103
|
+
"vaex": 1.0,
|
|
104
|
+
"memory mapping": 0.8,
|
|
105
|
+
"메모리 매핑": 0.8,
|
|
106
106
|
},
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
107
|
+
"modin": {
|
|
108
|
+
"modin": 1.0,
|
|
109
|
+
"pandas 가속": 0.8,
|
|
110
|
+
"pandas acceleration": 0.8,
|
|
111
111
|
},
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
112
|
+
"ray": {
|
|
113
|
+
"ray": 0.9,
|
|
114
|
+
"분산 컴퓨팅": 0.7,
|
|
115
|
+
"distributed computing": 0.7,
|
|
116
116
|
},
|
|
117
117
|
}
|
|
118
118
|
|
|
@@ -123,7 +123,7 @@ class LibraryDetector:
|
|
|
123
123
|
self,
|
|
124
124
|
request: str,
|
|
125
125
|
available_libraries: List[str],
|
|
126
|
-
imported_libraries: List[str] = None
|
|
126
|
+
imported_libraries: List[str] = None,
|
|
127
127
|
) -> List[str]:
|
|
128
128
|
"""
|
|
129
129
|
Detect required libraries from user request.
|
|
@@ -141,7 +141,9 @@ class LibraryDetector:
|
|
|
141
141
|
|
|
142
142
|
# Step 1: Explicit pattern matching (highest priority)
|
|
143
143
|
for pattern, lib in self.EXPLICIT_PATTERNS.items():
|
|
144
|
-
if lib in available_libraries and re.search(
|
|
144
|
+
if lib in available_libraries and re.search(
|
|
145
|
+
pattern, request, re.IGNORECASE
|
|
146
|
+
):
|
|
145
147
|
detected.add(lib)
|
|
146
148
|
|
|
147
149
|
# Step 2: Keyword scoring
|
|
@@ -162,8 +164,8 @@ class LibraryDetector:
|
|
|
162
164
|
for lib in imported_libraries:
|
|
163
165
|
lib_lower = lib.lower()
|
|
164
166
|
# seaborn -> matplotlib
|
|
165
|
-
if lib_lower ==
|
|
166
|
-
detected.add(
|
|
167
|
+
if lib_lower == "seaborn" and "matplotlib" in available_libraries:
|
|
168
|
+
detected.add("matplotlib")
|
|
167
169
|
elif lib_lower in available_libraries:
|
|
168
170
|
detected.add(lib_lower)
|
|
169
171
|
|
|
@@ -183,7 +185,7 @@ def get_library_detector() -> LibraryDetector:
|
|
|
183
185
|
|
|
184
186
|
|
|
185
187
|
# LLM library detection prompt
|
|
186
|
-
LIBRARY_DETECTION_PROMPT =
|
|
188
|
+
LIBRARY_DETECTION_PROMPT = """Analyze the user's request and determine which libraries to use for code generation.
|
|
187
189
|
|
|
188
190
|
## Available Library API Guides:
|
|
189
191
|
{library_list}
|
|
@@ -205,7 +207,7 @@ LIBRARY_DETECTION_PROMPT = '''Analyze the user's request and determine which lib
|
|
|
205
207
|
{{"libraries": ["library1", "library2"]}}
|
|
206
208
|
|
|
207
209
|
Empty array is also valid: {{"libraries": []}}
|
|
208
|
-
|
|
210
|
+
"""
|
|
209
211
|
|
|
210
212
|
|
|
211
213
|
class KnowledgeBase:
|
|
@@ -216,7 +218,7 @@ class KnowledgeBase:
|
|
|
216
218
|
self.knowledge_dir = Path(knowledge_dir)
|
|
217
219
|
else:
|
|
218
220
|
# Default path: knowledge/libraries
|
|
219
|
-
self.knowledge_dir = Path(__file__).parent /
|
|
221
|
+
self.knowledge_dir = Path(__file__).parent / "libraries"
|
|
220
222
|
|
|
221
223
|
self._cache: Dict[str, str] = {}
|
|
222
224
|
|
|
@@ -225,19 +227,19 @@ class KnowledgeBase:
|
|
|
225
227
|
available = self.list_available_libraries()
|
|
226
228
|
lines = []
|
|
227
229
|
for lib in available:
|
|
228
|
-
desc = LIBRARY_DESCRIPTIONS.get(lib,
|
|
230
|
+
desc = LIBRARY_DESCRIPTIONS.get(lib, "Other library")
|
|
229
231
|
lines.append(f"- **{lib}**: {desc}")
|
|
230
232
|
return "\n".join(lines)
|
|
231
233
|
|
|
232
|
-
def get_detection_prompt(
|
|
234
|
+
def get_detection_prompt(
|
|
235
|
+
self, request: str, imported_libraries: List[str] = None
|
|
236
|
+
) -> str:
|
|
233
237
|
"""Generate LLM library detection prompt"""
|
|
234
238
|
library_list = self.get_library_list_for_prompt()
|
|
235
239
|
imported = ", ".join(imported_libraries) if imported_libraries else "None"
|
|
236
240
|
|
|
237
241
|
return LIBRARY_DETECTION_PROMPT.format(
|
|
238
|
-
library_list=library_list,
|
|
239
|
-
request=request,
|
|
240
|
-
imported_libraries=imported
|
|
242
|
+
library_list=library_list, request=request, imported_libraries=imported
|
|
241
243
|
)
|
|
242
244
|
|
|
243
245
|
def load_library_guide(self, library: str) -> Optional[str]:
|
|
@@ -255,9 +257,9 @@ class KnowledgeBase:
|
|
|
255
257
|
return self._cache[library]
|
|
256
258
|
|
|
257
259
|
# Load file
|
|
258
|
-
file_path = self.knowledge_dir / f
|
|
260
|
+
file_path = self.knowledge_dir / f"{library}.md"
|
|
259
261
|
if file_path.exists():
|
|
260
|
-
content = file_path.read_text(encoding=
|
|
262
|
+
content = file_path.read_text(encoding="utf-8")
|
|
261
263
|
self._cache[library] = content
|
|
262
264
|
return content
|
|
263
265
|
|
|
@@ -274,7 +276,7 @@ class KnowledgeBase:
|
|
|
274
276
|
Combined guide string
|
|
275
277
|
"""
|
|
276
278
|
if not libraries:
|
|
277
|
-
return
|
|
279
|
+
return ""
|
|
278
280
|
|
|
279
281
|
guides = []
|
|
280
282
|
for lib in sorted(libraries):
|
|
@@ -283,7 +285,7 @@ class KnowledgeBase:
|
|
|
283
285
|
guides.append(f"## {lib.upper()} Library API Guide\n\n{guide}")
|
|
284
286
|
|
|
285
287
|
if not guides:
|
|
286
|
-
return
|
|
288
|
+
return ""
|
|
287
289
|
|
|
288
290
|
return "\n\n---\n\n".join(guides)
|
|
289
291
|
|
|
@@ -300,7 +302,7 @@ class KnowledgeBase:
|
|
|
300
302
|
knowledge = self.load_libraries_knowledge(libraries)
|
|
301
303
|
|
|
302
304
|
if not knowledge:
|
|
303
|
-
return
|
|
305
|
+
return ""
|
|
304
306
|
|
|
305
307
|
return f"""
|
|
306
308
|
## 📚 Library API Reference (MUST follow!)
|
|
@@ -317,7 +319,7 @@ Follow the API usage in the guides below. Avoid ❌ incorrect code and use ✅ c
|
|
|
317
319
|
if not self.knowledge_dir.exists():
|
|
318
320
|
return []
|
|
319
321
|
|
|
320
|
-
return [f.stem for f in self.knowledge_dir.glob(
|
|
322
|
+
return [f.stem for f in self.knowledge_dir.glob("*.md")]
|
|
321
323
|
|
|
322
324
|
|
|
323
325
|
# Singleton instance
|