PyPI - hdsp-jupyter-extension - Versions diffs - 2.0.8__py3-none-any.whl → 2.0.11__py3-none-any.whl - Mend

hdsp-jupyter-extension 2.0.8py3-none-any.whl → 2.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

hdsp_agent_core/interfaces.py CHANGED Viewed

@@ -73,7 +73,9 @@ class IAgentService(ABC):
         ...
     @abstractmethod
-    async def validate_code(self, code: str, notebook_context: Optional[Dict] = None) -> Dict[str, Any]:
+    async def validate_code(
+        self, code: str, notebook_context: Optional[Dict] = None
+    ) -> Dict[str, Any]:
         """
         Validate code before execution.
@@ -154,7 +156,7 @@ class IRAGService(ABC):
         self,
         query: str,
         detected_libraries: Optional[List[str]] = None,
-        max_results: int = 5
+        max_results: int = 5,
     ) -> Optional[str]:
         """
         Get formatted context for a query (for prompt injection).

hdsp_agent_core/knowledge/__init__.py CHANGED Viewed

@@ -4,18 +4,18 @@ HDSP Agent Core - Knowledge Base
 Deterministic library detection and API guide management.
 """
+from .chunking import (
+    DocumentChunker,
+    chunk_file,
+)
 from .loader import (
+    LIBRARY_DESCRIPTIONS,
     KnowledgeBase,
     KnowledgeLoader,
     LibraryDetector,
     get_knowledge_base,
     get_knowledge_loader,
     get_library_detector,
-    LIBRARY_DESCRIPTIONS,
-)
-from .chunking import (
-    DocumentChunker,
-    chunk_file,
 )
 __all__ = [

hdsp_agent_core/knowledge/chunking.py CHANGED Viewed

@@ -9,10 +9,10 @@ Provides intelligent chunking strategies:
 Each strategy preserves context and adds relevant metadata.
 """
-import re
 import logging
-from typing import List, Dict, Any, Optional, TYPE_CHECKING
+import re
 from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 if TYPE_CHECKING:
     from hdsp_agent_core.models.rag import ChunkingConfig
@@ -36,13 +36,14 @@ class DocumentChunker:
     def __init__(self, config: Optional["ChunkingConfig"] = None):
         from hdsp_agent_core.models.rag import ChunkingConfig
         self._config = config or ChunkingConfig()
     def chunk_document(
         self,
         content: str,
         metadata: Optional[Dict[str, Any]] = None,
-        file_type: Optional[str] = None
+        file_type: Optional[str] = None,
     ) -> List[Dict[str, Any]]:
         """
         Chunk document based on content type.
@@ -75,13 +76,12 @@ class DocumentChunker:
         for chunk in chunks:
             chunk_content = chunk["content"].strip()
             if len(chunk_content) >= self._config.min_chunk_size:
-                result.append({
-                    "content": chunk_content,
-                    "metadata": {
-                        **metadata,
-                        **chunk.get("metadata", {})
+                result.append(
+                    {
+                        "content": chunk_content,
+                        "metadata": {**metadata, **chunk.get("metadata", {})},
                     }
-                })
+                )
         logger.debug(f"Chunked document into {len(result)} chunks (type={file_type})")
         return result
@@ -108,9 +108,9 @@ class DocumentChunker:
         - Respect max chunk size with sub-splitting
         """
         # Pattern for markdown headers
-        header_pattern = r'^(#{1,6})\s+(.+)$'
+        header_pattern = r"^(#{1,6})\s+(.+)$"
-        lines = content.split('\n')
+        lines = content.split("\n")
         chunks = []
         current_chunk_lines = []
         current_headers = []  # Stack of (level, text)
@@ -121,13 +121,19 @@ class DocumentChunker:
             if header_match:
                 # Save current chunk if it has content
                 if current_chunk_lines:
-                    chunk_content = '\n'.join(current_chunk_lines).strip()
+                    chunk_content = "\n".join(current_chunk_lines).strip()
                     if chunk_content:
-                        section_path = ' > '.join(h[1] for h in current_headers) if current_headers else "Introduction"
-                        chunks.append({
-                            "content": chunk_content,
-                            "metadata": {"section": section_path}
-                        })
+                        section_path = (
+                            " > ".join(h[1] for h in current_headers)
+                            if current_headers
+                            else "Introduction"
+                        )
+                        chunks.append(
+                            {
+                                "content": chunk_content,
+                                "metadata": {"section": section_path},
+                            }
+                        )
                 # Update header stack
                 level = len(header_match.group(1))
@@ -143,26 +149,35 @@ class DocumentChunker:
                 current_chunk_lines.append(line)
                 # Check chunk size limit
-                chunk_text = '\n'.join(current_chunk_lines)
+                chunk_text = "\n".join(current_chunk_lines)
                 if len(chunk_text) >= self._config.max_chunk_size:
-                    section_path = ' > '.join(h[1] for h in current_headers) if current_headers else "Content"
-                    chunks.append({
-                        "content": chunk_text.strip(),
-                        "metadata": {"section": section_path}
-                    })
+                    section_path = (
+                        " > ".join(h[1] for h in current_headers)
+                        if current_headers
+                        else "Content"
+                    )
+                    chunks.append(
+                        {
+                            "content": chunk_text.strip(),
+                            "metadata": {"section": section_path},
+                        }
+                    )
                     # Keep overlap for context continuity
                     overlap_lines = self._get_overlap_lines(current_chunk_lines)
                     current_chunk_lines = overlap_lines
         # Save final chunk
         if current_chunk_lines:
-            chunk_content = '\n'.join(current_chunk_lines).strip()
+            chunk_content = "\n".join(current_chunk_lines).strip()
             if chunk_content:
-                section_path = ' > '.join(h[1] for h in current_headers) if current_headers else "Content"
-                chunks.append({
-                    "content": chunk_content,
-                    "metadata": {"section": section_path}
-                })
+                section_path = (
+                    " > ".join(h[1] for h in current_headers)
+                    if current_headers
+                    else "Content"
+                )
+                chunks.append(
+                    {"content": chunk_content, "metadata": {"section": section_path}}
+                )
         return chunks
@@ -176,9 +191,9 @@ class DocumentChunker:
         - Preserve import statements and module docstrings
         """
         # Pattern for class and function definitions (top-level only)
-        def_pattern = r'^(class|def|async\s+def)\s+(\w+)'
+        def_pattern = r"^(class|def|async\s+def)\s+(\w+)"
-        lines = content.split('\n')
+        lines = content.split("\n")
         chunks = []
         current_chunk_lines = []
         current_def = None
@@ -193,15 +208,21 @@ class DocumentChunker:
             def_match = re.match(def_pattern, line)
             # Check if this is a top-level definition (not indented)
-            if def_match and not line.startswith((' ', '\t')) and not in_multiline_string:
+            if (
+                def_match
+                and not line.startswith((" ", "\t"))
+                and not in_multiline_string
+            ):
                 # Save current chunk
                 if current_chunk_lines:
-                    chunk_content = '\n'.join(current_chunk_lines).strip()
+                    chunk_content = "\n".join(current_chunk_lines).strip()
                     if chunk_content:
-                        chunks.append({
-                            "content": chunk_content,
-                            "metadata": {"definition": current_def or "module"}
-                        })
+                        chunks.append(
+                            {
+                                "content": chunk_content,
+                                "metadata": {"definition": current_def or "module"},
+                            }
+                        )
                 current_def = f"{def_match.group(1)} {def_match.group(2)}"
                 current_chunk_lines = [line]
@@ -209,22 +230,26 @@ class DocumentChunker:
                 current_chunk_lines.append(line)
                 # Check max chunk size
-                if len('\n'.join(current_chunk_lines)) >= self._config.max_chunk_size:
-                    chunks.append({
-                        "content": '\n'.join(current_chunk_lines).strip(),
-                        "metadata": {"definition": current_def or "module"}
-                    })
+                if len("\n".join(current_chunk_lines)) >= self._config.max_chunk_size:
+                    chunks.append(
+                        {
+                            "content": "\n".join(current_chunk_lines).strip(),
+                            "metadata": {"definition": current_def or "module"},
+                        }
+                    )
                     overlap_lines = self._get_overlap_lines(current_chunk_lines)
                     current_chunk_lines = overlap_lines
         # Save final chunk
         if current_chunk_lines:
-            chunk_content = '\n'.join(current_chunk_lines).strip()
+            chunk_content = "\n".join(current_chunk_lines).strip()
             if chunk_content:
-                chunks.append({
-                    "content": chunk_content,
-                    "metadata": {"definition": current_def or "module"}
-                })
+                chunks.append(
+                    {
+                        "content": chunk_content,
+                        "metadata": {"definition": current_def or "module"},
+                    }
+                )
         return chunks
@@ -251,10 +276,12 @@ class DocumentChunker:
             if end >= len(content):
                 chunk_content = content[start:].strip()
                 if chunk_content:
-                    chunks.append({
-                        "content": chunk_content,
-                        "metadata": {"chunk_index": chunk_index}
-                    })
+                    chunks.append(
+                        {
+                            "content": chunk_content,
+                            "metadata": {"chunk_index": chunk_index},
+                        }
+                    )
                 break
             # Try to find a good break point
@@ -263,10 +290,9 @@ class DocumentChunker:
             chunk_content = content[start:end].strip()
             if chunk_content:
-                chunks.append({
-                    "content": chunk_content,
-                    "metadata": {"chunk_index": chunk_index}
-                })
+                chunks.append(
+                    {"content": chunk_content, "metadata": {"chunk_index": chunk_index}}
+                )
             # Move start with overlap
             start = max(end - overlap, start + 1)
@@ -288,12 +314,12 @@ class DocumentChunker:
         search_start = start + (end - start) // 2  # Search in latter half
         # Try paragraph break (double newline)
-        para_break = content.rfind('\n\n', search_start, end)
+        para_break = content.rfind("\n\n", search_start, end)
         if para_break > search_start:
             return para_break + 2
         # Try sentence break (. or ! or ? followed by space or newline)
-        sentence_pattern = r'[.!?]\s'
+        sentence_pattern = r"[.!?]\s"
         for match in re.finditer(sentence_pattern, content[search_start:end]):
             last_match_end = search_start + match.end()
         else:
@@ -301,15 +327,15 @@ class DocumentChunker:
         # Find last sentence break
         for i in range(end - 1, search_start, -1):
-            if i + 1 < len(content) and content[i] in '.!?' and content[i + 1] in ' \n':
+            if i + 1 < len(content) and content[i] in ".!?" and content[i + 1] in " \n":
                 return i + 1
         # Try word break (space or newline)
-        space_break = content.rfind(' ', search_start, end)
+        space_break = content.rfind(" ", search_start, end)
         if space_break > search_start:
             return space_break + 1
-        newline_break = content.rfind('\n', search_start, end)
+        newline_break = content.rfind("\n", search_start, end)
         if newline_break > search_start:
             return newline_break + 1
@@ -333,7 +359,7 @@ class DocumentChunker:
 def chunk_file(
     file_path: Path,
     config: Optional["ChunkingConfig"] = None,
-    base_metadata: Optional[Dict[str, Any]] = None
+    base_metadata: Optional[Dict[str, Any]] = None,
 ) -> List[Dict[str, Any]]:
     """
     Convenience function to chunk a file directly.

hdsp_agent_core/knowledge/loader.py CHANGED Viewed

@@ -5,19 +5,19 @@ Keyword matching + regex based library detection for loading appropriate API gui
 (No LLM calls - saves tokens and improves reliability)
 """
-from pathlib import Path
-from typing import List, Dict, Optional, Set
 import re
+from pathlib import Path
+from typing import Dict, List, Optional, Set
 # Library descriptions (reference)
 LIBRARY_DESCRIPTIONS: Dict[str, str] = {
-    'matplotlib': 'Visualization, graphs, charts, plot, histogram, scatter plot, EDA, data visualization, used with seaborn',
-    'dask': 'Large-scale data processing, pandas replacement, distributed processing, lazy evaluation, dd.read_csv',
-    'polars': 'High-performance DataFrame, pandas replacement, Rust-based, pl.read_csv',
-    'pyspark': 'Spark-based distributed processing, big data, SparkSession',
-    'vaex': 'Large-scale data exploration, out-of-core processing',
-    'modin': 'pandas acceleration, parallel processing',
-    'ray': 'Distributed computing, parallel processing framework',
+    "matplotlib": "Visualization, graphs, charts, plot, histogram, scatter plot, EDA, data visualization, used with seaborn",
+    "dask": "Large-scale data processing, pandas replacement, distributed processing, lazy evaluation, dd.read_csv",
+    "polars": "High-performance DataFrame, pandas replacement, Rust-based, pl.read_csv",
+    "pyspark": "Spark-based distributed processing, big data, SparkSession",
+    "vaex": "Large-scale data exploration, out-of-core processing",
+    "modin": "pandas acceleration, parallel processing",
+    "ray": "Distributed computing, parallel processing framework",
 }
@@ -29,90 +29,90 @@ class LibraryDetector:
     # Explicit library mention patterns (highest priority)
     EXPLICIT_PATTERNS: Dict[str, str] = {
-        r'\bdask\b': 'dask',
-        r'\bpolars\b': 'polars',
-        r'\bpyspark\b': 'pyspark',
-        r'\bvaex\b': 'vaex',
-        r'\bmodin\b': 'modin',
-        r'\bray\b': 'ray',
-        r'\bmatplotlib\b': 'matplotlib',
-        r'\bseaborn\b': 'matplotlib',  # seaborn -> matplotlib guide
-        r'\bplt\.': 'matplotlib',
-        r'\bdd\.read': 'dask',
-        r'\bpl\.read': 'polars',
-        r'\bpl\.DataFrame': 'polars',
+        r"\bdask\b": "dask",
+        r"\bpolars\b": "polars",
+        r"\bpyspark\b": "pyspark",
+        r"\bvaex\b": "vaex",
+        r"\bmodin\b": "modin",
+        r"\bray\b": "ray",
+        r"\bmatplotlib\b": "matplotlib",
+        r"\bseaborn\b": "matplotlib",  # seaborn -> matplotlib guide
+        r"\bplt\.": "matplotlib",
+        r"\bdd\.read": "dask",
+        r"\bpl\.read": "polars",
+        r"\bpl\.DataFrame": "polars",
     }
     # Keyword scores per library (0.0 ~ 1.0)
     KEYWORD_SCORES: Dict[str, Dict[str, float]] = {
-        'dask': {
-            '대용량': 0.7,
-            'big data': 0.7,
-            'bigdata': 0.7,
-            '빅데이터': 0.7,
-            'lazy': 0.8,
-            'lazy evaluation': 0.9,
-            'out-of-core': 0.9,
-            'out of core': 0.9,
-            '분산 처리': 0.6,
-            'distributed': 0.6,
-            'parallel dataframe': 0.8,
-            '병렬 데이터프레임': 0.8,
+        "dask": {
+            "대용량": 0.7,
+            "big data": 0.7,
+            "bigdata": 0.7,
+            "빅데이터": 0.7,
+            "lazy": 0.8,
+            "lazy evaluation": 0.9,
+            "out-of-core": 0.9,
+            "out of core": 0.9,
+            "분산 처리": 0.6,
+            "distributed": 0.6,
+            "parallel dataframe": 0.8,
+            "병렬 데이터프레임": 0.8,
         },
-        'polars': {
-            'rust 기반': 0.9,
-            'rust-based': 0.9,
-            'fast dataframe': 0.7,
-            '고성능 dataframe': 0.7,
-            '빠른 데이터프레임': 0.7,
+        "polars": {
+            "rust 기반": 0.9,
+            "rust-based": 0.9,
+            "fast dataframe": 0.7,
+            "고성능 dataframe": 0.7,
+            "빠른 데이터프레임": 0.7,
         },
-        'matplotlib': {
-            '시각화': 0.7,
-            'visualization': 0.7,
-            'visualize': 0.7,
-            'plot': 0.7,
-            'chart': 0.7,
-            'graph': 0.6,
-            '그래프': 0.6,
-            '차트': 0.7,
-            'histogram': 0.8,
-            '히스토그램': 0.8,
-            'scatter': 0.8,
-            '산점도': 0.8,
-            'line plot': 0.8,
-            '라인 플롯': 0.8,
-            'bar chart': 0.8,
-            '막대 그래프': 0.8,
-            'eda': 0.5,
-            '탐색적 데이터 분석': 0.6,
-            'figure': 0.5,
-            'subplot': 0.8,
-            'heatmap': 0.7,
-            '히트맵': 0.7,
+        "matplotlib": {
+            "시각화": 0.7,
+            "visualization": 0.7,
+            "visualize": 0.7,
+            "plot": 0.7,
+            "chart": 0.7,
+            "graph": 0.6,
+            "그래프": 0.6,
+            "차트": 0.7,
+            "histogram": 0.8,
+            "히스토그램": 0.8,
+            "scatter": 0.8,
+            "산점도": 0.8,
+            "line plot": 0.8,
+            "라인 플롯": 0.8,
+            "bar chart": 0.8,
+            "막대 그래프": 0.8,
+            "eda": 0.5,
+            "탐색적 데이터 분석": 0.6,
+            "figure": 0.5,
+            "subplot": 0.8,
+            "heatmap": 0.7,
+            "히트맵": 0.7,
         },
-        'pyspark': {
-            'spark': 0.9,
-            'sparksession': 0.95,
-            'spark session': 0.95,
-            'rdd': 0.9,
-            'hadoop': 0.7,
-            '클러스터': 0.6,
-            'cluster': 0.6,
+        "pyspark": {
+            "spark": 0.9,
+            "sparksession": 0.95,
+            "spark session": 0.95,
+            "rdd": 0.9,
+            "hadoop": 0.7,
+            "클러스터": 0.6,
+            "cluster": 0.6,
         },
-        'vaex': {
-            'vaex': 1.0,
-            'memory mapping': 0.8,
-            '메모리 매핑': 0.8,
+        "vaex": {
+            "vaex": 1.0,
+            "memory mapping": 0.8,
+            "메모리 매핑": 0.8,
         },
-        'modin': {
-            'modin': 1.0,
-            'pandas 가속': 0.8,
-            'pandas acceleration': 0.8,
+        "modin": {
+            "modin": 1.0,
+            "pandas 가속": 0.8,
+            "pandas acceleration": 0.8,
         },
-        'ray': {
-            'ray': 0.9,
-            '분산 컴퓨팅': 0.7,
-            'distributed computing': 0.7,
+        "ray": {
+            "ray": 0.9,
+            "분산 컴퓨팅": 0.7,
+            "distributed computing": 0.7,
         },
     }
@@ -123,7 +123,7 @@ class LibraryDetector:
         self,
         request: str,
         available_libraries: List[str],
-        imported_libraries: List[str] = None
+        imported_libraries: List[str] = None,
     ) -> List[str]:
         """
         Detect required libraries from user request.
@@ -141,7 +141,9 @@ class LibraryDetector:
         # Step 1: Explicit pattern matching (highest priority)
         for pattern, lib in self.EXPLICIT_PATTERNS.items():
-            if lib in available_libraries and re.search(pattern, request, re.IGNORECASE):
+            if lib in available_libraries and re.search(
+                pattern, request, re.IGNORECASE
+            ):
                 detected.add(lib)
         # Step 2: Keyword scoring
@@ -162,8 +164,8 @@ class LibraryDetector:
             for lib in imported_libraries:
                 lib_lower = lib.lower()
                 # seaborn -> matplotlib
-                if lib_lower == 'seaborn' and 'matplotlib' in available_libraries:
-                    detected.add('matplotlib')
+                if lib_lower == "seaborn" and "matplotlib" in available_libraries:
+                    detected.add("matplotlib")
                 elif lib_lower in available_libraries:
                     detected.add(lib_lower)
@@ -183,7 +185,7 @@ def get_library_detector() -> LibraryDetector:
 # LLM library detection prompt
-LIBRARY_DETECTION_PROMPT = '''Analyze the user's request and determine which libraries to use for code generation.
+LIBRARY_DETECTION_PROMPT = """Analyze the user's request and determine which libraries to use for code generation.
 ## Available Library API Guides:
 {library_list}
@@ -205,7 +207,7 @@ LIBRARY_DETECTION_PROMPT = '''Analyze the user's request and determine which lib
 {{"libraries": ["library1", "library2"]}}
 Empty array is also valid: {{"libraries": []}}
-'''
+"""
 class KnowledgeBase:
@@ -216,7 +218,7 @@ class KnowledgeBase:
             self.knowledge_dir = Path(knowledge_dir)
         else:
             # Default path: knowledge/libraries
-            self.knowledge_dir = Path(__file__).parent / 'libraries'
+            self.knowledge_dir = Path(__file__).parent / "libraries"
         self._cache: Dict[str, str] = {}
@@ -225,19 +227,19 @@ class KnowledgeBase:
         available = self.list_available_libraries()
         lines = []
         for lib in available:
-            desc = LIBRARY_DESCRIPTIONS.get(lib, 'Other library')
+            desc = LIBRARY_DESCRIPTIONS.get(lib, "Other library")
             lines.append(f"- **{lib}**: {desc}")
         return "\n".join(lines)
-    def get_detection_prompt(self, request: str, imported_libraries: List[str] = None) -> str:
+    def get_detection_prompt(
+        self, request: str, imported_libraries: List[str] = None
+    ) -> str:
         """Generate LLM library detection prompt"""
         library_list = self.get_library_list_for_prompt()
         imported = ", ".join(imported_libraries) if imported_libraries else "None"
         return LIBRARY_DETECTION_PROMPT.format(
-            library_list=library_list,
-            request=request,
-            imported_libraries=imported
+            library_list=library_list, request=request, imported_libraries=imported
         )
     def load_library_guide(self, library: str) -> Optional[str]:
@@ -255,9 +257,9 @@ class KnowledgeBase:
             return self._cache[library]
         # Load file
-        file_path = self.knowledge_dir / f'{library}.md'
+        file_path = self.knowledge_dir / f"{library}.md"
         if file_path.exists():
-            content = file_path.read_text(encoding='utf-8')
+            content = file_path.read_text(encoding="utf-8")
             self._cache[library] = content
             return content
@@ -274,7 +276,7 @@ class KnowledgeBase:
             Combined guide string
         """
         if not libraries:
-            return ''
+            return ""
         guides = []
         for lib in sorted(libraries):
@@ -283,7 +285,7 @@ class KnowledgeBase:
                 guides.append(f"## {lib.upper()} Library API Guide\n\n{guide}")
         if not guides:
-            return ''
+            return ""
         return "\n\n---\n\n".join(guides)
@@ -300,7 +302,7 @@ class KnowledgeBase:
         knowledge = self.load_libraries_knowledge(libraries)
         if not knowledge:
-            return ''
+            return ""
         return f"""
 ## 📚 Library API Reference (MUST follow!)
@@ -317,7 +319,7 @@ Follow the API usage in the guides below. Avoid ❌ incorrect code and use ✅ c
         if not self.knowledge_dir.exists():
             return []
-        return [f.stem for f in self.knowledge_dir.glob('*.md')]
+        return [f.stem for f in self.knowledge_dir.glob("*.md")]
 # Singleton instance

hdsp-jupyter-extension 2.0.8__py3-none-any.whl → 2.0.11__py3-none-any.whl

hdsp-jupyter-extension 2.0.8py3-none-any.whl → 2.0.11py3-none-any.whl