npm - claude-code-workflow - Versions diffs - 6.3.19 → 6.3.20 - Mend

claude-code-workflow 6.3.19 → 6.3.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/codex-lens/src/codexlens/semantic/chunker.py CHANGED Viewed

@@ -43,6 +43,250 @@ class ChunkConfig:
     strategy: str = "auto"  # Chunking strategy: auto, symbol, sliding_window, hybrid
     min_chunk_size: int = 50  # Minimum chunk size
     skip_token_count: bool = False  # Skip expensive token counting (use char/4 estimate)
+    strip_comments: bool = True  # Remove comments from chunk content for embedding
+    strip_docstrings: bool = True  # Remove docstrings from chunk content for embedding
+    preserve_original: bool = True  # Store original content in metadata when stripping
+class CommentStripper:
+    """Remove comments from source code while preserving structure."""
+    @staticmethod
+    def strip_python_comments(content: str) -> str:
+        """Strip Python comments (# style) but preserve docstrings.
+        Args:
+            content: Python source code
+        Returns:
+            Code with comments removed
+        """
+        lines = content.splitlines(keepends=True)
+        result_lines: List[str] = []
+        in_string = False
+        string_char = None
+        for line in lines:
+            new_line = []
+            i = 0
+            while i < len(line):
+                char = line[i]
+                # Handle string literals
+                if char in ('"', "'") and not in_string:
+                    # Check for triple quotes
+                    if line[i:i+3] in ('"""', "'''"):
+                        in_string = True
+                        string_char = line[i:i+3]
+                        new_line.append(line[i:i+3])
+                        i += 3
+                        continue
+                    else:
+                        in_string = True
+                        string_char = char
+                elif in_string:
+                    if string_char and len(string_char) == 3:
+                        if line[i:i+3] == string_char:
+                            in_string = False
+                            new_line.append(line[i:i+3])
+                            i += 3
+                            string_char = None
+                            continue
+                    elif char == string_char:
+                        # Check for escape
+                        if i > 0 and line[i-1] != '\\':
+                            in_string = False
+                            string_char = None
+                # Handle comments (only outside strings)
+                if char == '#' and not in_string:
+                    # Rest of line is comment, skip it
+                    new_line.append('\n' if line.endswith('\n') else '')
+                    break
+                new_line.append(char)
+                i += 1
+            result_lines.append(''.join(new_line))
+        return ''.join(result_lines)
+    @staticmethod
+    def strip_c_style_comments(content: str) -> str:
+        """Strip C-style comments (// and /* */) from code.
+        Args:
+            content: Source code with C-style comments
+        Returns:
+            Code with comments removed
+        """
+        result = []
+        i = 0
+        in_string = False
+        string_char = None
+        in_multiline_comment = False
+        while i < len(content):
+            # Handle multi-line comment end
+            if in_multiline_comment:
+                if content[i:i+2] == '*/':
+                    in_multiline_comment = False
+                    i += 2
+                    continue
+                i += 1
+                continue
+            char = content[i]
+            # Handle string literals
+            if char in ('"', "'", '`') and not in_string:
+                in_string = True
+                string_char = char
+                result.append(char)
+                i += 1
+                continue
+            elif in_string:
+                result.append(char)
+                if char == string_char and (i == 0 or content[i-1] != '\\'):
+                    in_string = False
+                    string_char = None
+                i += 1
+                continue
+            # Handle comments
+            if content[i:i+2] == '//':
+                # Single line comment - skip to end of line
+                while i < len(content) and content[i] != '\n':
+                    i += 1
+                if i < len(content):
+                    result.append('\n')
+                    i += 1
+                continue
+            if content[i:i+2] == '/*':
+                in_multiline_comment = True
+                i += 2
+                continue
+            result.append(char)
+            i += 1
+        return ''.join(result)
+    @classmethod
+    def strip_comments(cls, content: str, language: str) -> str:
+        """Strip comments based on language.
+        Args:
+            content: Source code content
+            language: Programming language
+        Returns:
+            Code with comments removed
+        """
+        if language == "python":
+            return cls.strip_python_comments(content)
+        elif language in {"javascript", "typescript", "java", "c", "cpp", "go", "rust"}:
+            return cls.strip_c_style_comments(content)
+        return content
+class DocstringStripper:
+    """Remove docstrings from source code."""
+    @staticmethod
+    def strip_python_docstrings(content: str) -> str:
+        """Strip Python docstrings (triple-quoted strings at module/class/function level).
+        Args:
+            content: Python source code
+        Returns:
+            Code with docstrings removed
+        """
+        lines = content.splitlines(keepends=True)
+        result_lines: List[str] = []
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            stripped = line.strip()
+            # Check for docstring start
+            if stripped.startswith('"""') or stripped.startswith("'''"):
+                quote_type = '"""' if stripped.startswith('"""') else "'''"
+                # Single line docstring
+                if stripped.count(quote_type) >= 2:
+                    # Skip this line (docstring)
+                    i += 1
+                    continue
+                # Multi-line docstring - skip until closing
+                i += 1
+                while i < len(lines):
+                    if quote_type in lines[i]:
+                        i += 1
+                        break
+                    i += 1
+                continue
+            result_lines.append(line)
+            i += 1
+        return ''.join(result_lines)
+    @staticmethod
+    def strip_jsdoc_comments(content: str) -> str:
+        """Strip JSDoc comments (/** ... */) from code.
+        Args:
+            content: JavaScript/TypeScript source code
+        Returns:
+            Code with JSDoc comments removed
+        """
+        result = []
+        i = 0
+        in_jsdoc = False
+        while i < len(content):
+            if in_jsdoc:
+                if content[i:i+2] == '*/':
+                    in_jsdoc = False
+                    i += 2
+                    continue
+                i += 1
+                continue
+            # Check for JSDoc start (/** but not /*)
+            if content[i:i+3] == '/**':
+                in_jsdoc = True
+                i += 3
+                continue
+            result.append(content[i])
+            i += 1
+        return ''.join(result)
+    @classmethod
+    def strip_docstrings(cls, content: str, language: str) -> str:
+        """Strip docstrings based on language.
+        Args:
+            content: Source code content
+            language: Programming language
+        Returns:
+            Code with docstrings removed
+        """
+        if language == "python":
+            return cls.strip_python_docstrings(content)
+        elif language in {"javascript", "typescript"}:
+            return cls.strip_jsdoc_comments(content)
+        return content
 class Chunker:
@@ -51,6 +295,33 @@ class Chunker:
     def __init__(self, config: ChunkConfig | None = None) -> None:
         self.config = config or ChunkConfig()
         self._tokenizer = get_default_tokenizer()
+        self._comment_stripper = CommentStripper()
+        self._docstring_stripper = DocstringStripper()
+    def _process_content(self, content: str, language: str) -> Tuple[str, Optional[str]]:
+        """Process chunk content by stripping comments/docstrings if configured.
+        Args:
+            content: Original chunk content
+            language: Programming language
+        Returns:
+            Tuple of (processed_content, original_content_if_preserved)
+        """
+        original = content if self.config.preserve_original else None
+        processed = content
+        if self.config.strip_comments:
+            processed = self._comment_stripper.strip_comments(processed, language)
+        if self.config.strip_docstrings:
+            processed = self._docstring_stripper.strip_docstrings(processed, language)
+        # If nothing changed, don't store original
+        if processed == content:
+            original = None
+        return processed, original
     def _estimate_token_count(self, text: str) -> int:
         """Estimate token count based on config.
@@ -120,30 +391,45 @@ class Chunker:
                     sub_chunk.metadata["symbol_name"] = symbol.name
                     sub_chunk.metadata["symbol_kind"] = symbol.kind
                     sub_chunk.metadata["strategy"] = "symbol_split"
+                    sub_chunk.metadata["chunk_type"] = "code"
                     sub_chunk.metadata["parent_symbol_range"] = (start_line, end_line)
                 chunks.extend(sub_chunks)
             else:
+                # Process content (strip comments/docstrings if configured)
+                processed_content, original_content = self._process_content(chunk_content, language)
+                # Skip if processed content is too small
+                if len(processed_content.strip()) < self.config.min_chunk_size:
+                    continue
                 # Calculate token count if not provided
                 token_count = None
                 if symbol_token_counts and symbol.name in symbol_token_counts:
                     token_count = symbol_token_counts[symbol.name]
                 else:
-                    token_count = self._estimate_token_count(chunk_content)
+                    token_count = self._estimate_token_count(processed_content)
+                metadata = {
+                    "file": str(file_path),
+                    "language": language,
+                    "symbol_name": symbol.name,
+                    "symbol_kind": symbol.kind,
+                    "start_line": start_line,
+                    "end_line": end_line,
+                    "strategy": "symbol",
+                    "chunk_type": "code",
+                    "token_count": token_count,
+                }
+                # Store original content if it was modified
+                if original_content is not None:
+                    metadata["original_content"] = original_content
                 chunks.append(SemanticChunk(
-                    content=chunk_content,
+                    content=processed_content,
                     embedding=None,
-                    metadata={
-                        "file": str(file_path),
-                        "language": language,
-                        "symbol_name": symbol.name,
-                        "symbol_kind": symbol.kind,
-                        "start_line": start_line,
-                        "end_line": end_line,
-                        "strategy": "symbol",
-                        "token_count": token_count,
-                    }
+                    metadata=metadata
                 ))
         return chunks
@@ -188,7 +474,19 @@ class Chunker:
             chunk_content = "".join(lines[start:end])
             if len(chunk_content.strip()) >= self.config.min_chunk_size:
-                token_count = self._estimate_token_count(chunk_content)
+                # Process content (strip comments/docstrings if configured)
+                processed_content, original_content = self._process_content(chunk_content, language)
+                # Skip if processed content is too small
+                if len(processed_content.strip()) < self.config.min_chunk_size:
+                    # Move window forward
+                    step = lines_per_chunk - overlap_lines
+                    if step <= 0:
+                        step = 1
+                    start += step
+                    continue
+                token_count = self._estimate_token_count(processed_content)
                 # Calculate correct line numbers
                 if line_mapping:
@@ -200,18 +498,25 @@ class Chunker:
                     start_line = start + 1
                     end_line = end
+                metadata = {
+                    "file": str(file_path),
+                    "language": language,
+                    "chunk_index": chunk_idx,
+                    "start_line": start_line,
+                    "end_line": end_line,
+                    "strategy": "sliding_window",
+                    "chunk_type": "code",
+                    "token_count": token_count,
+                }
+                # Store original content if it was modified
+                if original_content is not None:
+                    metadata["original_content"] = original_content
                 chunks.append(SemanticChunk(
-                    content=chunk_content,
+                    content=processed_content,
                     embedding=None,
-                    metadata={
-                        "file": str(file_path),
-                        "language": language,
-                        "chunk_index": chunk_idx,
-                        "start_line": start_line,
-                        "end_line": end_line,
-                        "strategy": "sliding_window",
-                        "token_count": token_count,
-                    }
+                    metadata=metadata
                 ))
                 chunk_idx += 1

package/codex-lens/src/codexlens/semantic/reranker/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file

package/codex-lens/src/codexlens/semantic/reranker/__pycache__/api_reranker.cpython-312.pyc ADDED Viewed

Binary file

package/codex-lens/src/codexlens/semantic/reranker/__pycache__/base.cpython-312.pyc ADDED Viewed

Binary file

package/codex-lens/src/codexlens/semantic/reranker/__pycache__/factory.cpython-312.pyc ADDED Viewed

Binary file

package/codex-lens/src/codexlens/semantic/reranker/__pycache__/fastembed_reranker.cpython-312.pyc ADDED Viewed

Binary file

package/codex-lens/src/codexlens/semantic/reranker/__pycache__/legacy.cpython-312.pyc ADDED Viewed

Binary file

package/codex-lens/src/codexlens/semantic/reranker/__pycache__/onnx_reranker.cpython-312.pyc ADDED Viewed

Binary file

package/codex-lens/src/codexlens/storage/__pycache__/index_tree.cpython-313.pyc CHANGED Viewed

Binary file

package/codex-lens/src/codexlens/storage/index_tree.py CHANGED Viewed

@@ -412,7 +412,8 @@ class IndexTreeBuilder:
         A directory is indexed if:
         1. It's not in IGNORE_DIRS
         2. It doesn't start with '.'
-        3. It contains at least one supported language file
+        3. It contains at least one supported language file, OR
+        4. It has subdirectories that contain supported files (transitive)
         Args:
             dir_path: Directory to check
@@ -427,7 +428,50 @@ class IndexTreeBuilder:
         # Check for supported files in this directory
         source_files = self._iter_source_files(dir_path, languages)
-        return len(source_files) > 0
+        if len(source_files) > 0:
+            return True
+        # Check if any subdirectory has indexable files (transitive)
+        # This handles cases like 'src' which has no direct files but has 'src/codexlens'
+        for item in dir_path.iterdir():
+            if not item.is_dir():
+                continue
+            if item.name in self.IGNORE_DIRS or item.name.startswith("."):
+                continue
+            # Recursively check subdirectories
+            if self._has_indexable_files_recursive(item, languages):
+                return True
+        return False
+    def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool:
+        """Check if directory or any subdirectory has indexable files.
+        Args:
+            dir_path: Directory to check
+            languages: Optional language filter
+        Returns:
+            True if directory tree contains indexable files
+        """
+        # Check for supported files in this directory
+        source_files = self._iter_source_files(dir_path, languages)
+        if len(source_files) > 0:
+            return True
+        # Check subdirectories
+        try:
+            for item in dir_path.iterdir():
+                if not item.is_dir():
+                    continue
+                if item.name in self.IGNORE_DIRS or item.name.startswith("."):
+                    continue
+                if self._has_indexable_files_recursive(item, languages):
+                    return True
+        except PermissionError:
+            pass
+        return False
     def _build_level_parallel(
         self,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-code-workflow",
-  "version": "6.3.19",
+  "version": "6.3.20",
   "description": "JSON-driven multi-agent development framework with intelligent CLI orchestration (Gemini/Qwen/Codex), context-first architecture, and automated workflow execution",
   "type": "module",
   "main": "ccw/src/index.js",