PyPI - kopipasta - Versions diffs - 0.35.0__py3-none-any.whl → 0.37.0__py3-none-any.whl - Mend

kopipasta 0.35.0py3-none-any.whl → 0.37.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kopipasta might be problematic. Click here for more details.

Files changed (13) hide show

kopipasta/cache.py +6 -3
kopipasta/file.py +156 -60
kopipasta/import_parser.py +141 -69
kopipasta/main.py +611 -302
kopipasta/prompt.py +50 -39
kopipasta/tree_selector.py +243 -149
{kopipasta-0.35.0.dist-info → kopipasta-0.37.0.dist-info}/METADATA +1 -1
kopipasta-0.37.0.dist-info/RECORD +13 -0
kopipasta-0.35.0.dist-info/RECORD +0 -13
{kopipasta-0.35.0.dist-info → kopipasta-0.37.0.dist-info}/LICENSE +0 -0
{kopipasta-0.35.0.dist-info → kopipasta-0.37.0.dist-info}/WHEEL +0 -0
{kopipasta-0.35.0.dist-info → kopipasta-0.37.0.dist-info}/entry_points.txt +0 -0
{kopipasta-0.35.0.dist-info → kopipasta-0.37.0.dist-info}/top_level.txt +0 -0

kopipasta/cache.py CHANGED Viewed

@@ -6,32 +6,35 @@ from typing import List, Tuple
 # Define FileTuple for type hinting
 FileTuple = Tuple[str, bool, List[str] | None, str]
 def get_cache_file_path() -> Path:
     """Gets the cross-platform path to the cache file for the last selection."""
     cache_dir = Path.home() / ".cache" / "kopipasta"
     cache_dir.mkdir(parents=True, exist_ok=True)
     return cache_dir / "last_selection.json"
 def save_selection_to_cache(files_to_include: List[FileTuple]):
     """Saves the list of selected file relative paths to the cache."""
     cache_file = get_cache_file_path()
     relative_paths = sorted([os.path.relpath(f[0]) for f in files_to_include])
     try:
-        with open(cache_file, 'w', encoding='utf-8') as f:
+        with open(cache_file, "w", encoding="utf-8") as f:
             json.dump(relative_paths, f, indent=2)
     except IOError as e:
         print(f"\nWarning: Could not save selection to cache: {e}")
 def load_selection_from_cache() -> List[str]:
     """Loads the list of selected files from the cache file."""
     cache_file = get_cache_file_path()
     if not cache_file.exists():
         return []
     try:
-        with open(cache_file, 'r', encoding='utf-8') as f:
+        with open(cache_file, "r", encoding="utf-8") as f:
             paths = json.load(f)
             # Filter out paths that no longer exist
             return [p for p in paths if os.path.exists(p)]
     except (IOError, json.JSONDecodeError) as e:
         print(f"\nWarning: Could not load previous selection from cache: {e}")
-        return []
+        return []

kopipasta/file.py CHANGED Viewed

@@ -1,14 +1,45 @@
 import fnmatch
 import os
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Set
 from pathlib import Path
 FileTuple = Tuple[str, bool, Optional[List[str]], str]
-# --- Cache for .gitignore patterns ---
-# Key: Directory path
-# Value: List of patterns
+# --- Caches ---
 _gitignore_cache: dict[str, list[str]] = {}
+_is_ignored_cache: dict[str, bool] = {}
+_is_binary_cache: dict[str, bool] = {}
+# --- Known File Extensions for is_binary ---
+# Using sets for O(1) average time complexity lookups
+TEXT_EXTENSIONS = {
+    # Code
+    ".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".c", ".cpp", ".h", ".hpp",
+    ".cs", ".go", ".rs", ".sh", ".bash", ".ps1", ".rb", ".php", ".swift",
+    ".kt", ".kts", ".scala", ".pl", ".pm", ".tcl",
+    # Markup & Data
+    ".html", ".htm", ".xml", ".css", ".scss", ".sass", ".less", ".json",
+    ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", ".md", ".txt", ".rtf",
+    ".csv", ".tsv", ".sql", ".graphql", ".gql",
+    # Config & Other
+    ".gitignore", ".dockerfile", "dockerfile", ".env", ".properties", ".mdx",
+}
+BINARY_EXTENSIONS = {
+    # Images
+    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".ico", ".webp", ".svg",
+    # Audio/Video
+    ".mp3", ".wav", ".ogg", ".flac", ".mp4", ".avi", ".mov", ".wmv", ".mkv",
+    # Archives
+    ".zip", ".rar", ".7z", ".tar", ".gz", ".bz2", ".xz",
+    # Documents
+    ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".odt",
+    # Executables & Compiled
+    ".exe", ".dll", ".so", ".dylib", ".class", ".jar", ".pyc", ".pyd", ".whl",
+    # Databases & Other
+    ".db", ".sqlite", ".sqlite3", ".db-wal", ".db-shm", ".lock",
+    ".bak", ".swo", ".swp",
+}
 def _read_gitignore_patterns(gitignore_path: str) -> list[str]:
     """Reads patterns from a single .gitignore file and caches them."""
@@ -19,31 +50,88 @@ def _read_gitignore_patterns(gitignore_path: str) -> list[str]:
         return []
     patterns = []
     try:
-        with open(gitignore_path, 'r', encoding='utf-8') as f:
+        with open(gitignore_path, "r", encoding="utf-8") as f:
             for line in f:
                 stripped_line = line.strip()
-                if stripped_line and not stripped_line.startswith('#'):
+                if stripped_line and not stripped_line.startswith("#"):
                     patterns.append(stripped_line)
     except IOError:
         pass
     _gitignore_cache[gitignore_path] = patterns
     return patterns
-def is_ignored(path: str, default_ignore_patterns: list[str], project_root: Optional[str] = None) -> bool:
+def is_ignored(
+    path: str, default_ignore_patterns: list[str], project_root: Optional[str] = None
+) -> bool:
     """
-    Checks if a path should be ignored based on default patterns and .gitignore files.
-    Searches for .gitignore from the path's location up to the project_root.
+    Checks if a path should be ignored by splitting patterns into fast (basename)
+    and slow (full path) checks, with heavy caching and optimized inner loops.
     """
     path_abs = os.path.abspath(path)
+    if path_abs in _is_ignored_cache:
+        return _is_ignored_cache[path_abs]
+    parent_dir = os.path.dirname(path_abs)
+    if parent_dir != path_abs and _is_ignored_cache.get(parent_dir, False):
+        _is_ignored_cache[path_abs] = True
+        return True
     if project_root is None:
         project_root = os.getcwd()
     project_root_abs = os.path.abspath(project_root)
-    # --- Step 1: Gather all patterns from all relevant .gitignore files ---
-    all_patterns = set(default_ignore_patterns)
+    basename_patterns, path_patterns = get_all_patterns(
+        default_ignore_patterns, path_abs, project_root_abs
+    )
-    # Determine the directory to start searching for .gitignore files
-    search_start_dir = path_abs if os.path.isdir(path_abs) else os.path.dirname(path_abs)
+    # --- Step 1: Fast check for basename patterns ---
+    path_basename = os.path.basename(path_abs)
+    for pattern in basename_patterns:
+        if fnmatch.fnmatch(path_basename, pattern):
+            _is_ignored_cache[path_abs] = True
+            return True
+    # --- Step 2: Optimized nested check for path patterns ---
+    try:
+        path_rel_to_root = os.path.relpath(path_abs, project_root_abs)
+    except ValueError:
+        _is_ignored_cache[path_abs] = False
+        return False
+    # Pre-calculate all path prefixes to check, avoiding re-joins in the loop.
+    path_parts = Path(path_rel_to_root).parts
+    path_prefixes = [os.path.join(*path_parts[:i]) for i in range(1, len(path_parts) + 1)]
+    # Pre-process patterns to remove trailing slashes once.
+    processed_path_patterns = [p.rstrip("/") for p in path_patterns]
+    for prefix in path_prefixes:
+        for pattern in processed_path_patterns:
+            if fnmatch.fnmatch(prefix, pattern):
+                _is_ignored_cache[path_abs] = True
+                return True
+    _is_ignored_cache[path_abs] = False
+    return False
+def get_all_patterns(default_ignore_patterns, path_abs, project_root_abs) -> Tuple[Set[str], Set[str]]:
+    """
+    Gathers all applicable ignore patterns, splitting them into two sets
+    for optimized checking: one for basenames, one for full paths.
+    """
+    basename_patterns = set()
+    path_patterns = set()
+    for p in default_ignore_patterns:
+        if "/" in p:
+            path_patterns.add(p)
+        else:
+            basename_patterns.add(p)
+    search_start_dir = (
+        path_abs if os.path.isdir(path_abs) else os.path.dirname(path_abs)
+    )
     current_dir = search_start_dir
     while True:
@@ -52,78 +140,86 @@ def is_ignored(path: str, default_ignore_patterns: list[str], project_root: Opti
         if patterns_from_file:
             gitignore_dir_rel = os.path.relpath(current_dir, project_root_abs)
-            if gitignore_dir_rel == '.': gitignore_dir_rel = ''
+            if gitignore_dir_rel == ".":
+                gitignore_dir_rel = ""
             for p in patterns_from_file:
-                # Patterns with a '/' are relative to the .gitignore file's location.
-                # We construct a new pattern relative to the project root.
-                if '/' in p:
-                    all_patterns.add(os.path.join(gitignore_dir_rel, p.lstrip('/')))
+                if "/" in p:
+                    # Path patterns are relative to the .gitignore file's location
+                    path_patterns.add(os.path.join(gitignore_dir_rel, p.lstrip("/")))
                 else:
-                    # Patterns without a '/' (e.g., `*.log`) can match anywhere.
-                    all_patterns.add(p)
+                    basename_patterns.add(p)
-        if not current_dir.startswith(project_root_abs) or current_dir == project_root_abs:
+        if (
+            not current_dir.startswith(project_root_abs)
+            or current_dir == project_root_abs
+        ):
             break
         parent = os.path.dirname(current_dir)
-        if parent == current_dir: break
+        if parent == current_dir:
+            break
         current_dir = parent
+    return basename_patterns, path_patterns
-    # --- Step 2: Check the path and its parents against the patterns ---
-    try:
-        path_rel_to_root = os.path.relpath(path_abs, project_root_abs)
-    except ValueError:
-        return False # Path is outside the project root
-    path_parts = Path(path_rel_to_root).parts
-    for pattern in all_patterns:
-        # Check against basename for simple wildcards (e.g., `*.log`, `__pycache__`)
-        # This is a primary matching mechanism.
-        if fnmatch.fnmatch(os.path.basename(path_abs), pattern):
-            return True
-        # Check the full path and its parent directories against the pattern.
-        # This handles directory ignores (`node_modules/`) and specific path ignores (`src/*.tmp`).
-        for i in range(len(path_parts)):
-            current_check_path = os.path.join(*path_parts[:i+1])
-            # Handle directory patterns like `node_modules/`
-            if pattern.endswith('/'):
-                if fnmatch.fnmatch(current_check_path, pattern.rstrip('/')):
-                    return True
-            # Handle full path patterns
-            else:
-                if fnmatch.fnmatch(current_check_path, pattern):
-                    return True
-    return False
 def read_file_contents(file_path):
     try:
-        with open(file_path, 'r') as file:
+        with open(file_path, "r") as file:
             return file.read()
     except Exception as e:
         print(f"Error reading {file_path}: {e}")
         return ""
-def is_binary(file_path):
+def is_binary(file_path: str) -> bool:
+    """
+    Efficiently checks if a file is binary.
+    The check follows a fast, multi-step process to minimize I/O:
+    1. Checks a memory cache for a previously determined result.
+    2. Checks the file extension against a list of known text file types.
+    3. Checks the file extension against a list of known binary file types.
+    4. As a last resort, reads the first 512 bytes of the file to check for
+       a null byte, a common indicator of a binary file.
+    """
+    # Step 1: Check cache first for fastest response
+    if file_path in _is_binary_cache:
+        return _is_binary_cache[file_path]
+    # Step 2: Fast check based on known text/binary extensions (no I/O)
+    _, extension = os.path.splitext(file_path)
+    extension = extension.lower()
+    if extension in TEXT_EXTENSIONS:
+        _is_binary_cache[file_path] = False
+        return False
+    if extension in BINARY_EXTENSIONS:
+        _is_binary_cache[file_path] = True
+        return True
+    # Step 3: Fallback to content analysis for unknown extensions
     try:
-        with open(file_path, 'rb') as file:
-            chunk = file.read(1024)
-            if b'\0' in chunk:
+        with open(file_path, "rb") as file:
+            # Read a smaller chunk, 512 bytes is usually enough to find a null byte
+            chunk = file.read(512)
+            if b"\0" in chunk:
+                _is_binary_cache[file_path] = True
                 return True
-            if file_path.lower().endswith(('.json', '.csv')):
-                return False
+            # If no null byte, assume it's a text file
+            _is_binary_cache[file_path] = False
             return False
     except IOError:
-        return False
+        # If we can't open it, treat it as binary to be safe
+        _is_binary_cache[file_path] = True
+        return True
 def get_human_readable_size(size):
-    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
         if size < 1024.0:
             return f"{size:.2f} {unit}"
         size /= 1024.0
 def is_large_file(file_path, threshold=102400):
     return os.path.getsize(file_path) > threshold

kopipasta 0.35.0__py3-none-any.whl → 0.37.0__py3-none-any.whl

Potentially problematic release.

kopipasta 0.35.0py3-none-any.whl → 0.37.0py3-none-any.whl