kopipasta 0.35.0__py3-none-any.whl → 0.37.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kopipasta might be problematic. Click here for more details.

kopipasta/cache.py CHANGED
@@ -6,32 +6,35 @@ from typing import List, Tuple
6
6
  # Define FileTuple for type hinting
7
7
  FileTuple = Tuple[str, bool, List[str] | None, str]
8
8
 
9
+
9
10
  def get_cache_file_path() -> Path:
10
11
  """Gets the cross-platform path to the cache file for the last selection."""
11
12
  cache_dir = Path.home() / ".cache" / "kopipasta"
12
13
  cache_dir.mkdir(parents=True, exist_ok=True)
13
14
  return cache_dir / "last_selection.json"
14
15
 
16
+
15
17
  def save_selection_to_cache(files_to_include: List[FileTuple]):
16
18
  """Saves the list of selected file relative paths to the cache."""
17
19
  cache_file = get_cache_file_path()
18
20
  relative_paths = sorted([os.path.relpath(f[0]) for f in files_to_include])
19
21
  try:
20
- with open(cache_file, 'w', encoding='utf-8') as f:
22
+ with open(cache_file, "w", encoding="utf-8") as f:
21
23
  json.dump(relative_paths, f, indent=2)
22
24
  except IOError as e:
23
25
  print(f"\nWarning: Could not save selection to cache: {e}")
24
26
 
27
+
25
28
  def load_selection_from_cache() -> List[str]:
26
29
  """Loads the list of selected files from the cache file."""
27
30
  cache_file = get_cache_file_path()
28
31
  if not cache_file.exists():
29
32
  return []
30
33
  try:
31
- with open(cache_file, 'r', encoding='utf-8') as f:
34
+ with open(cache_file, "r", encoding="utf-8") as f:
32
35
  paths = json.load(f)
33
36
  # Filter out paths that no longer exist
34
37
  return [p for p in paths if os.path.exists(p)]
35
38
  except (IOError, json.JSONDecodeError) as e:
36
39
  print(f"\nWarning: Could not load previous selection from cache: {e}")
37
- return []
40
+ return []
kopipasta/file.py CHANGED
@@ -1,14 +1,45 @@
1
1
  import fnmatch
2
2
  import os
3
- from typing import List, Optional, Tuple
3
+ from typing import List, Optional, Tuple, Set
4
4
  from pathlib import Path
5
5
 
6
6
  FileTuple = Tuple[str, bool, Optional[List[str]], str]
7
7
 
8
- # --- Cache for .gitignore patterns ---
9
- # Key: Directory path
10
- # Value: List of patterns
8
+ # --- Caches ---
11
9
  _gitignore_cache: dict[str, list[str]] = {}
10
+ _is_ignored_cache: dict[str, bool] = {}
11
+ _is_binary_cache: dict[str, bool] = {}
12
+
13
+ # --- Known File Extensions for is_binary ---
14
+ # Using sets for O(1) average time complexity lookups
15
+ TEXT_EXTENSIONS = {
16
+ # Code
17
+ ".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".c", ".cpp", ".h", ".hpp",
18
+ ".cs", ".go", ".rs", ".sh", ".bash", ".ps1", ".rb", ".php", ".swift",
19
+ ".kt", ".kts", ".scala", ".pl", ".pm", ".tcl",
20
+ # Markup & Data
21
+ ".html", ".htm", ".xml", ".css", ".scss", ".sass", ".less", ".json",
22
+ ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", ".md", ".txt", ".rtf",
23
+ ".csv", ".tsv", ".sql", ".graphql", ".gql",
24
+ # Config & Other
25
+ ".gitignore", ".dockerfile", "dockerfile", ".env", ".properties", ".mdx",
26
+ }
27
+
28
+ BINARY_EXTENSIONS = {
29
+ # Images
30
+ ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".ico", ".webp", ".svg",
31
+ # Audio/Video
32
+ ".mp3", ".wav", ".ogg", ".flac", ".mp4", ".avi", ".mov", ".wmv", ".mkv",
33
+ # Archives
34
+ ".zip", ".rar", ".7z", ".tar", ".gz", ".bz2", ".xz",
35
+ # Documents
36
+ ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".odt",
37
+ # Executables & Compiled
38
+ ".exe", ".dll", ".so", ".dylib", ".class", ".jar", ".pyc", ".pyd", ".whl",
39
+ # Databases & Other
40
+ ".db", ".sqlite", ".sqlite3", ".db-wal", ".db-shm", ".lock",
41
+ ".bak", ".swo", ".swp",
42
+ }
12
43
 
13
44
  def _read_gitignore_patterns(gitignore_path: str) -> list[str]:
14
45
  """Reads patterns from a single .gitignore file and caches them."""
@@ -19,31 +50,88 @@ def _read_gitignore_patterns(gitignore_path: str) -> list[str]:
19
50
  return []
20
51
  patterns = []
21
52
  try:
22
- with open(gitignore_path, 'r', encoding='utf-8') as f:
53
+ with open(gitignore_path, "r", encoding="utf-8") as f:
23
54
  for line in f:
24
55
  stripped_line = line.strip()
25
- if stripped_line and not stripped_line.startswith('#'):
56
+ if stripped_line and not stripped_line.startswith("#"):
26
57
  patterns.append(stripped_line)
27
58
  except IOError:
28
59
  pass
29
60
  _gitignore_cache[gitignore_path] = patterns
30
61
  return patterns
31
62
 
32
- def is_ignored(path: str, default_ignore_patterns: list[str], project_root: Optional[str] = None) -> bool:
63
+
64
+ def is_ignored(
65
+ path: str, default_ignore_patterns: list[str], project_root: Optional[str] = None
66
+ ) -> bool:
33
67
  """
34
- Checks if a path should be ignored based on default patterns and .gitignore files.
35
- Searches for .gitignore from the path's location up to the project_root.
68
+ Checks if a path should be ignored by splitting patterns into fast (basename)
69
+ and slow (full path) checks, with heavy caching and optimized inner loops.
36
70
  """
37
71
  path_abs = os.path.abspath(path)
72
+ if path_abs in _is_ignored_cache:
73
+ return _is_ignored_cache[path_abs]
74
+
75
+ parent_dir = os.path.dirname(path_abs)
76
+ if parent_dir != path_abs and _is_ignored_cache.get(parent_dir, False):
77
+ _is_ignored_cache[path_abs] = True
78
+ return True
79
+
38
80
  if project_root is None:
39
81
  project_root = os.getcwd()
40
82
  project_root_abs = os.path.abspath(project_root)
41
83
 
42
- # --- Step 1: Gather all patterns from all relevant .gitignore files ---
43
- all_patterns = set(default_ignore_patterns)
84
+ basename_patterns, path_patterns = get_all_patterns(
85
+ default_ignore_patterns, path_abs, project_root_abs
86
+ )
44
87
 
45
- # Determine the directory to start searching for .gitignore files
46
- search_start_dir = path_abs if os.path.isdir(path_abs) else os.path.dirname(path_abs)
88
+ # --- Step 1: Fast check for basename patterns ---
89
+ path_basename = os.path.basename(path_abs)
90
+ for pattern in basename_patterns:
91
+ if fnmatch.fnmatch(path_basename, pattern):
92
+ _is_ignored_cache[path_abs] = True
93
+ return True
94
+
95
+ # --- Step 2: Optimized nested check for path patterns ---
96
+ try:
97
+ path_rel_to_root = os.path.relpath(path_abs, project_root_abs)
98
+ except ValueError:
99
+ _is_ignored_cache[path_abs] = False
100
+ return False
101
+
102
+ # Pre-calculate all path prefixes to check, avoiding re-joins in the loop.
103
+ path_parts = Path(path_rel_to_root).parts
104
+ path_prefixes = [os.path.join(*path_parts[:i]) for i in range(1, len(path_parts) + 1)]
105
+
106
+ # Pre-process patterns to remove trailing slashes once.
107
+ processed_path_patterns = [p.rstrip("/") for p in path_patterns]
108
+
109
+ for prefix in path_prefixes:
110
+ for pattern in processed_path_patterns:
111
+ if fnmatch.fnmatch(prefix, pattern):
112
+ _is_ignored_cache[path_abs] = True
113
+ return True
114
+
115
+ _is_ignored_cache[path_abs] = False
116
+ return False
117
+
118
+ def get_all_patterns(default_ignore_patterns, path_abs, project_root_abs) -> Tuple[Set[str], Set[str]]:
119
+ """
120
+ Gathers all applicable ignore patterns, splitting them into two sets
121
+ for optimized checking: one for basenames, one for full paths.
122
+ """
123
+ basename_patterns = set()
124
+ path_patterns = set()
125
+
126
+ for p in default_ignore_patterns:
127
+ if "/" in p:
128
+ path_patterns.add(p)
129
+ else:
130
+ basename_patterns.add(p)
131
+
132
+ search_start_dir = (
133
+ path_abs if os.path.isdir(path_abs) else os.path.dirname(path_abs)
134
+ )
47
135
 
48
136
  current_dir = search_start_dir
49
137
  while True:
@@ -52,78 +140,86 @@ def is_ignored(path: str, default_ignore_patterns: list[str], project_root: Opti
52
140
 
53
141
  if patterns_from_file:
54
142
  gitignore_dir_rel = os.path.relpath(current_dir, project_root_abs)
55
- if gitignore_dir_rel == '.': gitignore_dir_rel = ''
143
+ if gitignore_dir_rel == ".":
144
+ gitignore_dir_rel = ""
56
145
 
57
146
  for p in patterns_from_file:
58
- # Patterns with a '/' are relative to the .gitignore file's location.
59
- # We construct a new pattern relative to the project root.
60
- if '/' in p:
61
- all_patterns.add(os.path.join(gitignore_dir_rel, p.lstrip('/')))
147
+ if "/" in p:
148
+ # Path patterns are relative to the .gitignore file's location
149
+ path_patterns.add(os.path.join(gitignore_dir_rel, p.lstrip("/")))
62
150
  else:
63
- # Patterns without a '/' (e.g., `*.log`) can match anywhere.
64
- all_patterns.add(p)
151
+ basename_patterns.add(p)
65
152
 
66
- if not current_dir.startswith(project_root_abs) or current_dir == project_root_abs:
153
+ if (
154
+ not current_dir.startswith(project_root_abs)
155
+ or current_dir == project_root_abs
156
+ ):
67
157
  break
68
158
  parent = os.path.dirname(current_dir)
69
- if parent == current_dir: break
159
+ if parent == current_dir:
160
+ break
70
161
  current_dir = parent
162
+ return basename_patterns, path_patterns
71
163
 
72
- # --- Step 2: Check the path and its parents against the patterns ---
73
- try:
74
- path_rel_to_root = os.path.relpath(path_abs, project_root_abs)
75
- except ValueError:
76
- return False # Path is outside the project root
77
-
78
- path_parts = Path(path_rel_to_root).parts
79
-
80
- for pattern in all_patterns:
81
- # Check against basename for simple wildcards (e.g., `*.log`, `__pycache__`)
82
- # This is a primary matching mechanism.
83
- if fnmatch.fnmatch(os.path.basename(path_abs), pattern):
84
- return True
85
-
86
- # Check the full path and its parent directories against the pattern.
87
- # This handles directory ignores (`node_modules/`) and specific path ignores (`src/*.tmp`).
88
- for i in range(len(path_parts)):
89
- current_check_path = os.path.join(*path_parts[:i+1])
90
-
91
- # Handle directory patterns like `node_modules/`
92
- if pattern.endswith('/'):
93
- if fnmatch.fnmatch(current_check_path, pattern.rstrip('/')):
94
- return True
95
- # Handle full path patterns
96
- else:
97
- if fnmatch.fnmatch(current_check_path, pattern):
98
- return True
99
-
100
- return False
101
164
 
102
165
  def read_file_contents(file_path):
103
166
  try:
104
- with open(file_path, 'r') as file:
167
+ with open(file_path, "r") as file:
105
168
  return file.read()
106
169
  except Exception as e:
107
170
  print(f"Error reading {file_path}: {e}")
108
171
  return ""
109
172
 
110
- def is_binary(file_path):
173
+
174
+ def is_binary(file_path: str) -> bool:
175
+ """
176
+ Efficiently checks if a file is binary.
177
+
178
+ The check follows a fast, multi-step process to minimize I/O:
179
+ 1. Checks a memory cache for a previously determined result.
180
+ 2. Checks the file extension against a list of known text file types.
181
+ 3. Checks the file extension against a list of known binary file types.
182
+ 4. As a last resort, reads the first 512 bytes of the file to check for
183
+ a null byte, a common indicator of a binary file.
184
+ """
185
+ # Step 1: Check cache first for fastest response
186
+ if file_path in _is_binary_cache:
187
+ return _is_binary_cache[file_path]
188
+
189
+ # Step 2: Fast check based on known text/binary extensions (no I/O)
190
+ _, extension = os.path.splitext(file_path)
191
+ extension = extension.lower()
192
+
193
+ if extension in TEXT_EXTENSIONS:
194
+ _is_binary_cache[file_path] = False
195
+ return False
196
+ if extension in BINARY_EXTENSIONS:
197
+ _is_binary_cache[file_path] = True
198
+ return True
199
+
200
+ # Step 3: Fallback to content analysis for unknown extensions
111
201
  try:
112
- with open(file_path, 'rb') as file:
113
- chunk = file.read(1024)
114
- if b'\0' in chunk:
202
+ with open(file_path, "rb") as file:
203
+ # Read a smaller chunk, 512 bytes is usually enough to find a null byte
204
+ chunk = file.read(512)
205
+ if b"\0" in chunk:
206
+ _is_binary_cache[file_path] = True
115
207
  return True
116
- if file_path.lower().endswith(('.json', '.csv')):
117
- return False
208
+ # If no null byte, assume it's a text file
209
+ _is_binary_cache[file_path] = False
118
210
  return False
119
211
  except IOError:
120
- return False
212
+ # If we can't open it, treat it as binary to be safe
213
+ _is_binary_cache[file_path] = True
214
+ return True
215
+
121
216
 
122
217
  def get_human_readable_size(size):
123
- for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
218
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
124
219
  if size < 1024.0:
125
220
  return f"{size:.2f} {unit}"
126
221
  size /= 1024.0
127
222
 
223
+
128
224
  def is_large_file(file_path, threshold=102400):
129
225
  return os.path.getsize(file_path) > threshold