kopipasta 0.2.0__tar.gz → 0.41.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kopipasta-0.41.0/PKG-INFO +140 -0
- kopipasta-0.41.0/README.md +112 -0
- kopipasta-0.41.0/kopipasta/cache.py +40 -0
- kopipasta-0.41.0/kopipasta/file.py +322 -0
- kopipasta-0.41.0/kopipasta/import_parser.py +356 -0
- kopipasta-0.41.0/kopipasta/main.py +262 -0
- kopipasta-0.41.0/kopipasta/ops.py +476 -0
- kopipasta-0.41.0/kopipasta/patcher.py +245 -0
- kopipasta-0.41.0/kopipasta/prompt.py +271 -0
- kopipasta-0.41.0/kopipasta/tree_selector.py +831 -0
- kopipasta-0.41.0/kopipasta.egg-info/PKG-INFO +140 -0
- kopipasta-0.41.0/kopipasta.egg-info/SOURCES.txt +25 -0
- kopipasta-0.41.0/kopipasta.egg-info/requires.txt +6 -0
- kopipasta-0.41.0/requirements.txt +6 -0
- {kopipasta-0.2.0 → kopipasta-0.41.0}/setup.py +8 -5
- kopipasta-0.41.0/tests/test_file.py +67 -0
- kopipasta-0.41.0/tests/test_patcher.py +246 -0
- kopipasta-0.41.0/tests/test_patcher_edge_cases.py +111 -0
- kopipasta-0.41.0/tests/test_patcher_regex.py +34 -0
- kopipasta-0.41.0/tests/test_tree_selector.py +118 -0
- kopipasta-0.2.0/PKG-INFO +0 -122
- kopipasta-0.2.0/README.md +0 -99
- kopipasta-0.2.0/kopipasta/main.py +0 -251
- kopipasta-0.2.0/kopipasta.egg-info/PKG-INFO +0 -122
- kopipasta-0.2.0/kopipasta.egg-info/SOURCES.txt +0 -13
- kopipasta-0.2.0/kopipasta.egg-info/requires.txt +0 -1
- kopipasta-0.2.0/requirements.txt +0 -1
- {kopipasta-0.2.0 → kopipasta-0.41.0}/LICENSE +0 -0
- {kopipasta-0.2.0 → kopipasta-0.41.0}/MANIFEST.in +0 -0
- {kopipasta-0.2.0 → kopipasta-0.41.0}/kopipasta/__init__.py +0 -0
- {kopipasta-0.2.0 → kopipasta-0.41.0}/kopipasta.egg-info/dependency_links.txt +0 -0
- {kopipasta-0.2.0 → kopipasta-0.41.0}/kopipasta.egg-info/entry_points.txt +0 -0
- {kopipasta-0.2.0 → kopipasta-0.41.0}/kopipasta.egg-info/top_level.txt +0 -0
- {kopipasta-0.2.0 → kopipasta-0.41.0}/setup.cfg +0 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: kopipasta
|
|
3
|
+
Version: 0.41.0
|
|
4
|
+
Summary: A CLI tool to generate prompts with project structure and file contents
|
|
5
|
+
Home-page: https://github.com/mkorpela/kopipasta
|
|
6
|
+
Author: Mikko Korpela
|
|
7
|
+
Author-email: mikko.korpela@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: pyperclip==1.9.0
|
|
23
|
+
Requires-Dist: requests==2.32.3
|
|
24
|
+
Requires-Dist: Pygments==2.18.0
|
|
25
|
+
Requires-Dist: rich==13.8.1
|
|
26
|
+
Requires-Dist: click==8.2.1
|
|
27
|
+
Requires-Dist: prompt-toolkit==3.0.52
|
|
28
|
+
|
|
29
|
+
# kopipasta
|
|
30
|
+
|
|
31
|
+
[](https://pypi.python.org/pypi/kopipasta)
|
|
32
|
+
[](http://pepy.tech/project/kopipasta)
|
|
33
|
+
|
|
34
|
+
**kopipasta bridges the gap between your local file system and LLM context windows.**
|
|
35
|
+
|
|
36
|
+
A CLI tool for taking **full, transparent control** of your prompt. No black boxes.
|
|
37
|
+
|
|
38
|
+
<img src="kopipasta.jpg" alt="kopipasta" width="300">
|
|
39
|
+
|
|
40
|
+
- An LLM told me that "kopi" means Coffee in some languages... and a Diffusion model then made this delicious soup.
|
|
41
|
+
|
|
42
|
+
## The Philosophy: You Control the Context
|
|
43
|
+
|
|
44
|
+
Many AI coding assistants use Retrieval-Augmented Generation (RAG) to automatically find what *they think* is relevant context. This is a black box. When the LLM gives a bad answer, you can't debug it because you don't know what context it was actually given.
|
|
45
|
+
|
|
46
|
+
**`kopipasta` is the opposite.** I built it for myself on the principle of **explicit context control**. You are in the driver's seat. You decide *exactly* what files, functions, and snippets go into the prompt. This transparency is the key to getting reliable, debuggable results from an LLM.
|
|
47
|
+
|
|
48
|
+
It's a "smart copy" command for your project, not a magic wand.
|
|
49
|
+
|
|
50
|
+
## How It Works
|
|
51
|
+
|
|
52
|
+
The workflow is dead simple:
|
|
53
|
+
|
|
54
|
+
1. **Gather:** Run `kopipasta` and point it at the files, directories, and URLs that matter for your task.
|
|
55
|
+
2. **Select:** The tool interactively helps you choose what to include. For large files, you can send just a snippet or even hand-pick individual functions.
|
|
56
|
+
3. **Define:** Write your instructions to the LLM in an interactive prompt directly in your terminal.
|
|
57
|
+
4. **Paste:** The final, comprehensive prompt is now on your clipboard, ready to be pasted into ChatGPT, Gemini, Claude, or your LLM of choice.
|
|
58
|
+
5. **Apply:** Inside the file selector, press `p`, paste the LLM's markdown response, and the tool will automatically patch your local files.
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Using pipx (recommended for CLI tools)
|
|
64
|
+
pipx install kopipasta
|
|
65
|
+
|
|
66
|
+
# Or using standard pip
|
|
67
|
+
pip install kopipasta
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
|
|
72
|
+
`kopipasta` has two main modes: creating prompts and applying patches.
|
|
73
|
+
|
|
74
|
+
### Creating a Prompt
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
kopipasta [options] [files_or_directories_or_urls...]
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Arguments:**
|
|
81
|
+
|
|
82
|
+
* `[files_or_directories_or_urls...]`: One or more paths to files, directories, or web URLs to use as the starting point for your context.
|
|
83
|
+
|
|
84
|
+
**Options:**
|
|
85
|
+
|
|
86
|
+
* `-t TASK`, `--task TASK`: Provide the task description directly on the command line, skipping the editor.
|
|
87
|
+
|
|
88
|
+
### Applying Patches
|
|
89
|
+
|
|
90
|
+
`kopipasta` automatically injects strict instructions into your prompt, teaching the LLM how to format code for this tool.
|
|
91
|
+
`kopipasta` can apply changes suggested by an LLM directly to your codebase, assuming you are in a Git repository.
|
|
92
|
+
|
|
93
|
+
1. While running `kopipasta` in the interactive file selector, press the `p` key.
|
|
94
|
+
2. Paste the entire markdown response from your LLM into the terminal prompt and submit.
|
|
95
|
+
3. The tool will find code blocks with file paths (e.g., `// FILE: src/main.py`) and immediately write those changes to your local files.
|
|
96
|
+
4. After applying, use standard Git commands like `git diff` to review the changes before staging and committing them.
|
|
97
|
+
|
|
98
|
+
## Key Features
|
|
99
|
+
|
|
100
|
+
* **Total Context Control:** Interactively select files, directories, or snippets. You see everything that goes into the prompt.
|
|
101
|
+
* **Smart Dependency Analysis:** Press `d` on a Python or TypeScript/JavaScript file, and `kopipasta` will scan imports to find and add related local files to your context automatically.
|
|
102
|
+
* **Interactive Code Patcher:** Press `p` in the file selector to paste and apply LLM-suggested changes directly to your local files. Relies on your version control (like Git) for safety, enabling a fast workflow.
|
|
103
|
+
* **Built-in Search:** Press `g` to grep for text patterns inside directories to find relevant files.
|
|
104
|
+
* **Transparent & Explicit:** No hidden RAG. You know exactly what's in the prompt because you built it. This makes debugging LLM failures possible.
|
|
105
|
+
* **Web-Aware:** Pulls in content directly from URLs—perfect for API documentation.
|
|
106
|
+
* **Safety First:**
|
|
107
|
+
* Automatically respects your `.gitignore` rules.
|
|
108
|
+
* Detects if you're about to include secrets from a `.env` file and asks what to do.
|
|
109
|
+
* **Context-Aware:** Keeps a running total of the prompt size (in characters and estimated tokens) so you don't overload the LLM's context window.
|
|
110
|
+
* **Developer-Friendly:**
|
|
111
|
+
* Provides a rich, interactive prompt for writing task descriptions in terminal.
|
|
112
|
+
* Copies the final prompt directly to your clipboard.
|
|
113
|
+
* Provides syntax highlighting during chunk selection.
|
|
114
|
+
|
|
115
|
+
## Interactive Controls
|
|
116
|
+
|
|
117
|
+
| Key | Action |
|
|
118
|
+
| :--- | :--- |
|
|
119
|
+
| `Space` | Toggle file/directory selection |
|
|
120
|
+
| `s` | Toggle **Snippet Mode** (include only the first 50 lines) |
|
|
121
|
+
| `d` | **Analyze Dependencies** (find and add imported files) |
|
|
122
|
+
| `g` | **Grep** (search text in directory) |
|
|
123
|
+
| `a` | Add all files in directory |
|
|
124
|
+
| `p` | **Apply Patch** (paste LLM response) |
|
|
125
|
+
| `r` | Reuse selection from previous run |
|
|
126
|
+
| `Enter` | Expand/Collapse directory |
|
|
127
|
+
| `q` | Quit and finalize selection |
|
|
128
|
+
|
|
129
|
+
## A Real-World Example
|
|
130
|
+
|
|
131
|
+
I had a bug where my `setup.py` didn't include all the dependencies from `requirements.txt`.
|
|
132
|
+
|
|
133
|
+
1. I ran `kopipasta -t "Update setup.py to read dependencies dynamically from requirements.txt" setup.py requirements.txt`.
|
|
134
|
+
2. The tool confirmed the inclusion of both files and copied the complete prompt to my clipboard.
|
|
135
|
+
3. I pasted the prompt into my LLM chat window.
|
|
136
|
+
4. I copied the LLM's response (which included a modified `setup.py` in a markdown code block).
|
|
137
|
+
5. Inside `kopipasta`, I pressed `p`, pasted the response, and my local `setup.py` was updated.
|
|
138
|
+
6. I ran `git diff` to review the changes, then tested and committed.
|
|
139
|
+
|
|
140
|
+
No manual file reading, no clumsy copy-pasting, just a clean, context-rich prompt that I had full control over, and a seamless way to apply the results.
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# kopipasta
|
|
2
|
+
|
|
3
|
+
[](https://pypi.python.org/pypi/kopipasta)
|
|
4
|
+
[](http://pepy.tech/project/kopipasta)
|
|
5
|
+
|
|
6
|
+
**kopipasta bridges the gap between your local file system and LLM context windows.**
|
|
7
|
+
|
|
8
|
+
A CLI tool for taking **full, transparent control** of your prompt. No black boxes.
|
|
9
|
+
|
|
10
|
+
<img src="kopipasta.jpg" alt="kopipasta" width="300">
|
|
11
|
+
|
|
12
|
+
- An LLM told me that "kopi" means Coffee in some languages... and a Diffusion model then made this delicious soup.
|
|
13
|
+
|
|
14
|
+
## The Philosophy: You Control the Context
|
|
15
|
+
|
|
16
|
+
Many AI coding assistants use Retrieval-Augmented Generation (RAG) to automatically find what *they think* is relevant context. This is a black box. When the LLM gives a bad answer, you can't debug it because you don't know what context it was actually given.
|
|
17
|
+
|
|
18
|
+
**`kopipasta` is the opposite.** I built it for myself on the principle of **explicit context control**. You are in the driver's seat. You decide *exactly* what files, functions, and snippets go into the prompt. This transparency is the key to getting reliable, debuggable results from an LLM.
|
|
19
|
+
|
|
20
|
+
It's a "smart copy" command for your project, not a magic wand.
|
|
21
|
+
|
|
22
|
+
## How It Works
|
|
23
|
+
|
|
24
|
+
The workflow is dead simple:
|
|
25
|
+
|
|
26
|
+
1. **Gather:** Run `kopipasta` and point it at the files, directories, and URLs that matter for your task.
|
|
27
|
+
2. **Select:** The tool interactively helps you choose what to include. For large files, you can send just a snippet or even hand-pick individual functions.
|
|
28
|
+
3. **Define:** Write your instructions to the LLM in an interactive prompt directly in your terminal.
|
|
29
|
+
4. **Paste:** The final, comprehensive prompt is now on your clipboard, ready to be pasted into ChatGPT, Gemini, Claude, or your LLM of choice.
|
|
30
|
+
5. **Apply:** Inside the file selector, press `p`, paste the LLM's markdown response, and the tool will automatically patch your local files.
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Using pipx (recommended for CLI tools)
|
|
36
|
+
pipx install kopipasta
|
|
37
|
+
|
|
38
|
+
# Or using standard pip
|
|
39
|
+
pip install kopipasta
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
`kopipasta` has two main modes: creating prompts and applying patches.
|
|
45
|
+
|
|
46
|
+
### Creating a Prompt
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
kopipasta [options] [files_or_directories_or_urls...]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Arguments:**
|
|
53
|
+
|
|
54
|
+
* `[files_or_directories_or_urls...]`: One or more paths to files, directories, or web URLs to use as the starting point for your context.
|
|
55
|
+
|
|
56
|
+
**Options:**
|
|
57
|
+
|
|
58
|
+
* `-t TASK`, `--task TASK`: Provide the task description directly on the command line, skipping the editor.
|
|
59
|
+
|
|
60
|
+
### Applying Patches
|
|
61
|
+
|
|
62
|
+
`kopipasta` automatically injects strict instructions into your prompt, teaching the LLM how to format code for this tool.
|
|
63
|
+
`kopipasta` can apply changes suggested by an LLM directly to your codebase, assuming you are in a Git repository.
|
|
64
|
+
|
|
65
|
+
1. While running `kopipasta` in the interactive file selector, press the `p` key.
|
|
66
|
+
2. Paste the entire markdown response from your LLM into the terminal prompt and submit.
|
|
67
|
+
3. The tool will find code blocks with file paths (e.g., `// FILE: src/main.py`) and immediately write those changes to your local files.
|
|
68
|
+
4. After applying, use standard Git commands like `git diff` to review the changes before staging and committing them.
|
|
69
|
+
|
|
70
|
+
## Key Features
|
|
71
|
+
|
|
72
|
+
* **Total Context Control:** Interactively select files, directories, or snippets. You see everything that goes into the prompt.
|
|
73
|
+
* **Smart Dependency Analysis:** Press `d` on a Python or TypeScript/JavaScript file, and `kopipasta` will scan imports to find and add related local files to your context automatically.
|
|
74
|
+
* **Interactive Code Patcher:** Press `p` in the file selector to paste and apply LLM-suggested changes directly to your local files. Relies on your version control (like Git) for safety, enabling a fast workflow.
|
|
75
|
+
* **Built-in Search:** Press `g` to grep for text patterns inside directories to find relevant files.
|
|
76
|
+
* **Transparent & Explicit:** No hidden RAG. You know exactly what's in the prompt because you built it. This makes debugging LLM failures possible.
|
|
77
|
+
* **Web-Aware:** Pulls in content directly from URLs—perfect for API documentation.
|
|
78
|
+
* **Safety First:**
|
|
79
|
+
* Automatically respects your `.gitignore` rules.
|
|
80
|
+
* Detects if you're about to include secrets from a `.env` file and asks what to do.
|
|
81
|
+
* **Context-Aware:** Keeps a running total of the prompt size (in characters and estimated tokens) so you don't overload the LLM's context window.
|
|
82
|
+
* **Developer-Friendly:**
|
|
83
|
+
* Provides a rich, interactive prompt for writing task descriptions in terminal.
|
|
84
|
+
* Copies the final prompt directly to your clipboard.
|
|
85
|
+
* Provides syntax highlighting during chunk selection.
|
|
86
|
+
|
|
87
|
+
## Interactive Controls
|
|
88
|
+
|
|
89
|
+
| Key | Action |
|
|
90
|
+
| :--- | :--- |
|
|
91
|
+
| `Space` | Toggle file/directory selection |
|
|
92
|
+
| `s` | Toggle **Snippet Mode** (include only the first 50 lines) |
|
|
93
|
+
| `d` | **Analyze Dependencies** (find and add imported files) |
|
|
94
|
+
| `g` | **Grep** (search text in directory) |
|
|
95
|
+
| `a` | Add all files in directory |
|
|
96
|
+
| `p` | **Apply Patch** (paste LLM response) |
|
|
97
|
+
| `r` | Reuse selection from previous run |
|
|
98
|
+
| `Enter` | Expand/Collapse directory |
|
|
99
|
+
| `q` | Quit and finalize selection |
|
|
100
|
+
|
|
101
|
+
## A Real-World Example
|
|
102
|
+
|
|
103
|
+
I had a bug where my `setup.py` didn't include all the dependencies from `requirements.txt`.
|
|
104
|
+
|
|
105
|
+
1. I ran `kopipasta -t "Update setup.py to read dependencies dynamically from requirements.txt" setup.py requirements.txt`.
|
|
106
|
+
2. The tool confirmed the inclusion of both files and copied the complete prompt to my clipboard.
|
|
107
|
+
3. I pasted the prompt into my LLM chat window.
|
|
108
|
+
4. I copied the LLM's response (which included a modified `setup.py` in a markdown code block).
|
|
109
|
+
5. Inside `kopipasta`, I pressed `p`, pasted the response, and my local `setup.py` was updated.
|
|
110
|
+
6. I ran `git diff` to review the changes, then tested and committed.
|
|
111
|
+
|
|
112
|
+
No manual file reading, no clumsy copy-pasting, just a clean, context-rich prompt that I had full control over, and a seamless way to apply the results.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
# Define FileTuple for type hinting
|
|
7
|
+
FileTuple = Tuple[str, bool, List[str] | None, str]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_cache_file_path() -> Path:
|
|
11
|
+
"""Gets the cross-platform path to the cache file for the last selection."""
|
|
12
|
+
cache_dir = Path.home() / ".cache" / "kopipasta"
|
|
13
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
return cache_dir / "last_selection.json"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def save_selection_to_cache(files_to_include: List[FileTuple]):
|
|
18
|
+
"""Saves the list of selected file relative paths to the cache."""
|
|
19
|
+
cache_file = get_cache_file_path()
|
|
20
|
+
relative_paths = sorted([os.path.relpath(f[0]) for f in files_to_include])
|
|
21
|
+
try:
|
|
22
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
23
|
+
json.dump(relative_paths, f, indent=2)
|
|
24
|
+
except IOError as e:
|
|
25
|
+
print(f"\nWarning: Could not save selection to cache: {e}")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def load_selection_from_cache() -> List[str]:
|
|
29
|
+
"""Loads the list of selected files from the cache file."""
|
|
30
|
+
cache_file = get_cache_file_path()
|
|
31
|
+
if not cache_file.exists():
|
|
32
|
+
return []
|
|
33
|
+
try:
|
|
34
|
+
with open(cache_file, "r", encoding="utf-8") as f:
|
|
35
|
+
paths = json.load(f)
|
|
36
|
+
# Filter out paths that no longer exist
|
|
37
|
+
return [p for p in paths if os.path.exists(p)]
|
|
38
|
+
except (IOError, json.JSONDecodeError) as e:
|
|
39
|
+
print(f"\nWarning: Could not load previous selection from cache: {e}")
|
|
40
|
+
return []
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
import os
|
|
3
|
+
from typing import List, Optional, Tuple, Set
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
FileTuple = Tuple[str, bool, Optional[List[str]], str]
|
|
7
|
+
|
|
8
|
+
# --- Caches ---
|
|
9
|
+
_gitignore_cache: dict[str, list[str]] = {}
|
|
10
|
+
_is_ignored_cache: dict[str, bool] = {}
|
|
11
|
+
_is_binary_cache: dict[str, bool] = {}
|
|
12
|
+
|
|
13
|
+
# --- Known File Extensions for is_binary ---
|
|
14
|
+
# Using sets for O(1) average time complexity lookups
|
|
15
|
+
TEXT_EXTENSIONS = {
|
|
16
|
+
# Code
|
|
17
|
+
".py",
|
|
18
|
+
".js",
|
|
19
|
+
".ts",
|
|
20
|
+
".jsx",
|
|
21
|
+
".tsx",
|
|
22
|
+
".java",
|
|
23
|
+
".c",
|
|
24
|
+
".cpp",
|
|
25
|
+
".h",
|
|
26
|
+
".hpp",
|
|
27
|
+
".cs",
|
|
28
|
+
".go",
|
|
29
|
+
".rs",
|
|
30
|
+
".sh",
|
|
31
|
+
".bash",
|
|
32
|
+
".ps1",
|
|
33
|
+
".rb",
|
|
34
|
+
".php",
|
|
35
|
+
".swift",
|
|
36
|
+
".kt",
|
|
37
|
+
".kts",
|
|
38
|
+
".scala",
|
|
39
|
+
".pl",
|
|
40
|
+
".pm",
|
|
41
|
+
".tcl",
|
|
42
|
+
# Markup & Data
|
|
43
|
+
".html",
|
|
44
|
+
".htm",
|
|
45
|
+
".xml",
|
|
46
|
+
".css",
|
|
47
|
+
".scss",
|
|
48
|
+
".sass",
|
|
49
|
+
".less",
|
|
50
|
+
".json",
|
|
51
|
+
".yaml",
|
|
52
|
+
".yml",
|
|
53
|
+
".toml",
|
|
54
|
+
".ini",
|
|
55
|
+
".cfg",
|
|
56
|
+
".conf",
|
|
57
|
+
".md",
|
|
58
|
+
".txt",
|
|
59
|
+
".rtf",
|
|
60
|
+
".csv",
|
|
61
|
+
".tsv",
|
|
62
|
+
".sql",
|
|
63
|
+
".graphql",
|
|
64
|
+
".gql",
|
|
65
|
+
# Config & Other
|
|
66
|
+
".gitignore",
|
|
67
|
+
".dockerfile",
|
|
68
|
+
"dockerfile",
|
|
69
|
+
".env",
|
|
70
|
+
".properties",
|
|
71
|
+
".mdx",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
BINARY_EXTENSIONS = {
|
|
75
|
+
# Images
|
|
76
|
+
".png",
|
|
77
|
+
".jpg",
|
|
78
|
+
".jpeg",
|
|
79
|
+
".gif",
|
|
80
|
+
".bmp",
|
|
81
|
+
".tiff",
|
|
82
|
+
".ico",
|
|
83
|
+
".webp",
|
|
84
|
+
".svg",
|
|
85
|
+
# Audio/Video
|
|
86
|
+
".mp3",
|
|
87
|
+
".wav",
|
|
88
|
+
".ogg",
|
|
89
|
+
".flac",
|
|
90
|
+
".mp4",
|
|
91
|
+
".avi",
|
|
92
|
+
".mov",
|
|
93
|
+
".wmv",
|
|
94
|
+
".mkv",
|
|
95
|
+
# Archives
|
|
96
|
+
".zip",
|
|
97
|
+
".rar",
|
|
98
|
+
".7z",
|
|
99
|
+
".tar",
|
|
100
|
+
".gz",
|
|
101
|
+
".bz2",
|
|
102
|
+
".xz",
|
|
103
|
+
# Documents
|
|
104
|
+
".pdf",
|
|
105
|
+
".doc",
|
|
106
|
+
".docx",
|
|
107
|
+
".xls",
|
|
108
|
+
".xlsx",
|
|
109
|
+
".ppt",
|
|
110
|
+
".pptx",
|
|
111
|
+
".odt",
|
|
112
|
+
# Executables & Compiled
|
|
113
|
+
".exe",
|
|
114
|
+
".dll",
|
|
115
|
+
".so",
|
|
116
|
+
".dylib",
|
|
117
|
+
".class",
|
|
118
|
+
".jar",
|
|
119
|
+
".pyc",
|
|
120
|
+
".pyd",
|
|
121
|
+
".whl",
|
|
122
|
+
# Databases & Other
|
|
123
|
+
".db",
|
|
124
|
+
".sqlite",
|
|
125
|
+
".sqlite3",
|
|
126
|
+
".db-wal",
|
|
127
|
+
".db-shm",
|
|
128
|
+
".lock",
|
|
129
|
+
".bak",
|
|
130
|
+
".swo",
|
|
131
|
+
".swp",
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _read_gitignore_patterns(gitignore_path: str) -> list[str]:
|
|
136
|
+
"""Reads patterns from a single .gitignore file and caches them."""
|
|
137
|
+
if gitignore_path in _gitignore_cache:
|
|
138
|
+
return _gitignore_cache[gitignore_path]
|
|
139
|
+
if not os.path.isfile(gitignore_path):
|
|
140
|
+
_gitignore_cache[gitignore_path] = []
|
|
141
|
+
return []
|
|
142
|
+
patterns = []
|
|
143
|
+
try:
|
|
144
|
+
with open(gitignore_path, "r", encoding="utf-8") as f:
|
|
145
|
+
for line in f:
|
|
146
|
+
stripped_line = line.strip()
|
|
147
|
+
if stripped_line and not stripped_line.startswith("#"):
|
|
148
|
+
patterns.append(stripped_line)
|
|
149
|
+
except IOError:
|
|
150
|
+
pass
|
|
151
|
+
_gitignore_cache[gitignore_path] = patterns
|
|
152
|
+
return patterns
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def is_ignored(
|
|
156
|
+
path: str, default_ignore_patterns: list[str], project_root: Optional[str] = None
|
|
157
|
+
) -> bool:
|
|
158
|
+
"""
|
|
159
|
+
Checks if a path should be ignored by splitting patterns into fast (basename)
|
|
160
|
+
and slow (full path) checks, with heavy caching and optimized inner loops.
|
|
161
|
+
"""
|
|
162
|
+
path_abs = os.path.abspath(path)
|
|
163
|
+
if path_abs in _is_ignored_cache:
|
|
164
|
+
return _is_ignored_cache[path_abs]
|
|
165
|
+
|
|
166
|
+
parent_dir = os.path.dirname(path_abs)
|
|
167
|
+
if parent_dir != path_abs and _is_ignored_cache.get(parent_dir, False):
|
|
168
|
+
_is_ignored_cache[path_abs] = True
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
if project_root is None:
|
|
172
|
+
project_root = os.getcwd()
|
|
173
|
+
project_root_abs = os.path.abspath(project_root)
|
|
174
|
+
|
|
175
|
+
basename_patterns, path_patterns = get_all_patterns(
|
|
176
|
+
default_ignore_patterns, path_abs, project_root_abs
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# --- Step 1: Fast check for basename patterns ---
|
|
180
|
+
path_basename = os.path.basename(path_abs)
|
|
181
|
+
for pattern in basename_patterns:
|
|
182
|
+
if fnmatch.fnmatch(path_basename, pattern):
|
|
183
|
+
_is_ignored_cache[path_abs] = True
|
|
184
|
+
return True
|
|
185
|
+
|
|
186
|
+
# --- Step 2: Optimized nested check for path patterns ---
|
|
187
|
+
try:
|
|
188
|
+
path_rel_to_root = os.path.relpath(path_abs, project_root_abs)
|
|
189
|
+
except ValueError:
|
|
190
|
+
_is_ignored_cache[path_abs] = False
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
# Pre-calculate all path prefixes to check, avoiding re-joins in the loop.
|
|
194
|
+
path_parts = Path(path_rel_to_root).parts
|
|
195
|
+
path_prefixes = [
|
|
196
|
+
os.path.join(*path_parts[:i]) for i in range(1, len(path_parts) + 1)
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
# Pre-process patterns to remove trailing slashes once.
|
|
200
|
+
processed_path_patterns = [p.rstrip("/") for p in path_patterns]
|
|
201
|
+
|
|
202
|
+
for prefix in path_prefixes:
|
|
203
|
+
for pattern in processed_path_patterns:
|
|
204
|
+
if fnmatch.fnmatch(prefix, pattern):
|
|
205
|
+
_is_ignored_cache[path_abs] = True
|
|
206
|
+
return True
|
|
207
|
+
|
|
208
|
+
_is_ignored_cache[path_abs] = False
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def get_all_patterns(
|
|
213
|
+
default_ignore_patterns, path_abs, project_root_abs
|
|
214
|
+
) -> Tuple[Set[str], Set[str]]:
|
|
215
|
+
"""
|
|
216
|
+
Gathers all applicable ignore patterns, splitting them into two sets
|
|
217
|
+
for optimized checking: one for basenames, one for full paths.
|
|
218
|
+
"""
|
|
219
|
+
basename_patterns = set()
|
|
220
|
+
path_patterns = set()
|
|
221
|
+
|
|
222
|
+
for p in default_ignore_patterns:
|
|
223
|
+
if "/" in p:
|
|
224
|
+
path_patterns.add(p)
|
|
225
|
+
else:
|
|
226
|
+
basename_patterns.add(p)
|
|
227
|
+
|
|
228
|
+
search_start_dir = (
|
|
229
|
+
path_abs if os.path.isdir(path_abs) else os.path.dirname(path_abs)
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
current_dir = search_start_dir
|
|
233
|
+
while True:
|
|
234
|
+
gitignore_path = os.path.join(current_dir, ".gitignore")
|
|
235
|
+
patterns_from_file = _read_gitignore_patterns(gitignore_path)
|
|
236
|
+
|
|
237
|
+
if patterns_from_file:
|
|
238
|
+
gitignore_dir_rel = os.path.relpath(current_dir, project_root_abs)
|
|
239
|
+
if gitignore_dir_rel == ".":
|
|
240
|
+
gitignore_dir_rel = ""
|
|
241
|
+
|
|
242
|
+
for p in patterns_from_file:
|
|
243
|
+
if "/" in p:
|
|
244
|
+
# Path patterns are relative to the .gitignore file's location
|
|
245
|
+
path_patterns.add(os.path.join(gitignore_dir_rel, p.lstrip("/")))
|
|
246
|
+
else:
|
|
247
|
+
basename_patterns.add(p)
|
|
248
|
+
|
|
249
|
+
if (
|
|
250
|
+
not current_dir.startswith(project_root_abs)
|
|
251
|
+
or current_dir == project_root_abs
|
|
252
|
+
):
|
|
253
|
+
break
|
|
254
|
+
parent = os.path.dirname(current_dir)
|
|
255
|
+
if parent == current_dir:
|
|
256
|
+
break
|
|
257
|
+
current_dir = parent
|
|
258
|
+
return basename_patterns, path_patterns
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def read_file_contents(file_path):
|
|
262
|
+
try:
|
|
263
|
+
with open(file_path, "r", encoding="utf-8") as file:
|
|
264
|
+
return file.read()
|
|
265
|
+
except (IOError, UnicodeDecodeError) as e:
|
|
266
|
+
failure = f"Error reading {file_path}: {e}"
|
|
267
|
+
print(failure)
|
|
268
|
+
return f"<.. {failure} ..>"
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def is_binary(file_path: str) -> bool:
|
|
272
|
+
"""
|
|
273
|
+
Efficiently checks if a file is binary.
|
|
274
|
+
|
|
275
|
+
The check follows a fast, multi-step process to minimize I/O:
|
|
276
|
+
1. Checks a memory cache for a previously determined result.
|
|
277
|
+
2. Checks the file extension against a list of known text file types.
|
|
278
|
+
3. Checks the file extension against a list of known binary file types.
|
|
279
|
+
4. As a last resort, reads the first 512 bytes of the file to check for
|
|
280
|
+
a null byte, a common indicator of a binary file.
|
|
281
|
+
"""
|
|
282
|
+
# Step 1: Check cache first for fastest response
|
|
283
|
+
if file_path in _is_binary_cache:
|
|
284
|
+
return _is_binary_cache[file_path]
|
|
285
|
+
|
|
286
|
+
# Step 2: Fast check based on known text/binary extensions (no I/O)
|
|
287
|
+
_, extension = os.path.splitext(file_path)
|
|
288
|
+
extension = extension.lower()
|
|
289
|
+
|
|
290
|
+
if extension in TEXT_EXTENSIONS:
|
|
291
|
+
_is_binary_cache[file_path] = False
|
|
292
|
+
return False
|
|
293
|
+
if extension in BINARY_EXTENSIONS:
|
|
294
|
+
_is_binary_cache[file_path] = True
|
|
295
|
+
return True
|
|
296
|
+
|
|
297
|
+
# Step 3: Fallback to content analysis for unknown extensions
|
|
298
|
+
try:
|
|
299
|
+
with open(file_path, "rb") as file:
|
|
300
|
+
# Read a smaller chunk, 512 bytes is usually enough to find a null byte
|
|
301
|
+
chunk = file.read(512)
|
|
302
|
+
if b"\0" in chunk:
|
|
303
|
+
_is_binary_cache[file_path] = True
|
|
304
|
+
return True
|
|
305
|
+
# If no null byte, assume it's a text file
|
|
306
|
+
_is_binary_cache[file_path] = False
|
|
307
|
+
return False
|
|
308
|
+
except IOError:
|
|
309
|
+
# If we can't open it, treat it as binary to be safe
|
|
310
|
+
_is_binary_cache[file_path] = True
|
|
311
|
+
return True
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def get_human_readable_size(size):
|
|
315
|
+
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
|
316
|
+
if size < 1024.0:
|
|
317
|
+
return f"{size:.2f} {unit}"
|
|
318
|
+
size /= 1024.0
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def is_large_file(file_path, threshold=102400):
|
|
322
|
+
return os.path.getsize(file_path) > threshold
|