kopipasta 0.2.0__tar.gz → 0.45.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kopipasta-0.45.0/PKG-INFO +161 -0
- kopipasta-0.45.0/README.md +133 -0
- kopipasta-0.45.0/kopipasta/cache.py +40 -0
- kopipasta-0.45.0/kopipasta/file.py +322 -0
- kopipasta-0.45.0/kopipasta/import_parser.py +356 -0
- kopipasta-0.45.0/kopipasta/main.py +265 -0
- kopipasta-0.45.0/kopipasta/ops.py +450 -0
- kopipasta-0.45.0/kopipasta/patcher.py +447 -0
- kopipasta-0.45.0/kopipasta/prompt.py +293 -0
- kopipasta-0.45.0/kopipasta/tree_selector.py +834 -0
- kopipasta-0.45.0/kopipasta.egg-info/PKG-INFO +161 -0
- kopipasta-0.45.0/kopipasta.egg-info/SOURCES.txt +27 -0
- kopipasta-0.45.0/kopipasta.egg-info/requires.txt +6 -0
- kopipasta-0.45.0/requirements.txt +6 -0
- {kopipasta-0.2.0 → kopipasta-0.45.0}/setup.py +8 -5
- kopipasta-0.45.0/tests/test_file.py +67 -0
- kopipasta-0.45.0/tests/test_patcher.py +246 -0
- kopipasta-0.45.0/tests/test_patcher_edge_cases.py +115 -0
- kopipasta-0.45.0/tests/test_patcher_regex.py +67 -0
- kopipasta-0.45.0/tests/test_patcher_repro.py +100 -0
- kopipasta-0.45.0/tests/test_patcher_repro_failures.py +118 -0
- kopipasta-0.45.0/tests/test_tree_selector.py +118 -0
- kopipasta-0.2.0/PKG-INFO +0 -122
- kopipasta-0.2.0/README.md +0 -99
- kopipasta-0.2.0/kopipasta/main.py +0 -251
- kopipasta-0.2.0/kopipasta.egg-info/PKG-INFO +0 -122
- kopipasta-0.2.0/kopipasta.egg-info/SOURCES.txt +0 -13
- kopipasta-0.2.0/kopipasta.egg-info/requires.txt +0 -1
- kopipasta-0.2.0/requirements.txt +0 -1
- {kopipasta-0.2.0 → kopipasta-0.45.0}/LICENSE +0 -0
- {kopipasta-0.2.0 → kopipasta-0.45.0}/MANIFEST.in +0 -0
- {kopipasta-0.2.0 → kopipasta-0.45.0}/kopipasta/__init__.py +0 -0
- {kopipasta-0.2.0 → kopipasta-0.45.0}/kopipasta.egg-info/dependency_links.txt +0 -0
- {kopipasta-0.2.0 → kopipasta-0.45.0}/kopipasta.egg-info/entry_points.txt +0 -0
- {kopipasta-0.2.0 → kopipasta-0.45.0}/kopipasta.egg-info/top_level.txt +0 -0
- {kopipasta-0.2.0 → kopipasta-0.45.0}/setup.cfg +0 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: kopipasta
|
|
3
|
+
Version: 0.45.0
|
|
4
|
+
Summary: A CLI tool to generate prompts with project structure and file contents
|
|
5
|
+
Home-page: https://github.com/mkorpela/kopipasta
|
|
6
|
+
Author: Mikko Korpela
|
|
7
|
+
Author-email: mikko.korpela@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: pyperclip==1.9.0
|
|
23
|
+
Requires-Dist: requests==2.32.3
|
|
24
|
+
Requires-Dist: Pygments==2.18.0
|
|
25
|
+
Requires-Dist: rich==13.8.1
|
|
26
|
+
Requires-Dist: click==8.2.1
|
|
27
|
+
Requires-Dist: prompt-toolkit==3.0.52
|
|
28
|
+
|
|
29
|
+
# kopipasta
|
|
30
|
+
|
|
31
|
+
[](https://pypi.python.org/pypi/kopipasta)
|
|
32
|
+
[](http://pepy.tech/project/kopipasta)
|
|
33
|
+
|
|
34
|
+
**kopipasta bridges the gap between your local file system and LLM context windows.**
|
|
35
|
+
|
|
36
|
+
A CLI tool for taking **full, transparent control** of your prompt. No black boxes.
|
|
37
|
+
|
|
38
|
+
```text
|
|
39
|
+
➜ ~ kopipasta
|
|
40
|
+
|
|
41
|
+
📁 Project Files
|
|
42
|
+
|-- 📂 src/
|
|
43
|
+
| |-- ● 📄 main.py (4.2 KB)
|
|
44
|
+
| |-- ○ 📄 utils.py (1.5 KB)
|
|
45
|
+
|-- ○ 📄 README.md (2.1 KB)
|
|
46
|
+
|
|
47
|
+
Current: src/main.py | Selected: 1 full | ~4,200 chars
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## You Control the Context
|
|
51
|
+
|
|
52
|
+
Many AI coding assistants automatically find what *they think* is relevant context. This is a black box. When the LLM gives a bad answer, you can't debug it because you don't know what context it was actually given.
|
|
53
|
+
|
|
54
|
+
**`kopipasta` is the opposite.** I built it on the principle of **explicit context control**. You are in the driver's seat. You decide *exactly* what files, functions, and snippets go into the prompt.
|
|
55
|
+
|
|
56
|
+
It's a "smart copy" command for your project, not a magic wand.
|
|
57
|
+
|
|
58
|
+
## How It Works
|
|
59
|
+
|
|
60
|
+
The workflow is a fast, iterative cycle:
|
|
61
|
+
|
|
62
|
+
1. **Context:** Run `kopipasta` to select files and define your task.
|
|
63
|
+
2. **Generate:** Paste the prompt into your LLM (ChatGPT, Claude, etc.).
|
|
64
|
+
3. **Patch:** Press `p` in `kopipasta` and paste the LLM's response to apply changes locally.
|
|
65
|
+
4. **Iterate:** Review with `git diff`, then repeat for the next step.
|
|
66
|
+
|
|
67
|
+
## Use Cases
|
|
68
|
+
|
|
69
|
+
* **Targeted Refactoring:** Select just the module you are cleaning up and its immediate dependencies.
|
|
70
|
+
* **Test Generation:** Pipe your implementation file and a similar existing test file to the LLM to generate consistent new tests.
|
|
71
|
+
* **Docs to Code:** Select an API documentation file (or web URL) and your source file to implement a feature against a spec.
|
|
72
|
+
* **Bug Fixing:** Grab the relevant traceback files and the config to diagnose issues without distracting the LLM with the whole repo.
|
|
73
|
+
|
|
74
|
+
## Installation
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# Using pipx (recommended for CLI tools)
|
|
78
|
+
pipx install kopipasta
|
|
79
|
+
|
|
80
|
+
# Or using standard pip
|
|
81
|
+
pip install kopipasta
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Usage
|
|
85
|
+
|
|
86
|
+
`kopipasta` has two main modes: creating prompts and applying patches.
|
|
87
|
+
|
|
88
|
+
### Creating a Prompt
|
|
89
|
+
|
|
90
|
+
By default `kopipasta` opens tree selector on the current dir.
|
|
91
|
+
|
|
92
|
+
You may also use the command line arguments:
|
|
93
|
+
```bash
|
|
94
|
+
kopipasta [options] [files_or_directories_or_urls...]
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Arguments:**
|
|
98
|
+
|
|
99
|
+
* `[files_or_directories_or_urls...]`: One or more paths to files, directories, or web URLs to use as the starting point for your context.
|
|
100
|
+
|
|
101
|
+
**Options:**
|
|
102
|
+
|
|
103
|
+
* `-t TASK`, `--task TASK`: Provide the task description directly on the command line, skipping the editor.
|
|
104
|
+
|
|
105
|
+
### Applying Patches
|
|
106
|
+
|
|
107
|
+
`kopipasta` automatically injects strict instructions into your prompt, teaching the LLM how to format code for this tool.
|
|
108
|
+
`kopipasta` can apply changes suggested by an LLM directly to your codebase, assuming you are in a Git repository.
|
|
109
|
+
|
|
110
|
+
1. Press `p` in the file selector.
|
|
111
|
+
2. Paste the **entire** markdown response from your LLM.
|
|
112
|
+
3. The tool robustly detects code blocks, handles indentation quirks, and applies changes (full files or diffs).
|
|
113
|
+
4. If a patch fails, the tool provides **diagnostic feedback** telling you exactly why (e.g., missing headers).
|
|
114
|
+
5. **Always** review changes with `git diff` before committing.
|
|
115
|
+
|
|
116
|
+
**Example of supported LLM output formats:**
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
# FILE: src/utils.py
|
|
120
|
+
def new_feature():
|
|
121
|
+
print("kopipasta handles full file creation")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
```diff
|
|
125
|
+
# FILE: src/main.py
|
|
126
|
+
@@ -10,2 +10,3 @@
|
|
127
|
+
def main():
|
|
128
|
+
- pass
|
|
129
|
+
+ new_feature()
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Key Features
|
|
133
|
+
|
|
134
|
+
* **Total Context Control:** Interactively select files, directories, or snippets. You see everything that goes into the prompt.
|
|
135
|
+
* **Smart Dependency Analysis:** Press `d` on a Python or TypeScript/JavaScript file, and `kopipasta` will scan imports to find and add related local files to your context automatically.
|
|
136
|
+
* **Robust Code Patcher:** Applies LLM suggestions directly. Handles indentation, various comment styles (`#`, `//`, `<!--`), and multiple files per block.
|
|
137
|
+
* **Built-in Search:** Press `g` to grep for text patterns inside directories to find relevant files.
|
|
138
|
+
* **Transparent & Explicit:** No hidden RAG. You know exactly what's in the prompt because you built it. This makes debugging LLM failures possible.
|
|
139
|
+
* **Web-Aware:** Pulls in content directly from URLs—perfect for API documentation.
|
|
140
|
+
* **Safety First:**
|
|
141
|
+
* Automatically respects your `.gitignore` rules.
|
|
142
|
+
* Detects if you're about to include secrets from a `.env` file and asks what to do.
|
|
143
|
+
* **Context-Aware:** Keeps a running total of the prompt size (in characters and estimated tokens) so you don't overload the LLM's context window.
|
|
144
|
+
* **Developer-Friendly:**
|
|
145
|
+
* Provides a rich, interactive prompt for writing task descriptions in terminal.
|
|
146
|
+
* Copies the final prompt directly to your clipboard.
|
|
147
|
+
* Provides syntax highlighting during chunk selection.
|
|
148
|
+
|
|
149
|
+
## Interactive Controls
|
|
150
|
+
|
|
151
|
+
| Key | Action |
|
|
152
|
+
| :--- | :--- |
|
|
153
|
+
| `Space` | Toggle file/directory selection |
|
|
154
|
+
| `s` | Toggle **Snippet Mode** (include only the first 50 lines) |
|
|
155
|
+
| `d` | **Analyze Dependencies** (find and add imported files) |
|
|
156
|
+
| `g` | **Grep** (search text in directory) |
|
|
157
|
+
| `a` | Add all files in directory |
|
|
158
|
+
| `p` | **Apply Patch** (paste LLM response) |
|
|
159
|
+
| `r` | Reuse selection from previous run |
|
|
160
|
+
| `Enter` | Expand/Collapse directory |
|
|
161
|
+
| `q` | Quit and finalize selection |
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# kopipasta
|
|
2
|
+
|
|
3
|
+
[](https://pypi.python.org/pypi/kopipasta)
|
|
4
|
+
[](http://pepy.tech/project/kopipasta)
|
|
5
|
+
|
|
6
|
+
**kopipasta bridges the gap between your local file system and LLM context windows.**
|
|
7
|
+
|
|
8
|
+
A CLI tool for taking **full, transparent control** of your prompt. No black boxes.
|
|
9
|
+
|
|
10
|
+
```text
|
|
11
|
+
➜ ~ kopipasta
|
|
12
|
+
|
|
13
|
+
📁 Project Files
|
|
14
|
+
|-- 📂 src/
|
|
15
|
+
| |-- ● 📄 main.py (4.2 KB)
|
|
16
|
+
| |-- ○ 📄 utils.py (1.5 KB)
|
|
17
|
+
|-- ○ 📄 README.md (2.1 KB)
|
|
18
|
+
|
|
19
|
+
Current: src/main.py | Selected: 1 full | ~4,200 chars
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## You Control the Context
|
|
23
|
+
|
|
24
|
+
Many AI coding assistants automatically find what *they think* is relevant context. This is a black box. When the LLM gives a bad answer, you can't debug it because you don't know what context it was actually given.
|
|
25
|
+
|
|
26
|
+
**`kopipasta` is the opposite.** I built it on the principle of **explicit context control**. You are in the driver's seat. You decide *exactly* what files, functions, and snippets go into the prompt.
|
|
27
|
+
|
|
28
|
+
It's a "smart copy" command for your project, not a magic wand.
|
|
29
|
+
|
|
30
|
+
## How It Works
|
|
31
|
+
|
|
32
|
+
The workflow is a fast, iterative cycle:
|
|
33
|
+
|
|
34
|
+
1. **Context:** Run `kopipasta` to select files and define your task.
|
|
35
|
+
2. **Generate:** Paste the prompt into your LLM (ChatGPT, Claude, etc.).
|
|
36
|
+
3. **Patch:** Press `p` in `kopipasta` and paste the LLM's response to apply changes locally.
|
|
37
|
+
4. **Iterate:** Review with `git diff`, then repeat for the next step.
|
|
38
|
+
|
|
39
|
+
## Use Cases
|
|
40
|
+
|
|
41
|
+
* **Targeted Refactoring:** Select just the module you are cleaning up and its immediate dependencies.
|
|
42
|
+
* **Test Generation:** Pipe your implementation file and a similar existing test file to the LLM to generate consistent new tests.
|
|
43
|
+
* **Docs to Code:** Select an API documentation file (or web URL) and your source file to implement a feature against a spec.
|
|
44
|
+
* **Bug Fixing:** Grab the relevant traceback files and the config to diagnose issues without distracting the LLM with the whole repo.
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Using pipx (recommended for CLI tools)
|
|
50
|
+
pipx install kopipasta
|
|
51
|
+
|
|
52
|
+
# Or using standard pip
|
|
53
|
+
pip install kopipasta
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
`kopipasta` has two main modes: creating prompts and applying patches.
|
|
59
|
+
|
|
60
|
+
### Creating a Prompt
|
|
61
|
+
|
|
62
|
+
By default `kopipasta` opens tree selector on the current dir.
|
|
63
|
+
|
|
64
|
+
You may also use the command line arguments:
|
|
65
|
+
```bash
|
|
66
|
+
kopipasta [options] [files_or_directories_or_urls...]
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
**Arguments:**
|
|
70
|
+
|
|
71
|
+
* `[files_or_directories_or_urls...]`: One or more paths to files, directories, or web URLs to use as the starting point for your context.
|
|
72
|
+
|
|
73
|
+
**Options:**
|
|
74
|
+
|
|
75
|
+
* `-t TASK`, `--task TASK`: Provide the task description directly on the command line, skipping the editor.
|
|
76
|
+
|
|
77
|
+
### Applying Patches
|
|
78
|
+
|
|
79
|
+
`kopipasta` automatically injects strict instructions into your prompt, teaching the LLM how to format code for this tool.
|
|
80
|
+
`kopipasta` can apply changes suggested by an LLM directly to your codebase, assuming you are in a Git repository.
|
|
81
|
+
|
|
82
|
+
1. Press `p` in the file selector.
|
|
83
|
+
2. Paste the **entire** markdown response from your LLM.
|
|
84
|
+
3. The tool robustly detects code blocks, handles indentation quirks, and applies changes (full files or diffs).
|
|
85
|
+
4. If a patch fails, the tool provides **diagnostic feedback** telling you exactly why (e.g., missing headers).
|
|
86
|
+
5. **Always** review changes with `git diff` before committing.
|
|
87
|
+
|
|
88
|
+
**Example of supported LLM output formats:**
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
# FILE: src/utils.py
|
|
92
|
+
def new_feature():
|
|
93
|
+
print("kopipasta handles full file creation")
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
```diff
|
|
97
|
+
# FILE: src/main.py
|
|
98
|
+
@@ -10,2 +10,3 @@
|
|
99
|
+
def main():
|
|
100
|
+
- pass
|
|
101
|
+
+ new_feature()
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Key Features
|
|
105
|
+
|
|
106
|
+
* **Total Context Control:** Interactively select files, directories, or snippets. You see everything that goes into the prompt.
|
|
107
|
+
* **Smart Dependency Analysis:** Press `d` on a Python or TypeScript/JavaScript file, and `kopipasta` will scan imports to find and add related local files to your context automatically.
|
|
108
|
+
* **Robust Code Patcher:** Applies LLM suggestions directly. Handles indentation, various comment styles (`#`, `//`, `<!--`), and multiple files per block.
|
|
109
|
+
* **Built-in Search:** Press `g` to grep for text patterns inside directories to find relevant files.
|
|
110
|
+
* **Transparent & Explicit:** No hidden RAG. You know exactly what's in the prompt because you built it. This makes debugging LLM failures possible.
|
|
111
|
+
* **Web-Aware:** Pulls in content directly from URLs—perfect for API documentation.
|
|
112
|
+
* **Safety First:**
|
|
113
|
+
* Automatically respects your `.gitignore` rules.
|
|
114
|
+
* Detects if you're about to include secrets from a `.env` file and asks what to do.
|
|
115
|
+
* **Context-Aware:** Keeps a running total of the prompt size (in characters and estimated tokens) so you don't overload the LLM's context window.
|
|
116
|
+
* **Developer-Friendly:**
|
|
117
|
+
* Provides a rich, interactive prompt for writing task descriptions in terminal.
|
|
118
|
+
* Copies the final prompt directly to your clipboard.
|
|
119
|
+
* Provides syntax highlighting during chunk selection.
|
|
120
|
+
|
|
121
|
+
## Interactive Controls
|
|
122
|
+
|
|
123
|
+
| Key | Action |
|
|
124
|
+
| :--- | :--- |
|
|
125
|
+
| `Space` | Toggle file/directory selection |
|
|
126
|
+
| `s` | Toggle **Snippet Mode** (include only the first 50 lines) |
|
|
127
|
+
| `d` | **Analyze Dependencies** (find and add imported files) |
|
|
128
|
+
| `g` | **Grep** (search text in directory) |
|
|
129
|
+
| `a` | Add all files in directory |
|
|
130
|
+
| `p` | **Apply Patch** (paste LLM response) |
|
|
131
|
+
| `r` | Reuse selection from previous run |
|
|
132
|
+
| `Enter` | Expand/Collapse directory |
|
|
133
|
+
| `q` | Quit and finalize selection |
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
# Define FileTuple for type hinting
|
|
7
|
+
FileTuple = Tuple[str, bool, List[str] | None, str]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_cache_file_path() -> Path:
|
|
11
|
+
"""Gets the cross-platform path to the cache file for the last selection."""
|
|
12
|
+
cache_dir = Path.home() / ".cache" / "kopipasta"
|
|
13
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
return cache_dir / "last_selection.json"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def save_selection_to_cache(files_to_include: List[FileTuple]):
|
|
18
|
+
"""Saves the list of selected file relative paths to the cache."""
|
|
19
|
+
cache_file = get_cache_file_path()
|
|
20
|
+
relative_paths = sorted([os.path.relpath(f[0]) for f in files_to_include])
|
|
21
|
+
try:
|
|
22
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
23
|
+
json.dump(relative_paths, f, indent=2)
|
|
24
|
+
except IOError as e:
|
|
25
|
+
print(f"\nWarning: Could not save selection to cache: {e}")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def load_selection_from_cache() -> List[str]:
|
|
29
|
+
"""Loads the list of selected files from the cache file."""
|
|
30
|
+
cache_file = get_cache_file_path()
|
|
31
|
+
if not cache_file.exists():
|
|
32
|
+
return []
|
|
33
|
+
try:
|
|
34
|
+
with open(cache_file, "r", encoding="utf-8") as f:
|
|
35
|
+
paths = json.load(f)
|
|
36
|
+
# Filter out paths that no longer exist
|
|
37
|
+
return [p for p in paths if os.path.exists(p)]
|
|
38
|
+
except (IOError, json.JSONDecodeError) as e:
|
|
39
|
+
print(f"\nWarning: Could not load previous selection from cache: {e}")
|
|
40
|
+
return []
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
import os
|
|
3
|
+
from typing import List, Optional, Tuple, Set
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
FileTuple = Tuple[str, bool, Optional[List[str]], str]
|
|
7
|
+
|
|
8
|
+
# --- Caches ---
|
|
9
|
+
_gitignore_cache: dict[str, list[str]] = {}
|
|
10
|
+
_is_ignored_cache: dict[str, bool] = {}
|
|
11
|
+
_is_binary_cache: dict[str, bool] = {}
|
|
12
|
+
|
|
13
|
+
# --- Known File Extensions for is_binary ---
|
|
14
|
+
# Using sets for O(1) average time complexity lookups
|
|
15
|
+
TEXT_EXTENSIONS = {
|
|
16
|
+
# Code
|
|
17
|
+
".py",
|
|
18
|
+
".js",
|
|
19
|
+
".ts",
|
|
20
|
+
".jsx",
|
|
21
|
+
".tsx",
|
|
22
|
+
".java",
|
|
23
|
+
".c",
|
|
24
|
+
".cpp",
|
|
25
|
+
".h",
|
|
26
|
+
".hpp",
|
|
27
|
+
".cs",
|
|
28
|
+
".go",
|
|
29
|
+
".rs",
|
|
30
|
+
".sh",
|
|
31
|
+
".bash",
|
|
32
|
+
".ps1",
|
|
33
|
+
".rb",
|
|
34
|
+
".php",
|
|
35
|
+
".swift",
|
|
36
|
+
".kt",
|
|
37
|
+
".kts",
|
|
38
|
+
".scala",
|
|
39
|
+
".pl",
|
|
40
|
+
".pm",
|
|
41
|
+
".tcl",
|
|
42
|
+
# Markup & Data
|
|
43
|
+
".html",
|
|
44
|
+
".htm",
|
|
45
|
+
".xml",
|
|
46
|
+
".css",
|
|
47
|
+
".scss",
|
|
48
|
+
".sass",
|
|
49
|
+
".less",
|
|
50
|
+
".json",
|
|
51
|
+
".yaml",
|
|
52
|
+
".yml",
|
|
53
|
+
".toml",
|
|
54
|
+
".ini",
|
|
55
|
+
".cfg",
|
|
56
|
+
".conf",
|
|
57
|
+
".md",
|
|
58
|
+
".txt",
|
|
59
|
+
".rtf",
|
|
60
|
+
".csv",
|
|
61
|
+
".tsv",
|
|
62
|
+
".sql",
|
|
63
|
+
".graphql",
|
|
64
|
+
".gql",
|
|
65
|
+
# Config & Other
|
|
66
|
+
".gitignore",
|
|
67
|
+
".dockerfile",
|
|
68
|
+
"dockerfile",
|
|
69
|
+
".env",
|
|
70
|
+
".properties",
|
|
71
|
+
".mdx",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
BINARY_EXTENSIONS = {
|
|
75
|
+
# Images
|
|
76
|
+
".png",
|
|
77
|
+
".jpg",
|
|
78
|
+
".jpeg",
|
|
79
|
+
".gif",
|
|
80
|
+
".bmp",
|
|
81
|
+
".tiff",
|
|
82
|
+
".ico",
|
|
83
|
+
".webp",
|
|
84
|
+
".svg",
|
|
85
|
+
# Audio/Video
|
|
86
|
+
".mp3",
|
|
87
|
+
".wav",
|
|
88
|
+
".ogg",
|
|
89
|
+
".flac",
|
|
90
|
+
".mp4",
|
|
91
|
+
".avi",
|
|
92
|
+
".mov",
|
|
93
|
+
".wmv",
|
|
94
|
+
".mkv",
|
|
95
|
+
# Archives
|
|
96
|
+
".zip",
|
|
97
|
+
".rar",
|
|
98
|
+
".7z",
|
|
99
|
+
".tar",
|
|
100
|
+
".gz",
|
|
101
|
+
".bz2",
|
|
102
|
+
".xz",
|
|
103
|
+
# Documents
|
|
104
|
+
".pdf",
|
|
105
|
+
".doc",
|
|
106
|
+
".docx",
|
|
107
|
+
".xls",
|
|
108
|
+
".xlsx",
|
|
109
|
+
".ppt",
|
|
110
|
+
".pptx",
|
|
111
|
+
".odt",
|
|
112
|
+
# Executables & Compiled
|
|
113
|
+
".exe",
|
|
114
|
+
".dll",
|
|
115
|
+
".so",
|
|
116
|
+
".dylib",
|
|
117
|
+
".class",
|
|
118
|
+
".jar",
|
|
119
|
+
".pyc",
|
|
120
|
+
".pyd",
|
|
121
|
+
".whl",
|
|
122
|
+
# Databases & Other
|
|
123
|
+
".db",
|
|
124
|
+
".sqlite",
|
|
125
|
+
".sqlite3",
|
|
126
|
+
".db-wal",
|
|
127
|
+
".db-shm",
|
|
128
|
+
".lock",
|
|
129
|
+
".bak",
|
|
130
|
+
".swo",
|
|
131
|
+
".swp",
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _read_gitignore_patterns(gitignore_path: str) -> list[str]:
|
|
136
|
+
"""Reads patterns from a single .gitignore file and caches them."""
|
|
137
|
+
if gitignore_path in _gitignore_cache:
|
|
138
|
+
return _gitignore_cache[gitignore_path]
|
|
139
|
+
if not os.path.isfile(gitignore_path):
|
|
140
|
+
_gitignore_cache[gitignore_path] = []
|
|
141
|
+
return []
|
|
142
|
+
patterns = []
|
|
143
|
+
try:
|
|
144
|
+
with open(gitignore_path, "r", encoding="utf-8") as f:
|
|
145
|
+
for line in f:
|
|
146
|
+
stripped_line = line.strip()
|
|
147
|
+
if stripped_line and not stripped_line.startswith("#"):
|
|
148
|
+
patterns.append(stripped_line)
|
|
149
|
+
except IOError:
|
|
150
|
+
pass
|
|
151
|
+
_gitignore_cache[gitignore_path] = patterns
|
|
152
|
+
return patterns
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def is_ignored(
|
|
156
|
+
path: str, default_ignore_patterns: list[str], project_root: Optional[str] = None
|
|
157
|
+
) -> bool:
|
|
158
|
+
"""
|
|
159
|
+
Checks if a path should be ignored by splitting patterns into fast (basename)
|
|
160
|
+
and slow (full path) checks, with heavy caching and optimized inner loops.
|
|
161
|
+
"""
|
|
162
|
+
path_abs = os.path.abspath(path)
|
|
163
|
+
if path_abs in _is_ignored_cache:
|
|
164
|
+
return _is_ignored_cache[path_abs]
|
|
165
|
+
|
|
166
|
+
parent_dir = os.path.dirname(path_abs)
|
|
167
|
+
if parent_dir != path_abs and _is_ignored_cache.get(parent_dir, False):
|
|
168
|
+
_is_ignored_cache[path_abs] = True
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
if project_root is None:
|
|
172
|
+
project_root = os.getcwd()
|
|
173
|
+
project_root_abs = os.path.abspath(project_root)
|
|
174
|
+
|
|
175
|
+
basename_patterns, path_patterns = get_all_patterns(
|
|
176
|
+
default_ignore_patterns, path_abs, project_root_abs
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# --- Step 1: Fast check for basename patterns ---
|
|
180
|
+
path_basename = os.path.basename(path_abs)
|
|
181
|
+
for pattern in basename_patterns:
|
|
182
|
+
if fnmatch.fnmatch(path_basename, pattern):
|
|
183
|
+
_is_ignored_cache[path_abs] = True
|
|
184
|
+
return True
|
|
185
|
+
|
|
186
|
+
# --- Step 2: Optimized nested check for path patterns ---
|
|
187
|
+
try:
|
|
188
|
+
path_rel_to_root = os.path.relpath(path_abs, project_root_abs)
|
|
189
|
+
except ValueError:
|
|
190
|
+
_is_ignored_cache[path_abs] = False
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
# Pre-calculate all path prefixes to check, avoiding re-joins in the loop.
|
|
194
|
+
path_parts = Path(path_rel_to_root).parts
|
|
195
|
+
path_prefixes = [
|
|
196
|
+
os.path.join(*path_parts[:i]) for i in range(1, len(path_parts) + 1)
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
# Pre-process patterns to remove trailing slashes once.
|
|
200
|
+
processed_path_patterns = [p.rstrip("/") for p in path_patterns]
|
|
201
|
+
|
|
202
|
+
for prefix in path_prefixes:
|
|
203
|
+
for pattern in processed_path_patterns:
|
|
204
|
+
if fnmatch.fnmatch(prefix, pattern):
|
|
205
|
+
_is_ignored_cache[path_abs] = True
|
|
206
|
+
return True
|
|
207
|
+
|
|
208
|
+
_is_ignored_cache[path_abs] = False
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def get_all_patterns(
|
|
213
|
+
default_ignore_patterns, path_abs, project_root_abs
|
|
214
|
+
) -> Tuple[Set[str], Set[str]]:
|
|
215
|
+
"""
|
|
216
|
+
Gathers all applicable ignore patterns, splitting them into two sets
|
|
217
|
+
for optimized checking: one for basenames, one for full paths.
|
|
218
|
+
"""
|
|
219
|
+
basename_patterns = set()
|
|
220
|
+
path_patterns = set()
|
|
221
|
+
|
|
222
|
+
for p in default_ignore_patterns:
|
|
223
|
+
if "/" in p:
|
|
224
|
+
path_patterns.add(p)
|
|
225
|
+
else:
|
|
226
|
+
basename_patterns.add(p)
|
|
227
|
+
|
|
228
|
+
search_start_dir = (
|
|
229
|
+
path_abs if os.path.isdir(path_abs) else os.path.dirname(path_abs)
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
current_dir = search_start_dir
|
|
233
|
+
while True:
|
|
234
|
+
gitignore_path = os.path.join(current_dir, ".gitignore")
|
|
235
|
+
patterns_from_file = _read_gitignore_patterns(gitignore_path)
|
|
236
|
+
|
|
237
|
+
if patterns_from_file:
|
|
238
|
+
gitignore_dir_rel = os.path.relpath(current_dir, project_root_abs)
|
|
239
|
+
if gitignore_dir_rel == ".":
|
|
240
|
+
gitignore_dir_rel = ""
|
|
241
|
+
|
|
242
|
+
for p in patterns_from_file:
|
|
243
|
+
if "/" in p:
|
|
244
|
+
# Path patterns are relative to the .gitignore file's location
|
|
245
|
+
path_patterns.add(os.path.join(gitignore_dir_rel, p.lstrip("/")))
|
|
246
|
+
else:
|
|
247
|
+
basename_patterns.add(p)
|
|
248
|
+
|
|
249
|
+
if (
|
|
250
|
+
not current_dir.startswith(project_root_abs)
|
|
251
|
+
or current_dir == project_root_abs
|
|
252
|
+
):
|
|
253
|
+
break
|
|
254
|
+
parent = os.path.dirname(current_dir)
|
|
255
|
+
if parent == current_dir:
|
|
256
|
+
break
|
|
257
|
+
current_dir = parent
|
|
258
|
+
return basename_patterns, path_patterns
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def read_file_contents(file_path):
|
|
262
|
+
try:
|
|
263
|
+
with open(file_path, "r", encoding="utf-8") as file:
|
|
264
|
+
return file.read()
|
|
265
|
+
except (IOError, UnicodeDecodeError) as e:
|
|
266
|
+
failure = f"Error reading {file_path}: {e}"
|
|
267
|
+
print(failure)
|
|
268
|
+
return f"<.. {failure} ..>"
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def is_binary(file_path: str) -> bool:
|
|
272
|
+
"""
|
|
273
|
+
Efficiently checks if a file is binary.
|
|
274
|
+
|
|
275
|
+
The check follows a fast, multi-step process to minimize I/O:
|
|
276
|
+
1. Checks a memory cache for a previously determined result.
|
|
277
|
+
2. Checks the file extension against a list of known text file types.
|
|
278
|
+
3. Checks the file extension against a list of known binary file types.
|
|
279
|
+
4. As a last resort, reads the first 512 bytes of the file to check for
|
|
280
|
+
a null byte, a common indicator of a binary file.
|
|
281
|
+
"""
|
|
282
|
+
# Step 1: Check cache first for fastest response
|
|
283
|
+
if file_path in _is_binary_cache:
|
|
284
|
+
return _is_binary_cache[file_path]
|
|
285
|
+
|
|
286
|
+
# Step 2: Fast check based on known text/binary extensions (no I/O)
|
|
287
|
+
_, extension = os.path.splitext(file_path)
|
|
288
|
+
extension = extension.lower()
|
|
289
|
+
|
|
290
|
+
if extension in TEXT_EXTENSIONS:
|
|
291
|
+
_is_binary_cache[file_path] = False
|
|
292
|
+
return False
|
|
293
|
+
if extension in BINARY_EXTENSIONS:
|
|
294
|
+
_is_binary_cache[file_path] = True
|
|
295
|
+
return True
|
|
296
|
+
|
|
297
|
+
# Step 3: Fallback to content analysis for unknown extensions
|
|
298
|
+
try:
|
|
299
|
+
with open(file_path, "rb") as file:
|
|
300
|
+
# Read a smaller chunk, 512 bytes is usually enough to find a null byte
|
|
301
|
+
chunk = file.read(512)
|
|
302
|
+
if b"\0" in chunk:
|
|
303
|
+
_is_binary_cache[file_path] = True
|
|
304
|
+
return True
|
|
305
|
+
# If no null byte, assume it's a text file
|
|
306
|
+
_is_binary_cache[file_path] = False
|
|
307
|
+
return False
|
|
308
|
+
except IOError:
|
|
309
|
+
# If we can't open it, treat it as binary to be safe
|
|
310
|
+
_is_binary_cache[file_path] = True
|
|
311
|
+
return True
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def get_human_readable_size(size):
|
|
315
|
+
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
|
316
|
+
if size < 1024.0:
|
|
317
|
+
return f"{size:.2f} {unit}"
|
|
318
|
+
size /= 1024.0
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def is_large_file(file_path, threshold=102400):
|
|
322
|
+
return os.path.getsize(file_path) > threshold
|