mcp-codebase-searcher 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_codebase_searcher-0.1.0/LICENSE +21 -0
- mcp_codebase_searcher-0.1.0/PKG-INFO +292 -0
- mcp_codebase_searcher-0.1.0/README.md +269 -0
- mcp_codebase_searcher-0.1.0/pyproject.toml +56 -0
- mcp_codebase_searcher-0.1.0/setup.cfg +4 -0
- mcp_codebase_searcher-0.1.0/src/config.py +20 -0
- mcp_codebase_searcher-0.1.0/src/file_scanner.py +213 -0
- mcp_codebase_searcher-0.1.0/src/mcp_codebase_searcher.egg-info/PKG-INFO +292 -0
- mcp_codebase_searcher-0.1.0/src/mcp_codebase_searcher.egg-info/SOURCES.txt +22 -0
- mcp_codebase_searcher-0.1.0/src/mcp_codebase_searcher.egg-info/dependency_links.txt +1 -0
- mcp_codebase_searcher-0.1.0/src/mcp_codebase_searcher.egg-info/entry_points.txt +2 -0
- mcp_codebase_searcher-0.1.0/src/mcp_codebase_searcher.egg-info/requires.txt +2 -0
- mcp_codebase_searcher-0.1.0/src/mcp_codebase_searcher.egg-info/top_level.txt +7 -0
- mcp_codebase_searcher-0.1.0/src/mcp_elaborate.py +277 -0
- mcp_codebase_searcher-0.1.0/src/mcp_search.py +295 -0
- mcp_codebase_searcher-0.1.0/src/mcp_searcher.py +217 -0
- mcp_codebase_searcher-0.1.0/src/output_generator.py +177 -0
- mcp_codebase_searcher-0.1.0/src/report_elaborator.py +203 -0
- mcp_codebase_searcher-0.1.0/tests/test_file_scanner.py +264 -0
- mcp_codebase_searcher-0.1.0/tests/test_mcp_elaborate.py +252 -0
- mcp_codebase_searcher-0.1.0/tests/test_mcp_search.py +340 -0
- mcp_codebase_searcher-0.1.0/tests/test_mcp_searcher.py +220 -0
- mcp_codebase_searcher-0.1.0/tests/test_output_generator.py +165 -0
- mcp_codebase_searcher-0.1.0/tests/test_report_elaborator.py +169 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Sakilmostak
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mcp_codebase_searcher
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python tool to scan codebases, search for text/regex patterns, and elaborate on findings using Google Gemini.
|
|
5
|
+
Author-email: Sakilmostak <skmahim71@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/sakilmostak/mcp-codebase-searcher
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/sakilmostak/mcp-codebase-searcher/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Topic :: Software Development :: Build Tools
|
|
15
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
16
|
+
Classifier: Topic :: Utilities
|
|
17
|
+
Requires-Python: >=3.8
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: python-dotenv
|
|
21
|
+
Requires-Dist: google-generativeai
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# MCP Codebase Searcher
|
|
25
|
+
|
|
26
|
+
MCP Codebase Searcher is a Python tool designed to scan codebases, search for text or regular expression patterns, and optionally elaborate on the findings using Google Gemini.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
* Search for exact strings or regular expression patterns.
|
|
31
|
+
* Case-sensitive or case-insensitive searching.
|
|
32
|
+
* Specify context lines to display around matches.
|
|
33
|
+
* Exclude specific directories and file patterns.
|
|
34
|
+
* Option to include/exclude hidden files and directories.
|
|
35
|
+
* Output results in console, JSON, or Markdown format.
|
|
36
|
+
* Save search results to a file.
|
|
37
|
+
* Elaborate on individual findings from a JSON report using Google Gemini.
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
This project uses Python 3.8+.
|
|
42
|
+
|
|
43
|
+
1. **Clone the repository (if applicable):**
|
|
44
|
+
```bash
|
|
45
|
+
git clone <repository_url>
|
|
46
|
+
cd mcp_codebase_searcher
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
2. **Create and activate a virtual environment:**
|
|
50
|
+
```bash
|
|
51
|
+
python3 -m venv venv
|
|
52
|
+
source venv/bin/activate # On Windows use `venv\Scripts\activate`
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
3. **Install the package:**
|
|
56
|
+
Once the package is built (see Building section below), you can install it using pip:
|
|
57
|
+
```bash
|
|
58
|
+
pip install dist/mcp_codebase_searcher-*.whl
|
|
59
|
+
```
|
|
60
|
+
Alternatively, for development, install in editable mode from the project root:
|
|
61
|
+
```bash
|
|
62
|
+
pip install -e .
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
4. **API Key (for Elaboration):**
|
|
66
|
+
To use the elaboration feature, you need a Google API key for Gemini. You can provide it via:
|
|
67
|
+
* The `--api-key` argument when using the `elaborate` command.
|
|
68
|
+
* A JSON configuration file specified with `--config-file` (containing `{"GOOGLE_API_KEY": "YOUR_KEY"}`).
|
|
69
|
+
* An environment variable `GOOGLE_API_KEY`.
|
|
70
|
+
* A `config.py` file in the project root (if running from source) that has a `load_api_key()` function returning the key.
|
|
71
|
+
|
|
72
|
+
The API key is sourced with the following precedence: `--api-key` argument > `--config-file` > `GOOGLE_API_KEY` environment variable > `config.py` module.
|
|
73
|
+
|
|
74
|
+
Create a `.env` file in the project root for local development if using environment variables:
|
|
75
|
+
```
|
|
76
|
+
GOOGLE_API_KEY="YOUR_API_KEY_HERE"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Usage
|
|
80
|
+
|
|
81
|
+
The tool provides two main commands: `search` and `elaborate`.
|
|
82
|
+
|
|
83
|
+
### Search
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
mcp-searcher search "your_query" path/to/search [--regex] [--case-sensitive] [--context LINES] [--exclude-dirs .git,node_modules] [--exclude-files *.log] [--include-hidden] [--output-format json] [--output-file results.json]
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
**Arguments:**
|
|
90
|
+
|
|
91
|
+
* `query`: The search term or regex pattern.
|
|
92
|
+
* `paths`: One or more file or directory paths to search within.
|
|
93
|
+
* `--regex`, `-r`: Treat the `query` as a Python regular expression pattern.
|
|
94
|
+
* `--case-sensitive`, `-c`: Perform a case-sensitive search. By default, search is case-insensitive.
|
|
95
|
+
* `--context LINES`, `-C LINES`: Number of context lines to show around each match (default: 3). Set to 0 for no context.
|
|
96
|
+
* `--exclude-dirs PATTERNS`: Comma-separated list of directory name patterns (using `fnmatch` wildcards like `*`, `?`) to exclude (e.g., `.git,node_modules,build,*cache*`).
|
|
97
|
+
* `--exclude-files PATTERNS`: Comma-separated list of file name patterns (using `fnmatch` wildcards) to exclude (e.g., `*.log,*.tmp,temp_*`).
|
|
98
|
+
* `--include-hidden`: Include hidden files and directories (those starting with a period `.`) in the scan. By default, they are excluded unless they are explicitly provided in `paths`.
|
|
99
|
+
* `--output-format FORMAT`: Format for the output. Choices: `console` (default), `json`, `md` (or `markdown`).
|
|
100
|
+
* `--output-file FILE`: Path to save the output. If not provided, prints to the console.
|
|
101
|
+
|
|
102
|
+
**Examples:**
|
|
103
|
+
|
|
104
|
+
1. Search for "TODO" (case-insensitive) in the `src` directory and its subdirectories, excluding `__pycache__` directories and any `.tmp` or `.log` files, and save the results as JSON:
|
|
105
|
+
```bash
|
|
106
|
+
mcp-searcher search "TODO" src --exclude-dirs __pycache__ --exclude-files "*.tmp,*.log" --output-format json --output-file todos.json
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
2. Search for Python function definitions (e.g., `def my_function(`) using a regular expression in all `.py` files within the current directory (`.`) and its subdirectories:
|
|
110
|
+
```bash
|
|
111
|
+
mcp-searcher search "^\s*def\s+\w+\s*\(.*\):" . --regex --exclude-files "!*.py" # Assumes FileScanner handles includes or user pre-filters paths if !*.py is not directly supported for exclusion.
|
|
112
|
+
# A better way if FileScanner doesn't support include patterns in exclude-files:
|
|
113
|
+
# Find .py files first, then pass to mcp-searcher, or rely on mcp-searcher scanning all and then filtering if it did.
|
|
114
|
+
# For this tool, it scans all non-excluded, so to search only .py, you'd typically not exclude others unless they are binaries etc.
|
|
115
|
+
# Corrected Example for just regex:
|
|
116
|
+
mcp-searcher search "^\s*def\s+\w+\s*\(.*\):" . --regex
|
|
117
|
+
```
|
|
118
|
+
*Note: Ensure your regex is quoted correctly for your shell, especially if it contains special characters.*
|
|
119
|
+
|
|
120
|
+
3. Perform a case-sensitive search for the exact string "ErrorLog" in all files in `/var/log`, include hidden files, and output to a Markdown file:
|
|
121
|
+
```bash
|
|
122
|
+
mcp-searcher search "ErrorLog" /var/log --case-sensitive --include-hidden --output-format md --output-file errors_report.md
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Elaborate
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
mcp-searcher elaborate --report-file path/to/report.json --finding-id INDEX [--api-key YOUR_KEY] [--config-file path/to/config.json] [--context-lines LINES]
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**Arguments:**
|
|
132
|
+
|
|
133
|
+
* `--report-file FILE`: (Required) Path to the JSON search report file generated by the `search` command.
|
|
134
|
+
* `--finding-id INDEX`: (Required) The 0-based index (ID) of the specific finding within the report file that you want to elaborate on.
|
|
135
|
+
* `--api-key KEY`: Your Google API key for Gemini. If provided, this takes precedence over other key sources.
|
|
136
|
+
* `--config-file FILE`: Path to an optional JSON configuration file containing your `GOOGLE_API_KEY` (e.g., `{"GOOGLE_API_KEY": "YOUR_KEY"}`).
|
|
137
|
+
* `--context-lines LINES`: Number of lines of broader context from the source file (surrounding the original snippet) to provide to the LLM for better understanding (default: 10).
|
|
138
|
+
|
|
139
|
+
**Examples:**
|
|
140
|
+
|
|
141
|
+
1. Elaborate on the first finding (index 0) from `todos.json`, assuming the API key is set as an environment variable (`GOOGLE_API_KEY`) or in a `config.py` / `.env` file:
|
|
142
|
+
```bash
|
|
143
|
+
mcp-searcher elaborate --report-file todos.json --finding-id 0
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
2. Elaborate on the third finding (index 2) from `search_results.json`, providing the API key directly and specifying 15 lines of context for the LLM:
|
|
147
|
+
```bash
|
|
148
|
+
mcp-searcher elaborate --report-file search_results.json --finding-id 2 --api-key "AIzaSyXXXXXXXXXXXXXXXXXXX" --context-lines 15
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
3. Elaborate on a finding from `project_report.json`, using an API key stored in a custom configuration file named `my_gemini_config.json` located in the user's home directory:
|
|
152
|
+
```bash
|
|
153
|
+
mcp-searcher elaborate --report-file project_report.json --finding-id 5 --config-file ~/.my_gemini_config.json
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Output Formats
|
|
157
|
+
|
|
158
|
+
The `search` command can output results in several formats using the `--output-format` option:
|
|
159
|
+
|
|
160
|
+
* **`console` (default):** Prints results directly to the terminal in a human-readable format. Each match includes the file path, line number, and the line containing the match with the matched text highlighted (e.g., `>>>matched text<<<`). Context lines, if requested, are shown above and below the match line.
|
|
161
|
+
|
|
162
|
+
*Example Console Output (simplified):*
|
|
163
|
+
```text
|
|
164
|
+
path/to/your/file.py:42
|
|
165
|
+
Context line 1 before match
|
|
166
|
+
>>>The line with the matched text<<<
|
|
167
|
+
Context line 1 after match
|
|
168
|
+
---
|
|
169
|
+
another/file.txt:101
|
|
170
|
+
Just the >>>matched line<<< if no context
|
|
171
|
+
---
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
* **`json`:** Outputs results as a JSON array. Each object in the array represents a single match and contains the following fields:
|
|
175
|
+
* `file_path`: Absolute path to the file containing the match.
|
|
176
|
+
* `line_number`: The 1-based line number where the match occurred.
|
|
177
|
+
* `match_text`: The actual text that was matched.
|
|
178
|
+
* `snippet`: A string containing the line with the match and any surrounding context lines requested. The matched text within the snippet is highlighted with `>>> <<<`.
|
|
179
|
+
* `char_start_in_line`: The 0-based starting character offset of the match within its line.
|
|
180
|
+
* `char_end_in_line`: The 0-based ending character offset of the match within its line.
|
|
181
|
+
|
|
182
|
+
*Example JSON Output (for one match):*
|
|
183
|
+
```json
|
|
184
|
+
[
|
|
185
|
+
{
|
|
186
|
+
"file_path": "/path/to/your/file.py",
|
|
187
|
+
"line_number": 42,
|
|
188
|
+
"match_text": "matched text",
|
|
189
|
+
"snippet": " Context line 1 before match\n >>>The line with the matched text<<<\n Context line 1 after match",
|
|
190
|
+
"char_start_in_line": 25,
|
|
191
|
+
"char_end_in_line": 37
|
|
192
|
+
}
|
|
193
|
+
// ... more matches ...
|
|
194
|
+
]
|
|
195
|
+
```
|
|
196
|
+
This format is ideal for programmatic processing and is required as input for the `elaborate` command.
|
|
197
|
+
|
|
198
|
+
* **`md` or `markdown`:** Outputs results in Markdown format. Each match is typically presented with the file path as a heading or bolded, followed by the line number and the snippet (often as a preformatted text block).
|
|
199
|
+
|
|
200
|
+
*Example Markdown Output (simplified):*
|
|
201
|
+
```markdown
|
|
202
|
+
**path/to/your/file.py:42**
|
|
203
|
+
```text
|
|
204
|
+
Context line 1 before match
|
|
205
|
+
>>>The line with the matched text<<<
|
|
206
|
+
Context line 1 after match
|
|
207
|
+
```
|
|
208
|
+
---
|
|
209
|
+
**another/file.txt:101**
|
|
210
|
+
```text
|
|
211
|
+
Just the >>>matched line<<< if no context
|
|
212
|
+
```
|
|
213
|
+
```
|
|
214
|
+
This format is suitable for generating reports or for easy pasting into documents that support Markdown.
|
|
215
|
+
|
|
216
|
+
## Building
|
|
217
|
+
|
|
218
|
+
To build the package (wheel and source distribution):
|
|
219
|
+
|
|
220
|
+
1. Ensure you have the necessary build tools:
|
|
221
|
+
```bash
|
|
222
|
+
pip install build
|
|
223
|
+
```
|
|
224
|
+
2. Run the build command from the project root:
|
|
225
|
+
```bash
|
|
226
|
+
python -m build
|
|
227
|
+
```
|
|
228
|
+
This will create `sdist` and `wheel` files in a `dist/` directory.
|
|
229
|
+
|
|
230
|
+
## Running Tests
|
|
231
|
+
|
|
232
|
+
1. Ensure test dependencies are installed (if any beyond main dependencies).
|
|
233
|
+
2. Run tests using unittest discovery from the project root:
|
|
234
|
+
```bash
|
|
235
|
+
python -m unittest discover -s tests
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## Contributing
|
|
239
|
+
|
|
240
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
245
|
+
|
|
246
|
+
## Troubleshooting
|
|
247
|
+
|
|
248
|
+
Here are some common issues and how to resolve them:
|
|
249
|
+
|
|
250
|
+
* **Command not found (`mcp-searcher: command not found`):**
|
|
251
|
+
* Ensure you have activated the virtual environment where the package was installed: `source venv/bin/activate` (or `venv\Scripts\activate` on Windows).
|
|
252
|
+
* If installed in editable mode (`pip install -e .`), ensure you are in the project root or that the project root is in your `PYTHONPATH`.
|
|
253
|
+
* If installed via wheel, ensure the virtual environment's `bin` (or `Scripts`) directory is in your system's `PATH`.
|
|
254
|
+
|
|
255
|
+
* **ModuleNotFoundError (e.g., `No module named 'google_generativeai'`):**
|
|
256
|
+
* Make sure all dependencies are installed correctly within your active virtual environment. Try reinstalling: `pip install --force-reinstall -r requirements.txt` (if you have one from source) or `pip install --force-reinstall mcp-codebase-searcher` (if from wheel, though direct wheel reinstallation might be `pip install --force-reinstall dist/mcp_codebase_searcher-*.whl`). For an installed package, dependencies should be handled automatically.
|
|
257
|
+
* Ensure you are using the Python interpreter from your activated virtual environment.
|
|
258
|
+
|
|
259
|
+
* **API Key Errors (for `elaborate` command):**
|
|
260
|
+
* **"Could not initialize GenerativeModel... API key not found."**: This means the Google API key was not found through any of the supported methods (argument, config file, environment variable, `config.py`). Double-check the [API Key section under Installation](#api-key-for-elaboration).
|
|
261
|
+
* **"Could not initialize GenerativeModel... Invalid API key."**: The key was found but is incorrect or unauthorized for the Gemini API.
|
|
262
|
+
* Ensure your `.env` file (if used) is in the correct location (project root if running from source) and correctly formatted (`GOOGLE_API_KEY="YOUR_KEY"`).
|
|
263
|
+
* Verify that the environment variable `GOOGLE_API_KEY` is set and exported in your current shell session if not using an `.env` file with `python-dotenv` support.
|
|
264
|
+
|
|
265
|
+
* **File/Directory Not Found (for `search` or `elaborate --report-file`):**
|
|
266
|
+
* Double-check that the paths provided to the `search` command or the `--report-file` argument are correct and accessible.
|
|
267
|
+
* Relative paths are resolved from the current working directory where you run the command.
|
|
268
|
+
|
|
269
|
+
* **Permission Denied Errors:**
|
|
270
|
+
* Ensure you have read permissions for the files/directories you are trying to search, and write permissions if using `--output-file` to a restricted location.
|
|
271
|
+
|
|
272
|
+
* **Invalid Regular Expression (for `search --regex`):**
|
|
273
|
+
* The tool will output an error if the regex pattern is invalid. Test your regex pattern with online tools or Python's `re` module separately.
|
|
274
|
+
* Remember to quote your regex pattern properly in the shell, especially if it contains special characters like `*`, `(`, `)`, `|`, etc. Single quotes (`'pattern'`) are often safer than double quotes in bash/zsh for complex patterns.
|
|
275
|
+
|
|
276
|
+
* **No Matches Found:**
|
|
277
|
+
* Verify your query term or regex pattern. Try a simpler, broader query first.
|
|
278
|
+
* Check your `--case-sensitive` flag. Search is case-insensitive by default.
|
|
279
|
+
* Review your exclusion patterns (`--exclude-dirs`, `--exclude-files`). You might be unintentionally excluding the files containing matches.
|
|
280
|
+
* Ensure the target files are not binary or are of a type the tool can read (primarily text-based).
|
|
281
|
+
* If searching hidden files, ensure `--include-hidden` is used.
|
|
282
|
+
|
|
283
|
+
* **Incorrect JSON in Report File (for `elaborate` command):**
|
|
284
|
+
* The `elaborate` command expects a JSON file in the format produced by `mcp-searcher search --output-format json`. If the file is malformed or not a valid JSON array of search results, elaboration will fail.
|
|
285
|
+
* Error messages like "Could not decode JSON from report file" or "Finding ID ... is out of range" point to issues with the report file or the provided ID.
|
|
286
|
+
|
|
287
|
+
* **Shell Quoting Issues for Query:**
|
|
288
|
+
* If your search query contains spaces or special shell characters (e.g., `!`, `*`, `$`, `&`), ensure it's properly quoted. Single quotes (`'your query'`) are generally safest to prevent shell expansion.
|
|
289
|
+
```bash
|
|
290
|
+
mcp-searcher search 'my exact phrase with spaces!' .
|
|
291
|
+
mcp-searcher search 'pattern_with_$(dollar_sign_and_parens)' . --regex
|
|
292
|
+
```
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# MCP Codebase Searcher
|
|
2
|
+
|
|
3
|
+
MCP Codebase Searcher is a Python tool designed to scan codebases, search for text or regular expression patterns, and optionally elaborate on the findings using Google Gemini.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
* Search for exact strings or regular expression patterns.
|
|
8
|
+
* Case-sensitive or case-insensitive searching.
|
|
9
|
+
* Specify context lines to display around matches.
|
|
10
|
+
* Exclude specific directories and file patterns.
|
|
11
|
+
* Option to include/exclude hidden files and directories.
|
|
12
|
+
* Output results in console, JSON, or Markdown format.
|
|
13
|
+
* Save search results to a file.
|
|
14
|
+
* Elaborate on individual findings from a JSON report using Google Gemini.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
This project uses Python 3.8+.
|
|
19
|
+
|
|
20
|
+
1. **Clone the repository (if applicable):**
|
|
21
|
+
```bash
|
|
22
|
+
git clone <repository_url>
|
|
23
|
+
cd mcp_codebase_searcher
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
2. **Create and activate a virtual environment:**
|
|
27
|
+
```bash
|
|
28
|
+
python3 -m venv venv
|
|
29
|
+
source venv/bin/activate # On Windows use `venv\Scripts\activate`
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
3. **Install the package:**
|
|
33
|
+
Once the package is built (see Building section below), you can install it using pip:
|
|
34
|
+
```bash
|
|
35
|
+
pip install dist/mcp_codebase_searcher-*.whl
|
|
36
|
+
```
|
|
37
|
+
Alternatively, for development, install in editable mode from the project root:
|
|
38
|
+
```bash
|
|
39
|
+
pip install -e .
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
4. **API Key (for Elaboration):**
|
|
43
|
+
To use the elaboration feature, you need a Google API key for Gemini. You can provide it via:
|
|
44
|
+
* The `--api-key` argument when using the `elaborate` command.
|
|
45
|
+
* A JSON configuration file specified with `--config-file` (containing `{"GOOGLE_API_KEY": "YOUR_KEY"}`).
|
|
46
|
+
* An environment variable `GOOGLE_API_KEY`.
|
|
47
|
+
* A `config.py` file in the project root (if running from source) that has a `load_api_key()` function returning the key.
|
|
48
|
+
|
|
49
|
+
The API key is sourced with the following precedence: `--api-key` argument > `--config-file` > `GOOGLE_API_KEY` environment variable > `config.py` module.
|
|
50
|
+
|
|
51
|
+
Create a `.env` file in the project root for local development if using environment variables:
|
|
52
|
+
```
|
|
53
|
+
GOOGLE_API_KEY="YOUR_API_KEY_HERE"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
The tool provides two main commands: `search` and `elaborate`.
|
|
59
|
+
|
|
60
|
+
### Search
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
mcp-searcher search "your_query" path/to/search [--regex] [--case-sensitive] [--context LINES] [--exclude-dirs .git,node_modules] [--exclude-files *.log] [--include-hidden] [--output-format json] [--output-file results.json]
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
**Arguments:**
|
|
67
|
+
|
|
68
|
+
* `query`: The search term or regex pattern.
|
|
69
|
+
* `paths`: One or more file or directory paths to search within.
|
|
70
|
+
* `--regex`, `-r`: Treat the `query` as a Python regular expression pattern.
|
|
71
|
+
* `--case-sensitive`, `-c`: Perform a case-sensitive search. By default, search is case-insensitive.
|
|
72
|
+
* `--context LINES`, `-C LINES`: Number of context lines to show around each match (default: 3). Set to 0 for no context.
|
|
73
|
+
* `--exclude-dirs PATTERNS`: Comma-separated list of directory name patterns (using `fnmatch` wildcards like `*`, `?`) to exclude (e.g., `.git,node_modules,build,*cache*`).
|
|
74
|
+
* `--exclude-files PATTERNS`: Comma-separated list of file name patterns (using `fnmatch` wildcards) to exclude (e.g., `*.log,*.tmp,temp_*`).
|
|
75
|
+
* `--include-hidden`: Include hidden files and directories (those starting with a period `.`) in the scan. By default, they are excluded unless they are explicitly provided in `paths`.
|
|
76
|
+
* `--output-format FORMAT`: Format for the output. Choices: `console` (default), `json`, `md` (or `markdown`).
|
|
77
|
+
* `--output-file FILE`: Path to save the output. If not provided, prints to the console.
|
|
78
|
+
|
|
79
|
+
**Examples:**
|
|
80
|
+
|
|
81
|
+
1. Search for "TODO" (case-insensitive) in the `src` directory and its subdirectories, excluding `__pycache__` directories and any `.tmp` or `.log` files, and save the results as JSON:
|
|
82
|
+
```bash
|
|
83
|
+
mcp-searcher search "TODO" src --exclude-dirs __pycache__ --exclude-files "*.tmp,*.log" --output-format json --output-file todos.json
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
2. Search for Python function definitions (e.g., `def my_function(`) using a regular expression in all `.py` files within the current directory (`.`) and its subdirectories:
|
|
87
|
+
```bash
|
|
88
|
+
mcp-searcher search "^\s*def\s+\w+\s*\(.*\):" . --regex --exclude-files "!*.py" # Assumes FileScanner handles includes or user pre-filters paths if !*.py is not directly supported for exclusion.
|
|
89
|
+
# A better way if FileScanner doesn't support include patterns in exclude-files:
|
|
90
|
+
# Find .py files first, then pass to mcp-searcher, or rely on mcp-searcher scanning all and then filtering if it did.
|
|
91
|
+
# For this tool, it scans all non-excluded, so to search only .py, you'd typically not exclude others unless they are binaries etc.
|
|
92
|
+
# Corrected Example for just regex:
|
|
93
|
+
mcp-searcher search "^\s*def\s+\w+\s*\(.*\):" . --regex
|
|
94
|
+
```
|
|
95
|
+
*Note: Ensure your regex is quoted correctly for your shell, especially if it contains special characters.*
|
|
96
|
+
|
|
97
|
+
3. Perform a case-sensitive search for the exact string "ErrorLog" in all files in `/var/log`, include hidden files, and output to a Markdown file:
|
|
98
|
+
```bash
|
|
99
|
+
mcp-searcher search "ErrorLog" /var/log --case-sensitive --include-hidden --output-format md --output-file errors_report.md
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Elaborate
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
mcp-searcher elaborate --report-file path/to/report.json --finding-id INDEX [--api-key YOUR_KEY] [--config-file path/to/config.json] [--context-lines LINES]
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**Arguments:**
|
|
109
|
+
|
|
110
|
+
* `--report-file FILE`: (Required) Path to the JSON search report file generated by the `search` command.
|
|
111
|
+
* `--finding-id INDEX`: (Required) The 0-based index (ID) of the specific finding within the report file that you want to elaborate on.
|
|
112
|
+
* `--api-key KEY`: Your Google API key for Gemini. If provided, this takes precedence over other key sources.
|
|
113
|
+
* `--config-file FILE`: Path to an optional JSON configuration file containing your `GOOGLE_API_KEY` (e.g., `{"GOOGLE_API_KEY": "YOUR_KEY"}`).
|
|
114
|
+
* `--context-lines LINES`: Number of lines of broader context from the source file (surrounding the original snippet) to provide to the LLM for better understanding (default: 10).
|
|
115
|
+
|
|
116
|
+
**Examples:**
|
|
117
|
+
|
|
118
|
+
1. Elaborate on the first finding (index 0) from `todos.json`, assuming the API key is set as an environment variable (`GOOGLE_API_KEY`) or in a `config.py` / `.env` file:
|
|
119
|
+
```bash
|
|
120
|
+
mcp-searcher elaborate --report-file todos.json --finding-id 0
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
2. Elaborate on the third finding (index 2) from `search_results.json`, providing the API key directly and specifying 15 lines of context for the LLM:
|
|
124
|
+
```bash
|
|
125
|
+
mcp-searcher elaborate --report-file search_results.json --finding-id 2 --api-key "AIzaSyXXXXXXXXXXXXXXXXXXX" --context-lines 15
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
3. Elaborate on a finding from `project_report.json`, using an API key stored in a custom configuration file named `my_gemini_config.json` located in the user's home directory:
|
|
129
|
+
```bash
|
|
130
|
+
mcp-searcher elaborate --report-file project_report.json --finding-id 5 --config-file ~/.my_gemini_config.json
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Output Formats
|
|
134
|
+
|
|
135
|
+
The `search` command can output results in several formats using the `--output-format` option:
|
|
136
|
+
|
|
137
|
+
* **`console` (default):** Prints results directly to the terminal in a human-readable format. Each match includes the file path, line number, and the line containing the match with the matched text highlighted (e.g., `>>>matched text<<<`). Context lines, if requested, are shown above and below the match line.
|
|
138
|
+
|
|
139
|
+
*Example Console Output (simplified):*
|
|
140
|
+
```text
|
|
141
|
+
path/to/your/file.py:42
|
|
142
|
+
Context line 1 before match
|
|
143
|
+
>>>The line with the matched text<<<
|
|
144
|
+
Context line 1 after match
|
|
145
|
+
---
|
|
146
|
+
another/file.txt:101
|
|
147
|
+
Just the >>>matched line<<< if no context
|
|
148
|
+
---
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
* **`json`:** Outputs results as a JSON array. Each object in the array represents a single match and contains the following fields:
|
|
152
|
+
* `file_path`: Absolute path to the file containing the match.
|
|
153
|
+
* `line_number`: The 1-based line number where the match occurred.
|
|
154
|
+
* `match_text`: The actual text that was matched.
|
|
155
|
+
* `snippet`: A string containing the line with the match and any surrounding context lines requested. The matched text within the snippet is highlighted with `>>> <<<`.
|
|
156
|
+
* `char_start_in_line`: The 0-based starting character offset of the match within its line.
|
|
157
|
+
* `char_end_in_line`: The 0-based ending character offset of the match within its line.
|
|
158
|
+
|
|
159
|
+
*Example JSON Output (for one match):*
|
|
160
|
+
```json
|
|
161
|
+
[
|
|
162
|
+
{
|
|
163
|
+
"file_path": "/path/to/your/file.py",
|
|
164
|
+
"line_number": 42,
|
|
165
|
+
"match_text": "matched text",
|
|
166
|
+
"snippet": " Context line 1 before match\n >>>The line with the matched text<<<\n Context line 1 after match",
|
|
167
|
+
"char_start_in_line": 25,
|
|
168
|
+
"char_end_in_line": 37
|
|
169
|
+
}
|
|
170
|
+
// ... more matches ...
|
|
171
|
+
]
|
|
172
|
+
```
|
|
173
|
+
This format is ideal for programmatic processing and is required as input for the `elaborate` command.
|
|
174
|
+
|
|
175
|
+
* **`md` or `markdown`:** Outputs results in Markdown format. Each match is typically presented with the file path as a heading or bolded, followed by the line number and the snippet (often as a preformatted text block).
|
|
176
|
+
|
|
177
|
+
*Example Markdown Output (simplified):*
|
|
178
|
+
```markdown
|
|
179
|
+
**path/to/your/file.py:42**
|
|
180
|
+
```text
|
|
181
|
+
Context line 1 before match
|
|
182
|
+
>>>The line with the matched text<<<
|
|
183
|
+
Context line 1 after match
|
|
184
|
+
```
|
|
185
|
+
---
|
|
186
|
+
**another/file.txt:101**
|
|
187
|
+
```text
|
|
188
|
+
Just the >>>matched line<<< if no context
|
|
189
|
+
```
|
|
190
|
+
```
|
|
191
|
+
This format is suitable for generating reports or for easy pasting into documents that support Markdown.
|
|
192
|
+
|
|
193
|
+
## Building
|
|
194
|
+
|
|
195
|
+
To build the package (wheel and source distribution):
|
|
196
|
+
|
|
197
|
+
1. Ensure you have the necessary build tools:
|
|
198
|
+
```bash
|
|
199
|
+
pip install build
|
|
200
|
+
```
|
|
201
|
+
2. Run the build command from the project root:
|
|
202
|
+
```bash
|
|
203
|
+
python -m build
|
|
204
|
+
```
|
|
205
|
+
This will create `sdist` and `wheel` files in a `dist/` directory.
|
|
206
|
+
|
|
207
|
+
## Running Tests
|
|
208
|
+
|
|
209
|
+
1. Ensure test dependencies are installed (if any beyond main dependencies).
|
|
210
|
+
2. Run tests using unittest discovery from the project root:
|
|
211
|
+
```bash
|
|
212
|
+
python -m unittest discover -s tests
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Contributing
|
|
216
|
+
|
|
217
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
222
|
+
|
|
223
|
+
## Troubleshooting
|
|
224
|
+
|
|
225
|
+
Here are some common issues and how to resolve them:
|
|
226
|
+
|
|
227
|
+
* **Command not found (`mcp-searcher: command not found`):**
|
|
228
|
+
* Ensure you have activated the virtual environment where the package was installed: `source venv/bin/activate` (or `venv\Scripts\activate` on Windows).
|
|
229
|
+
* If installed in editable mode (`pip install -e .`), ensure you are in the project root or that the project root is in your `PYTHONPATH`.
|
|
230
|
+
* If installed via wheel, ensure the virtual environment's `bin` (or `Scripts`) directory is in your system's `PATH`.
|
|
231
|
+
|
|
232
|
+
* **ModuleNotFoundError (e.g., `No module named 'google_generativeai'`):**
|
|
233
|
+
* Make sure all dependencies are installed correctly within your active virtual environment. Try reinstalling: `pip install --force-reinstall -r requirements.txt` (if you have one from source) or `pip install --force-reinstall mcp-codebase-searcher` (if from wheel, though direct wheel reinstallation might be `pip install --force-reinstall dist/mcp_codebase_searcher-*.whl`). For an installed package, dependencies should be handled automatically.
|
|
234
|
+
* Ensure you are using the Python interpreter from your activated virtual environment.
|
|
235
|
+
|
|
236
|
+
* **API Key Errors (for `elaborate` command):**
|
|
237
|
+
* **"Could not initialize GenerativeModel... API key not found."**: This means the Google API key was not found through any of the supported methods (argument, config file, environment variable, `config.py`). Double-check the [API Key section under Installation](#api-key-for-elaboration).
|
|
238
|
+
* **"Could not initialize GenerativeModel... Invalid API key."**: The key was found but is incorrect or unauthorized for the Gemini API.
|
|
239
|
+
* Ensure your `.env` file (if used) is in the correct location (project root if running from source) and correctly formatted (`GOOGLE_API_KEY="YOUR_KEY"`).
|
|
240
|
+
* Verify that the environment variable `GOOGLE_API_KEY` is set and exported in your current shell session if not using an `.env` file with `python-dotenv` support.
|
|
241
|
+
|
|
242
|
+
* **File/Directory Not Found (for `search` or `elaborate --report-file`):**
|
|
243
|
+
* Double-check that the paths provided to the `search` command or the `--report-file` argument are correct and accessible.
|
|
244
|
+
* Relative paths are resolved from the current working directory where you run the command.
|
|
245
|
+
|
|
246
|
+
* **Permission Denied Errors:**
|
|
247
|
+
* Ensure you have read permissions for the files/directories you are trying to search, and write permissions if using `--output-file` to a restricted location.
|
|
248
|
+
|
|
249
|
+
* **Invalid Regular Expression (for `search --regex`):**
|
|
250
|
+
* The tool will output an error if the regex pattern is invalid. Test your regex pattern with online tools or Python's `re` module separately.
|
|
251
|
+
* Remember to quote your regex pattern properly in the shell, especially if it contains special characters like `*`, `(`, `)`, `|`, etc. Single quotes (`'pattern'`) are often safer than double quotes in bash/zsh for complex patterns.
|
|
252
|
+
|
|
253
|
+
* **No Matches Found:**
|
|
254
|
+
* Verify your query term or regex pattern. Try a simpler, broader query first.
|
|
255
|
+
* Check your `--case-sensitive` flag. Search is case-insensitive by default.
|
|
256
|
+
* Review your exclusion patterns (`--exclude-dirs`, `--exclude-files`). You might be unintentionally excluding the files containing matches.
|
|
257
|
+
* Ensure the target files are not binary or are of a type the tool can read (primarily text-based).
|
|
258
|
+
* If searching hidden files, ensure `--include-hidden` is used.
|
|
259
|
+
|
|
260
|
+
* **Incorrect JSON in Report File (for `elaborate` command):**
|
|
261
|
+
* The `elaborate` command expects a JSON file in the format produced by `mcp-searcher search --output-format json`. If the file is malformed or not a valid JSON array of search results, elaboration will fail.
|
|
262
|
+
* Error messages like "Could not decode JSON from report file" or "Finding ID ... is out of range" point to issues with the report file or the provided ID.
|
|
263
|
+
|
|
264
|
+
* **Shell Quoting Issues for Query:**
|
|
265
|
+
* If your search query contains spaces or special shell characters (e.g., `!`, `*`, `$`, `&`), ensure it's properly quoted. Single quotes (`'your query'`) are generally safest to prevent shell expansion.
|
|
266
|
+
```bash
|
|
267
|
+
mcp-searcher search 'my exact phrase with spaces!' .
|
|
268
|
+
mcp-searcher search 'pattern_with_$(dollar_sign_and_parens)' . --regex
|
|
269
|
+
```
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "mcp_codebase_searcher"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Sakilmostak", email="skmahim71@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A Python tool to scan codebases, search for text/regex patterns, and elaborate on findings using Google Gemini."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
license = "MIT"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Development Status :: 3 - Alpha",
|
|
19
|
+
"Environment :: Console",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Topic :: Software Development :: Build Tools",
|
|
22
|
+
"Topic :: Text Processing :: Indexing",
|
|
23
|
+
"Topic :: Utilities",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
# Dependencies will be added in the next step
|
|
27
|
+
dependencies = [
|
|
28
|
+
"python-dotenv",
|
|
29
|
+
"google-generativeai",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# py-modules IS REMOVED FROM HERE
|
|
33
|
+
|
|
34
|
+
[project.scripts]
|
|
35
|
+
# Entry points will be defined in a later subtask
|
|
36
|
+
mcp-searcher = "mcp_searcher:main"
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
"Homepage" = "https://github.com/sakilmostak/mcp-codebase-searcher" # Placeholder
|
|
40
|
+
"Bug Tracker" = "https://github.com/sakilmostak/mcp-codebase-searcher/issues" # Placeholder
|
|
41
|
+
|
|
42
|
+
[tool.setuptools]
|
|
43
|
+
package-dir = {"" = "src"}
|
|
44
|
+
py-modules = [
|
|
45
|
+
"mcp_searcher",
|
|
46
|
+
"config",
|
|
47
|
+
"file_scanner",
|
|
48
|
+
"mcp_elaborate",
|
|
49
|
+
"mcp_search",
|
|
50
|
+
"output_generator",
|
|
51
|
+
"report_elaborator"
|
|
52
|
+
]
|
|
53
|
+
# Using py-modules explicitly, so find is not strictly needed but can be kept for safety/explicitness
|
|
54
|
+
[tool.setuptools.packages.find]
|
|
55
|
+
where = ["src"]
|
|
56
|
+
exclude = ["tests", "tests.*"]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dotenv import load_dotenv
|
|
3
|
+
|
|
4
|
+
def load_api_key():
|
|
5
|
+
"""Loads the Google API key from the .env file."""
|
|
6
|
+
load_dotenv() # Load environment variables from .env file
|
|
7
|
+
api_key = os.getenv("GOOGLE_API_KEY")
|
|
8
|
+
if not api_key:
|
|
9
|
+
print("Warning: GOOGLE_API_KEY not found in .env file or environment variables.")
|
|
10
|
+
# Depending on strictness, you might want to raise an error here
|
|
11
|
+
# raise ValueError("GOOGLE_API_KEY not found. Please ensure it is set in your .env file.")
|
|
12
|
+
return api_key
|
|
13
|
+
|
|
14
|
+
if __name__ == '__main__':
|
|
15
|
+
# Example usage:
|
|
16
|
+
key = load_api_key()
|
|
17
|
+
if key:
|
|
18
|
+
print(f"Successfully loaded API key (first 5 chars): {key[:5]}...")
|
|
19
|
+
else:
|
|
20
|
+
print("API key not loaded. Please check your .env file for GOOGLE_API_KEY.")
|