beswarm 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- beswarm/queries/tree-sitter-language-pack/README.md +7 -0
- beswarm/queries/tree-sitter-language-pack/arduino-tags.scm +5 -0
- beswarm/queries/tree-sitter-language-pack/c-tags.scm +9 -0
- beswarm/queries/tree-sitter-language-pack/chatito-tags.scm +16 -0
- beswarm/queries/tree-sitter-language-pack/commonlisp-tags.scm +122 -0
- beswarm/queries/tree-sitter-language-pack/cpp-tags.scm +15 -0
- beswarm/queries/tree-sitter-language-pack/csharp-tags.scm +26 -0
- beswarm/queries/tree-sitter-language-pack/d-tags.scm +26 -0
- beswarm/queries/tree-sitter-language-pack/dart-tags.scm +92 -0
- beswarm/queries/tree-sitter-language-pack/elisp-tags.scm +5 -0
- beswarm/queries/tree-sitter-language-pack/elixir-tags.scm +54 -0
- beswarm/queries/tree-sitter-language-pack/elm-tags.scm +19 -0
- beswarm/queries/tree-sitter-language-pack/gleam-tags.scm +41 -0
- beswarm/queries/tree-sitter-language-pack/go-tags.scm +42 -0
- beswarm/queries/tree-sitter-language-pack/java-tags.scm +20 -0
- beswarm/queries/tree-sitter-language-pack/javascript-tags.scm +88 -0
- beswarm/queries/tree-sitter-language-pack/lua-tags.scm +34 -0
- beswarm/queries/tree-sitter-language-pack/pony-tags.scm +39 -0
- beswarm/queries/tree-sitter-language-pack/properties-tags.scm +5 -0
- beswarm/queries/tree-sitter-language-pack/python-tags.scm +14 -0
- beswarm/queries/tree-sitter-language-pack/r-tags.scm +21 -0
- beswarm/queries/tree-sitter-language-pack/racket-tags.scm +12 -0
- beswarm/queries/tree-sitter-language-pack/ruby-tags.scm +64 -0
- beswarm/queries/tree-sitter-language-pack/rust-tags.scm +60 -0
- beswarm/queries/tree-sitter-language-pack/solidity-tags.scm +43 -0
- beswarm/queries/tree-sitter-language-pack/swift-tags.scm +51 -0
- beswarm/queries/tree-sitter-language-pack/udev-tags.scm +20 -0
- beswarm/queries/tree-sitter-languages/README.md +23 -0
- beswarm/queries/tree-sitter-languages/c-tags.scm +9 -0
- beswarm/queries/tree-sitter-languages/c_sharp-tags.scm +46 -0
- beswarm/queries/tree-sitter-languages/cpp-tags.scm +15 -0
- beswarm/queries/tree-sitter-languages/dart-tags.scm +91 -0
- beswarm/queries/tree-sitter-languages/elisp-tags.scm +8 -0
- beswarm/queries/tree-sitter-languages/elixir-tags.scm +54 -0
- beswarm/queries/tree-sitter-languages/elm-tags.scm +19 -0
- beswarm/queries/tree-sitter-languages/go-tags.scm +30 -0
- beswarm/queries/tree-sitter-languages/hcl-tags.scm +77 -0
- beswarm/queries/tree-sitter-languages/java-tags.scm +20 -0
- beswarm/queries/tree-sitter-languages/javascript-tags.scm +88 -0
- beswarm/queries/tree-sitter-languages/kotlin-tags.scm +27 -0
- beswarm/queries/tree-sitter-languages/ocaml-tags.scm +115 -0
- beswarm/queries/tree-sitter-languages/php-tags.scm +26 -0
- beswarm/queries/tree-sitter-languages/python-tags.scm +12 -0
- beswarm/queries/tree-sitter-languages/ql-tags.scm +26 -0
- beswarm/queries/tree-sitter-languages/ruby-tags.scm +64 -0
- beswarm/queries/tree-sitter-languages/rust-tags.scm +60 -0
- beswarm/queries/tree-sitter-languages/scala-tags.scm +65 -0
- beswarm/queries/tree-sitter-languages/typescript-tags.scm +41 -0
- beswarm/tools/__init__.py +13 -0
- beswarm/tools/edit_file.py +162 -0
- beswarm/tools/planner.py +33 -0
- beswarm/tools/repomap.py +1289 -0
- beswarm/tools/search_arxiv.py +206 -0
- beswarm/tools/think.py +40 -0
- beswarm/tools/worker.py +118 -0
- {beswarm-0.1.2.dist-info → beswarm-0.1.3.dist-info}/METADATA +1 -1
- beswarm-0.1.3.dist-info/RECORD +60 -0
- beswarm-0.1.2.dist-info/RECORD +0 -5
- {beswarm-0.1.2.dist-info → beswarm-0.1.3.dist-info}/WHEEL +0 -0
- {beswarm-0.1.2.dist-info → beswarm-0.1.3.dist-info}/top_level.txt +0 -0
beswarm/tools/repomap.py
ADDED
@@ -0,0 +1,1289 @@
|
|
1
|
+
import os
|
2
|
+
import math
|
3
|
+
import time
|
4
|
+
import random
|
5
|
+
import shutil
|
6
|
+
import sqlite3
|
7
|
+
import colorsys
|
8
|
+
import warnings
|
9
|
+
from pathlib import Path
|
10
|
+
from collections import Counter, defaultdict, namedtuple
|
11
|
+
|
12
|
+
from aient.plugins import register_tool
|
13
|
+
|
14
|
+
from tqdm import tqdm
|
15
|
+
from diskcache import Cache
|
16
|
+
from grep_ast import TreeContext, filename_to_lang
|
17
|
+
from pygments.token import Token
|
18
|
+
from pygments.lexers import guess_lexer_for_filename
|
19
|
+
|
20
|
+
ROOT_IMPORTANT_FILES = [
|
21
|
+
# Version Control
|
22
|
+
".gitignore",
|
23
|
+
".gitattributes",
|
24
|
+
# Documentation
|
25
|
+
"README",
|
26
|
+
"README.md",
|
27
|
+
"README.txt",
|
28
|
+
"README.rst",
|
29
|
+
"CONTRIBUTING",
|
30
|
+
"CONTRIBUTING.md",
|
31
|
+
"CONTRIBUTING.txt",
|
32
|
+
"CONTRIBUTING.rst",
|
33
|
+
"LICENSE",
|
34
|
+
"LICENSE.md",
|
35
|
+
"LICENSE.txt",
|
36
|
+
"CHANGELOG",
|
37
|
+
"CHANGELOG.md",
|
38
|
+
"CHANGELOG.txt",
|
39
|
+
"CHANGELOG.rst",
|
40
|
+
"SECURITY",
|
41
|
+
"SECURITY.md",
|
42
|
+
"SECURITY.txt",
|
43
|
+
"CODEOWNERS",
|
44
|
+
# Package Management and Dependencies
|
45
|
+
"requirements.txt",
|
46
|
+
"Pipfile",
|
47
|
+
"Pipfile.lock",
|
48
|
+
"pyproject.toml",
|
49
|
+
"setup.py",
|
50
|
+
"setup.cfg",
|
51
|
+
"package.json",
|
52
|
+
"package-lock.json",
|
53
|
+
"yarn.lock",
|
54
|
+
"npm-shrinkwrap.json",
|
55
|
+
"Gemfile",
|
56
|
+
"Gemfile.lock",
|
57
|
+
"composer.json",
|
58
|
+
"composer.lock",
|
59
|
+
"pom.xml",
|
60
|
+
"build.gradle",
|
61
|
+
"build.gradle.kts",
|
62
|
+
"build.sbt",
|
63
|
+
"go.mod",
|
64
|
+
"go.sum",
|
65
|
+
"Cargo.toml",
|
66
|
+
"Cargo.lock",
|
67
|
+
"mix.exs",
|
68
|
+
"rebar.config",
|
69
|
+
"project.clj",
|
70
|
+
"Podfile",
|
71
|
+
"Cartfile",
|
72
|
+
"dub.json",
|
73
|
+
"dub.sdl",
|
74
|
+
# Configuration and Settings
|
75
|
+
".env",
|
76
|
+
".env.example",
|
77
|
+
".editorconfig",
|
78
|
+
"tsconfig.json",
|
79
|
+
"jsconfig.json",
|
80
|
+
".babelrc",
|
81
|
+
"babel.config.js",
|
82
|
+
".eslintrc",
|
83
|
+
".eslintignore",
|
84
|
+
".prettierrc",
|
85
|
+
".stylelintrc",
|
86
|
+
"tslint.json",
|
87
|
+
".pylintrc",
|
88
|
+
".flake8",
|
89
|
+
".rubocop.yml",
|
90
|
+
".scalafmt.conf",
|
91
|
+
".dockerignore",
|
92
|
+
".gitpod.yml",
|
93
|
+
"sonar-project.properties",
|
94
|
+
"renovate.json",
|
95
|
+
"dependabot.yml",
|
96
|
+
".pre-commit-config.yaml",
|
97
|
+
"mypy.ini",
|
98
|
+
"tox.ini",
|
99
|
+
".yamllint",
|
100
|
+
"pyrightconfig.json",
|
101
|
+
# Build and Compilation
|
102
|
+
"webpack.config.js",
|
103
|
+
"rollup.config.js",
|
104
|
+
"parcel.config.js",
|
105
|
+
"gulpfile.js",
|
106
|
+
"Gruntfile.js",
|
107
|
+
"build.xml",
|
108
|
+
"build.boot",
|
109
|
+
"project.json",
|
110
|
+
"build.cake",
|
111
|
+
"MANIFEST.in",
|
112
|
+
# Testing
|
113
|
+
"pytest.ini",
|
114
|
+
"phpunit.xml",
|
115
|
+
"karma.conf.js",
|
116
|
+
"jest.config.js",
|
117
|
+
"cypress.json",
|
118
|
+
".nycrc",
|
119
|
+
".nycrc.json",
|
120
|
+
# CI/CD
|
121
|
+
".travis.yml",
|
122
|
+
".gitlab-ci.yml",
|
123
|
+
"Jenkinsfile",
|
124
|
+
"azure-pipelines.yml",
|
125
|
+
"bitbucket-pipelines.yml",
|
126
|
+
"appveyor.yml",
|
127
|
+
"circle.yml",
|
128
|
+
".circleci/config.yml",
|
129
|
+
".github/dependabot.yml",
|
130
|
+
"codecov.yml",
|
131
|
+
".coveragerc",
|
132
|
+
# Docker and Containers
|
133
|
+
"Dockerfile",
|
134
|
+
"docker-compose.yml",
|
135
|
+
"docker-compose.override.yml",
|
136
|
+
# Cloud and Serverless
|
137
|
+
"serverless.yml",
|
138
|
+
"firebase.json",
|
139
|
+
"now.json",
|
140
|
+
"netlify.toml",
|
141
|
+
"vercel.json",
|
142
|
+
"app.yaml",
|
143
|
+
"terraform.tf",
|
144
|
+
"main.tf",
|
145
|
+
"cloudformation.yaml",
|
146
|
+
"cloudformation.json",
|
147
|
+
"ansible.cfg",
|
148
|
+
"kubernetes.yaml",
|
149
|
+
"k8s.yaml",
|
150
|
+
# Database
|
151
|
+
"schema.sql",
|
152
|
+
"liquibase.properties",
|
153
|
+
"flyway.conf",
|
154
|
+
# Framework-specific
|
155
|
+
"next.config.js",
|
156
|
+
"nuxt.config.js",
|
157
|
+
"vue.config.js",
|
158
|
+
"angular.json",
|
159
|
+
"gatsby-config.js",
|
160
|
+
"gridsome.config.js",
|
161
|
+
# API Documentation
|
162
|
+
"swagger.yaml",
|
163
|
+
"swagger.json",
|
164
|
+
"openapi.yaml",
|
165
|
+
"openapi.json",
|
166
|
+
# Development environment
|
167
|
+
".nvmrc",
|
168
|
+
".ruby-version",
|
169
|
+
".python-version",
|
170
|
+
"Vagrantfile",
|
171
|
+
# Quality and metrics
|
172
|
+
".codeclimate.yml",
|
173
|
+
"codecov.yml",
|
174
|
+
# Documentation
|
175
|
+
"mkdocs.yml",
|
176
|
+
"_config.yml",
|
177
|
+
"book.toml",
|
178
|
+
"readthedocs.yml",
|
179
|
+
".readthedocs.yaml",
|
180
|
+
# Package registries
|
181
|
+
".npmrc",
|
182
|
+
".yarnrc",
|
183
|
+
# Linting and formatting
|
184
|
+
".isort.cfg",
|
185
|
+
".markdownlint.json",
|
186
|
+
".markdownlint.yaml",
|
187
|
+
# Security
|
188
|
+
".bandit",
|
189
|
+
".secrets.baseline",
|
190
|
+
# Misc
|
191
|
+
".pypirc",
|
192
|
+
".gitkeep",
|
193
|
+
".npmignore",
|
194
|
+
]
|
195
|
+
|
196
|
+
|
197
|
+
# Normalize the lists once
|
198
|
+
NORMALIZED_ROOT_IMPORTANT_FILES = set(os.path.normpath(path) for path in ROOT_IMPORTANT_FILES)
|
199
|
+
|
200
|
+
|
201
|
+
def is_important(file_path):
|
202
|
+
file_name = os.path.basename(file_path)
|
203
|
+
dir_name = os.path.normpath(os.path.dirname(file_path))
|
204
|
+
normalized_path = os.path.normpath(file_path)
|
205
|
+
|
206
|
+
# Check for GitHub Actions workflow files
|
207
|
+
if dir_name == os.path.normpath(".github/workflows") and file_name.endswith(".yml"):
|
208
|
+
return True
|
209
|
+
|
210
|
+
return normalized_path in NORMALIZED_ROOT_IMPORTANT_FILES
|
211
|
+
|
212
|
+
|
213
|
+
def filter_important_files(file_paths):
|
214
|
+
"""
|
215
|
+
Filter a list of file paths to return only those that are commonly important in codebases.
|
216
|
+
|
217
|
+
:param file_paths: List of file paths to check
|
218
|
+
:return: List of file paths that match important file patterns
|
219
|
+
"""
|
220
|
+
return list(filter(is_important, file_paths))
|
221
|
+
|
222
|
+
import os
|
223
|
+
import base64
|
224
|
+
from pathlib import Path
|
225
|
+
|
226
|
+
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp", ".pdf"}
|
227
|
+
|
228
|
+
def is_image_file(file_name):
|
229
|
+
"""
|
230
|
+
Check if the given file name has an image file extension.
|
231
|
+
|
232
|
+
:param file_name: The name of the file to check.
|
233
|
+
:return: True if the file is an image, False otherwise.
|
234
|
+
"""
|
235
|
+
file_name = str(file_name) # Convert file_name to string
|
236
|
+
return any(file_name.endswith(ext) for ext in IMAGE_EXTENSIONS)
|
237
|
+
|
238
|
+
def ensure_hash_prefix(color):
|
239
|
+
"""Ensure hex color values have a # prefix."""
|
240
|
+
if not color:
|
241
|
+
return color
|
242
|
+
if isinstance(color, str) and color.strip() and not color.startswith("#"):
|
243
|
+
# Check if it's a valid hex color (3 or 6 hex digits)
|
244
|
+
if all(c in "0123456789ABCDEFabcdef" for c in color) and len(color) in (3, 6):
|
245
|
+
return f"#{color}"
|
246
|
+
return color
|
247
|
+
|
248
|
+
class InputOutput:
|
249
|
+
num_error_outputs = 0
|
250
|
+
num_user_asks = 0
|
251
|
+
clipboard_watcher = None
|
252
|
+
bell_on_next_input = False
|
253
|
+
notifications_command = None
|
254
|
+
|
255
|
+
def __init__(
|
256
|
+
self,
|
257
|
+
pretty=True,
|
258
|
+
yes=None,
|
259
|
+
input_history_file=None,
|
260
|
+
chat_history_file=None,
|
261
|
+
input=None,
|
262
|
+
output=None,
|
263
|
+
user_input_color="blue",
|
264
|
+
tool_output_color=None,
|
265
|
+
tool_error_color="red",
|
266
|
+
tool_warning_color="#FFA500",
|
267
|
+
assistant_output_color="blue",
|
268
|
+
completion_menu_color=None,
|
269
|
+
completion_menu_bg_color=None,
|
270
|
+
completion_menu_current_color=None,
|
271
|
+
completion_menu_current_bg_color=None,
|
272
|
+
code_theme="default",
|
273
|
+
encoding="utf-8",
|
274
|
+
line_endings="platform",
|
275
|
+
dry_run=False,
|
276
|
+
llm_history_file=None,
|
277
|
+
# editingmode=EditingMode.EMACS,
|
278
|
+
fancy_input=True,
|
279
|
+
file_watcher=None,
|
280
|
+
multiline_mode=False,
|
281
|
+
root=".",
|
282
|
+
notifications=False,
|
283
|
+
notifications_command=None,
|
284
|
+
):
|
285
|
+
self.placeholder = None
|
286
|
+
self.interrupted = False
|
287
|
+
self.never_prompts = set()
|
288
|
+
# self.editingmode = editingmode
|
289
|
+
self.multiline_mode = multiline_mode
|
290
|
+
self.bell_on_next_input = False
|
291
|
+
self.notifications = notifications
|
292
|
+
if notifications and notifications_command is None:
|
293
|
+
self.notifications_command = self.get_default_notification_command()
|
294
|
+
else:
|
295
|
+
self.notifications_command = notifications_command
|
296
|
+
|
297
|
+
no_color = os.environ.get("NO_COLOR")
|
298
|
+
if no_color is not None and no_color != "":
|
299
|
+
pretty = False
|
300
|
+
|
301
|
+
self.user_input_color = ensure_hash_prefix(user_input_color) if pretty else None
|
302
|
+
self.tool_output_color = ensure_hash_prefix(tool_output_color) if pretty else None
|
303
|
+
self.tool_error_color = ensure_hash_prefix(tool_error_color) if pretty else None
|
304
|
+
self.tool_warning_color = ensure_hash_prefix(tool_warning_color) if pretty else None
|
305
|
+
self.assistant_output_color = ensure_hash_prefix(assistant_output_color)
|
306
|
+
self.completion_menu_color = ensure_hash_prefix(completion_menu_color) if pretty else None
|
307
|
+
self.completion_menu_bg_color = (
|
308
|
+
ensure_hash_prefix(completion_menu_bg_color) if pretty else None
|
309
|
+
)
|
310
|
+
self.completion_menu_current_color = (
|
311
|
+
ensure_hash_prefix(completion_menu_current_color) if pretty else None
|
312
|
+
)
|
313
|
+
self.completion_menu_current_bg_color = (
|
314
|
+
ensure_hash_prefix(completion_menu_current_bg_color) if pretty else None
|
315
|
+
)
|
316
|
+
|
317
|
+
self.code_theme = code_theme
|
318
|
+
|
319
|
+
self.input = input
|
320
|
+
self.output = output
|
321
|
+
|
322
|
+
self.pretty = pretty
|
323
|
+
if self.output:
|
324
|
+
self.pretty = False
|
325
|
+
|
326
|
+
self.yes = yes
|
327
|
+
|
328
|
+
self.input_history_file = input_history_file
|
329
|
+
self.llm_history_file = llm_history_file
|
330
|
+
if chat_history_file is not None:
|
331
|
+
self.chat_history_file = Path(chat_history_file)
|
332
|
+
else:
|
333
|
+
self.chat_history_file = None
|
334
|
+
|
335
|
+
self.encoding = encoding
|
336
|
+
valid_line_endings = {"platform", "lf", "crlf"}
|
337
|
+
if line_endings not in valid_line_endings:
|
338
|
+
raise ValueError(
|
339
|
+
f"Invalid line_endings value: {line_endings}. "
|
340
|
+
f"Must be one of: {', '.join(valid_line_endings)}"
|
341
|
+
)
|
342
|
+
self.newline = (
|
343
|
+
None if line_endings == "platform" else "\n" if line_endings == "lf" else "\r\n"
|
344
|
+
)
|
345
|
+
self.dry_run = dry_run
|
346
|
+
|
347
|
+
self.prompt_session = None
|
348
|
+
|
349
|
+
self.file_watcher = file_watcher
|
350
|
+
self.root = root
|
351
|
+
|
352
|
+
def read_image(self, filename):
|
353
|
+
try:
|
354
|
+
with open(str(filename), "rb") as image_file:
|
355
|
+
encoded_string = base64.b64encode(image_file.read())
|
356
|
+
return encoded_string.decode("utf-8")
|
357
|
+
except OSError as err:
|
358
|
+
self.tool_error(f"{filename}: unable to read: {err}")
|
359
|
+
return
|
360
|
+
except FileNotFoundError:
|
361
|
+
self.tool_error(f"{filename}: file not found error")
|
362
|
+
return
|
363
|
+
except IsADirectoryError:
|
364
|
+
self.tool_error(f"{filename}: is a directory")
|
365
|
+
return
|
366
|
+
except Exception as e:
|
367
|
+
self.tool_error(f"{filename}: {e}")
|
368
|
+
return
|
369
|
+
|
370
|
+
def read_text(self, filename, silent=False):
|
371
|
+
if is_image_file(filename):
|
372
|
+
return self.read_image(filename)
|
373
|
+
|
374
|
+
try:
|
375
|
+
with open(str(filename), "r", encoding=self.encoding) as f:
|
376
|
+
return f.read()
|
377
|
+
except FileNotFoundError:
|
378
|
+
if not silent:
|
379
|
+
self.tool_error(f"{filename}: file not found error")
|
380
|
+
return
|
381
|
+
except IsADirectoryError:
|
382
|
+
if not silent:
|
383
|
+
self.tool_error(f"{filename}: is a directory")
|
384
|
+
return
|
385
|
+
except OSError as err:
|
386
|
+
if not silent:
|
387
|
+
self.tool_error(f"{filename}: unable to read: {err}")
|
388
|
+
return
|
389
|
+
except UnicodeError as e:
|
390
|
+
if not silent:
|
391
|
+
self.tool_error(f"{filename}: {e}")
|
392
|
+
self.tool_error("Use --encoding to set the unicode encoding.")
|
393
|
+
return
|
394
|
+
|
395
|
+
|
396
|
+
# tree_sitter is throwing a FutureWarning
|
397
|
+
warnings.simplefilter("ignore", category=FutureWarning)
|
398
|
+
from grep_ast.tsl import USING_TSL_PACK, get_language, get_parser # noqa: E402
|
399
|
+
|
400
|
+
Tag = namedtuple("Tag", "rel_fname fname line name kind".split())
|
401
|
+
|
402
|
+
|
403
|
+
SQLITE_ERRORS = (sqlite3.OperationalError, sqlite3.DatabaseError, OSError)
|
404
|
+
|
405
|
+
|
406
|
+
CACHE_VERSION = 3
|
407
|
+
if USING_TSL_PACK:
|
408
|
+
CACHE_VERSION = 4
|
409
|
+
|
410
|
+
|
411
|
+
class RepoMap:
|
412
|
+
TAGS_CACHE_DIR = f".beswarm.tags.cache.v{CACHE_VERSION}"
|
413
|
+
|
414
|
+
warned_files = set()
|
415
|
+
|
416
|
+
def __init__(
|
417
|
+
self,
|
418
|
+
map_tokens=8192,
|
419
|
+
root=None,
|
420
|
+
main_model=None,
|
421
|
+
io=None,
|
422
|
+
repo_content_prefix=None,
|
423
|
+
verbose=False,
|
424
|
+
max_context_window=None,
|
425
|
+
map_mul_no_files=8,
|
426
|
+
refresh="auto",
|
427
|
+
):
|
428
|
+
self.io = io
|
429
|
+
self.verbose = verbose
|
430
|
+
self.refresh = refresh
|
431
|
+
|
432
|
+
if not root:
|
433
|
+
root = os.getcwd()
|
434
|
+
self.root = root
|
435
|
+
|
436
|
+
self.load_tags_cache()
|
437
|
+
self.cache_threshold = 0.95
|
438
|
+
|
439
|
+
self.max_map_tokens = map_tokens
|
440
|
+
self.map_mul_no_files = map_mul_no_files
|
441
|
+
self.max_context_window = max_context_window
|
442
|
+
|
443
|
+
self.repo_content_prefix = repo_content_prefix
|
444
|
+
|
445
|
+
self.main_model = main_model
|
446
|
+
|
447
|
+
self.tree_cache = {}
|
448
|
+
self.tree_context_cache = {}
|
449
|
+
self.map_cache = {}
|
450
|
+
self.map_processing_time = 0
|
451
|
+
self.last_map = None
|
452
|
+
|
453
|
+
if self.verbose:
|
454
|
+
self.io.tool_output(
|
455
|
+
f"RepoMap initialized with map_mul_no_files: {self.map_mul_no_files}"
|
456
|
+
)
|
457
|
+
|
458
|
+
def token_count(self, text):
|
459
|
+
len_text = len(text)
|
460
|
+
return len_text / 4
|
461
|
+
if len_text < 200:
|
462
|
+
return self.main_model.token_count(text)
|
463
|
+
|
464
|
+
lines = text.splitlines(keepends=True)
|
465
|
+
num_lines = len(lines)
|
466
|
+
step = num_lines // 100 or 1
|
467
|
+
lines = lines[::step]
|
468
|
+
sample_text = "".join(lines)
|
469
|
+
sample_tokens = self.main_model.token_count(sample_text)
|
470
|
+
est_tokens = sample_tokens / len(sample_text) * len_text
|
471
|
+
return est_tokens
|
472
|
+
|
473
|
+
def get_repo_map(
|
474
|
+
self,
|
475
|
+
chat_files,
|
476
|
+
other_files,
|
477
|
+
mentioned_fnames=None,
|
478
|
+
mentioned_idents=None,
|
479
|
+
force_refresh=False,
|
480
|
+
):
|
481
|
+
if self.max_map_tokens <= 0:
|
482
|
+
return
|
483
|
+
if not other_files:
|
484
|
+
return
|
485
|
+
if not mentioned_fnames:
|
486
|
+
mentioned_fnames = set()
|
487
|
+
if not mentioned_idents:
|
488
|
+
mentioned_idents = set()
|
489
|
+
|
490
|
+
max_map_tokens = self.max_map_tokens
|
491
|
+
|
492
|
+
# With no files in the chat, give a bigger view of the entire repo
|
493
|
+
padding = 4096
|
494
|
+
if max_map_tokens and self.max_context_window:
|
495
|
+
target = min(
|
496
|
+
int(max_map_tokens * self.map_mul_no_files),
|
497
|
+
self.max_context_window - padding,
|
498
|
+
)
|
499
|
+
else:
|
500
|
+
target = 0
|
501
|
+
if not chat_files and self.max_context_window and target > 0:
|
502
|
+
max_map_tokens = target
|
503
|
+
|
504
|
+
try:
|
505
|
+
files_listing = self.get_ranked_tags_map(
|
506
|
+
chat_files,
|
507
|
+
other_files,
|
508
|
+
max_map_tokens,
|
509
|
+
mentioned_fnames,
|
510
|
+
mentioned_idents,
|
511
|
+
force_refresh,
|
512
|
+
)
|
513
|
+
except RecursionError:
|
514
|
+
self.io.tool_error("Disabling repo map, git repo too large?")
|
515
|
+
self.max_map_tokens = 0
|
516
|
+
return
|
517
|
+
|
518
|
+
if not files_listing:
|
519
|
+
return
|
520
|
+
|
521
|
+
if self.verbose:
|
522
|
+
num_tokens = self.token_count(files_listing)
|
523
|
+
self.io.tool_output(f"Repo-map: {num_tokens / 1024:.1f} k-tokens")
|
524
|
+
|
525
|
+
if chat_files:
|
526
|
+
other = "other "
|
527
|
+
else:
|
528
|
+
other = ""
|
529
|
+
|
530
|
+
if self.repo_content_prefix:
|
531
|
+
repo_content = self.repo_content_prefix.format(other=other)
|
532
|
+
else:
|
533
|
+
repo_content = ""
|
534
|
+
|
535
|
+
repo_content += files_listing
|
536
|
+
|
537
|
+
return repo_content
|
538
|
+
|
539
|
+
def get_rel_fname(self, fname):
|
540
|
+
try:
|
541
|
+
return os.path.relpath(fname, self.root)
|
542
|
+
except ValueError:
|
543
|
+
# Issue #1288: ValueError: path is on mount 'C:', start on mount 'D:'
|
544
|
+
# Just return the full fname.
|
545
|
+
return fname
|
546
|
+
|
547
|
+
def tags_cache_error(self, original_error=None):
|
548
|
+
"""Handle SQLite errors by trying to recreate cache, falling back to dict if needed"""
|
549
|
+
|
550
|
+
if self.verbose and original_error:
|
551
|
+
self.io.tool_warning(f"Tags cache error: {str(original_error)}")
|
552
|
+
|
553
|
+
if isinstance(getattr(self, "TAGS_CACHE", None), dict):
|
554
|
+
return
|
555
|
+
|
556
|
+
path = Path(self.root) / self.TAGS_CACHE_DIR
|
557
|
+
|
558
|
+
# Try to recreate the cache
|
559
|
+
try:
|
560
|
+
# Delete existing cache dir
|
561
|
+
if path.exists():
|
562
|
+
shutil.rmtree(path)
|
563
|
+
|
564
|
+
# Try to create new cache
|
565
|
+
new_cache = Cache(path)
|
566
|
+
|
567
|
+
# Test that it works
|
568
|
+
test_key = "test"
|
569
|
+
new_cache[test_key] = "test"
|
570
|
+
_ = new_cache[test_key]
|
571
|
+
del new_cache[test_key]
|
572
|
+
|
573
|
+
# If we got here, the new cache works
|
574
|
+
self.TAGS_CACHE = new_cache
|
575
|
+
return
|
576
|
+
|
577
|
+
except SQLITE_ERRORS as e:
|
578
|
+
# If anything goes wrong, warn and fall back to dict
|
579
|
+
self.io.tool_warning(
|
580
|
+
f"Unable to use tags cache at {path}, falling back to memory cache"
|
581
|
+
)
|
582
|
+
if self.verbose:
|
583
|
+
self.io.tool_warning(f"Cache recreation error: {str(e)}")
|
584
|
+
|
585
|
+
self.TAGS_CACHE = dict()
|
586
|
+
|
587
|
+
def load_tags_cache(self):
|
588
|
+
path = Path(self.root) / self.TAGS_CACHE_DIR
|
589
|
+
try:
|
590
|
+
self.TAGS_CACHE = Cache(path)
|
591
|
+
except SQLITE_ERRORS as e:
|
592
|
+
self.tags_cache_error(e)
|
593
|
+
|
594
|
+
def save_tags_cache(self):
|
595
|
+
pass
|
596
|
+
|
597
|
+
def get_mtime(self, fname):
|
598
|
+
try:
|
599
|
+
return os.path.getmtime((self.root / Path(fname)))
|
600
|
+
except FileNotFoundError:
|
601
|
+
self.io.tool_warning(f"File not found error: {fname}")
|
602
|
+
|
603
|
+
def get_tags(self, fname, rel_fname):
|
604
|
+
# Check if the file is in the cache and if the modification time has not changed
|
605
|
+
file_mtime = self.get_mtime(fname)
|
606
|
+
# print(f"file_mtime: {file_mtime}")
|
607
|
+
if file_mtime is None:
|
608
|
+
return []
|
609
|
+
cache_key = fname
|
610
|
+
try:
|
611
|
+
val = self.TAGS_CACHE.get(cache_key) # Issue #1308
|
612
|
+
except SQLITE_ERRORS as e:
|
613
|
+
self.tags_cache_error(e)
|
614
|
+
val = self.TAGS_CACHE.get(cache_key)
|
615
|
+
|
616
|
+
if val is not None and val.get("mtime") == file_mtime:
|
617
|
+
try:
|
618
|
+
return self.TAGS_CACHE[cache_key]["data"]
|
619
|
+
except SQLITE_ERRORS as e:
|
620
|
+
self.tags_cache_error(e)
|
621
|
+
return self.TAGS_CACHE[cache_key]["data"]
|
622
|
+
|
623
|
+
# miss!
|
624
|
+
data = list(self.get_tags_raw(fname, rel_fname))
|
625
|
+
|
626
|
+
# Update the cache
|
627
|
+
try:
|
628
|
+
self.TAGS_CACHE[cache_key] = {"mtime": file_mtime, "data": data}
|
629
|
+
self.save_tags_cache()
|
630
|
+
except SQLITE_ERRORS as e:
|
631
|
+
self.tags_cache_error(e)
|
632
|
+
self.TAGS_CACHE[cache_key] = {"mtime": file_mtime, "data": data}
|
633
|
+
|
634
|
+
return data
|
635
|
+
|
636
|
+
def get_tags_raw(self, fname, rel_fname):
|
637
|
+
lang = filename_to_lang(str(self.root / Path(fname)))
|
638
|
+
# print(f"lang1: {lang}")
|
639
|
+
if not lang:
|
640
|
+
return
|
641
|
+
# print(f"lang2: {lang}")
|
642
|
+
|
643
|
+
try:
|
644
|
+
language = get_language(lang)
|
645
|
+
parser = get_parser(lang)
|
646
|
+
except Exception as err:
|
647
|
+
print(f"Skipping file {fname}: {err}")
|
648
|
+
return
|
649
|
+
|
650
|
+
query_scm = get_scm_fname(lang)
|
651
|
+
# print(f"query_scm: {query_scm}, {query_scm.exists()}")
|
652
|
+
if not query_scm.exists():
|
653
|
+
return
|
654
|
+
query_scm = query_scm.read_text()
|
655
|
+
|
656
|
+
code = self.io.read_text(str(self.root / Path(fname)))
|
657
|
+
# print(f"code: {code}")
|
658
|
+
if not code:
|
659
|
+
return
|
660
|
+
tree = parser.parse(bytes(code, "utf-8"))
|
661
|
+
|
662
|
+
# Run the tags queries
|
663
|
+
query = language.query(query_scm)
|
664
|
+
captures = query.captures(tree.root_node)
|
665
|
+
|
666
|
+
saw = set()
|
667
|
+
if USING_TSL_PACK:
|
668
|
+
all_nodes = []
|
669
|
+
for tag, nodes in captures.items():
|
670
|
+
all_nodes += [(node, tag) for node in nodes]
|
671
|
+
else:
|
672
|
+
all_nodes = list(captures)
|
673
|
+
|
674
|
+
for node, tag in all_nodes:
|
675
|
+
if tag.startswith("name.definition."):
|
676
|
+
kind = "def"
|
677
|
+
elif tag.startswith("name.reference."):
|
678
|
+
kind = "ref"
|
679
|
+
else:
|
680
|
+
continue
|
681
|
+
|
682
|
+
saw.add(kind)
|
683
|
+
|
684
|
+
result = Tag(
|
685
|
+
rel_fname=rel_fname,
|
686
|
+
fname=fname,
|
687
|
+
name=node.text.decode("utf-8"),
|
688
|
+
kind=kind,
|
689
|
+
line=node.start_point[0],
|
690
|
+
)
|
691
|
+
|
692
|
+
yield result
|
693
|
+
|
694
|
+
if "ref" in saw:
|
695
|
+
return
|
696
|
+
if "def" not in saw:
|
697
|
+
return
|
698
|
+
|
699
|
+
# We saw defs, without any refs
|
700
|
+
# Some tags files only provide defs (cpp, for example)
|
701
|
+
# Use pygments to backfill refs
|
702
|
+
|
703
|
+
try:
|
704
|
+
lexer = guess_lexer_for_filename(fname, code)
|
705
|
+
except Exception: # On Windows, bad ref to time.clock which is deprecated?
|
706
|
+
# self.io.tool_error(f"Error lexing {fname}")
|
707
|
+
return
|
708
|
+
|
709
|
+
tokens = list(lexer.get_tokens(code))
|
710
|
+
tokens = [token[1] for token in tokens if token[0] in Token.Name]
|
711
|
+
|
712
|
+
for token in tokens:
|
713
|
+
yield Tag(
|
714
|
+
rel_fname=rel_fname,
|
715
|
+
fname=fname,
|
716
|
+
name=token,
|
717
|
+
kind="ref",
|
718
|
+
line=-1,
|
719
|
+
)
|
720
|
+
|
721
|
+
def get_ranked_tags(
|
722
|
+
self, chat_fnames, other_fnames, mentioned_fnames, mentioned_idents, progress=None
|
723
|
+
):
|
724
|
+
import networkx as nx
|
725
|
+
|
726
|
+
defines = defaultdict(set)
|
727
|
+
references = defaultdict(list)
|
728
|
+
definitions = defaultdict(set)
|
729
|
+
|
730
|
+
personalization = dict()
|
731
|
+
|
732
|
+
fnames = set(chat_fnames).union(set(other_fnames))
|
733
|
+
chat_rel_fnames = set()
|
734
|
+
|
735
|
+
fnames = sorted(fnames)
|
736
|
+
|
737
|
+
# Default personalization for unspecified files is 1/num_nodes
|
738
|
+
# https://networkx.org/documentation/stable/_modules/networkx/algorithms/link_analysis/pagerank_alg.html#pagerank
|
739
|
+
personalize = 100 / len(fnames)
|
740
|
+
|
741
|
+
try:
|
742
|
+
cache_size = len(self.TAGS_CACHE)
|
743
|
+
except SQLITE_ERRORS as e:
|
744
|
+
self.tags_cache_error(e)
|
745
|
+
cache_size = len(self.TAGS_CACHE)
|
746
|
+
|
747
|
+
if len(fnames) - cache_size > 100:
|
748
|
+
# self.io.tool_output(
|
749
|
+
# "Initial repo scan can be slow in larger repos, but only happens once."
|
750
|
+
# )
|
751
|
+
fnames = tqdm(fnames, desc="Scanning repo")
|
752
|
+
showing_bar = True
|
753
|
+
else:
|
754
|
+
showing_bar = False
|
755
|
+
|
756
|
+
for fname in fnames:
|
757
|
+
if self.verbose:
|
758
|
+
self.io.tool_output(f"Processing {fname}")
|
759
|
+
# if progress and not showing_bar:
|
760
|
+
# progress()
|
761
|
+
|
762
|
+
try:
|
763
|
+
file_ok = (self.root / Path(fname)).is_file()
|
764
|
+
except OSError:
|
765
|
+
file_ok = False
|
766
|
+
|
767
|
+
if not file_ok:
|
768
|
+
# print(f"file_ok: {file_ok}, fname: {self.root / Path(fname)}")
|
769
|
+
# if fname not in self.warned_files:
|
770
|
+
# self.io.tool_warning(f"Repo-map can't include {fname}")
|
771
|
+
# self.io.tool_output(
|
772
|
+
# "Has it been deleted from the file system but not from git?"
|
773
|
+
# )
|
774
|
+
# self.warned_files.add(fname)
|
775
|
+
continue
|
776
|
+
|
777
|
+
# dump(fname)
|
778
|
+
# print(f"self.root: {self.root}")
|
779
|
+
rel_fname = self.get_rel_fname((self.root / Path(fname)))
|
780
|
+
current_pers = 0.0 # Start with 0 personalization score
|
781
|
+
|
782
|
+
if fname in chat_fnames:
|
783
|
+
current_pers += personalize
|
784
|
+
chat_rel_fnames.add(rel_fname)
|
785
|
+
|
786
|
+
if rel_fname in mentioned_fnames:
|
787
|
+
# Use max to avoid double counting if in chat_fnames and mentioned_fnames
|
788
|
+
current_pers = max(current_pers, personalize)
|
789
|
+
|
790
|
+
# Check path components against mentioned_idents
|
791
|
+
path_obj = self.root / Path(rel_fname)
|
792
|
+
# print(f"path_obj: {path_obj.absolute()}")
|
793
|
+
path_components = set(path_obj.parts)
|
794
|
+
basename_with_ext = path_obj.name
|
795
|
+
basename_without_ext, _ = os.path.splitext(basename_with_ext)
|
796
|
+
components_to_check = path_components.union({basename_with_ext, basename_without_ext})
|
797
|
+
|
798
|
+
matched_idents = components_to_check.intersection(mentioned_idents)
|
799
|
+
if matched_idents:
|
800
|
+
# Add personalization *once* if any path component matches a mentioned ident
|
801
|
+
current_pers += personalize
|
802
|
+
|
803
|
+
if current_pers > 0:
|
804
|
+
personalization[rel_fname] = current_pers # Assign the final calculated value
|
805
|
+
|
806
|
+
tags = list(self.get_tags(fname, rel_fname))
|
807
|
+
if tags is None:
|
808
|
+
continue
|
809
|
+
|
810
|
+
for tag in tags:
|
811
|
+
if tag.kind == "def":
|
812
|
+
defines[tag.name].add(rel_fname)
|
813
|
+
key = (rel_fname, tag.name)
|
814
|
+
definitions[key].add(tag)
|
815
|
+
|
816
|
+
elif tag.kind == "ref":
|
817
|
+
references[tag.name].append(rel_fname)
|
818
|
+
|
819
|
+
##
|
820
|
+
# dump(defines)
|
821
|
+
# dump(references)
|
822
|
+
# dump(personalization)
|
823
|
+
|
824
|
+
if not references:
|
825
|
+
references = dict((k, list(v)) for k, v in defines.items())
|
826
|
+
|
827
|
+
idents = set(defines.keys()).intersection(set(references.keys()))
|
828
|
+
|
829
|
+
G = nx.MultiDiGraph()
|
830
|
+
|
831
|
+
# Add a small self-edge for every definition that has no references
|
832
|
+
# Helps with tree-sitter 0.23.2 with ruby, where "def greet(name)"
|
833
|
+
# isn't counted as a def AND a ref. tree-sitter 0.24.0 does.
|
834
|
+
for ident in defines.keys():
|
835
|
+
if ident in references:
|
836
|
+
continue
|
837
|
+
for definer in defines[ident]:
|
838
|
+
G.add_edge(definer, definer, weight=0.1, ident=ident)
|
839
|
+
# print(f"self.root: {self.root}")
|
840
|
+
for ident in idents:
|
841
|
+
if progress:
|
842
|
+
progress()
|
843
|
+
|
844
|
+
definers = defines[ident]
|
845
|
+
|
846
|
+
mul = 1.0
|
847
|
+
|
848
|
+
is_snake = ("_" in ident) and any(c.isalpha() for c in ident)
|
849
|
+
is_camel = any(c.isupper() for c in ident) and any(c.islower() for c in ident)
|
850
|
+
if ident in mentioned_idents:
|
851
|
+
mul *= 10
|
852
|
+
if (is_snake or is_camel) and len(ident) >= 8:
|
853
|
+
mul *= 10
|
854
|
+
if ident.startswith("_"):
|
855
|
+
mul *= 0.1
|
856
|
+
if len(defines[ident]) > 5:
|
857
|
+
mul *= 0.1
|
858
|
+
|
859
|
+
for referencer, num_refs in Counter(references[ident]).items():
|
860
|
+
for definer in definers:
|
861
|
+
# dump(referencer, definer, num_refs, mul)
|
862
|
+
# if referencer == definer:
|
863
|
+
# continue
|
864
|
+
|
865
|
+
use_mul = mul
|
866
|
+
if referencer in chat_rel_fnames:
|
867
|
+
use_mul *= 50
|
868
|
+
|
869
|
+
# scale down so high freq (low value) mentions don't dominate
|
870
|
+
num_refs = math.sqrt(num_refs)
|
871
|
+
|
872
|
+
G.add_edge(referencer, definer, weight=use_mul * num_refs, ident=ident)
|
873
|
+
|
874
|
+
if not references:
|
875
|
+
pass
|
876
|
+
|
877
|
+
if personalization:
|
878
|
+
pers_args = dict(personalization=personalization, dangling=personalization)
|
879
|
+
else:
|
880
|
+
pers_args = dict()
|
881
|
+
|
882
|
+
try:
|
883
|
+
ranked = nx.pagerank(G, weight="weight", **pers_args)
|
884
|
+
except ZeroDivisionError:
|
885
|
+
# Issue #1536
|
886
|
+
try:
|
887
|
+
ranked = nx.pagerank(G, weight="weight")
|
888
|
+
except ZeroDivisionError:
|
889
|
+
return []
|
890
|
+
|
891
|
+
# distribute the rank from each source node, across all of its out edges
|
892
|
+
ranked_definitions = defaultdict(float)
|
893
|
+
for src in G.nodes:
|
894
|
+
if progress:
|
895
|
+
progress()
|
896
|
+
|
897
|
+
src_rank = ranked[src]
|
898
|
+
total_weight = sum(data["weight"] for _src, _dst, data in G.out_edges(src, data=True))
|
899
|
+
# dump(src, src_rank, total_weight)
|
900
|
+
for _src, dst, data in G.out_edges(src, data=True):
|
901
|
+
data["rank"] = src_rank * data["weight"] / total_weight
|
902
|
+
ident = data["ident"]
|
903
|
+
ranked_definitions[(dst, ident)] += data["rank"]
|
904
|
+
|
905
|
+
ranked_tags = []
|
906
|
+
ranked_definitions = sorted(
|
907
|
+
ranked_definitions.items(), reverse=True, key=lambda x: (x[1], x[0])
|
908
|
+
)
|
909
|
+
|
910
|
+
# dump(ranked_definitions)
|
911
|
+
|
912
|
+
for (fname, ident), rank in ranked_definitions:
|
913
|
+
# print(f"{rank:.03f} {fname} {ident}")
|
914
|
+
if fname in chat_rel_fnames:
|
915
|
+
continue
|
916
|
+
ranked_tags += list(definitions.get((fname, ident), []))
|
917
|
+
# print(f"self.root: {self.root}")
|
918
|
+
rel_other_fnames_without_tags = set(self.get_rel_fname((self.root / Path(fname))) for fname in other_fnames)
|
919
|
+
# print(f"self.root: {self.root}")
|
920
|
+
fnames_already_included = set(rt[0] for rt in ranked_tags)
|
921
|
+
|
922
|
+
top_rank = sorted([(rank, node) for (node, rank) in ranked.items()], reverse=True)
|
923
|
+
for rank, fname in top_rank:
|
924
|
+
if fname in rel_other_fnames_without_tags:
|
925
|
+
rel_other_fnames_without_tags.remove(fname)
|
926
|
+
if fname not in fnames_already_included:
|
927
|
+
ranked_tags.append((fname,))
|
928
|
+
# print(f"self.root: {self.root}")
|
929
|
+
|
930
|
+
for fname in rel_other_fnames_without_tags:
|
931
|
+
# print(f"fname: {fname}")
|
932
|
+
# print(f"self.root / Path(fname).absolute(): {self.root / Path(fname)}")
|
933
|
+
ranked_tags.append((str(self.root / Path(fname)),))
|
934
|
+
# if "main.py" in fname:
|
935
|
+
# print(f"tags: {fname}, {tags}")
|
936
|
+
# print(f"ranked_tags: {ranked_tags}")
|
937
|
+
return ranked_tags
|
938
|
+
|
939
|
+
def get_ranked_tags_map(
|
940
|
+
self,
|
941
|
+
chat_fnames,
|
942
|
+
other_fnames=None,
|
943
|
+
max_map_tokens=None,
|
944
|
+
mentioned_fnames=None,
|
945
|
+
mentioned_idents=None,
|
946
|
+
force_refresh=False,
|
947
|
+
):
|
948
|
+
# Create a cache key
|
949
|
+
cache_key = [
|
950
|
+
tuple(sorted(chat_fnames)) if chat_fnames else None,
|
951
|
+
tuple(sorted(other_fnames)) if other_fnames else None,
|
952
|
+
max_map_tokens,
|
953
|
+
]
|
954
|
+
# print("cache_key", cache_key)
|
955
|
+
|
956
|
+
if self.refresh == "auto":
|
957
|
+
cache_key += [
|
958
|
+
tuple(sorted(mentioned_fnames)) if mentioned_fnames else None,
|
959
|
+
tuple(sorted(mentioned_idents)) if mentioned_idents else None,
|
960
|
+
]
|
961
|
+
cache_key = tuple(cache_key)
|
962
|
+
|
963
|
+
use_cache = False
|
964
|
+
if not force_refresh:
|
965
|
+
if self.refresh == "manual" and self.last_map:
|
966
|
+
return self.last_map
|
967
|
+
|
968
|
+
if self.refresh == "always":
|
969
|
+
use_cache = False
|
970
|
+
elif self.refresh == "files":
|
971
|
+
use_cache = True
|
972
|
+
elif self.refresh == "auto":
|
973
|
+
use_cache = self.map_processing_time > 1.0
|
974
|
+
|
975
|
+
# Check if the result is in the cache
|
976
|
+
if use_cache and cache_key in self.map_cache:
|
977
|
+
return self.map_cache[cache_key]
|
978
|
+
|
979
|
+
# If not in cache or force_refresh is True, generate the map
|
980
|
+
start_time = time.time()
|
981
|
+
result = self.get_ranked_tags_map_uncached(
|
982
|
+
chat_fnames, other_fnames, max_map_tokens, mentioned_fnames, mentioned_idents
|
983
|
+
)
|
984
|
+
# print(f"result: {result}")
|
985
|
+
end_time = time.time()
|
986
|
+
self.map_processing_time = end_time - start_time
|
987
|
+
|
988
|
+
# Store the result in the cache
|
989
|
+
self.map_cache[cache_key] = result
|
990
|
+
self.last_map = result
|
991
|
+
|
992
|
+
# print(f"result: {result}")
|
993
|
+
return result
|
994
|
+
|
995
|
+
def get_ranked_tags_map_uncached(
|
996
|
+
self,
|
997
|
+
chat_fnames,
|
998
|
+
other_fnames=None,
|
999
|
+
max_map_tokens=None,
|
1000
|
+
mentioned_fnames=None,
|
1001
|
+
mentioned_idents=None,
|
1002
|
+
):
|
1003
|
+
if not other_fnames:
|
1004
|
+
other_fnames = list()
|
1005
|
+
if not max_map_tokens:
|
1006
|
+
max_map_tokens = self.max_map_tokens
|
1007
|
+
if not mentioned_fnames:
|
1008
|
+
mentioned_fnames = set()
|
1009
|
+
if not mentioned_idents:
|
1010
|
+
mentioned_idents = set()
|
1011
|
+
|
1012
|
+
# spin = Spinner("Updating repo map")
|
1013
|
+
|
1014
|
+
ranked_tags = self.get_ranked_tags(
|
1015
|
+
chat_fnames,
|
1016
|
+
other_fnames,
|
1017
|
+
mentioned_fnames,
|
1018
|
+
mentioned_idents,
|
1019
|
+
# progress=spin.step,
|
1020
|
+
)
|
1021
|
+
|
1022
|
+
other_rel_fnames = sorted(set(self.get_rel_fname(fname) for fname in other_fnames))
|
1023
|
+
special_fnames = filter_important_files(other_rel_fnames)
|
1024
|
+
ranked_tags_fnames = set(tag[0] for tag in ranked_tags)
|
1025
|
+
special_fnames = [fn for fn in special_fnames if fn not in ranked_tags_fnames]
|
1026
|
+
special_fnames = [(fn,) for fn in special_fnames]
|
1027
|
+
|
1028
|
+
ranked_tags = special_fnames + ranked_tags
|
1029
|
+
# print("ranked_tags", ranked_tags)
|
1030
|
+
|
1031
|
+
# spin.step()
|
1032
|
+
|
1033
|
+
num_tags = len(ranked_tags)
|
1034
|
+
lower_bound = 0
|
1035
|
+
upper_bound = num_tags
|
1036
|
+
best_tree = None
|
1037
|
+
best_tree_tokens = 0
|
1038
|
+
|
1039
|
+
chat_rel_fnames = set(self.get_rel_fname(fname) for fname in chat_fnames)
|
1040
|
+
|
1041
|
+
self.tree_cache = dict()
|
1042
|
+
|
1043
|
+
middle = min(int(max_map_tokens // 25), num_tags)
|
1044
|
+
# print(f"max_map_tokens: {max_map_tokens}")
|
1045
|
+
while lower_bound <= upper_bound:
|
1046
|
+
# dump(lower_bound, middle, upper_bound)
|
1047
|
+
|
1048
|
+
# spin.step()
|
1049
|
+
|
1050
|
+
tree = self.to_tree(ranked_tags[:middle], chat_rel_fnames)
|
1051
|
+
# print("tree", tree)
|
1052
|
+
num_tokens = self.token_count(tree)
|
1053
|
+
|
1054
|
+
pct_err = abs(num_tokens - max_map_tokens) / max_map_tokens
|
1055
|
+
ok_err = 0.15
|
1056
|
+
if (num_tokens <= max_map_tokens and num_tokens > best_tree_tokens) or pct_err < ok_err:
|
1057
|
+
best_tree = tree
|
1058
|
+
best_tree_tokens = num_tokens
|
1059
|
+
|
1060
|
+
if pct_err < ok_err:
|
1061
|
+
break
|
1062
|
+
|
1063
|
+
if num_tokens < max_map_tokens:
|
1064
|
+
lower_bound = middle + 1
|
1065
|
+
else:
|
1066
|
+
upper_bound = middle - 1
|
1067
|
+
|
1068
|
+
middle = int((lower_bound + upper_bound) // 2)
|
1069
|
+
|
1070
|
+
# spin.end()
|
1071
|
+
# print("best_tree", repr(best_tree))
|
1072
|
+
return best_tree
|
1073
|
+
|
1074
|
+
tree_cache = dict()
|
1075
|
+
|
1076
|
+
def render_tree(self, abs_fname, rel_fname, lois):
|
1077
|
+
mtime = self.get_mtime(abs_fname)
|
1078
|
+
key = (rel_fname, tuple(sorted(lois)), mtime)
|
1079
|
+
|
1080
|
+
# print(f"key: {key}")
|
1081
|
+
# print(f"self.tree_cache: {self.tree_cache}")
|
1082
|
+
if key in self.tree_cache:
|
1083
|
+
return self.tree_cache[key]
|
1084
|
+
# print(f"abs_fname: {abs_fname}")
|
1085
|
+
# print(f"rel_fname: {rel_fname}")
|
1086
|
+
# print(f"mtime: {mtime}")
|
1087
|
+
# print(f"self.tree_context_cache: {self.tree_context_cache}")
|
1088
|
+
if (
|
1089
|
+
rel_fname not in self.tree_context_cache
|
1090
|
+
or self.tree_context_cache[rel_fname]["mtime"] != mtime
|
1091
|
+
):
|
1092
|
+
# print(f"abs_fname: {abs_fname}")
|
1093
|
+
code = self.io.read_text(abs_fname) or ""
|
1094
|
+
# print(f"code: {code}")
|
1095
|
+
if not code.endswith("\n"):
|
1096
|
+
code += "\n"
|
1097
|
+
|
1098
|
+
context = TreeContext(
|
1099
|
+
rel_fname,
|
1100
|
+
code,
|
1101
|
+
color=False,
|
1102
|
+
line_number=False,
|
1103
|
+
child_context=False,
|
1104
|
+
last_line=False,
|
1105
|
+
margin=0,
|
1106
|
+
mark_lois=False,
|
1107
|
+
loi_pad=0,
|
1108
|
+
# header_max=30,
|
1109
|
+
show_top_of_file_parent_scope=False,
|
1110
|
+
)
|
1111
|
+
self.tree_context_cache[rel_fname] = {"context": context, "mtime": mtime}
|
1112
|
+
|
1113
|
+
context = self.tree_context_cache[rel_fname]["context"]
|
1114
|
+
context.lines_of_interest = set()
|
1115
|
+
context.add_lines_of_interest(lois)
|
1116
|
+
context.add_context()
|
1117
|
+
res = context.format()
|
1118
|
+
self.tree_cache[key] = res
|
1119
|
+
return res
|
1120
|
+
|
1121
|
+
def to_tree(self, tags, chat_rel_fnames):
|
1122
|
+
# print("tags", tags)
|
1123
|
+
# print("chat_rel_fnames", chat_rel_fnames)
|
1124
|
+
if not tags:
|
1125
|
+
return ""
|
1126
|
+
|
1127
|
+
cur_fname = None
|
1128
|
+
cur_abs_fname = None
|
1129
|
+
lois = None
|
1130
|
+
output = ""
|
1131
|
+
|
1132
|
+
# add a bogus tag at the end so we trip the this_fname != cur_fname...
|
1133
|
+
dummy_tag = (None,)
|
1134
|
+
for tag in sorted(tags) + [dummy_tag]:
|
1135
|
+
this_rel_fname = tag[0]
|
1136
|
+
if this_rel_fname in chat_rel_fnames:
|
1137
|
+
continue
|
1138
|
+
|
1139
|
+
# ... here ... to output the final real entry in the list
|
1140
|
+
if this_rel_fname != cur_fname:
|
1141
|
+
# print("this_rel_fname", this_rel_fname)
|
1142
|
+
# print("lois", lois, tag, type(tag), type(tag) is Tag)
|
1143
|
+
if lois is not None:
|
1144
|
+
output += "\n"
|
1145
|
+
output += str(self.root / Path(cur_fname)) + ":\n"
|
1146
|
+
# print(f"cur_abs_fname: {cur_abs_fname}, {type(cur_abs_fname)}")
|
1147
|
+
output += self.render_tree(self.root / Path(cur_abs_fname), cur_fname, lois)
|
1148
|
+
lois = None
|
1149
|
+
elif cur_fname:
|
1150
|
+
output += "\n" + cur_fname + "\n"
|
1151
|
+
if type(tag) is Tag:
|
1152
|
+
lois = []
|
1153
|
+
cur_abs_fname = tag.fname
|
1154
|
+
cur_fname = this_rel_fname
|
1155
|
+
|
1156
|
+
if lois is not None:
|
1157
|
+
lois.append(tag.line)
|
1158
|
+
|
1159
|
+
# truncate long lines, in case we get minified js or something else crazy
|
1160
|
+
output = "\n".join([line[:100] for line in output.splitlines()]) + "\n"
|
1161
|
+
|
1162
|
+
return output
|
1163
|
+
|
1164
|
+
|
1165
|
+
def find_src_files(directory):
|
1166
|
+
if not os.path.isdir(directory):
|
1167
|
+
return [directory]
|
1168
|
+
|
1169
|
+
src_files = []
|
1170
|
+
for root, dirs, files in os.walk(directory):
|
1171
|
+
for file in files:
|
1172
|
+
src_files.append(os.path.join(root, file))
|
1173
|
+
return src_files
|
1174
|
+
|
1175
|
+
|
1176
|
+
def get_random_color():
|
1177
|
+
hue = random.random()
|
1178
|
+
r, g, b = [int(x * 255) for x in colorsys.hsv_to_rgb(hue, 1, 0.75)]
|
1179
|
+
res = f"#{r:02x}{g:02x}{b:02x}"
|
1180
|
+
return res
|
1181
|
+
|
1182
|
+
|
1183
|
+
def get_scm_fname(lang):
|
1184
|
+
# print("lang", lang)
|
1185
|
+
# Load the tags queries
|
1186
|
+
if USING_TSL_PACK:
|
1187
|
+
subdir = "tree-sitter-language-pack"
|
1188
|
+
try:
|
1189
|
+
path = Path(__file__).parent.parent / "queries" / subdir / f"{lang}-tags.scm"
|
1190
|
+
# path = resources.files(__package__).joinpath(
|
1191
|
+
# "queries",
|
1192
|
+
# subdir,
|
1193
|
+
# f"{lang}-tags.scm",
|
1194
|
+
# )
|
1195
|
+
if path.exists():
|
1196
|
+
return path
|
1197
|
+
except KeyError:
|
1198
|
+
pass
|
1199
|
+
|
1200
|
+
# Fall back to tree-sitter-languages
|
1201
|
+
subdir = "tree-sitter-languages"
|
1202
|
+
try:
|
1203
|
+
path = Path(__file__).parent.parent / "queries" / subdir / f"{lang}-tags.scm"
|
1204
|
+
return path
|
1205
|
+
# return resources.files(__package__).joinpath(
|
1206
|
+
# "queries",
|
1207
|
+
# subdir,
|
1208
|
+
# f"{lang}-tags.scm",
|
1209
|
+
# )
|
1210
|
+
except KeyError:
|
1211
|
+
return
|
1212
|
+
|
1213
|
+
|
1214
|
+
def get_supported_languages_md():
|
1215
|
+
from grep_ast.parsers import PARSERS
|
1216
|
+
|
1217
|
+
res = """
|
1218
|
+
| Language | File extension | Repo map | Linter |
|
1219
|
+
|:--------:|:--------------:|:--------:|:------:|
|
1220
|
+
"""
|
1221
|
+
data = sorted((lang, ex) for ex, lang in PARSERS.items())
|
1222
|
+
|
1223
|
+
for lang, ext in data:
|
1224
|
+
fn = get_scm_fname(lang)
|
1225
|
+
repo_map = "✓" if Path(fn).exists() else ""
|
1226
|
+
linter_support = "✓"
|
1227
|
+
res += f"| {lang:20} | {ext:20} | {repo_map:^8} | {linter_support:^6} |\n"
|
1228
|
+
|
1229
|
+
res += "\n"
|
1230
|
+
|
1231
|
+
return res
|
1232
|
+
|
1233
|
+
def find_all_files(dir_path):
|
1234
|
+
excluded_dirs = {'.git', '__pycache__', '.venv', '.env', 'node_modules'} # 排除的目录
|
1235
|
+
other_fnames = []
|
1236
|
+
for root, dirs, files in os.walk(dir_path):
|
1237
|
+
# 从dirs中移除需要排除的目录
|
1238
|
+
dirs[:] = [d for d in dirs if d not in excluded_dirs]
|
1239
|
+
for file in files:
|
1240
|
+
# if file.endswith(".py"):
|
1241
|
+
rel_path = os.path.relpath(os.path.join(root, file), dir_path)
|
1242
|
+
other_fnames.append(rel_path)
|
1243
|
+
return other_fnames
|
1244
|
+
|
1245
|
+
|
1246
|
+
@register_tool()
|
1247
|
+
def get_code_repo_map(dir_path):
|
1248
|
+
"""
|
1249
|
+
获取指定代码仓库的结构地图或摘要信息。研究代码仓库必须优先使用此工具。
|
1250
|
+
|
1251
|
+
此工具分析指定目录下的代码仓库,扫描源代码文件,识别关键的定义(如函数、类)
|
1252
|
+
和它们之间的引用关系,并生成一个基于重要性排名的代码结构摘要。
|
1253
|
+
这有助于快速理解大型代码库的组织方式和核心组件。
|
1254
|
+
|
1255
|
+
参数:
|
1256
|
+
dir_path: str - 需要分析的代码仓库的根目录路径。
|
1257
|
+
|
1258
|
+
返回:
|
1259
|
+
str - 包含代码仓库结构地图或摘要信息的字符串。
|
1260
|
+
如果目录无效或分析过程中出现错误,可能返回错误信息或空字符串。
|
1261
|
+
地图通常包含重要文件的路径以及这些文件中最相关的代码片段(定义)。
|
1262
|
+
"""
|
1263
|
+
rm = RepoMap(root=dir_path, io=InputOutput())
|
1264
|
+
other_fnames = find_all_files(dir_path)
|
1265
|
+
repo_map = rm.get_ranked_tags_map([], other_fnames)
|
1266
|
+
return repo_map
|
1267
|
+
|
1268
|
+
if __name__ == "__main__":
|
1269
|
+
# fnames = sys.argv[1:]
|
1270
|
+
|
1271
|
+
# chat_fnames = []
|
1272
|
+
# other_fnames = []
|
1273
|
+
# for fname in sys.argv[1:]:
|
1274
|
+
# if Path(fname).is_dir():
|
1275
|
+
# chat_fnames += find_src_files(fname)
|
1276
|
+
# else:
|
1277
|
+
# chat_fnames.append(fname)
|
1278
|
+
# print("chat_fnames", chat_fnames)
|
1279
|
+
# chat_fnames = []
|
1280
|
+
# rm = RepoMap(root=".", io=InputOutput())
|
1281
|
+
|
1282
|
+
# other_fnames = find_all_files(".")
|
1283
|
+
# print("other_fnames", other_fnames)
|
1284
|
+
# repo_map = rm.get_ranked_tags_map(chat_fnames, other_fnames)
|
1285
|
+
# print(repo_map)
|
1286
|
+
|
1287
|
+
# print(get_code_repo_map("."))
|
1288
|
+
# print(get_code_repo_map("/Users/yanyuming/Downloads/GitHub/uni-api"))
|
1289
|
+
print(get_code_repo_map("/Users/yanyuming/Downloads/GitHub/text-to-motion"))
|