lgit-cli 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lgit/__init__.py +75 -0
- lgit/__main__.py +8 -0
- lgit/analysis.py +326 -0
- lgit/api.py +1077 -0
- lgit/cache.py +338 -0
- lgit/changelog.py +523 -0
- lgit/cli.py +1104 -0
- lgit/compose.py +2110 -0
- lgit/config.py +437 -0
- lgit/diffing.py +384 -0
- lgit/errors.py +137 -0
- lgit/git.py +852 -0
- lgit/map_reduce.py +508 -0
- lgit/markdown_output.py +709 -0
- lgit/models.py +924 -0
- lgit/normalization.py +411 -0
- lgit/patch.py +784 -0
- lgit/profile.py +426 -0
- lgit/py.typed +0 -0
- lgit/repo.py +287 -0
- lgit/resources/__init__.py +1 -0
- lgit/resources/commit_types.json +242 -0
- lgit/resources/prompts/analysis/default.md +237 -0
- lgit/resources/prompts/analysis/markdown.md +112 -0
- lgit/resources/prompts/changelog/default.md +89 -0
- lgit/resources/prompts/changelog/markdown.md +60 -0
- lgit/resources/prompts/compose-bind/default.md +40 -0
- lgit/resources/prompts/compose-bind/markdown.md +41 -0
- lgit/resources/prompts/compose-intent/default.md +63 -0
- lgit/resources/prompts/compose-intent/markdown.md +59 -0
- lgit/resources/prompts/fast/default.md +46 -0
- lgit/resources/prompts/fast/markdown.md +51 -0
- lgit/resources/prompts/map/default.md +67 -0
- lgit/resources/prompts/map/markdown.md +63 -0
- lgit/resources/prompts/reduce/default.md +81 -0
- lgit/resources/prompts/reduce/markdown.md +68 -0
- lgit/resources/prompts/summary/default.md +74 -0
- lgit/resources/prompts/summary/markdown.md +77 -0
- lgit/resources/validation_data.json +1 -0
- lgit/rewrite.py +392 -0
- lgit/style.py +295 -0
- lgit/templates.py +385 -0
- lgit/testing/__init__.py +62 -0
- lgit/testing/compare.py +57 -0
- lgit/testing/fixture.py +386 -0
- lgit/testing/report.py +201 -0
- lgit/testing/runner.py +256 -0
- lgit/tokens.py +90 -0
- lgit/validation.py +545 -0
- lgit_cli-3.7.0.dist-info/METADATA +288 -0
- lgit_cli-3.7.0.dist-info/RECORD +54 -0
- lgit_cli-3.7.0.dist-info/WHEEL +4 -0
- lgit_cli-3.7.0.dist-info/entry_points.txt +2 -0
- lgit_cli-3.7.0.dist-info/licenses/LICENSE +21 -0
lgit/map_reduce.py
ADDED
|
@@ -0,0 +1,508 @@
|
|
|
1
|
+
"""Map-reduce analysis for large git diffs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from .api import (
|
|
13
|
+
OneShotSpec,
|
|
14
|
+
build_analysis_schema,
|
|
15
|
+
format_types_description,
|
|
16
|
+
render_prompt,
|
|
17
|
+
run_oneshot,
|
|
18
|
+
strict_json_schema,
|
|
19
|
+
)
|
|
20
|
+
from .diffing import FileDiff, parse_diff, reconstruct_diff
|
|
21
|
+
from .markdown_output import analysis_from_mapping, fallback_summary, parse_conventional_analysis_markdown
|
|
22
|
+
from .models import AnalysisDetail, ConventionalAnalysis, resolve_model_name
|
|
23
|
+
from .tokens import create_token_counter
|
|
24
|
+
|
|
25
|
+
MAX_FILE_TOKENS = 50_000
|
|
26
|
+
MAP_PHASE_CONCURRENCY = 16
|
|
27
|
+
MAX_CONTEXT_FILES = 20
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True, slots=True)
|
|
31
|
+
class FileObservation:
|
|
32
|
+
"""Factual observations extracted for one changed file."""
|
|
33
|
+
|
|
34
|
+
file: str
|
|
35
|
+
observations: tuple[str, ...]
|
|
36
|
+
additions: int = 0
|
|
37
|
+
deletions: int = 0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def should_use_map_reduce(diff: str, config: Any, counter: Any | None = None) -> bool:
|
|
41
|
+
"""Return whether ``diff`` should be analyzed with map-reduce."""
|
|
42
|
+
|
|
43
|
+
if not bool(getattr(config, "map_reduce_enabled", True)):
|
|
44
|
+
return False
|
|
45
|
+
counter = counter or create_token_counter(config)
|
|
46
|
+
total_tokens = 0
|
|
47
|
+
has_included_file = False
|
|
48
|
+
for file in _included_files(parse_diff(diff), config):
|
|
49
|
+
has_included_file = True
|
|
50
|
+
file_tokens = file.token_estimate(counter)
|
|
51
|
+
if file_tokens > MAX_FILE_TOKENS:
|
|
52
|
+
return True
|
|
53
|
+
total_tokens += file_tokens
|
|
54
|
+
if total_tokens >= int(getattr(config, "map_reduce_threshold", 5000)):
|
|
55
|
+
return True
|
|
56
|
+
return has_included_file and total_tokens >= int(getattr(config, "map_reduce_threshold", 5000))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def build_file_batches(files: Sequence[FileDiff], counter: Any, budget: int) -> list[list[int]]:
|
|
60
|
+
"""Group file indices into token-budgeted map batches."""
|
|
61
|
+
|
|
62
|
+
return _build_file_batches_for_indices(files, range(len(files)), counter, budget)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def build_llm_file_batches(files: Sequence[FileDiff], counter: Any, budget: int) -> list[list[int]]:
|
|
66
|
+
"""Group non-binary files into token-budgeted LLM batches."""
|
|
67
|
+
|
|
68
|
+
indices = [idx for idx, file in enumerate(files) if not file.is_binary]
|
|
69
|
+
return _build_file_batches_for_indices(files, indices, counter, budget)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def observe_diff_files(
|
|
73
|
+
diff: str, map_model_name: str, config: Any, counter: Any | None = None
|
|
74
|
+
) -> list[FileObservation]:
|
|
75
|
+
"""Run the map phase and return per-file observations."""
|
|
76
|
+
|
|
77
|
+
counter = counter or create_token_counter(config)
|
|
78
|
+
files = _included_files(parse_diff(diff), config)
|
|
79
|
+
if not files:
|
|
80
|
+
raise ValueError("No relevant files to summarize after filtering")
|
|
81
|
+
return await _map_phase(files, map_model_name, config, counter)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
async def reduce_phase(
|
|
85
|
+
observations: Sequence[FileObservation], stat: str, scope_candidates: str, model_name: str, config: Any
|
|
86
|
+
) -> ConventionalAnalysis:
|
|
87
|
+
"""Synthesize map observations into final conventional analysis."""
|
|
88
|
+
|
|
89
|
+
type_enum = list(getattr(config, "types", {}) or {"chore": None})
|
|
90
|
+
observations_json = json.dumps(
|
|
91
|
+
[_observation_to_mapping(item) for item in observations], ensure_ascii=False, indent=2
|
|
92
|
+
)
|
|
93
|
+
variant = "markdown" if bool(getattr(config, "markdown_output", True)) else "default"
|
|
94
|
+
system_prompt, user_prompt = _render_reduce_prompt(
|
|
95
|
+
variant,
|
|
96
|
+
observations_json,
|
|
97
|
+
stat,
|
|
98
|
+
scope_candidates,
|
|
99
|
+
format_types_description(config),
|
|
100
|
+
)
|
|
101
|
+
response = await run_oneshot(
|
|
102
|
+
config,
|
|
103
|
+
OneShotSpec(
|
|
104
|
+
operation="map-reduce/reduce",
|
|
105
|
+
model=resolve_model_name(model_name),
|
|
106
|
+
prompt_family="reduce",
|
|
107
|
+
prompt_variant=variant,
|
|
108
|
+
system_prompt=system_prompt,
|
|
109
|
+
user_prompt=user_prompt,
|
|
110
|
+
tool_name="create_conventional_analysis",
|
|
111
|
+
tool_description="Analyze file observations and classify as a conventional commit",
|
|
112
|
+
schema=build_analysis_schema(type_enum, config),
|
|
113
|
+
progress_label="reduce file observations",
|
|
114
|
+
cacheable=True,
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
output = response.output if hasattr(response, "output") else response
|
|
118
|
+
default_type = type_enum[0] if type_enum else "chore"
|
|
119
|
+
if isinstance(output, ConventionalAnalysis):
|
|
120
|
+
return output
|
|
121
|
+
if isinstance(output, Mapping):
|
|
122
|
+
return analysis_from_mapping(output, default_type=default_type)
|
|
123
|
+
text_content = getattr(response, "text_content", None)
|
|
124
|
+
if text_content:
|
|
125
|
+
try:
|
|
126
|
+
return parse_conventional_analysis_markdown(text_content, default_type=default_type)
|
|
127
|
+
except ValueError:
|
|
128
|
+
pass
|
|
129
|
+
return _fallback_reduce_analysis(observations, config)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
async def run_map_reduce(*args: Any, **kwargs: Any) -> ConventionalAnalysis:
|
|
133
|
+
"""Run map and reduce phases for a large diff.
|
|
134
|
+
|
|
135
|
+
Accepts Python order ``(config, stat, diff, scope_candidates=...)`` and the
|
|
136
|
+
Rust-port order ``(diff, stat, scope_candidates, model_name, config, counter)``.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
if args and isinstance(args[0], str):
|
|
140
|
+
diff = args[0]
|
|
141
|
+
stat = args[1] if len(args) > 1 else kwargs.get("stat", "")
|
|
142
|
+
scope_candidates = args[2] if len(args) > 2 else kwargs.get("scope_candidates", "")
|
|
143
|
+
model_name = args[3] if len(args) > 3 else kwargs.get("model_name")
|
|
144
|
+
config = args[4] if len(args) > 4 else kwargs["config"]
|
|
145
|
+
counter = args[5] if len(args) > 5 else kwargs.get("counter")
|
|
146
|
+
else:
|
|
147
|
+
config = args[0] if args else kwargs["config"]
|
|
148
|
+
stat = args[1] if len(args) > 1 else kwargs.get("stat", "")
|
|
149
|
+
diff = args[2] if len(args) > 2 else kwargs.get("diff", "")
|
|
150
|
+
scope_candidates = args[3] if len(args) > 3 else kwargs.get("scope_candidates", "")
|
|
151
|
+
model_name = kwargs.get("model_name")
|
|
152
|
+
counter = kwargs.get("counter")
|
|
153
|
+
|
|
154
|
+
counter = counter or create_token_counter(config)
|
|
155
|
+
reduce_model = resolve_model_name(
|
|
156
|
+
str(model_name or getattr(config, "analysis_model", getattr(config, "model", "claude-opus-4.5")))
|
|
157
|
+
)
|
|
158
|
+
map_model = resolve_model_name(str(getattr(config, "summary_model", getattr(config, "model", reduce_model))))
|
|
159
|
+
observations = await observe_diff_files(str(diff), map_model, config, counter)
|
|
160
|
+
return await reduce_phase(observations, str(stat), str(scope_candidates), reduce_model, config)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
async def _map_phase(
|
|
164
|
+
files: Sequence[FileDiff], map_model_name: str, config: Any, counter: Any
|
|
165
|
+
) -> list[FileObservation]:
|
|
166
|
+
context_headers = _ContextHeaders(files)
|
|
167
|
+
batches = build_llm_file_batches(files, counter, int(getattr(config, "map_batch_token_budget", 16000)))
|
|
168
|
+
observations_by_index: list[FileObservation | None] = [None] * len(files)
|
|
169
|
+
for idx, file in enumerate(files):
|
|
170
|
+
if file.is_binary:
|
|
171
|
+
observations_by_index[idx] = FileObservation(
|
|
172
|
+
file.filename, ("Binary file changed.",), file.additions, file.deletions
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
semaphore = asyncio.Semaphore(MAP_PHASE_CONCURRENCY)
|
|
176
|
+
|
|
177
|
+
async def run_batch(batch_idx: int, batch_indices: list[int]) -> list[tuple[int, FileObservation]]:
|
|
178
|
+
async with semaphore:
|
|
179
|
+
batch_files = [files[idx] for idx in batch_indices]
|
|
180
|
+
paths = [file.filename for file in batch_files]
|
|
181
|
+
context_header = context_headers.header_for_files(paths)
|
|
182
|
+
observations = await _map_file_batch(
|
|
183
|
+
batch_files,
|
|
184
|
+
context_header,
|
|
185
|
+
map_model_name,
|
|
186
|
+
config,
|
|
187
|
+
counter,
|
|
188
|
+
f"map batch {batch_idx + 1}/{len(batches)} ({len(batch_files)} files)",
|
|
189
|
+
)
|
|
190
|
+
return list(zip(batch_indices, observations, strict=True))
|
|
191
|
+
|
|
192
|
+
results = await asyncio.gather(*(run_batch(idx, batch) for idx, batch in enumerate(batches)))
|
|
193
|
+
for batch_result in results:
|
|
194
|
+
for idx, observation in batch_result:
|
|
195
|
+
observations_by_index[idx] = observation
|
|
196
|
+
observations: list[FileObservation] = []
|
|
197
|
+
for idx, observation in enumerate(observations_by_index):
|
|
198
|
+
if observation is None:
|
|
199
|
+
raise RuntimeError(f"Missing map observation for {files[idx].filename}")
|
|
200
|
+
observations.append(observation)
|
|
201
|
+
return observations
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
async def _map_file_batch(
|
|
205
|
+
files: Sequence[FileDiff], context_header: str, model_name: str, config: Any, counter: Any, progress_label: str
|
|
206
|
+
) -> list[FileObservation]:
|
|
207
|
+
rendered = [_render_file_diff_for_batch(file, counter) for file in files]
|
|
208
|
+
prompt_files = [{"path": file.filename, "diff": diff} for file, diff in zip(files, rendered, strict=True)]
|
|
209
|
+
variant = "markdown" if bool(getattr(config, "markdown_output", True)) else "default"
|
|
210
|
+
system_prompt, user_prompt = _render_map_prompt(variant, prompt_files, context_header)
|
|
211
|
+
response = await run_oneshot(
|
|
212
|
+
config,
|
|
213
|
+
OneShotSpec(
|
|
214
|
+
operation="map-reduce/map",
|
|
215
|
+
model=resolve_model_name(model_name),
|
|
216
|
+
prompt_family="map",
|
|
217
|
+
prompt_variant=variant,
|
|
218
|
+
system_prompt=system_prompt,
|
|
219
|
+
user_prompt=user_prompt,
|
|
220
|
+
tool_name="create_file_observations",
|
|
221
|
+
tool_description="Extract observations from a batch of file changes",
|
|
222
|
+
schema=_batch_observation_schema(),
|
|
223
|
+
progress_label=progress_label,
|
|
224
|
+
cacheable=True,
|
|
225
|
+
),
|
|
226
|
+
)
|
|
227
|
+
output = response.output if hasattr(response, "output") else response
|
|
228
|
+
text_content = getattr(response, "text_content", None)
|
|
229
|
+
stop_reason = getattr(response, "stop_reason", None)
|
|
230
|
+
return _map_batch_response_to_observations(files, output, text_content, stop_reason)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _map_batch_response_to_observations(
|
|
234
|
+
files: Sequence[FileDiff], output: Any, text_content: str | None, stop_reason: str | None
|
|
235
|
+
) -> list[FileObservation]:
|
|
236
|
+
entries = _observation_entries(output)
|
|
237
|
+
if not entries and text_content and text_content.strip():
|
|
238
|
+
return [_fallback_file_observation(file) for file in files]
|
|
239
|
+
used = [False] * len(entries)
|
|
240
|
+
observations: list[FileObservation] = []
|
|
241
|
+
stopped_at_max_tokens = stop_reason == "max_tokens"
|
|
242
|
+
for file in files:
|
|
243
|
+
entry_idx = _find_observation_entry(file.filename, entries, used, files)
|
|
244
|
+
if entry_idx is None:
|
|
245
|
+
observations.append(_fallback_file_observation(file))
|
|
246
|
+
continue
|
|
247
|
+
used[entry_idx] = True
|
|
248
|
+
entry = entries[entry_idx]
|
|
249
|
+
raw_observations = _parse_observations(entry.get("observations", []))
|
|
250
|
+
if not raw_observations and stopped_at_max_tokens:
|
|
251
|
+
raw_observations = [_fallback_observation_text(file.filename)]
|
|
252
|
+
observations.append(FileObservation(file.filename, tuple(raw_observations), file.additions, file.deletions))
|
|
253
|
+
return observations
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _observation_entries(output: Any) -> list[dict[str, Any]]:
|
|
257
|
+
if isinstance(output, Mapping):
|
|
258
|
+
raw = output.get("files", [])
|
|
259
|
+
elif isinstance(output, list):
|
|
260
|
+
raw = output
|
|
261
|
+
else:
|
|
262
|
+
raw = []
|
|
263
|
+
return [dict(item) for item in raw if isinstance(item, Mapping)]
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _find_observation_entry(
|
|
267
|
+
filename: str, entries: Sequence[Mapping[str, Any]], used: Sequence[bool], batch_files: Sequence[FileDiff]
|
|
268
|
+
) -> int | None:
|
|
269
|
+
basename = _path_basename(filename)
|
|
270
|
+
basename_unique = sum(1 for file in batch_files if _path_basename(file.filename) == basename) == 1
|
|
271
|
+
matchers = (
|
|
272
|
+
lambda entry: str(entry.get("path", "")) == filename,
|
|
273
|
+
lambda entry: basename_unique and _path_basename(str(entry.get("path", ""))) == basename,
|
|
274
|
+
lambda entry: _path_suffix_matches(str(entry.get("path", "")), filename),
|
|
275
|
+
)
|
|
276
|
+
for matcher in matchers:
|
|
277
|
+
for idx, entry in enumerate(entries):
|
|
278
|
+
if not used[idx] and matcher(entry):
|
|
279
|
+
return idx
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _parse_observations(value: Any) -> list[str]:
|
|
284
|
+
if isinstance(value, str):
|
|
285
|
+
stripped = value.strip()
|
|
286
|
+
if stripped.startswith("["):
|
|
287
|
+
try:
|
|
288
|
+
decoded = json.loads(stripped)
|
|
289
|
+
if isinstance(decoded, list):
|
|
290
|
+
return [str(item).strip() for item in decoded if str(item).strip()]
|
|
291
|
+
except json.JSONDecodeError:
|
|
292
|
+
pass
|
|
293
|
+
return [line.lstrip("-*• ").strip() for line in stripped.splitlines() if line.lstrip("-*• ").strip()]
|
|
294
|
+
if isinstance(value, Iterable):
|
|
295
|
+
return [str(item).strip() for item in value if str(item).strip()]
|
|
296
|
+
return []
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _build_file_batches_for_indices(
|
|
300
|
+
files: Sequence[FileDiff], indices: Iterable[int], counter: Any, budget: int
|
|
301
|
+
) -> list[list[int]]:
|
|
302
|
+
token_budget = max(1, int(budget))
|
|
303
|
+
batches: list[list[int]] = []
|
|
304
|
+
current: list[int] = []
|
|
305
|
+
current_tokens = 0
|
|
306
|
+
for idx in indices:
|
|
307
|
+
file_tokens = files[idx].token_estimate(counter)
|
|
308
|
+
if file_tokens > token_budget:
|
|
309
|
+
if current:
|
|
310
|
+
batches.append(current)
|
|
311
|
+
current = []
|
|
312
|
+
current_tokens = 0
|
|
313
|
+
batches.append([idx])
|
|
314
|
+
continue
|
|
315
|
+
if current and current_tokens + file_tokens > token_budget:
|
|
316
|
+
batches.append(current)
|
|
317
|
+
current = []
|
|
318
|
+
current_tokens = 0
|
|
319
|
+
current.append(idx)
|
|
320
|
+
current_tokens += file_tokens
|
|
321
|
+
if current:
|
|
322
|
+
batches.append(current)
|
|
323
|
+
return batches
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _included_files(files: Sequence[FileDiff], config: Any) -> list[FileDiff]:
|
|
327
|
+
excluded = tuple(str(item) for item in getattr(config, "excluded_files", ()))
|
|
328
|
+
return [file for file in files if not any(file.filename.endswith(pattern) for pattern in excluded)]
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _render_file_diff_for_batch(file: FileDiff, counter: Any) -> str:
|
|
332
|
+
if file.token_estimate(counter) <= MAX_FILE_TOKENS:
|
|
333
|
+
return _reconstruct_single_file_diff(file)
|
|
334
|
+
clone = FileDiff(file.filename, file.header, file.content, file.additions, file.deletions, file.is_binary)
|
|
335
|
+
clone.truncate(MAX_FILE_TOKENS * 4)
|
|
336
|
+
return reconstruct_diff([clone])
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _reconstruct_single_file_diff(file: FileDiff) -> str:
|
|
340
|
+
return f"{file.header}\n{file.content}" if file.content else file.header
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _fallback_file_observation(file: FileDiff) -> FileObservation:
|
|
344
|
+
return FileObservation(file.filename, (_fallback_observation_text(file.filename),), file.additions, file.deletions)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _fallback_observation_text(filename: str) -> str:
|
|
348
|
+
return f"Updated {_path_basename(filename)}."
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _fallback_reduce_analysis(
|
|
352
|
+
observations: Sequence[FileObservation], config: Any, stat: str = ""
|
|
353
|
+
) -> ConventionalAnalysis:
|
|
354
|
+
details = [obs for item in observations for obs in item.observations if obs]
|
|
355
|
+
summary = fallback_summary(stat=stat, details=details, limit=int(getattr(config, "summary_hard_limit", 128)))
|
|
356
|
+
return ConventionalAnalysis(
|
|
357
|
+
commit_type="chore",
|
|
358
|
+
summary=summary,
|
|
359
|
+
details=tuple(AnalysisDetail.simple(_ensure_sentence(detail)) for detail in details[:6]),
|
|
360
|
+
issue_refs=(),
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _ensure_sentence(text: str) -> str:
|
|
365
|
+
stripped = text.strip()
|
|
366
|
+
return stripped if not stripped or stripped.endswith((".", "!", "?")) else f"{stripped}."
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _batch_observation_schema() -> dict[str, Any]:
|
|
370
|
+
return strict_json_schema(
|
|
371
|
+
{
|
|
372
|
+
"files": {
|
|
373
|
+
"type": "array",
|
|
374
|
+
"description": "Per-file observations for every file in the map batch.",
|
|
375
|
+
"items": {
|
|
376
|
+
"type": "object",
|
|
377
|
+
"properties": {
|
|
378
|
+
"path": {"type": "string", "description": "Exact input file path."},
|
|
379
|
+
"observations": {"type": "array", "items": {"type": "string"}},
|
|
380
|
+
},
|
|
381
|
+
"required": ["path", "observations"],
|
|
382
|
+
"additionalProperties": False,
|
|
383
|
+
},
|
|
384
|
+
}
|
|
385
|
+
},
|
|
386
|
+
["files"],
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _render_map_prompt(variant: str, files: Sequence[Mapping[str, str]], context_header: str) -> tuple[str, str]:
|
|
391
|
+
try:
|
|
392
|
+
from .templates import render_map_prompt
|
|
393
|
+
|
|
394
|
+
parts = render_map_prompt(variant, files, context_header)
|
|
395
|
+
return parts.system, parts.user
|
|
396
|
+
except Exception:
|
|
397
|
+
return render_prompt("map", variant, {"files": files, "context_header": context_header})
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _render_reduce_prompt(
|
|
401
|
+
variant: str, observations: str, stat: str, scope_candidates: str, types_description: str
|
|
402
|
+
) -> tuple[str, str]:
|
|
403
|
+
try:
|
|
404
|
+
from .templates import render_reduce_prompt
|
|
405
|
+
|
|
406
|
+
parts = render_reduce_prompt(variant, observations, stat, scope_candidates, types_description)
|
|
407
|
+
return parts.system, parts.user
|
|
408
|
+
except Exception:
|
|
409
|
+
return render_prompt(
|
|
410
|
+
"reduce",
|
|
411
|
+
variant,
|
|
412
|
+
{
|
|
413
|
+
"observations": observations,
|
|
414
|
+
"stat": stat,
|
|
415
|
+
"scope_candidates": scope_candidates,
|
|
416
|
+
"types_description": types_description,
|
|
417
|
+
},
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _observation_to_mapping(item: FileObservation) -> dict[str, Any]:
|
|
422
|
+
return {
|
|
423
|
+
"file": item.file,
|
|
424
|
+
"observations": list(item.observations),
|
|
425
|
+
"additions": item.additions,
|
|
426
|
+
"deletions": item.deletions,
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _path_basename(path: str) -> str:
|
|
431
|
+
return Path(path).name or path
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def _path_suffix_matches(left: str, right: str) -> bool:
|
|
435
|
+
return _path_has_suffix(left, right) or _path_has_suffix(right, left)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _path_has_suffix(path: str, suffix: str) -> bool:
|
|
439
|
+
return path == suffix or path.endswith(f"/{suffix}") or path.endswith(f"\\{suffix}")
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
class _ContextHeaders:
|
|
443
|
+
def __init__(self, files: Sequence[FileDiff]) -> None:
|
|
444
|
+
self.large_commit_header = f"(Large commit with {len(files)} total files)" if len(files) > 100 else None
|
|
445
|
+
self.files = (
|
|
446
|
+
[
|
|
447
|
+
(
|
|
448
|
+
_file.filename,
|
|
449
|
+
_file.additions + _file.deletions,
|
|
450
|
+
_infer_file_description(_file.filename, _file.content),
|
|
451
|
+
)
|
|
452
|
+
for _file in files
|
|
453
|
+
]
|
|
454
|
+
if self.large_commit_header is None
|
|
455
|
+
else []
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
def header_for_files(self, current_files: Sequence[str]) -> str:
|
|
459
|
+
if self.large_commit_header:
|
|
460
|
+
return self.large_commit_header
|
|
461
|
+
current = set(current_files)
|
|
462
|
+
others = [item for item in self.files if item[0] not in current]
|
|
463
|
+
if not others:
|
|
464
|
+
return ""
|
|
465
|
+
shown = sorted(others, key=lambda item: item[1], reverse=True)[:MAX_CONTEXT_FILES]
|
|
466
|
+
lines = ["OTHER FILES IN THIS CHANGE:", *(f"- {path} ({size} lines): {desc}" for path, size, desc in shown)]
|
|
467
|
+
if len(shown) < len(others):
|
|
468
|
+
lines.append(f"... and {len(others) - len(shown)} more files")
|
|
469
|
+
return "\n".join(lines)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def _infer_file_description(filename: str, content: str) -> str:
|
|
473
|
+
lower = filename.lower()
|
|
474
|
+
suffix = Path(filename).suffix.lower()
|
|
475
|
+
if "test" in lower:
|
|
476
|
+
return "test file"
|
|
477
|
+
if "prompt" in lower or "system" in lower:
|
|
478
|
+
return "prompt template"
|
|
479
|
+
if suffix == ".md":
|
|
480
|
+
return "documentation"
|
|
481
|
+
if "config" in lower or suffix in {".toml", ".yaml", ".yml"}:
|
|
482
|
+
return "configuration"
|
|
483
|
+
if "error" in lower:
|
|
484
|
+
return "error definitions"
|
|
485
|
+
if "type" in lower:
|
|
486
|
+
return "type definitions"
|
|
487
|
+
if lower.endswith(("mod.rs", "lib.rs")):
|
|
488
|
+
return "module exports"
|
|
489
|
+
if lower.endswith(("main.rs", "main.go", "main.py")):
|
|
490
|
+
return "entry point"
|
|
491
|
+
if "class " in content or "def " in content or "fn " in content:
|
|
492
|
+
return "implementation"
|
|
493
|
+
if "struct " in content or "enum " in content:
|
|
494
|
+
return "type definitions"
|
|
495
|
+
if "async " in content or "await" in content:
|
|
496
|
+
return "async code"
|
|
497
|
+
return "source code"
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
__all__ = [
|
|
501
|
+
"FileObservation",
|
|
502
|
+
"build_file_batches",
|
|
503
|
+
"build_llm_file_batches",
|
|
504
|
+
"observe_diff_files",
|
|
505
|
+
"reduce_phase",
|
|
506
|
+
"run_map_reduce",
|
|
507
|
+
"should_use_map_reduce",
|
|
508
|
+
]
|