@pmaddire/gcie 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,67 +1,84 @@
1
- """Context builder for minimal LLM prompts."""
2
-
3
- from __future__ import annotations
4
-
5
- from dataclasses import dataclass
6
-
7
- from .snippet_selector import RankedSnippet, estimate_tokens, select_snippets
8
-
9
-
10
- @dataclass(frozen=True, slots=True)
11
- class ContextPayload:
12
- query: str
13
- snippets: tuple[RankedSnippet, ...]
14
- total_tokens_estimate: int
15
-
16
-
17
-
18
-
19
- _INTENT_BASE = {
20
- "edit": 300,
21
- "refactor": 600,
22
- "debug": 500,
23
- "explore": 400,
24
- }
25
-
26
-
27
- def _detect_intent(query: str) -> str:
28
- text = query.lower()
29
- if any(word in text for word in ("refactor", "rewrite", "migrate", "restructure")):
30
- return "refactor"
31
- if any(word in text for word in ("debug", "why", "error", "fail", "bug", "trace")):
32
- return "debug"
33
- if any(word in text for word in ("add", "change", "update", "extend", "modify", "remove", "rename")):
34
- return "edit"
35
- return "explore"
36
-
37
-
38
- def _auto_budget(query: str, ranked_snippets: list[RankedSnippet], intent: str) -> int:
39
- """Compute a context budget that scales with intent, query, and candidate size."""
40
- q_tokens = estimate_tokens(query)
41
- count = len(ranked_snippets)
42
-
43
- base = _INTENT_BASE.get(intent, 400)
44
- budget = base + min(300, q_tokens * 10) + min(400, count * 30)
45
- return max(200, min(1600, budget))
46
-
47
-
48
- def build_context(
49
- query: str,
50
- ranked_snippets: list[RankedSnippet],
51
- *,
52
- token_budget: int | None = 800,
53
- mandatory_node_ids: set[str] | None = None,
54
- intent: str | None = None,
55
- ) -> ContextPayload:
56
- """Build minimal context payload for LLM usage."""
57
- if token_budget is None:
58
- detected_intent = _detect_intent(query) if intent is None else intent
59
- token_budget = _auto_budget(query, ranked_snippets, detected_intent)
60
-
61
- selected = select_snippets(
62
- ranked_snippets,
63
- token_budget=token_budget,
64
- mandatory_node_ids=mandatory_node_ids,
1
+ """Context builder for minimal LLM prompts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from .snippet_selector import (
8
+ RankedSnippet,
9
+ SelectionMetrics,
10
+ estimate_tokens,
11
+ select_snippets_with_diagnostics,
12
+ )
13
+
14
+
15
+ @dataclass(frozen=True, slots=True)
16
+ class ContextPayload:
17
+ query: str
18
+ snippets: tuple[RankedSnippet, ...]
19
+ total_tokens_estimate: int
20
+ selection_confidence: float = 0.0
21
+ selection_metrics: SelectionMetrics | None = None
22
+
23
+
24
+ _INTENT_BASE = {
25
+ "edit": 300,
26
+ "refactor": 600,
27
+ "debug": 500,
28
+ "explore": 400,
29
+ }
30
+
31
+
32
+ def _detect_intent(query: str) -> str:
33
+ text = query.lower()
34
+ if any(word in text for word in ("refactor", "rewrite", "migrate", "restructure")):
35
+ return "refactor"
36
+ if any(word in text for word in ("debug", "why", "error", "fail", "bug", "trace")):
37
+ return "debug"
38
+ if any(word in text for word in ("add", "change", "update", "extend", "modify", "remove", "rename")):
39
+ return "edit"
40
+ return "explore"
41
+
42
+
43
+ def _auto_budget(query: str, ranked_snippets: list[RankedSnippet], intent: str) -> int:
44
+ """Compute a context budget that scales with intent, query, and candidate size."""
45
+ q_tokens = estimate_tokens(query)
46
+ count = len(ranked_snippets)
47
+
48
+ base = _INTENT_BASE.get(intent, 400)
49
+ budget = base + min(300, q_tokens * 10) + min(400, count * 30)
50
+ return max(200, min(1600, budget))
51
+
52
+
53
+ def should_expand_context(payload: ContextPayload, *, minimum_confidence: float = 0.7) -> bool:
54
+ """Return True when the caller should consider widening retrieval."""
55
+ return payload.selection_confidence < minimum_confidence
56
+
57
+
58
+ def build_context(
59
+ query: str,
60
+ ranked_snippets: list[RankedSnippet],
61
+ *,
62
+ token_budget: int | None = 800,
63
+ mandatory_node_ids: set[str] | None = None,
64
+ intent: str | None = None,
65
+ ) -> ContextPayload:
66
+ """Build minimal context payload for LLM usage."""
67
+ if token_budget is None:
68
+ detected_intent = _detect_intent(query) if intent is None else intent
69
+ token_budget = _auto_budget(query, ranked_snippets, detected_intent)
70
+
71
+ selection = select_snippets_with_diagnostics(
72
+ ranked_snippets,
73
+ token_budget=token_budget,
74
+ mandatory_node_ids=mandatory_node_ids,
75
+ )
76
+ total = sum(estimate_tokens(item.content) for item in selection.snippets)
77
+ return ContextPayload(
78
+ query=query,
79
+ snippets=selection.snippets,
80
+ total_tokens_estimate=total,
81
+ selection_confidence=selection.metrics.confidence,
82
+ selection_metrics=selection.metrics,
65
83
  )
66
- total = sum(estimate_tokens(item.content) for item in selected)
67
- return ContextPayload(query=query, snippets=selected, total_tokens_estimate=total)
84
+
@@ -12,46 +12,177 @@ class RankedSnippet:
12
12
  score: float
13
13
 
14
14
 
15
+ @dataclass(frozen=True, slots=True)
16
+ class SelectionMetrics:
17
+ token_budget: int
18
+ used_tokens: int
19
+ selected_count: int
20
+ mandatory_requested_count: int
21
+ mandatory_selected_count: int
22
+ mandatory_coverage_ratio: float
23
+ score_spread: float
24
+ confidence: float
25
+
26
+
27
+ @dataclass(frozen=True, slots=True)
28
+ class SelectionResult:
29
+ snippets: tuple[RankedSnippet, ...]
30
+ metrics: SelectionMetrics
31
+
32
+
15
33
  def estimate_tokens(text: str) -> int:
16
34
  """Cheap token estimate for budget management."""
17
35
  return max(1, len(text.split()))
18
36
 
19
37
 
20
- def select_snippets(
38
+ def _candidate_sort_key(item: RankedSnippet) -> tuple[float, str, str]:
39
+ return (-item.score, item.node_id, item.content)
40
+
41
+
42
+ def _dedupe_ranked_snippets(
43
+ ranked: list[RankedSnippet],
44
+ *,
45
+ mandatory_node_ids: set[str],
46
+ ) -> list[RankedSnippet]:
47
+ """Keep one snippet per content blob while preferring stronger candidates."""
48
+ best_by_content: dict[str, tuple[int, RankedSnippet]] = {}
49
+ for index, item in enumerate(ranked):
50
+ current = best_by_content.get(item.content)
51
+ if current is None:
52
+ best_by_content[item.content] = (index, item)
53
+ continue
54
+
55
+ current_index, current_item = current
56
+ current_is_mandatory = current_item.node_id in mandatory_node_ids
57
+ item_is_mandatory = item.node_id in mandatory_node_ids
58
+
59
+ replace = False
60
+ if item_is_mandatory and not current_is_mandatory:
61
+ replace = True
62
+ elif item.score > current_item.score:
63
+ replace = True
64
+ elif item.score == current_item.score and index < current_index:
65
+ replace = True
66
+
67
+ if replace:
68
+ best_by_content[item.content] = (index, item)
69
+
70
+ ordered = sorted(best_by_content.values(), key=lambda pair: pair[0])
71
+ return [item for _, item in ordered]
72
+
73
+
74
+ def _score_spread(selected: tuple[RankedSnippet, ...]) -> float:
75
+ if not selected:
76
+ return 0.0
77
+ if len(selected) == 1:
78
+ return max(0.0, selected[0].score)
79
+ return max(0.0, selected[0].score - selected[-1].score)
80
+
81
+
82
+ def _selection_confidence(
83
+ *,
84
+ selected_count: int,
85
+ total_candidates: int,
86
+ mandatory_requested_count: int,
87
+ mandatory_selected_count: int,
88
+ score_spread: float,
89
+ ) -> float:
90
+ if selected_count <= 0:
91
+ return 0.0
92
+
93
+ if mandatory_requested_count > 0:
94
+ mandatory_coverage = mandatory_selected_count / mandatory_requested_count
95
+ else:
96
+ mandatory_coverage = 1.0
97
+
98
+ spread_score = min(1.0, score_spread / 0.5)
99
+ selected_density = min(1.0, selected_count / max(3, total_candidates or 1))
100
+
101
+ confidence = (
102
+ 0.65 * mandatory_coverage
103
+ + 0.2 * spread_score
104
+ + 0.15 * selected_density
105
+ )
106
+ if mandatory_requested_count > 0 and mandatory_coverage < 1.0:
107
+ confidence *= 0.9
108
+
109
+ return max(0.0, min(1.0, round(confidence, 3)))
110
+
111
+
112
+ def select_snippets_with_diagnostics(
21
113
  ranked: list[RankedSnippet],
22
114
  *,
23
115
  token_budget: int,
24
116
  mandatory_node_ids: set[str] | None = None,
25
- ) -> tuple[RankedSnippet, ...]:
26
- """Select minimal high-value snippets under token budget."""
117
+ ) -> SelectionResult:
118
+ """Select snippets under budget and return selection confidence diagnostics."""
27
119
  mandatory_node_ids = mandatory_node_ids or set()
120
+ deduped = _dedupe_ranked_snippets(ranked, mandatory_node_ids=mandatory_node_ids)
121
+
122
+ mandatory = sorted(
123
+ [item for item in deduped if item.node_id in mandatory_node_ids],
124
+ key=_candidate_sort_key,
125
+ )
126
+ optional = sorted(
127
+ [item for item in deduped if item.node_id not in mandatory_node_ids],
128
+ key=_candidate_sort_key,
129
+ )
28
130
 
29
131
  selected: list[RankedSnippet] = []
30
132
  seen_contents: set[str] = set()
31
133
  used_tokens = 0
32
134
 
33
- # First, include mandatory snippets if possible.
34
- for item in ranked:
35
- if item.node_id not in mandatory_node_ids:
36
- continue
37
- if item.content in seen_contents:
38
- continue
39
- t = estimate_tokens(item.content)
40
- if used_tokens + t > token_budget:
41
- continue
42
- selected.append(item)
43
- seen_contents.add(item.content)
44
- used_tokens += t
135
+ # Mandatory snippets are chosen first, but still in score order so we keep the most relevant ones.
136
+ for pool in (mandatory, optional):
137
+ for item in pool:
138
+ if item.content in seen_contents:
139
+ continue
140
+ tokens = estimate_tokens(item.content)
141
+ if used_tokens + tokens > token_budget:
142
+ continue
143
+ selected.append(item)
144
+ seen_contents.add(item.content)
145
+ used_tokens += tokens
45
146
 
46
- # Then fill with highest score snippets.
47
- for item in sorted(ranked, key=lambda s: s.score, reverse=True):
48
- if item.content in seen_contents:
49
- continue
50
- t = estimate_tokens(item.content)
51
- if used_tokens + t > token_budget:
52
- continue
53
- selected.append(item)
54
- seen_contents.add(item.content)
55
- used_tokens += t
147
+ selected_tuple = tuple(selected)
148
+ mandatory_requested_count = len(mandatory)
149
+ mandatory_selected_count = sum(1 for item in selected_tuple if item.node_id in mandatory_node_ids)
150
+ score_spread = _score_spread(selected_tuple)
151
+ mandatory_coverage_ratio = (
152
+ mandatory_selected_count / mandatory_requested_count if mandatory_requested_count else 1.0
153
+ )
154
+ confidence = _selection_confidence(
155
+ selected_count=len(selected_tuple),
156
+ total_candidates=len(deduped),
157
+ mandatory_requested_count=mandatory_requested_count,
158
+ mandatory_selected_count=mandatory_selected_count,
159
+ score_spread=score_spread,
160
+ )
161
+
162
+ return SelectionResult(
163
+ snippets=selected_tuple,
164
+ metrics=SelectionMetrics(
165
+ token_budget=token_budget,
166
+ used_tokens=used_tokens,
167
+ selected_count=len(selected_tuple),
168
+ mandatory_requested_count=mandatory_requested_count,
169
+ mandatory_selected_count=mandatory_selected_count,
170
+ mandatory_coverage_ratio=mandatory_coverage_ratio,
171
+ score_spread=score_spread,
172
+ confidence=confidence,
173
+ ),
174
+ )
56
175
 
57
- return tuple(selected)
176
+
177
+ def select_snippets(
178
+ ranked: list[RankedSnippet],
179
+ *,
180
+ token_budget: int,
181
+ mandatory_node_ids: set[str] | None = None,
182
+ ) -> tuple[RankedSnippet, ...]:
183
+ """Select minimal high-value snippets under token budget."""
184
+ return select_snippets_with_diagnostics(
185
+ ranked,
186
+ token_budget=token_budget,
187
+ mandatory_node_ids=mandatory_node_ids,
188
+ ).snippets
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pmaddire/gcie",
3
- "version": "0.1.13",
3
+ "version": "0.1.15",
4
4
  "description": "GraphCode Intelligence Engine one-command setup and context CLI",
5
5
  "bin": {
6
6
  "gcie": "bin/gcie.js",