recursive-llm-ts 4.5.0 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -4
- package/bin/rlm-go +0 -0
- package/dist/bridge-interface.d.ts +14 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +25 -1
- package/dist/index.d.ts +2 -2
- package/dist/index.js +2 -1
- package/dist/rlm.d.ts +3 -1
- package/dist/rlm.js +5 -0
- package/go/README.md +9 -1
- package/go/rlm/context_overflow.go +572 -0
- package/go/rlm/context_overflow_test.go +901 -0
- package/go/rlm/errors.go +185 -1
- package/go/rlm/rlm.go +10 -0
- package/go/rlm/structured.go +60 -7
- package/go/rlm/textrank.go +273 -0
- package/go/rlm/textrank_test.go +335 -0
- package/go/rlm/tfidf.go +225 -0
- package/go/rlm/tfidf_test.go +272 -0
- package/go/rlm/types.go +25 -2
- package/package.json +1 -1
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
package rlm
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"math"
|
|
5
|
+
"strings"
|
|
6
|
+
"testing"
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
// ─── CosineSimilarity Tests ─────────────────────────────────────────────────
|
|
10
|
+
|
|
11
|
+
func TestCosineSimilarity_IdenticalVectors(t *testing.T) {
|
|
12
|
+
v := tfidfVector{
|
|
13
|
+
terms: map[string]float64{"hello": 1.0, "world": 2.0},
|
|
14
|
+
norm: math.Sqrt(5.0),
|
|
15
|
+
}
|
|
16
|
+
sim := cosineSimilarity(v, v)
|
|
17
|
+
|
|
18
|
+
if sim < 0.99 || sim > 1.01 {
|
|
19
|
+
t.Errorf("expected cosine similarity ~1.0 for identical vectors, got %f", sim)
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
func TestCosineSimilarity_OrthogonalVectors(t *testing.T) {
|
|
24
|
+
a := tfidfVector{
|
|
25
|
+
terms: map[string]float64{"hello": 1.0},
|
|
26
|
+
norm: 1.0,
|
|
27
|
+
}
|
|
28
|
+
b := tfidfVector{
|
|
29
|
+
terms: map[string]float64{"world": 1.0},
|
|
30
|
+
norm: 1.0,
|
|
31
|
+
}
|
|
32
|
+
sim := cosineSimilarity(a, b)
|
|
33
|
+
|
|
34
|
+
if sim != 0 {
|
|
35
|
+
t.Errorf("expected cosine similarity 0 for orthogonal vectors, got %f", sim)
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
func TestCosineSimilarity_ZeroVector(t *testing.T) {
|
|
40
|
+
a := tfidfVector{terms: map[string]float64{}, norm: 0}
|
|
41
|
+
b := tfidfVector{terms: map[string]float64{"hello": 1.0}, norm: 1.0}
|
|
42
|
+
|
|
43
|
+
sim := cosineSimilarity(a, b)
|
|
44
|
+
if sim != 0 {
|
|
45
|
+
t.Errorf("expected 0 for zero vector, got %f", sim)
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
func TestCosineSimilarity_PartialOverlap(t *testing.T) {
|
|
50
|
+
a := tfidfVector{
|
|
51
|
+
terms: map[string]float64{"hello": 1.0, "world": 1.0},
|
|
52
|
+
norm: math.Sqrt(2.0),
|
|
53
|
+
}
|
|
54
|
+
b := tfidfVector{
|
|
55
|
+
terms: map[string]float64{"hello": 1.0, "foo": 1.0},
|
|
56
|
+
norm: math.Sqrt(2.0),
|
|
57
|
+
}
|
|
58
|
+
sim := cosineSimilarity(a, b)
|
|
59
|
+
|
|
60
|
+
// dot = 1*1 = 1, norm product = sqrt(2)*sqrt(2) = 2, sim = 0.5
|
|
61
|
+
if sim < 0.49 || sim > 0.51 {
|
|
62
|
+
t.Errorf("expected cosine similarity ~0.5, got %f", sim)
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// ─── BuildSimilarityGraph Tests ─────────────────────────────────────────────
|
|
67
|
+
|
|
68
|
+
func TestBuildSimilarityGraph_SimilarSentences(t *testing.T) {
|
|
69
|
+
sentences := []string{
|
|
70
|
+
"The machine learning model processes data efficiently.",
|
|
71
|
+
"The deep learning model processes information quickly.",
|
|
72
|
+
"The weather today is sunny and warm.",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
config := DefaultTextRankConfig()
|
|
76
|
+
config.MinSimilarity = 0.01 // Very low threshold to ensure edges
|
|
77
|
+
graph := BuildSimilarityGraph(sentences, config)
|
|
78
|
+
|
|
79
|
+
if len(graph) != 3 {
|
|
80
|
+
t.Fatalf("expected 3x3 graph, got %d rows", len(graph))
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Sentences 0 and 1 share terms (model, processes, learning) - should have higher similarity
|
|
84
|
+
// Sentence 2 is about weather - should have lower similarity with 0 and 1
|
|
85
|
+
sim01 := graph[0][1]
|
|
86
|
+
sim02 := graph[0][2]
|
|
87
|
+
|
|
88
|
+
if sim01 <= 0 {
|
|
89
|
+
t.Error("expected positive similarity between similar sentences 0 and 1")
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// 0-1 should be more similar than 0-2
|
|
93
|
+
if sim01 <= sim02 {
|
|
94
|
+
t.Errorf("expected sentences 0,1 to be more similar than 0,2: sim01=%f, sim02=%f", sim01, sim02)
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
func TestBuildSimilarityGraph_Symmetric(t *testing.T) {
|
|
99
|
+
sentences := []string{"First sentence here.", "Second sentence there.", "Third different topic."}
|
|
100
|
+
config := DefaultTextRankConfig()
|
|
101
|
+
config.MinSimilarity = 0.0
|
|
102
|
+
graph := BuildSimilarityGraph(sentences, config)
|
|
103
|
+
|
|
104
|
+
for i := 0; i < len(graph); i++ {
|
|
105
|
+
for j := 0; j < len(graph); j++ {
|
|
106
|
+
if graph[i][j] != graph[j][i] {
|
|
107
|
+
t.Errorf("graph not symmetric: [%d][%d]=%f != [%d][%d]=%f",
|
|
108
|
+
i, j, graph[i][j], j, i, graph[j][i])
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
func TestBuildSimilarityGraph_DiagonalZero(t *testing.T) {
|
|
115
|
+
sentences := []string{"First.", "Second.", "Third."}
|
|
116
|
+
config := DefaultTextRankConfig()
|
|
117
|
+
graph := BuildSimilarityGraph(sentences, config)
|
|
118
|
+
|
|
119
|
+
for i := 0; i < len(graph); i++ {
|
|
120
|
+
if graph[i][i] != 0 {
|
|
121
|
+
t.Errorf("expected zero self-similarity, got %f for sentence %d", graph[i][i], i)
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// ─── PageRank Tests ─────────────────────────────────────────────────────────
|
|
127
|
+
|
|
128
|
+
func TestPageRank_UniformGraph(t *testing.T) {
|
|
129
|
+
// Fully connected graph with equal weights -> all scores should be equal
|
|
130
|
+
n := 4
|
|
131
|
+
graph := make([][]float64, n)
|
|
132
|
+
for i := range graph {
|
|
133
|
+
graph[i] = make([]float64, n)
|
|
134
|
+
for j := range graph[i] {
|
|
135
|
+
if i != j {
|
|
136
|
+
graph[i][j] = 1.0
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
config := DefaultTextRankConfig()
|
|
142
|
+
scores := PageRank(graph, config)
|
|
143
|
+
|
|
144
|
+
if len(scores) != n {
|
|
145
|
+
t.Fatalf("expected %d scores, got %d", n, len(scores))
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// All scores should be approximately equal
|
|
149
|
+
for i := 1; i < n; i++ {
|
|
150
|
+
if math.Abs(scores[i]-scores[0]) > 0.01 {
|
|
151
|
+
t.Errorf("expected uniform scores, got %v", scores)
|
|
152
|
+
break
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
func TestPageRank_StarGraph(t *testing.T) {
|
|
158
|
+
// Star graph: node 0 connected to all others, others only connected to 0
|
|
159
|
+
// Node 0 should have the highest score
|
|
160
|
+
n := 5
|
|
161
|
+
graph := make([][]float64, n)
|
|
162
|
+
for i := range graph {
|
|
163
|
+
graph[i] = make([]float64, n)
|
|
164
|
+
}
|
|
165
|
+
for i := 1; i < n; i++ {
|
|
166
|
+
graph[0][i] = 1.0
|
|
167
|
+
graph[i][0] = 1.0
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
config := DefaultTextRankConfig()
|
|
171
|
+
scores := PageRank(graph, config)
|
|
172
|
+
|
|
173
|
+
// Node 0 (hub) should have the highest score
|
|
174
|
+
for i := 1; i < n; i++ {
|
|
175
|
+
if scores[0] <= scores[i] {
|
|
176
|
+
t.Errorf("expected hub node (0) to have highest score: scores[0]=%f, scores[%d]=%f",
|
|
177
|
+
scores[0], i, scores[i])
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
func TestPageRank_EmptyGraph(t *testing.T) {
|
|
183
|
+
scores := PageRank(nil, DefaultTextRankConfig())
|
|
184
|
+
if scores != nil {
|
|
185
|
+
t.Errorf("expected nil for empty graph, got %v", scores)
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
func TestPageRank_DisconnectedGraph(t *testing.T) {
|
|
190
|
+
// Graph with no edges -> all scores should be equal (from the (1-d)/n term)
|
|
191
|
+
n := 3
|
|
192
|
+
graph := make([][]float64, n)
|
|
193
|
+
for i := range graph {
|
|
194
|
+
graph[i] = make([]float64, n)
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
config := DefaultTextRankConfig()
|
|
198
|
+
scores := PageRank(graph, config)
|
|
199
|
+
|
|
200
|
+
for i := 1; i < n; i++ {
|
|
201
|
+
if math.Abs(scores[i]-scores[0]) > 0.001 {
|
|
202
|
+
t.Errorf("expected equal scores for disconnected graph, got %v", scores)
|
|
203
|
+
break
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
func TestPageRank_Convergence(t *testing.T) {
|
|
209
|
+
// PageRank should converge (scores sum to approximately 1)
|
|
210
|
+
n := 4
|
|
211
|
+
graph := make([][]float64, n)
|
|
212
|
+
for i := range graph {
|
|
213
|
+
graph[i] = make([]float64, n)
|
|
214
|
+
for j := range graph[i] {
|
|
215
|
+
if i != j {
|
|
216
|
+
graph[i][j] = float64((i + j) % 3) // asymmetric weights
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
config := DefaultTextRankConfig()
|
|
222
|
+
scores := PageRank(graph, config)
|
|
223
|
+
|
|
224
|
+
sum := 0.0
|
|
225
|
+
for _, s := range scores {
|
|
226
|
+
sum += s
|
|
227
|
+
if s < 0 {
|
|
228
|
+
t.Errorf("negative PageRank score: %f", s)
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Sum should be approximately 1.0
|
|
233
|
+
if sum < 0.9 || sum > 1.1 {
|
|
234
|
+
t.Errorf("expected PageRank scores to sum to ~1.0, got %f", sum)
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// ─── CompressContextTextRank Tests ──────────────────────────────────────────
|
|
239
|
+
|
|
240
|
+
func TestCompressContextTextRank_NoCompressionNeeded(t *testing.T) {
|
|
241
|
+
text := "Short text that fits."
|
|
242
|
+
result := CompressContextTextRank(text, 1000)
|
|
243
|
+
|
|
244
|
+
if result != text {
|
|
245
|
+
t.Errorf("expected unchanged text, got %q", result)
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
func TestCompressContextTextRank_CompressesLargeText(t *testing.T) {
|
|
250
|
+
var parts []string
|
|
251
|
+
for i := 0; i < 30; i++ {
|
|
252
|
+
parts = append(parts, "Machine learning algorithms process large datasets to find patterns in the data.")
|
|
253
|
+
}
|
|
254
|
+
parts = append(parts, "The stock price of AAPL rose 15% to $198.50 after the earnings report.")
|
|
255
|
+
parts = append(parts, "Quantum computing achieved 99.9% gate fidelity using topological qubits.")
|
|
256
|
+
text := strings.Join(parts, " ")
|
|
257
|
+
|
|
258
|
+
originalTokens := EstimateTokens(text)
|
|
259
|
+
targetTokens := originalTokens / 3
|
|
260
|
+
|
|
261
|
+
result := CompressContextTextRank(text, targetTokens)
|
|
262
|
+
|
|
263
|
+
resultTokens := EstimateTokens(result)
|
|
264
|
+
if resultTokens > targetTokens+10 {
|
|
265
|
+
t.Errorf("compressed result (%d tokens) exceeds target (%d tokens)", resultTokens, targetTokens)
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
func TestCompressContextTextRank_PreservesOrder(t *testing.T) {
|
|
270
|
+
text := "Alpha is the first Greek letter. Beta comes after alpha. Gamma is the third letter. Delta is fourth. Epsilon follows delta."
|
|
271
|
+
result := CompressContextTextRank(text, 30)
|
|
272
|
+
|
|
273
|
+
sentences := SplitSentences(result)
|
|
274
|
+
for i := 1; i < len(sentences); i++ {
|
|
275
|
+
posI := strings.Index(text, sentences[i])
|
|
276
|
+
posPrev := strings.Index(text, sentences[i-1])
|
|
277
|
+
if posI < posPrev {
|
|
278
|
+
t.Errorf("order not preserved: %q before %q", sentences[i-1], sentences[i])
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
func TestCompressContextTextRank_Empty(t *testing.T) {
|
|
284
|
+
result := CompressContextTextRank("", 100)
|
|
285
|
+
if result != "" {
|
|
286
|
+
t.Errorf("expected empty result, got %q", result)
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
func TestCompressContextTextRank_WithConfig(t *testing.T) {
|
|
291
|
+
text := "Machine learning models. Deep learning networks. Neural network architectures. Data processing pipelines. Cloud computing infrastructure."
|
|
292
|
+
|
|
293
|
+
config := TextRankConfig{
|
|
294
|
+
DampingFactor: 0.85,
|
|
295
|
+
MaxIterations: 50,
|
|
296
|
+
ConvergenceThreshold: 0.001,
|
|
297
|
+
MinSimilarity: 0.0, // Allow all edges
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
result := CompressContextTextRankWithConfig(text, 15, config)
|
|
301
|
+
|
|
302
|
+
if len(result) == 0 {
|
|
303
|
+
t.Error("expected non-empty result")
|
|
304
|
+
}
|
|
305
|
+
if EstimateTokens(result) > 20 { // Some slack
|
|
306
|
+
t.Errorf("result exceeds budget: %d tokens", EstimateTokens(result))
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// ─── TextRank vs TF-IDF Comparison ──────────────────────────────────────────
|
|
311
|
+
|
|
312
|
+
func TestTextRank_DifferentFromTFIDF(t *testing.T) {
|
|
313
|
+
// TextRank and TF-IDF should produce different rankings because TextRank
|
|
314
|
+
// considers inter-sentence relationships while TF-IDF only considers term rarity
|
|
315
|
+
text := "Machine learning processes data. " +
|
|
316
|
+
"Deep learning is a subset of machine learning. " +
|
|
317
|
+
"Neural networks power deep learning. " +
|
|
318
|
+
"The weather is sunny today. " +
|
|
319
|
+
"Rain is expected tomorrow."
|
|
320
|
+
|
|
321
|
+
tfidfResult := CompressContextTFIDF(text, 20)
|
|
322
|
+
textrankResult := CompressContextTextRank(text, 20)
|
|
323
|
+
|
|
324
|
+
// They should both produce non-empty results
|
|
325
|
+
if len(tfidfResult) == 0 {
|
|
326
|
+
t.Error("TF-IDF result is empty")
|
|
327
|
+
}
|
|
328
|
+
if len(textrankResult) == 0 {
|
|
329
|
+
t.Error("TextRank result is empty")
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// They CAN be the same for simple inputs, but both should work
|
|
333
|
+
t.Logf("TF-IDF: %q", tfidfResult)
|
|
334
|
+
t.Logf("TextRank: %q", textrankResult)
|
|
335
|
+
}
|
package/go/rlm/tfidf.go
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
package rlm
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"math"
|
|
5
|
+
"sort"
|
|
6
|
+
"strings"
|
|
7
|
+
"unicode"
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
// ─── TF-IDF Extractive Context Compression ──────────────────────────────────
|
|
11
|
+
//
|
|
12
|
+
// Pure Go, zero external dependencies, zero API calls.
|
|
13
|
+
// Scores sentences by Term Frequency - Inverse Document Frequency,
|
|
14
|
+
// selects top-K sentences that fit within a token budget,
|
|
15
|
+
// and preserves original document order.
|
|
16
|
+
|
|
17
|
+
// ScoredSentence holds a sentence with its TF-IDF score and original position.
|
|
18
|
+
type ScoredSentence struct {
|
|
19
|
+
Text string
|
|
20
|
+
Score float64
|
|
21
|
+
Index int // original position in the document (for order preservation)
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// SplitSentences breaks text into sentences using punctuation boundaries.
|
|
25
|
+
// Handles ". ", "! ", "? " as sentence terminators, plus paragraph breaks.
|
|
26
|
+
func SplitSentences(text string) []string {
|
|
27
|
+
if len(strings.TrimSpace(text)) == 0 {
|
|
28
|
+
return nil
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
var sentences []string
|
|
32
|
+
var current strings.Builder
|
|
33
|
+
|
|
34
|
+
runes := []rune(text)
|
|
35
|
+
for i := 0; i < len(runes); i++ {
|
|
36
|
+
r := runes[i]
|
|
37
|
+
current.WriteRune(r)
|
|
38
|
+
|
|
39
|
+
// Check for sentence-ending punctuation followed by space/newline/end
|
|
40
|
+
isSentenceEnd := false
|
|
41
|
+
if r == '.' || r == '!' || r == '?' {
|
|
42
|
+
if i+1 >= len(runes) {
|
|
43
|
+
isSentenceEnd = true
|
|
44
|
+
} else {
|
|
45
|
+
next := runes[i+1]
|
|
46
|
+
if next == ' ' || next == '\n' || next == '\r' || next == '\t' {
|
|
47
|
+
isSentenceEnd = true
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Also split on double newline (paragraph boundary)
|
|
53
|
+
if r == '\n' && i+1 < len(runes) && runes[i+1] == '\n' {
|
|
54
|
+
s := strings.TrimSpace(current.String())
|
|
55
|
+
if len(s) > 0 {
|
|
56
|
+
sentences = append(sentences, s)
|
|
57
|
+
}
|
|
58
|
+
current.Reset()
|
|
59
|
+
i++ // skip the second newline
|
|
60
|
+
continue
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if isSentenceEnd {
|
|
64
|
+
s := strings.TrimSpace(current.String())
|
|
65
|
+
if len(s) > 0 {
|
|
66
|
+
sentences = append(sentences, s)
|
|
67
|
+
}
|
|
68
|
+
current.Reset()
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Flush remaining text
|
|
73
|
+
if remaining := strings.TrimSpace(current.String()); len(remaining) > 0 {
|
|
74
|
+
sentences = append(sentences, remaining)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return sentences
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// TokenizeWords splits text into lowercase word tokens, filtering non-alphanumeric characters.
|
|
81
|
+
func TokenizeWords(text string) []string {
|
|
82
|
+
return strings.FieldsFunc(strings.ToLower(text), func(r rune) bool {
|
|
83
|
+
return !unicode.IsLetter(r) && !unicode.IsDigit(r)
|
|
84
|
+
})
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// stopWords contains common English stop words to optionally filter for better TF-IDF scoring.
|
|
88
|
+
var stopWords = map[string]bool{
|
|
89
|
+
"a": true, "an": true, "the": true, "and": true, "or": true, "but": true,
|
|
90
|
+
"in": true, "on": true, "at": true, "to": true, "for": true, "of": true,
|
|
91
|
+
"with": true, "by": true, "from": true, "is": true, "are": true, "was": true,
|
|
92
|
+
"were": true, "be": true, "been": true, "being": true, "have": true, "has": true,
|
|
93
|
+
"had": true, "do": true, "does": true, "did": true, "will": true, "would": true,
|
|
94
|
+
"could": true, "should": true, "may": true, "might": true, "shall": true,
|
|
95
|
+
"can": true, "this": true, "that": true, "these": true, "those": true,
|
|
96
|
+
"it": true, "its": true, "i": true, "you": true, "he": true, "she": true,
|
|
97
|
+
"we": true, "they": true, "me": true, "him": true, "her": true, "us": true,
|
|
98
|
+
"them": true, "my": true, "your": true, "his": true, "our": true, "their": true,
|
|
99
|
+
"not": true, "no": true, "if": true, "as": true, "so": true, "than": true,
|
|
100
|
+
"then": true, "also": true, "just": true, "about": true, "into": true,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// FilterStopWords removes common stop words from a word list.
|
|
104
|
+
func FilterStopWords(words []string) []string {
|
|
105
|
+
filtered := make([]string, 0, len(words))
|
|
106
|
+
for _, w := range words {
|
|
107
|
+
if !stopWords[w] && len(w) > 1 {
|
|
108
|
+
filtered = append(filtered, w)
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return filtered
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// ComputeTFIDF computes TF-IDF scores for each sentence in a document.
|
|
115
|
+
// Returns ScoredSentence slice with scores and original indices.
|
|
116
|
+
func ComputeTFIDF(sentences []string) []ScoredSentence {
|
|
117
|
+
if len(sentences) == 0 {
|
|
118
|
+
return nil
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Tokenize each sentence
|
|
122
|
+
docWords := make([][]string, len(sentences))
|
|
123
|
+
for i, s := range sentences {
|
|
124
|
+
words := TokenizeWords(s)
|
|
125
|
+
docWords[i] = FilterStopWords(words)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Compute document frequency (how many sentences contain each word)
|
|
129
|
+
df := make(map[string]int)
|
|
130
|
+
for _, words := range docWords {
|
|
131
|
+
seen := make(map[string]bool)
|
|
132
|
+
for _, w := range words {
|
|
133
|
+
if !seen[w] {
|
|
134
|
+
df[w]++
|
|
135
|
+
seen[w] = true
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
n := float64(len(sentences))
|
|
141
|
+
|
|
142
|
+
// Score each sentence by TF-IDF
|
|
143
|
+
scored := make([]ScoredSentence, len(sentences))
|
|
144
|
+
for i, words := range docWords {
|
|
145
|
+
score := 0.0
|
|
146
|
+
tf := make(map[string]int)
|
|
147
|
+
for _, w := range words {
|
|
148
|
+
tf[w]++
|
|
149
|
+
}
|
|
150
|
+
for word, freq := range tf {
|
|
151
|
+
// TF: term frequency in this sentence
|
|
152
|
+
// IDF: log(N / df) where N = number of sentences
|
|
153
|
+
idf := math.Log(n / float64(df[word]))
|
|
154
|
+
score += float64(freq) * idf
|
|
155
|
+
}
|
|
156
|
+
// Normalize by sentence length to avoid bias toward long sentences
|
|
157
|
+
if len(words) > 0 {
|
|
158
|
+
score /= math.Sqrt(float64(len(words)))
|
|
159
|
+
}
|
|
160
|
+
scored[i] = ScoredSentence{
|
|
161
|
+
Text: sentences[i],
|
|
162
|
+
Score: score,
|
|
163
|
+
Index: i,
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return scored
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// CompressContextTFIDF reduces context to fit within a token budget using
|
|
171
|
+
// extractive summarization via TF-IDF sentence scoring.
|
|
172
|
+
// Preserves original sentence order in the output.
|
|
173
|
+
// Returns the original context unchanged if it already fits.
|
|
174
|
+
func CompressContextTFIDF(text string, targetTokens int) string {
|
|
175
|
+
if EstimateTokens(text) <= targetTokens {
|
|
176
|
+
return text
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
sentences := SplitSentences(text)
|
|
180
|
+
if len(sentences) == 0 {
|
|
181
|
+
return text
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
scored := ComputeTFIDF(sentences)
|
|
185
|
+
|
|
186
|
+
// Sort by score descending
|
|
187
|
+
sort.Slice(scored, func(i, j int) bool {
|
|
188
|
+
return scored[i].Score > scored[j].Score
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
// Greedily select top sentences until budget is reached
|
|
192
|
+
var selected []ScoredSentence
|
|
193
|
+
currentTokens := 0
|
|
194
|
+
for _, s := range scored {
|
|
195
|
+
sentTokens := EstimateTokens(s.Text)
|
|
196
|
+
if currentTokens+sentTokens > targetTokens {
|
|
197
|
+
continue
|
|
198
|
+
}
|
|
199
|
+
selected = append(selected, s)
|
|
200
|
+
currentTokens += sentTokens
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
if len(selected) == 0 {
|
|
204
|
+
// Budget too small for even one sentence - truncate the highest-scored
|
|
205
|
+
if len(scored) > 0 {
|
|
206
|
+
maxChars := targetTokens * 3 // Conservative chars/token
|
|
207
|
+
if maxChars > len(scored[0].Text) {
|
|
208
|
+
maxChars = len(scored[0].Text)
|
|
209
|
+
}
|
|
210
|
+
return scored[0].Text[:maxChars]
|
|
211
|
+
}
|
|
212
|
+
return text
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Re-sort by original index to preserve document order
|
|
216
|
+
sort.Slice(selected, func(i, j int) bool {
|
|
217
|
+
return selected[i].Index < selected[j].Index
|
|
218
|
+
})
|
|
219
|
+
|
|
220
|
+
parts := make([]string, len(selected))
|
|
221
|
+
for i, s := range selected {
|
|
222
|
+
parts[i] = s.Text
|
|
223
|
+
}
|
|
224
|
+
return strings.Join(parts, " ")
|
|
225
|
+
}
|