lynkr 7.2.5 → 8.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/config/model-tiers.json +89 -0
- package/install.sh +6 -1
- package/package.json +4 -2
- package/scripts/setup.js +0 -1
- package/src/agents/executor.js +14 -6
- package/src/api/middleware/session.js +15 -2
- package/src/api/openai-router.js +162 -37
- package/src/api/providers-handler.js +15 -1
- package/src/api/router.js +107 -2
- package/src/budget/index.js +4 -3
- package/src/clients/databricks.js +431 -234
- package/src/clients/gpt-utils.js +181 -0
- package/src/clients/ollama-utils.js +66 -140
- package/src/clients/routing.js +0 -1
- package/src/clients/standard-tools.js +99 -3
- package/src/config/index.js +133 -35
- package/src/context/toon.js +173 -0
- package/src/logger/index.js +23 -0
- package/src/orchestrator/index.js +688 -213
- package/src/routing/agentic-detector.js +320 -0
- package/src/routing/complexity-analyzer.js +202 -2
- package/src/routing/cost-optimizer.js +305 -0
- package/src/routing/index.js +168 -159
- package/src/routing/model-tiers.js +365 -0
- package/src/server.js +4 -14
- package/src/sessions/cleanup.js +3 -3
- package/src/sessions/record.js +10 -1
- package/src/sessions/store.js +7 -2
- package/src/tools/agent-task.js +48 -1
- package/src/tools/index.js +19 -2
- package/src/tools/lazy-loader.js +7 -0
- package/src/tools/tinyfish.js +358 -0
- package/src/tools/truncate.js +1 -0
- package/.github/FUNDING.yml +0 -15
- package/.github/workflows/README.md +0 -215
- package/.github/workflows/ci.yml +0 -69
- package/.github/workflows/index.yml +0 -62
- package/.github/workflows/web-tools-tests.yml +0 -56
- package/CITATIONS.bib +0 -6
- package/CLAWROUTER_ROUTING_PLAN.md +0 -910
- package/DEPLOYMENT.md +0 -1001
- package/LYNKR-TUI-PLAN.md +0 -984
- package/PERFORMANCE-REPORT.md +0 -866
- package/PLAN-per-client-model-routing.md +0 -252
- package/ROUTER_COMPARISON.md +0 -173
- package/TIER_ROUTING_PLAN.md +0 -771
- package/docs/42642f749da6234f41b6b425c3bb07c9.txt +0 -1
- package/docs/BingSiteAuth.xml +0 -4
- package/docs/docs-style.css +0 -478
- package/docs/docs.html +0 -197
- package/docs/google5be250e608e6da39.html +0 -1
- package/docs/index.html +0 -577
- package/docs/index.md +0 -577
- package/docs/robots.txt +0 -4
- package/docs/sitemap.xml +0 -44
- package/docs/style.css +0 -1223
- package/documentation/README.md +0 -100
- package/documentation/api.md +0 -806
- package/documentation/claude-code-cli.md +0 -672
- package/documentation/codex-cli.md +0 -397
- package/documentation/contributing.md +0 -571
- package/documentation/cursor-integration.md +0 -731
- package/documentation/docker.md +0 -867
- package/documentation/embeddings.md +0 -760
- package/documentation/faq.md +0 -659
- package/documentation/features.md +0 -396
- package/documentation/headroom.md +0 -519
- package/documentation/installation.md +0 -706
- package/documentation/memory-system.md +0 -476
- package/documentation/production.md +0 -601
- package/documentation/providers.md +0 -906
- package/documentation/testing.md +0 -629
- package/documentation/token-optimization.md +0 -323
- package/documentation/tools.md +0 -697
- package/documentation/troubleshooting.md +0 -893
- package/final-test.js +0 -33
- package/headroom-sidecar/config.py +0 -93
- package/headroom-sidecar/requirements.txt +0 -14
- package/headroom-sidecar/server.py +0 -451
- package/monitor-agents.sh +0 -31
- package/scripts/audit-log-reader.js +0 -399
- package/scripts/compact-dictionary.js +0 -204
- package/scripts/test-deduplication.js +0 -448
- package/src/db/database.sqlite +0 -0
- package/test/README.md +0 -212
- package/test/azure-openai-config.test.js +0 -204
- package/test/azure-openai-error-resilience.test.js +0 -238
- package/test/azure-openai-format-conversion.test.js +0 -354
- package/test/azure-openai-integration.test.js +0 -281
- package/test/azure-openai-routing.test.js +0 -177
- package/test/azure-openai-streaming.test.js +0 -171
- package/test/bedrock-integration.test.js +0 -471
- package/test/comprehensive-test-suite.js +0 -928
- package/test/config-validation.test.js +0 -207
- package/test/cursor-integration.test.js +0 -484
- package/test/format-conversion.test.js +0 -578
- package/test/hybrid-routing-integration.test.js +0 -254
- package/test/hybrid-routing-performance.test.js +0 -418
- package/test/llamacpp-integration.test.js +0 -863
- package/test/lmstudio-integration.test.js +0 -335
- package/test/memory/extractor.test.js +0 -398
- package/test/memory/retriever.test.js +0 -613
- package/test/memory/retriever.test.js.bak +0 -585
- package/test/memory/search.test.js +0 -537
- package/test/memory/search.test.js.bak +0 -389
- package/test/memory/store.test.js +0 -344
- package/test/memory/store.test.js.bak +0 -312
- package/test/memory/surprise.test.js +0 -300
- package/test/memory-performance.test.js +0 -472
- package/test/openai-integration.test.js +0 -686
- package/test/openrouter-error-resilience.test.js +0 -418
- package/test/passthrough-mode.test.js +0 -385
- package/test/performance-benchmark.js +0 -351
- package/test/performance-tests.js +0 -528
- package/test/routing.test.js +0 -219
- package/test/web-tools.test.js +0 -329
- package/test-agents-simple.js +0 -43
- package/test-cli-connection.sh +0 -33
- package/test-learning-unit.js +0 -126
- package/test-learning.js +0 -112
- package/test-parallel-agents.sh +0 -124
- package/test-parallel-direct.js +0 -155
- package/test-subagents.sh +0 -117
package/final-test.js
DELETED
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
const http = require('http');
|
|
2
|
-
|
|
3
|
-
const data = JSON.stringify({
|
|
4
|
-
model: "claude-sonnet-4-5",
|
|
5
|
-
max_tokens: 100,
|
|
6
|
-
messages: [{ role: "user", content: "Say hello" }]
|
|
7
|
-
});
|
|
8
|
-
|
|
9
|
-
const req = http.request({
|
|
10
|
-
hostname: 'localhost',
|
|
11
|
-
port: 8081,
|
|
12
|
-
path: '/v1/messages',
|
|
13
|
-
method: 'POST',
|
|
14
|
-
headers: { 'Content-Type': 'application/json', 'Content-Length': data.length }
|
|
15
|
-
}, (res) => {
|
|
16
|
-
let body = '';
|
|
17
|
-
res.on('data', chunk => body += chunk);
|
|
18
|
-
res.on('end', () => {
|
|
19
|
-
console.log('Status:', res.statusCode);
|
|
20
|
-
if (res.statusCode === 200) {
|
|
21
|
-
const json = JSON.parse(body);
|
|
22
|
-
console.log('✅ SUCCESS!');
|
|
23
|
-
console.log('Model:', json.model);
|
|
24
|
-
console.log('Response:', json.content[0].text.substring(0, 150));
|
|
25
|
-
} else {
|
|
26
|
-
console.log('❌ Error:', body.substring(0, 300));
|
|
27
|
-
}
|
|
28
|
-
});
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
req.on('error', e => console.error('Request failed:', e.message));
|
|
32
|
-
req.write(data);
|
|
33
|
-
req.end();
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Headroom Sidecar Configuration
|
|
3
|
-
Loads settings from environment variables
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import os
|
|
7
|
-
from typing import Optional
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def str_to_bool(value: str) -> bool:
|
|
11
|
-
"""Convert string to boolean"""
|
|
12
|
-
return value.lower() in ("true", "1", "yes", "on")
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class HeadroomConfig:
|
|
16
|
-
"""Configuration for Headroom sidecar"""
|
|
17
|
-
|
|
18
|
-
def __init__(self):
|
|
19
|
-
# Server settings
|
|
20
|
-
self.host = os.environ.get("HEADROOM_HOST", "0.0.0.0")
|
|
21
|
-
self.port = int(os.environ.get("HEADROOM_PORT", "8787"))
|
|
22
|
-
self.log_level = os.environ.get("HEADROOM_LOG_LEVEL", "info")
|
|
23
|
-
|
|
24
|
-
# Operating mode
|
|
25
|
-
self.mode = os.environ.get("HEADROOM_MODE", "optimize")
|
|
26
|
-
self.provider = os.environ.get("HEADROOM_PROVIDER", "anthropic")
|
|
27
|
-
|
|
28
|
-
# Smart Crusher settings
|
|
29
|
-
self.smart_crusher_enabled = str_to_bool(
|
|
30
|
-
os.environ.get("HEADROOM_SMART_CRUSHER", "true")
|
|
31
|
-
)
|
|
32
|
-
self.smart_crusher_min_tokens = int(
|
|
33
|
-
os.environ.get("HEADROOM_SMART_CRUSHER_MIN_TOKENS", "200")
|
|
34
|
-
)
|
|
35
|
-
self.smart_crusher_max_items = int(
|
|
36
|
-
os.environ.get("HEADROOM_SMART_CRUSHER_MAX_ITEMS", "15")
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
# Tool Crusher settings
|
|
40
|
-
self.tool_crusher_enabled = str_to_bool(
|
|
41
|
-
os.environ.get("HEADROOM_TOOL_CRUSHER", "true")
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
# Cache Aligner settings
|
|
45
|
-
self.cache_aligner_enabled = str_to_bool(
|
|
46
|
-
os.environ.get("HEADROOM_CACHE_ALIGNER", "true")
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
# Rolling Window settings
|
|
50
|
-
self.rolling_window_enabled = str_to_bool(
|
|
51
|
-
os.environ.get("HEADROOM_ROLLING_WINDOW", "true")
|
|
52
|
-
)
|
|
53
|
-
self.keep_turns = int(os.environ.get("HEADROOM_KEEP_TURNS", "3"))
|
|
54
|
-
|
|
55
|
-
# CCR settings
|
|
56
|
-
self.ccr_enabled = str_to_bool(os.environ.get("HEADROOM_CCR", "true"))
|
|
57
|
-
self.ccr_ttl = int(os.environ.get("HEADROOM_CCR_TTL", "300"))
|
|
58
|
-
|
|
59
|
-
# LLMLingua settings
|
|
60
|
-
self.llmlingua_enabled = str_to_bool(
|
|
61
|
-
os.environ.get("HEADROOM_LLMLINGUA", "false")
|
|
62
|
-
)
|
|
63
|
-
self.llmlingua_device = os.environ.get("HEADROOM_LLMLINGUA_DEVICE", "auto")
|
|
64
|
-
|
|
65
|
-
def to_dict(self) -> dict:
|
|
66
|
-
"""Return configuration as dictionary"""
|
|
67
|
-
return {
|
|
68
|
-
"host": self.host,
|
|
69
|
-
"port": self.port,
|
|
70
|
-
"log_level": self.log_level,
|
|
71
|
-
"mode": self.mode,
|
|
72
|
-
"provider": self.provider,
|
|
73
|
-
"smart_crusher": {
|
|
74
|
-
"enabled": self.smart_crusher_enabled,
|
|
75
|
-
"min_tokens": self.smart_crusher_min_tokens,
|
|
76
|
-
"max_items": self.smart_crusher_max_items,
|
|
77
|
-
},
|
|
78
|
-
"tool_crusher": {"enabled": self.tool_crusher_enabled},
|
|
79
|
-
"cache_aligner": {"enabled": self.cache_aligner_enabled},
|
|
80
|
-
"rolling_window": {
|
|
81
|
-
"enabled": self.rolling_window_enabled,
|
|
82
|
-
"keep_turns": self.keep_turns,
|
|
83
|
-
},
|
|
84
|
-
"ccr": {"enabled": self.ccr_enabled, "ttl": self.ccr_ttl},
|
|
85
|
-
"llmlingua": {
|
|
86
|
-
"enabled": self.llmlingua_enabled,
|
|
87
|
-
"device": self.llmlingua_device,
|
|
88
|
-
},
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
# Global config instance
|
|
93
|
-
config = HeadroomConfig()
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
# Headroom Sidecar Dependencies
|
|
2
|
-
|
|
3
|
-
# Core framework
|
|
4
|
-
fastapi>=0.109.0
|
|
5
|
-
uvicorn[standard]>=0.27.0
|
|
6
|
-
pydantic>=2.5.0
|
|
7
|
-
|
|
8
|
-
# Headroom SDK
|
|
9
|
-
headroom-ai>=0.1.0
|
|
10
|
-
|
|
11
|
-
# Optional: LLMLingua support (uncomment for ML compression)
|
|
12
|
-
# llmlingua>=0.2.0
|
|
13
|
-
# torch>=2.0.0
|
|
14
|
-
# transformers>=4.36.0
|
|
@@ -1,451 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Headroom Sidecar Server
|
|
3
|
-
FastAPI application providing context compression via HTTP API
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
import time
|
|
8
|
-
import hashlib
|
|
9
|
-
import json
|
|
10
|
-
from typing import Any, Dict, List, Optional
|
|
11
|
-
from datetime import datetime
|
|
12
|
-
|
|
13
|
-
from fastapi import FastAPI, HTTPException
|
|
14
|
-
from fastapi.responses import JSONResponse
|
|
15
|
-
from pydantic import BaseModel
|
|
16
|
-
import uvicorn
|
|
17
|
-
|
|
18
|
-
from config import config
|
|
19
|
-
|
|
20
|
-
# Setup logging
|
|
21
|
-
logging.basicConfig(
|
|
22
|
-
level=getattr(logging, config.log_level.upper()),
|
|
23
|
-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
24
|
-
)
|
|
25
|
-
logger = logging.getLogger("headroom-sidecar")
|
|
26
|
-
|
|
27
|
-
# Initialize FastAPI app
|
|
28
|
-
app = FastAPI(
|
|
29
|
-
title="Headroom Sidecar",
|
|
30
|
-
description="Context compression service for LLM requests",
|
|
31
|
-
version="1.0.0",
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
# Try to import headroom, fallback to basic compression if not available
|
|
35
|
-
try:
|
|
36
|
-
from headroom import (
|
|
37
|
-
TransformPipeline,
|
|
38
|
-
SmartCrusher,
|
|
39
|
-
SmartCrusherConfig,
|
|
40
|
-
ToolCrusher,
|
|
41
|
-
ToolCrusherConfig,
|
|
42
|
-
RollingWindow,
|
|
43
|
-
RollingWindowConfig,
|
|
44
|
-
AnthropicProvider,
|
|
45
|
-
OpenAIProvider,
|
|
46
|
-
)
|
|
47
|
-
import warnings
|
|
48
|
-
warnings.filterwarnings("ignore", message=".*tiktoken approximation.*")
|
|
49
|
-
|
|
50
|
-
# Create transforms based on config
|
|
51
|
-
transforms = []
|
|
52
|
-
|
|
53
|
-
if config.smart_crusher_enabled:
|
|
54
|
-
transforms.append(SmartCrusher(SmartCrusherConfig(
|
|
55
|
-
enabled=True,
|
|
56
|
-
min_tokens_to_crush=config.smart_crusher_min_tokens,
|
|
57
|
-
max_items_after_crush=config.smart_crusher_max_items,
|
|
58
|
-
)))
|
|
59
|
-
logger.info("SmartCrusher enabled")
|
|
60
|
-
|
|
61
|
-
if config.tool_crusher_enabled:
|
|
62
|
-
transforms.append(ToolCrusher(ToolCrusherConfig(
|
|
63
|
-
enabled=True,
|
|
64
|
-
)))
|
|
65
|
-
logger.info("ToolCrusher enabled")
|
|
66
|
-
|
|
67
|
-
if config.rolling_window_enabled:
|
|
68
|
-
transforms.append(RollingWindow(RollingWindowConfig(
|
|
69
|
-
enabled=True,
|
|
70
|
-
keep_last_turns=config.keep_turns,
|
|
71
|
-
)))
|
|
72
|
-
logger.info("RollingWindow enabled")
|
|
73
|
-
|
|
74
|
-
# Create provider based on config
|
|
75
|
-
if config.provider == "openai":
|
|
76
|
-
headroom_provider = OpenAIProvider()
|
|
77
|
-
else:
|
|
78
|
-
headroom_provider = AnthropicProvider()
|
|
79
|
-
|
|
80
|
-
headroom_pipeline = TransformPipeline(transforms=transforms, provider=headroom_provider) if transforms else None
|
|
81
|
-
HEADROOM_AVAILABLE = headroom_pipeline is not None
|
|
82
|
-
logger.info(f"Headroom SDK loaded successfully with {len(transforms)} transforms (provider: {config.provider})")
|
|
83
|
-
except ImportError as e:
|
|
84
|
-
logger.warning(f"Headroom SDK not available: {e}. Using basic compression.")
|
|
85
|
-
headroom_pipeline = None
|
|
86
|
-
HEADROOM_AVAILABLE = False
|
|
87
|
-
|
|
88
|
-
# CCR Store (in-memory with TTL)
|
|
89
|
-
ccr_store: Dict[str, Dict[str, Any]] = {}
|
|
90
|
-
|
|
91
|
-
# Metrics
|
|
92
|
-
metrics = {
|
|
93
|
-
"requests_total": 0,
|
|
94
|
-
"compressions_applied": 0,
|
|
95
|
-
"compressions_skipped": 0,
|
|
96
|
-
"errors": 0,
|
|
97
|
-
"ccr_stores": 0,
|
|
98
|
-
"ccr_retrievals": 0,
|
|
99
|
-
"total_tokens_before": 0,
|
|
100
|
-
"total_tokens_after": 0,
|
|
101
|
-
"start_time": datetime.utcnow().isoformat(),
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
# Request/Response models
|
|
106
|
-
class CompressRequest(BaseModel):
|
|
107
|
-
messages: List[Dict[str, Any]]
|
|
108
|
-
tools: Optional[List[Dict[str, Any]]] = None
|
|
109
|
-
model: Optional[str] = "claude-3-5-sonnet-20241022"
|
|
110
|
-
model_limit: Optional[int] = 200000
|
|
111
|
-
mode: Optional[str] = None
|
|
112
|
-
token_budget: Optional[int] = None
|
|
113
|
-
query_context: Optional[str] = None
|
|
114
|
-
preserve_recent_turns: Optional[int] = None
|
|
115
|
-
target_ratio: Optional[float] = None
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
class CompressResponse(BaseModel):
|
|
119
|
-
messages: List[Dict[str, Any]]
|
|
120
|
-
tools: Optional[List[Dict[str, Any]]] = None
|
|
121
|
-
compressed: bool
|
|
122
|
-
stats: Dict[str, Any]
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
class CCRRetrieveRequest(BaseModel):
|
|
126
|
-
hash: str
|
|
127
|
-
query: Optional[str] = None
|
|
128
|
-
max_results: Optional[int] = 20
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
class CCRRetrieveResponse(BaseModel):
|
|
132
|
-
success: bool
|
|
133
|
-
content: Optional[Any] = None
|
|
134
|
-
items_retrieved: int = 0
|
|
135
|
-
was_search: bool = False
|
|
136
|
-
error: Optional[str] = None
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def estimate_tokens(data: Any) -> int:
|
|
140
|
-
"""Estimate token count (rough approximation: ~4 chars per token)"""
|
|
141
|
-
text = json.dumps(data) if not isinstance(data, str) else data
|
|
142
|
-
return len(text) // 4
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def generate_hash(content: Any) -> str:
|
|
146
|
-
"""Generate hash for CCR storage"""
|
|
147
|
-
text = json.dumps(content, sort_keys=True)
|
|
148
|
-
return hashlib.sha256(text.encode()).hexdigest()[:12]
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def cleanup_expired_ccr():
|
|
152
|
-
"""Remove expired CCR entries"""
|
|
153
|
-
now = time.time()
|
|
154
|
-
expired = [k for k, v in ccr_store.items() if now - v["timestamp"] > config.ccr_ttl]
|
|
155
|
-
for key in expired:
|
|
156
|
-
del ccr_store[key]
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def basic_compress(messages: List[Dict], tools: Optional[List] = None) -> Dict:
|
|
160
|
-
"""Basic compression when Headroom SDK is not available"""
|
|
161
|
-
tokens_before = estimate_tokens(messages)
|
|
162
|
-
compressed_messages = []
|
|
163
|
-
|
|
164
|
-
for msg in messages:
|
|
165
|
-
compressed_msg = msg.copy()
|
|
166
|
-
|
|
167
|
-
# Compress large tool results
|
|
168
|
-
if msg.get("role") == "user" and isinstance(msg.get("content"), list):
|
|
169
|
-
new_content = []
|
|
170
|
-
for block in msg["content"]:
|
|
171
|
-
if block.get("type") == "tool_result":
|
|
172
|
-
content = block.get("content", "")
|
|
173
|
-
if isinstance(content, str) and len(content) > 2000:
|
|
174
|
-
# Store in CCR and replace with reference
|
|
175
|
-
hash_key = generate_hash(content)
|
|
176
|
-
ccr_store[hash_key] = {
|
|
177
|
-
"content": content,
|
|
178
|
-
"timestamp": time.time(),
|
|
179
|
-
"tool_name": block.get("tool_use_id", "unknown"),
|
|
180
|
-
}
|
|
181
|
-
metrics["ccr_stores"] += 1
|
|
182
|
-
block = block.copy()
|
|
183
|
-
block["content"] = (
|
|
184
|
-
f"[CCR:{hash_key}] Content compressed ({len(content)} chars). "
|
|
185
|
-
f"Use ccr_retrieve to access full content."
|
|
186
|
-
)
|
|
187
|
-
new_content.append(block)
|
|
188
|
-
compressed_msg["content"] = new_content
|
|
189
|
-
compressed_messages.append(compressed_msg)
|
|
190
|
-
|
|
191
|
-
tokens_after = estimate_tokens(compressed_messages)
|
|
192
|
-
|
|
193
|
-
return {
|
|
194
|
-
"messages": compressed_messages,
|
|
195
|
-
"tools": tools,
|
|
196
|
-
"compressed": tokens_after < tokens_before,
|
|
197
|
-
"stats": {
|
|
198
|
-
"tokens_before": tokens_before,
|
|
199
|
-
"tokens_after": tokens_after,
|
|
200
|
-
"tokens_saved": tokens_before - tokens_after,
|
|
201
|
-
"savings_percent": round(
|
|
202
|
-
(1 - tokens_after / tokens_before) * 100, 1
|
|
203
|
-
) if tokens_before > 0 else 0,
|
|
204
|
-
"transforms_applied": ["basic_ccr"] if tokens_after < tokens_before else [],
|
|
205
|
-
"latency_ms": 0,
|
|
206
|
-
},
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
@app.get("/health")
|
|
211
|
-
async def health_check():
|
|
212
|
-
"""Health check endpoint"""
|
|
213
|
-
cleanup_expired_ccr()
|
|
214
|
-
return {
|
|
215
|
-
"status": "healthy",
|
|
216
|
-
"headroom_loaded": HEADROOM_AVAILABLE,
|
|
217
|
-
"ccr_enabled": config.ccr_enabled,
|
|
218
|
-
"llmlingua_enabled": config.llmlingua_enabled,
|
|
219
|
-
"entries_cached": len(ccr_store),
|
|
220
|
-
"config": config.to_dict(),
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
@app.get("/metrics")
|
|
225
|
-
async def get_metrics():
|
|
226
|
-
"""Get compression metrics"""
|
|
227
|
-
return {
|
|
228
|
-
**metrics,
|
|
229
|
-
"average_compression_ratio": (
|
|
230
|
-
round(metrics["total_tokens_after"] / metrics["total_tokens_before"], 3)
|
|
231
|
-
if metrics["total_tokens_before"] > 0
|
|
232
|
-
else 1.0
|
|
233
|
-
),
|
|
234
|
-
"ccr_entries": len(ccr_store),
|
|
235
|
-
"uptime_seconds": (
|
|
236
|
-
datetime.utcnow() - datetime.fromisoformat(metrics["start_time"])
|
|
237
|
-
).total_seconds(),
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
@app.post("/compress", response_model=CompressResponse)
|
|
242
|
-
async def compress_messages(request: CompressRequest):
|
|
243
|
-
"""Compress messages and tools"""
|
|
244
|
-
start_time = time.time()
|
|
245
|
-
metrics["requests_total"] += 1
|
|
246
|
-
|
|
247
|
-
try:
|
|
248
|
-
tokens_before = estimate_tokens(request.messages)
|
|
249
|
-
metrics["total_tokens_before"] += tokens_before
|
|
250
|
-
|
|
251
|
-
# Skip if below minimum tokens
|
|
252
|
-
if tokens_before < config.smart_crusher_min_tokens:
|
|
253
|
-
metrics["compressions_skipped"] += 1
|
|
254
|
-
return CompressResponse(
|
|
255
|
-
messages=request.messages,
|
|
256
|
-
tools=request.tools,
|
|
257
|
-
compressed=False,
|
|
258
|
-
stats={
|
|
259
|
-
"skipped": True,
|
|
260
|
-
"reason": f"Below threshold ({tokens_before} < {config.smart_crusher_min_tokens})",
|
|
261
|
-
},
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
# Use Headroom SDK if available
|
|
265
|
-
if HEADROOM_AVAILABLE and headroom_pipeline:
|
|
266
|
-
try:
|
|
267
|
-
result = headroom_pipeline.apply(
|
|
268
|
-
request.messages,
|
|
269
|
-
model=request.model,
|
|
270
|
-
model_limit=request.model_limit,
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
# Extract messages from TransformResult
|
|
274
|
-
if hasattr(result, 'messages'):
|
|
275
|
-
compressed_messages = result.messages
|
|
276
|
-
# transforms_applied may be strings or objects with .name
|
|
277
|
-
if hasattr(result, 'transforms_applied'):
|
|
278
|
-
transforms_applied = [t if isinstance(t, str) else getattr(t, 'name', str(t)) for t in result.transforms_applied]
|
|
279
|
-
else:
|
|
280
|
-
transforms_applied = []
|
|
281
|
-
elif isinstance(result, dict):
|
|
282
|
-
compressed_messages = result.get("messages", request.messages)
|
|
283
|
-
transforms_applied = result.get("transforms", [])
|
|
284
|
-
else:
|
|
285
|
-
compressed_messages = result if isinstance(result, list) else request.messages
|
|
286
|
-
transforms_applied = []
|
|
287
|
-
|
|
288
|
-
tokens_after = estimate_tokens(compressed_messages)
|
|
289
|
-
metrics["total_tokens_after"] += tokens_after
|
|
290
|
-
metrics["compressions_applied"] += 1
|
|
291
|
-
|
|
292
|
-
return CompressResponse(
|
|
293
|
-
messages=compressed_messages,
|
|
294
|
-
tools=request.tools, # Tools not modified by current transforms
|
|
295
|
-
compressed=tokens_after < tokens_before,
|
|
296
|
-
stats={
|
|
297
|
-
"tokens_before": tokens_before,
|
|
298
|
-
"tokens_after": tokens_after,
|
|
299
|
-
"tokens_saved": tokens_before - tokens_after,
|
|
300
|
-
"savings_percent": round(
|
|
301
|
-
(1 - tokens_after / tokens_before) * 100, 1
|
|
302
|
-
) if tokens_before > 0 else 0,
|
|
303
|
-
"transforms_applied": transforms_applied,
|
|
304
|
-
"latency_ms": round((time.time() - start_time) * 1000, 1),
|
|
305
|
-
},
|
|
306
|
-
)
|
|
307
|
-
except Exception as e:
|
|
308
|
-
logger.warning(f"Headroom SDK error, falling back to basic: {e}")
|
|
309
|
-
|
|
310
|
-
# Fallback to basic compression
|
|
311
|
-
result = basic_compress(request.messages, request.tools)
|
|
312
|
-
metrics["total_tokens_after"] += result["stats"]["tokens_after"]
|
|
313
|
-
if result["compressed"]:
|
|
314
|
-
metrics["compressions_applied"] += 1
|
|
315
|
-
else:
|
|
316
|
-
metrics["compressions_skipped"] += 1
|
|
317
|
-
|
|
318
|
-
result["stats"]["latency_ms"] = round((time.time() - start_time) * 1000, 1)
|
|
319
|
-
return CompressResponse(**result)
|
|
320
|
-
|
|
321
|
-
except Exception as e:
|
|
322
|
-
metrics["errors"] += 1
|
|
323
|
-
logger.error(f"Compression error: {e}")
|
|
324
|
-
raise HTTPException(status_code=500, detail=str(e))
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
@app.post("/ccr/retrieve", response_model=CCRRetrieveResponse)
|
|
328
|
-
async def ccr_retrieve(request: CCRRetrieveRequest):
|
|
329
|
-
"""Retrieve content from CCR store"""
|
|
330
|
-
cleanup_expired_ccr()
|
|
331
|
-
|
|
332
|
-
if request.hash not in ccr_store:
|
|
333
|
-
return CCRRetrieveResponse(
|
|
334
|
-
success=False,
|
|
335
|
-
error=f"Hash {request.hash} not found or expired",
|
|
336
|
-
)
|
|
337
|
-
|
|
338
|
-
entry = ccr_store[request.hash]
|
|
339
|
-
content = entry["content"]
|
|
340
|
-
metrics["ccr_retrievals"] += 1
|
|
341
|
-
|
|
342
|
-
# If query provided, search within content
|
|
343
|
-
if request.query:
|
|
344
|
-
if isinstance(content, list):
|
|
345
|
-
# Filter list items by query
|
|
346
|
-
filtered = [
|
|
347
|
-
item
|
|
348
|
-
for item in content
|
|
349
|
-
if request.query.lower() in json.dumps(item).lower()
|
|
350
|
-
][: request.max_results]
|
|
351
|
-
return CCRRetrieveResponse(
|
|
352
|
-
success=True,
|
|
353
|
-
content=filtered,
|
|
354
|
-
items_retrieved=len(filtered),
|
|
355
|
-
was_search=True,
|
|
356
|
-
)
|
|
357
|
-
elif isinstance(content, str):
|
|
358
|
-
# Return content if query matches
|
|
359
|
-
if request.query.lower() in content.lower():
|
|
360
|
-
return CCRRetrieveResponse(
|
|
361
|
-
success=True,
|
|
362
|
-
content=content,
|
|
363
|
-
items_retrieved=1,
|
|
364
|
-
was_search=True,
|
|
365
|
-
)
|
|
366
|
-
return CCRRetrieveResponse(
|
|
367
|
-
success=False,
|
|
368
|
-
error="Query not found in content",
|
|
369
|
-
)
|
|
370
|
-
|
|
371
|
-
# Return full content
|
|
372
|
-
return CCRRetrieveResponse(
|
|
373
|
-
success=True,
|
|
374
|
-
content=content,
|
|
375
|
-
items_retrieved=1 if not isinstance(content, list) else len(content),
|
|
376
|
-
was_search=False,
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
@app.post("/ccr/track")
|
|
381
|
-
async def ccr_track(
|
|
382
|
-
hash_key: str,
|
|
383
|
-
turn_number: int,
|
|
384
|
-
tool_name: str,
|
|
385
|
-
sample: str,
|
|
386
|
-
):
|
|
387
|
-
"""Track compression for proactive expansion"""
|
|
388
|
-
return {"tracked": True, "hash_key": hash_key}
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
@app.post("/ccr/analyze")
|
|
392
|
-
async def ccr_analyze(query: str, turn_number: int):
|
|
393
|
-
"""Analyze query for proactive CCR expansion"""
|
|
394
|
-
# Simple keyword matching for expansion suggestions
|
|
395
|
-
expansions = []
|
|
396
|
-
for hash_key, entry in ccr_store.items():
|
|
397
|
-
if query.lower() in json.dumps(entry["content"]).lower():
|
|
398
|
-
expansions.append(
|
|
399
|
-
{
|
|
400
|
-
"hash": hash_key,
|
|
401
|
-
"tool_name": entry.get("tool_name", "unknown"),
|
|
402
|
-
"relevance": 0.8,
|
|
403
|
-
}
|
|
404
|
-
)
|
|
405
|
-
return {"expansions": expansions[:5]}
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
@app.post("/compress/llmlingua")
|
|
409
|
-
async def llmlingua_compress(
|
|
410
|
-
text: str,
|
|
411
|
-
target_ratio: float = 0.5,
|
|
412
|
-
force_tokens: Optional[str] = None,
|
|
413
|
-
):
|
|
414
|
-
"""Compress text using LLMLingua (if available)"""
|
|
415
|
-
if not config.llmlingua_enabled:
|
|
416
|
-
raise HTTPException(status_code=400, detail="LLMLingua is not enabled")
|
|
417
|
-
|
|
418
|
-
try:
|
|
419
|
-
# Try to import and use llmlingua
|
|
420
|
-
from llmlingua import PromptCompressor
|
|
421
|
-
|
|
422
|
-
compressor = PromptCompressor(device_map=config.llmlingua_device)
|
|
423
|
-
result = compressor.compress_prompt(
|
|
424
|
-
text,
|
|
425
|
-
rate=target_ratio,
|
|
426
|
-
force_tokens=json.loads(force_tokens) if force_tokens else None,
|
|
427
|
-
)
|
|
428
|
-
return {
|
|
429
|
-
"compressed": result["compressed_prompt"],
|
|
430
|
-
"original_tokens": result.get("origin_tokens", len(text) // 4),
|
|
431
|
-
"compressed_tokens": result.get("compressed_tokens", len(result["compressed_prompt"]) // 4),
|
|
432
|
-
"ratio": result.get("rate", target_ratio),
|
|
433
|
-
}
|
|
434
|
-
except ImportError:
|
|
435
|
-
raise HTTPException(
|
|
436
|
-
status_code=501,
|
|
437
|
-
detail="LLMLingua not installed. Add llmlingua to requirements.txt",
|
|
438
|
-
)
|
|
439
|
-
except Exception as e:
|
|
440
|
-
raise HTTPException(status_code=500, detail=str(e))
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
if __name__ == "__main__":
|
|
444
|
-
logger.info(f"Starting Headroom sidecar on {config.host}:{config.port}")
|
|
445
|
-
logger.info(f"Configuration: {json.dumps(config.to_dict(), indent=2)}")
|
|
446
|
-
uvicorn.run(
|
|
447
|
-
app,
|
|
448
|
-
host=config.host,
|
|
449
|
-
port=config.port,
|
|
450
|
-
log_level=config.log_level,
|
|
451
|
-
)
|
package/monitor-agents.sh
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
|
|
3
|
-
# Monitor agent activity in real-time
|
|
4
|
-
|
|
5
|
-
echo "🔍 Monitoring Agent Activity"
|
|
6
|
-
echo "=============================="
|
|
7
|
-
echo ""
|
|
8
|
-
|
|
9
|
-
while true; do
|
|
10
|
-
clear
|
|
11
|
-
echo "🔍 Agent Statistics (refreshing every 3s)"
|
|
12
|
-
echo "=========================================="
|
|
13
|
-
echo ""
|
|
14
|
-
|
|
15
|
-
# Get stats
|
|
16
|
-
curl -s http://localhost:8080/v1/agents/stats | jq -r '.stats[] |
|
|
17
|
-
"Agent: \(.agent_type)
|
|
18
|
-
Executions: \(.total_executions) (\(.completed) completed, \(.failed) failed)
|
|
19
|
-
Avg Duration: \(.avg_duration_ms)ms
|
|
20
|
-
Tokens: \(.total_input_tokens) in / \(.total_output_tokens) out
|
|
21
|
-
"' || echo "Proxy not responding..."
|
|
22
|
-
|
|
23
|
-
echo ""
|
|
24
|
-
echo "Latest transcripts:"
|
|
25
|
-
ls -lt data/agent-transcripts/*.jsonl 2>/dev/null | head -3 || echo "No transcripts yet"
|
|
26
|
-
|
|
27
|
-
echo ""
|
|
28
|
-
echo "Press Ctrl+C to stop monitoring"
|
|
29
|
-
|
|
30
|
-
sleep 3
|
|
31
|
-
done
|