claude-evolve 1.8.49 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/claude-evolve-ideate +7 -26
- package/bin/claude-evolve-ideate-py +15 -0
- package/bin/claude-evolve-run-py +15 -0
- package/bin/claude-evolve-worker-py +15 -0
- package/lib/__pycache__/ai_cli.cpython-314.pyc +0 -0
- package/lib/__pycache__/embedding.cpython-314.pyc +0 -0
- package/lib/__pycache__/evolution_csv.cpython-314.pyc +0 -0
- package/lib/__pycache__/evolve_ideate.cpython-314.pyc +0 -0
- package/lib/__pycache__/evolve_run.cpython-314.pyc +0 -0
- package/lib/__pycache__/evolve_worker.cpython-314.pyc +0 -0
- package/lib/ai-cli.sh +2 -2
- package/lib/ai_cli.py +196 -0
- package/lib/embedding.py +200 -0
- package/lib/evolution_csv.py +325 -0
- package/lib/evolve_ideate.py +509 -0
- package/lib/evolve_run.py +402 -0
- package/lib/evolve_worker.py +518 -0
- package/package.json +10 -10
package/bin/claude-evolve-ideate
CHANGED
|
@@ -1094,16 +1094,12 @@ CRITICAL: You must use your file editing tools (Edit/MultiEdit) to modify the CS
|
|
|
1094
1094
|
|
|
1095
1095
|
# Get AI to directly edit the CSV file
|
|
1096
1096
|
local ai_response
|
|
1097
|
-
local stderr_file="stderr-$$.txt"
|
|
1098
|
-
# Temporarily show stderr for debugging
|
|
1099
1097
|
if ! ai_response=$(call_ai_for_ideation "$prompt" "$CURRENT_GENERATION" "$count" "$temp_csv_basename"); then
|
|
1100
1098
|
echo "[ERROR] AI model failed to generate novel ideas" >&2
|
|
1101
|
-
cat "$stderr_file" >&2
|
|
1102
1099
|
safe_popd
|
|
1103
|
-
rm -f "$temp_csv"
|
|
1100
|
+
rm -f "$temp_csv"
|
|
1104
1101
|
return 1
|
|
1105
1102
|
fi
|
|
1106
|
-
rm -f "$stderr_file"
|
|
1107
1103
|
|
|
1108
1104
|
# Restore working directory
|
|
1109
1105
|
safe_popd
|
|
@@ -1213,7 +1209,7 @@ If you must read source files:
|
|
|
1213
1209
|
- Focus only on finding parameter definitions at the top of the file
|
|
1214
1210
|
- Do NOT read the entire implementation
|
|
1215
1211
|
|
|
1216
|
-
Most of the time, you can infer parameters from descriptions like
|
|
1212
|
+
Most of the time, you can infer parameters from descriptions like 'RSI with threshold 30' or 'MA period 20'.
|
|
1217
1213
|
|
|
1218
1214
|
CRITICAL TASK:
|
|
1219
1215
|
The CSV file already contains $count stub rows with these IDs: $required_ids_str
|
|
@@ -1247,16 +1243,12 @@ CRITICAL INSTRUCTIONS:
|
|
|
1247
1243
|
|
|
1248
1244
|
# Get AI to directly edit the CSV file
|
|
1249
1245
|
local ai_response
|
|
1250
|
-
local stderr_file="stderr-$$.txt"
|
|
1251
|
-
# Temporarily show stderr for debugging
|
|
1252
1246
|
if ! ai_response=$(call_ai_for_ideation "$prompt" "$CURRENT_GENERATION" "$count" "$temp_csv_basename"); then
|
|
1253
1247
|
echo "[ERROR] AI model failed to generate hill climbing ideas" >&2
|
|
1254
|
-
cat "$stderr_file" >&2
|
|
1255
1248
|
safe_popd
|
|
1256
|
-
rm -f "$temp_csv"
|
|
1249
|
+
rm -f "$temp_csv"
|
|
1257
1250
|
return 1
|
|
1258
1251
|
fi
|
|
1259
|
-
rm -f "$stderr_file"
|
|
1260
1252
|
|
|
1261
1253
|
# Restore working directory
|
|
1262
1254
|
safe_popd
|
|
@@ -1390,16 +1382,12 @@ CRITICAL INSTRUCTIONS:
|
|
|
1390
1382
|
|
|
1391
1383
|
# Get AI to directly edit the CSV file
|
|
1392
1384
|
local ai_response
|
|
1393
|
-
local stderr_file="stderr-$$.txt"
|
|
1394
|
-
# Temporarily show stderr for debugging
|
|
1395
1385
|
if ! ai_response=$(call_ai_for_ideation "$prompt" "$CURRENT_GENERATION" "$count" "$temp_csv_basename"); then
|
|
1396
1386
|
echo "[ERROR] AI model failed to generate structural mutation ideas" >&2
|
|
1397
|
-
cat "$stderr_file" >&2
|
|
1398
1387
|
safe_popd
|
|
1399
|
-
rm -f "$temp_csv"
|
|
1388
|
+
rm -f "$temp_csv"
|
|
1400
1389
|
return 1
|
|
1401
1390
|
fi
|
|
1402
|
-
rm -f "$stderr_file"
|
|
1403
1391
|
|
|
1404
1392
|
# Restore working directory
|
|
1405
1393
|
safe_popd
|
|
@@ -1533,16 +1521,12 @@ CRITICAL INSTRUCTIONS:
|
|
|
1533
1521
|
|
|
1534
1522
|
# Get AI to directly edit the CSV file
|
|
1535
1523
|
local ai_response
|
|
1536
|
-
local stderr_file="stderr-$$.txt"
|
|
1537
|
-
# Temporarily show stderr for debugging
|
|
1538
1524
|
if ! ai_response=$(call_ai_for_ideation "$prompt" "$CURRENT_GENERATION" "$count" "$temp_csv_basename"); then
|
|
1539
1525
|
echo "[ERROR] AI model failed to generate crossover hybrid ideas" >&2
|
|
1540
|
-
cat "$stderr_file" >&2
|
|
1541
1526
|
safe_popd
|
|
1542
|
-
rm -f "$temp_csv"
|
|
1527
|
+
rm -f "$temp_csv"
|
|
1543
1528
|
return 1
|
|
1544
1529
|
fi
|
|
1545
|
-
rm -f "$stderr_file"
|
|
1546
1530
|
|
|
1547
1531
|
# Restore working directory
|
|
1548
1532
|
safe_popd
|
|
@@ -1654,15 +1638,12 @@ CRITICAL: You must use your file editing tools (Edit/MultiEdit) to modify the CS
|
|
|
1654
1638
|
|
|
1655
1639
|
# Get AI to directly edit the CSV file
|
|
1656
1640
|
local ai_response
|
|
1657
|
-
|
|
1658
|
-
if ! ai_response=$(call_ai_for_ideation "$prompt" "$CURRENT_GENERATION" "$TOTAL_IDEAS" "$temp_csv_basename" 2>"$stderr_file"); then
|
|
1641
|
+
if ! ai_response=$(call_ai_for_ideation "$prompt" "$CURRENT_GENERATION" "$TOTAL_IDEAS" "$temp_csv_basename"); then
|
|
1659
1642
|
echo "[ERROR] AI model failed to generate ideas" >&2
|
|
1660
|
-
cat "$stderr_file" >&2
|
|
1661
1643
|
safe_popd
|
|
1662
|
-
rm -f "$temp_csv"
|
|
1644
|
+
rm -f "$temp_csv"
|
|
1663
1645
|
return 1
|
|
1664
1646
|
fi
|
|
1665
|
-
rm -f "$stderr_file"
|
|
1666
1647
|
|
|
1667
1648
|
# Restore working directory
|
|
1668
1649
|
safe_popd
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Python implementation of claude-evolve-ideate.
|
|
4
|
+
Test this alongside the shell version before switching.
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# Add lib to path
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'lib'))
|
|
11
|
+
|
|
12
|
+
from evolve_ideate import main
|
|
13
|
+
|
|
14
|
+
if __name__ == '__main__':
|
|
15
|
+
sys.exit(main())
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Python implementation of claude-evolve-run.
|
|
4
|
+
Test this alongside the shell version before switching.
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# Add lib to path
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'lib'))
|
|
11
|
+
|
|
12
|
+
from evolve_run import main
|
|
13
|
+
|
|
14
|
+
if __name__ == '__main__':
|
|
15
|
+
sys.exit(main())
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Python implementation of claude-evolve-worker.
|
|
4
|
+
Test this alongside the shell version before switching.
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# Add lib to path
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'lib'))
|
|
11
|
+
|
|
12
|
+
from evolve_worker import main
|
|
13
|
+
|
|
14
|
+
if __name__ == '__main__':
|
|
15
|
+
sys.exit(main())
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/ai-cli.sh
CHANGED
|
@@ -96,12 +96,12 @@ $prompt"
|
|
|
96
96
|
;;
|
|
97
97
|
gpt5high)
|
|
98
98
|
local ai_output
|
|
99
|
-
ai_output=$(timeout -k 30 600 codex exec -m gpt-5.
|
|
99
|
+
ai_output=$(timeout -k 30 600 codex exec -m gpt-5.2 -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
100
100
|
local ai_exit_code=$?
|
|
101
101
|
;;
|
|
102
102
|
gpt5)
|
|
103
103
|
local ai_output
|
|
104
|
-
ai_output=$(timeout -k 30 600 codex exec -m gpt-5.
|
|
104
|
+
ai_output=$(timeout -k 30 600 codex exec -m gpt-5.2 --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
105
105
|
local ai_exit_code=$?
|
|
106
106
|
;;
|
|
107
107
|
o3high)
|
package/lib/ai_cli.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Python wrapper around ai-cli.sh for AI model invocation.
|
|
4
|
+
AIDEV-NOTE: This keeps ai-cli.sh as the source of truth for model configs and timeouts.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import subprocess
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional, Tuple
|
|
12
|
+
|
|
13
|
+
# Path to ai-cli.sh relative to this file
|
|
14
|
+
SCRIPT_DIR = Path(__file__).parent
|
|
15
|
+
AI_CLI_PATH = SCRIPT_DIR / "ai-cli.sh"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AIError(Exception):
|
|
19
|
+
"""Base exception for AI errors."""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RateLimitError(AIError):
|
|
24
|
+
"""Rate limit hit - should retry later."""
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class APIExhaustedError(AIError):
|
|
29
|
+
"""API quota exhausted - stop processing."""
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TimeoutError(AIError):
|
|
34
|
+
"""AI call timed out."""
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_git_protection_warning() -> str:
|
|
39
|
+
"""Get the git protection warning that must prefix all AI prompts."""
|
|
40
|
+
return '''!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
41
|
+
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
42
|
+
!!!
|
|
43
|
+
!!! ⛔ ABSOLUTE PROHIBITION - READ THIS FIRST ⛔
|
|
44
|
+
!!!
|
|
45
|
+
!!! YOU ARE STRICTLY FORBIDDEN FROM USING ANY GIT COMMANDS WHATSOEVER
|
|
46
|
+
!!!
|
|
47
|
+
!!! ❌ FORBIDDEN: git commit, git add, git reset, git checkout, git revert,
|
|
48
|
+
!!! git branch, git merge, git stash, git clean, git push, git pull
|
|
49
|
+
!!! OR ANY OTHER COMMAND STARTING WITH 'git'
|
|
50
|
+
!!!
|
|
51
|
+
!!! ⚠️ WHY: This runs in production. Git operations have caused DATA LOSS.
|
|
52
|
+
!!! Multiple times AIs have corrupted evolution runs with git commands.
|
|
53
|
+
!!! Version control is ONLY managed by the human operator.
|
|
54
|
+
!!!
|
|
55
|
+
!!! ✅ WHAT YOU CAN DO: Edit files directly using file editing tools ONLY.
|
|
56
|
+
!!! Never touch version control. Ever.
|
|
57
|
+
!!!
|
|
58
|
+
!!! 💀 IF YOU USE GIT: You will corrupt the entire evolution run and lose data.
|
|
59
|
+
!!! This is an automated system. No git operations allowed.
|
|
60
|
+
!!!
|
|
61
|
+
!!! 🚨 CONSEQUENCES: If you execute ANY git command, the human operator will be
|
|
62
|
+
!!! forced to SHUT DOWN ALL AI-BASED EVOLUTION WORK and switch
|
|
63
|
+
!!! to manual-only mode. You will cause the termination of this
|
|
64
|
+
!!! entire automated evolution system. DO NOT BE THAT AI.
|
|
65
|
+
!!!
|
|
66
|
+
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
67
|
+
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
68
|
+
'''
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def call_ai(
|
|
72
|
+
prompt: str,
|
|
73
|
+
command: str = "run",
|
|
74
|
+
working_dir: Optional[str] = None,
|
|
75
|
+
env_vars: Optional[dict] = None
|
|
76
|
+
) -> Tuple[str, str]:
|
|
77
|
+
"""
|
|
78
|
+
Call AI using the configured models via ai-cli.sh.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
prompt: The prompt to send to the AI
|
|
82
|
+
command: Either "run" or "ideate" - determines which model pool to use
|
|
83
|
+
working_dir: Directory to run the command in (for file editing)
|
|
84
|
+
env_vars: Additional environment variables to pass
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Tuple of (output, model_name)
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
TimeoutError: If the AI call times out
|
|
91
|
+
RateLimitError: If rate limited
|
|
92
|
+
APIExhaustedError: If API quota exhausted
|
|
93
|
+
AIError: For other AI errors
|
|
94
|
+
"""
|
|
95
|
+
# Create temp file for model name (ai-cli.sh writes to /tmp/.claude-evolve-model-$$)
|
|
96
|
+
pid = os.getpid()
|
|
97
|
+
model_file = f"/tmp/.claude-evolve-model-{pid}"
|
|
98
|
+
|
|
99
|
+
# Build the bash command that sources config and calls the AI
|
|
100
|
+
# We need to source config.sh first to get LLM_RUN/LLM_IDEATE variables
|
|
101
|
+
bash_script = f'''
|
|
102
|
+
source "{SCRIPT_DIR}/config.sh"
|
|
103
|
+
source "{AI_CLI_PATH}"
|
|
104
|
+
call_ai_random "$1" "$2"
|
|
105
|
+
'''
|
|
106
|
+
|
|
107
|
+
# Setup environment
|
|
108
|
+
env = os.environ.copy()
|
|
109
|
+
if env_vars:
|
|
110
|
+
env.update(env_vars)
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
result = subprocess.run(
|
|
114
|
+
["bash", "-c", bash_script, "bash", prompt, command],
|
|
115
|
+
capture_output=True,
|
|
116
|
+
text=True,
|
|
117
|
+
cwd=working_dir,
|
|
118
|
+
env=env
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
output = result.stdout
|
|
122
|
+
stderr = result.stderr
|
|
123
|
+
exit_code = result.returncode
|
|
124
|
+
|
|
125
|
+
# Read model name from temp file
|
|
126
|
+
model_name = "unknown"
|
|
127
|
+
if os.path.exists(model_file):
|
|
128
|
+
with open(model_file) as f:
|
|
129
|
+
model_name = f.read().strip()
|
|
130
|
+
os.remove(model_file)
|
|
131
|
+
|
|
132
|
+
# Handle exit codes
|
|
133
|
+
if exit_code == 124:
|
|
134
|
+
raise TimeoutError(f"AI call timed out (model: {model_name})")
|
|
135
|
+
elif exit_code == 2:
|
|
136
|
+
raise RateLimitError(f"Rate limit hit (model: {model_name})")
|
|
137
|
+
elif exit_code == 3:
|
|
138
|
+
raise APIExhaustedError(f"API quota exhausted (model: {model_name})")
|
|
139
|
+
elif exit_code != 0:
|
|
140
|
+
raise AIError(f"AI call failed with exit code {exit_code}: {stderr}")
|
|
141
|
+
|
|
142
|
+
return output, model_name
|
|
143
|
+
|
|
144
|
+
except subprocess.SubprocessError as e:
|
|
145
|
+
raise AIError(f"Failed to call AI: {e}")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def call_ai_for_file_edit(
|
|
149
|
+
prompt: str,
|
|
150
|
+
file_path: str,
|
|
151
|
+
command: str = "run",
|
|
152
|
+
working_dir: Optional[str] = None
|
|
153
|
+
) -> Tuple[bool, str]:
|
|
154
|
+
"""
|
|
155
|
+
Call AI to edit a specific file.
|
|
156
|
+
|
|
157
|
+
This is used when the AI needs to modify files directly (like CSV editing
|
|
158
|
+
during ideation). The file path is passed in the prompt context.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
prompt: The prompt including file editing instructions
|
|
162
|
+
file_path: Path to the file being edited (for verification)
|
|
163
|
+
command: Either "run" or "ideate"
|
|
164
|
+
working_dir: Directory to run in
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Tuple of (success: bool, model_name: str)
|
|
168
|
+
"""
|
|
169
|
+
# Get file mtime before
|
|
170
|
+
before_mtime = None
|
|
171
|
+
if os.path.exists(file_path):
|
|
172
|
+
before_mtime = os.path.getmtime(file_path)
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
output, model_name = call_ai(prompt, command, working_dir)
|
|
176
|
+
|
|
177
|
+
# Verify file was modified
|
|
178
|
+
if os.path.exists(file_path):
|
|
179
|
+
after_mtime = os.path.getmtime(file_path)
|
|
180
|
+
if before_mtime is not None and after_mtime > before_mtime:
|
|
181
|
+
return True, model_name
|
|
182
|
+
|
|
183
|
+
# File not modified - might be an error
|
|
184
|
+
return False, model_name
|
|
185
|
+
|
|
186
|
+
except AIError:
|
|
187
|
+
raise
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
if __name__ == "__main__":
|
|
191
|
+
# Quick test
|
|
192
|
+
print("Testing AI CLI wrapper...")
|
|
193
|
+
print(f"AI CLI path: {AI_CLI_PATH}")
|
|
194
|
+
print(f"AI CLI exists: {AI_CLI_PATH.exists()}")
|
|
195
|
+
print("\nGit protection warning:")
|
|
196
|
+
print(get_git_protection_warning()[:200] + "...")
|
package/lib/embedding.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Embedding helper using Ollama's nomic-embed-text model.
|
|
4
|
+
AIDEV-NOTE: Requires ollama with nomic-embed-text model pulled.
|
|
5
|
+
AIDEV-NOTE: Embeddings are cached to disk for efficiency.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import json
|
|
10
|
+
import math
|
|
11
|
+
import os
|
|
12
|
+
import urllib.request
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import List, Optional, Tuple, Dict
|
|
15
|
+
|
|
16
|
+
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "nomic-embed-text")
|
|
17
|
+
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
|
18
|
+
|
|
19
|
+
# Global cache - maps text hash to embedding
|
|
20
|
+
_embedding_cache: Dict[str, List[float]] = {}
|
|
21
|
+
_cache_file: Optional[Path] = None
|
|
22
|
+
_cache_dirty = False
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _text_hash(text: str) -> str:
|
|
26
|
+
"""Create a short hash of text for cache key."""
|
|
27
|
+
return hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def set_cache_file(path: str) -> None:
|
|
31
|
+
"""Set the file path for persistent embedding cache."""
|
|
32
|
+
global _cache_file, _embedding_cache
|
|
33
|
+
_cache_file = Path(path)
|
|
34
|
+
_load_cache()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _load_cache() -> None:
|
|
38
|
+
"""Load cache from disk."""
|
|
39
|
+
global _embedding_cache
|
|
40
|
+
if _cache_file and _cache_file.exists():
|
|
41
|
+
try:
|
|
42
|
+
with open(_cache_file, 'r') as f:
|
|
43
|
+
_embedding_cache = json.load(f)
|
|
44
|
+
print(f"[EMBED] Loaded {len(_embedding_cache)} cached embeddings", file=__import__('sys').stderr)
|
|
45
|
+
except Exception as e:
|
|
46
|
+
print(f"[EMBED] Cache load error: {e}", file=__import__('sys').stderr)
|
|
47
|
+
_embedding_cache = {}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def save_cache() -> None:
|
|
51
|
+
"""Save cache to disk."""
|
|
52
|
+
global _cache_dirty
|
|
53
|
+
if _cache_file and _cache_dirty:
|
|
54
|
+
try:
|
|
55
|
+
_cache_file.parent.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
with open(_cache_file, 'w') as f:
|
|
57
|
+
json.dump(_embedding_cache, f)
|
|
58
|
+
_cache_dirty = False
|
|
59
|
+
except Exception as e:
|
|
60
|
+
print(f"[EMBED] Cache save error: {e}", file=__import__('sys').stderr)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_embedding(text: str, use_cache: bool = True) -> Optional[List[float]]:
|
|
64
|
+
"""Get embedding vector for text using Ollama. Uses cache if available."""
|
|
65
|
+
global _cache_dirty
|
|
66
|
+
|
|
67
|
+
# Check cache first
|
|
68
|
+
if use_cache:
|
|
69
|
+
key = _text_hash(text)
|
|
70
|
+
if key in _embedding_cache:
|
|
71
|
+
return _embedding_cache[key]
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
req_data = json.dumps({"model": EMBEDDING_MODEL, "input": text}).encode('utf-8')
|
|
75
|
+
req = urllib.request.Request(
|
|
76
|
+
f"{OLLAMA_URL}/api/embed",
|
|
77
|
+
data=req_data,
|
|
78
|
+
headers={"Content-Type": "application/json"}
|
|
79
|
+
)
|
|
80
|
+
with urllib.request.urlopen(req, timeout=30) as response:
|
|
81
|
+
data = json.loads(response.read().decode('utf-8'))
|
|
82
|
+
embedding = data.get("embeddings", [[]])[0]
|
|
83
|
+
|
|
84
|
+
# Store in cache
|
|
85
|
+
if use_cache and embedding:
|
|
86
|
+
key = _text_hash(text)
|
|
87
|
+
_embedding_cache[key] = embedding
|
|
88
|
+
_cache_dirty = True
|
|
89
|
+
|
|
90
|
+
return embedding
|
|
91
|
+
except Exception as e:
|
|
92
|
+
print(f"Embedding error: {e}")
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def get_file_embedding(file_path: str) -> Optional[List[float]]:
|
|
97
|
+
"""Get embedding for a file's contents."""
|
|
98
|
+
try:
|
|
99
|
+
with open(file_path, 'r') as f:
|
|
100
|
+
return get_embedding(f.read())
|
|
101
|
+
except Exception as e:
|
|
102
|
+
print(f"File read error: {e}")
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def cosine_similarity(a: List[float], b: List[float]) -> float:
|
|
107
|
+
"""Compute cosine similarity between two embedding vectors."""
|
|
108
|
+
if not a or not b or len(a) != len(b):
|
|
109
|
+
return 0.0
|
|
110
|
+
|
|
111
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
112
|
+
norm_a = math.sqrt(sum(x * x for x in a))
|
|
113
|
+
norm_b = math.sqrt(sum(x * x for x in b))
|
|
114
|
+
|
|
115
|
+
if norm_a == 0 or norm_b == 0:
|
|
116
|
+
return 0.0
|
|
117
|
+
|
|
118
|
+
return dot / (norm_a * norm_b)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def is_similar(text1: str, text2: str, threshold: float = 0.9) -> bool:
|
|
122
|
+
"""Check if two texts are semantically similar."""
|
|
123
|
+
emb1 = get_embedding(text1)
|
|
124
|
+
emb2 = get_embedding(text2)
|
|
125
|
+
|
|
126
|
+
if not emb1 or not emb2:
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
return cosine_similarity(emb1, emb2) >= threshold
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def find_most_similar(
|
|
133
|
+
query: str,
|
|
134
|
+
candidates: List[str],
|
|
135
|
+
top_k: int = 5
|
|
136
|
+
) -> List[Tuple[int, float, str]]:
|
|
137
|
+
"""
|
|
138
|
+
Find most similar texts from candidates.
|
|
139
|
+
Returns list of (index, similarity, text) tuples.
|
|
140
|
+
"""
|
|
141
|
+
query_emb = get_embedding(query)
|
|
142
|
+
if not query_emb:
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
results = []
|
|
146
|
+
for i, candidate in enumerate(candidates):
|
|
147
|
+
cand_emb = get_embedding(candidate)
|
|
148
|
+
if cand_emb:
|
|
149
|
+
sim = cosine_similarity(query_emb, cand_emb)
|
|
150
|
+
results.append((i, sim, candidate))
|
|
151
|
+
|
|
152
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
153
|
+
return results[:top_k]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def check_novelty(
|
|
157
|
+
new_code: str,
|
|
158
|
+
existing_codes: List[str],
|
|
159
|
+
threshold: float = 0.95
|
|
160
|
+
) -> Tuple[bool, float]:
|
|
161
|
+
"""
|
|
162
|
+
Check if new code is novel enough compared to existing code.
|
|
163
|
+
Returns (is_novel, max_similarity).
|
|
164
|
+
Uses cache for efficiency - subsequent calls with same texts are instant.
|
|
165
|
+
"""
|
|
166
|
+
new_emb = get_embedding(new_code)
|
|
167
|
+
if not new_emb:
|
|
168
|
+
return True, 0.0 # Can't check, assume novel
|
|
169
|
+
|
|
170
|
+
max_sim = 0.0
|
|
171
|
+
for existing in existing_codes:
|
|
172
|
+
existing_emb = get_embedding(existing)
|
|
173
|
+
if existing_emb:
|
|
174
|
+
sim = cosine_similarity(new_emb, existing_emb)
|
|
175
|
+
max_sim = max(max_sim, sim)
|
|
176
|
+
|
|
177
|
+
# Save cache after checking (batched save)
|
|
178
|
+
save_cache()
|
|
179
|
+
|
|
180
|
+
return max_sim < threshold, max_sim
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
if __name__ == "__main__":
|
|
184
|
+
# Test
|
|
185
|
+
print("Testing embedding...")
|
|
186
|
+
emb = get_embedding("def hello(): print('hello world')")
|
|
187
|
+
if emb:
|
|
188
|
+
print(f"Embedding (first 5 dims): {emb[:5]}")
|
|
189
|
+
print(f"Full dimensions: {len(emb)}")
|
|
190
|
+
|
|
191
|
+
# Test similarity
|
|
192
|
+
code1 = "def add(a, b): return a + b"
|
|
193
|
+
code2 = "def sum(x, y): return x + y"
|
|
194
|
+
code3 = "def multiply(a, b): return a * b"
|
|
195
|
+
|
|
196
|
+
sim12 = cosine_similarity(get_embedding(code1), get_embedding(code2))
|
|
197
|
+
sim13 = cosine_similarity(get_embedding(code1), get_embedding(code3))
|
|
198
|
+
|
|
199
|
+
print(f"\nSimilarity (add vs sum): {sim12:.4f}")
|
|
200
|
+
print(f"Similarity (add vs multiply): {sim13:.4f}")
|