reprompt-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reprompt/__init__.py +5 -0
- reprompt/adapters/__init__.py +3 -0
- reprompt/adapters/base.py +25 -0
- reprompt/adapters/claude_code.py +140 -0
- reprompt/adapters/openclaw.py +79 -0
- reprompt/cli.py +177 -0
- reprompt/config.py +44 -0
- reprompt/core/__init__.py +1 -0
- reprompt/core/analyzer.py +68 -0
- reprompt/core/dedup.py +89 -0
- reprompt/core/library.py +91 -0
- reprompt/core/models.py +24 -0
- reprompt/core/pipeline.py +172 -0
- reprompt/embeddings/__init__.py +3 -0
- reprompt/embeddings/base.py +21 -0
- reprompt/embeddings/ollama.py +54 -0
- reprompt/embeddings/tfidf.py +22 -0
- reprompt/output/__init__.py +1 -0
- reprompt/output/json_out.py +11 -0
- reprompt/output/markdown.py +46 -0
- reprompt/output/terminal.py +65 -0
- reprompt/py.typed +0 -0
- reprompt/storage/__init__.py +1 -0
- reprompt/storage/db.py +314 -0
- reprompt_cli-0.1.1.dist-info/METADATA +198 -0
- reprompt_cli-0.1.1.dist-info/RECORD +29 -0
- reprompt_cli-0.1.1.dist-info/WHEEL +4 -0
- reprompt_cli-0.1.1.dist-info/entry_points.txt +2 -0
- reprompt_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
reprompt/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Base adapter interface for AI coding session parsers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from reprompt.core.models import Prompt
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseAdapter(ABC):
|
|
12
|
+
"""Abstract base class for session adapters."""
|
|
13
|
+
|
|
14
|
+
name: str
|
|
15
|
+
default_session_path: str
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def parse_session(self, path: Path) -> list[Prompt]:
|
|
19
|
+
"""Parse a session file and return a list of Prompt objects."""
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def detect_installed(self) -> bool:
|
|
24
|
+
"""Check if the tool's session directory exists."""
|
|
25
|
+
...
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Claude Code session adapter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from reprompt.adapters.base import BaseAdapter
|
|
11
|
+
from reprompt.core.models import Prompt
|
|
12
|
+
|
|
13
|
+
SKIP_EXACT = {
|
|
14
|
+
"\u597d\u7684",
|
|
15
|
+
"OK",
|
|
16
|
+
"ok",
|
|
17
|
+
"Ok",
|
|
18
|
+
"\u662f\u7684",
|
|
19
|
+
"\u53ef\u4ee5",
|
|
20
|
+
"sure",
|
|
21
|
+
"Sure",
|
|
22
|
+
"yes",
|
|
23
|
+
"Yes",
|
|
24
|
+
"Done",
|
|
25
|
+
"done",
|
|
26
|
+
"Sent",
|
|
27
|
+
"sent",
|
|
28
|
+
"\u597d",
|
|
29
|
+
"\u5bf9",
|
|
30
|
+
"\u884c",
|
|
31
|
+
"\u55ef",
|
|
32
|
+
"Tool loaded.",
|
|
33
|
+
"1",
|
|
34
|
+
"2",
|
|
35
|
+
"3",
|
|
36
|
+
"A",
|
|
37
|
+
"B",
|
|
38
|
+
"C",
|
|
39
|
+
"D",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
SKIP_PREFIXES = (
|
|
43
|
+
"<",
|
|
44
|
+
"Tool loaded",
|
|
45
|
+
"Base directory for this skill",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def should_keep_prompt(text: str) -> bool:
|
|
50
|
+
"""Filter out noise prompts -- short messages, exact matches, prefixes."""
|
|
51
|
+
text = text.strip()
|
|
52
|
+
if len(text) < 10:
|
|
53
|
+
return False
|
|
54
|
+
if text in SKIP_EXACT:
|
|
55
|
+
return False
|
|
56
|
+
if any(text.startswith(p) for p in SKIP_PREFIXES):
|
|
57
|
+
return False
|
|
58
|
+
if not re.search(r"[a-zA-Z\u4e00-\u9fff]", text):
|
|
59
|
+
return False
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _extract_text(message: dict[str, object]) -> str:
|
|
64
|
+
"""Extract text from a message, handling both string and list content."""
|
|
65
|
+
content = message.get("content", "")
|
|
66
|
+
if isinstance(content, list):
|
|
67
|
+
parts = [
|
|
68
|
+
p.get("text", "") for p in content if isinstance(p, dict) and p.get("type") == "text"
|
|
69
|
+
]
|
|
70
|
+
return " ".join(parts).strip()
|
|
71
|
+
return str(content).strip()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ClaudeCodeAdapter(BaseAdapter):
|
|
75
|
+
"""Adapter for Claude Code JSONL session files."""
|
|
76
|
+
|
|
77
|
+
name = "claude-code"
|
|
78
|
+
default_session_path = "~/.claude/projects"
|
|
79
|
+
|
|
80
|
+
def __init__(self, session_path: Path | None = None) -> None:
|
|
81
|
+
self._session_path = session_path or Path(os.path.expanduser(self.default_session_path))
|
|
82
|
+
|
|
83
|
+
def detect_installed(self) -> bool:
|
|
84
|
+
"""Check if Claude Code session directory exists."""
|
|
85
|
+
return self._session_path.is_dir()
|
|
86
|
+
|
|
87
|
+
def parse_session(self, path: Path) -> list[Prompt]:
|
|
88
|
+
"""Parse a Claude Code JSONL session file into Prompt objects."""
|
|
89
|
+
prompts: list[Prompt] = []
|
|
90
|
+
session_id = path.stem
|
|
91
|
+
|
|
92
|
+
with open(path, encoding="utf-8") as f:
|
|
93
|
+
for line in f:
|
|
94
|
+
line = line.strip()
|
|
95
|
+
if not line:
|
|
96
|
+
continue
|
|
97
|
+
try:
|
|
98
|
+
entry = json.loads(line)
|
|
99
|
+
except json.JSONDecodeError:
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
# Only process user messages
|
|
103
|
+
if entry.get("type") != "user":
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
message = entry.get("message", {})
|
|
107
|
+
if message.get("role") != "user":
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
text = _extract_text(message)
|
|
111
|
+
if not should_keep_prompt(text):
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
project = self._project_from_path(str(path))
|
|
115
|
+
timestamp = entry.get("timestamp", "")
|
|
116
|
+
|
|
117
|
+
prompts.append(
|
|
118
|
+
Prompt(
|
|
119
|
+
text=text,
|
|
120
|
+
source=self.name,
|
|
121
|
+
session_id=session_id,
|
|
122
|
+
project=project,
|
|
123
|
+
timestamp=timestamp,
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return prompts
|
|
128
|
+
|
|
129
|
+
def _project_from_path(self, file_path: str) -> str:
|
|
130
|
+
"""Extract project name from Claude Code session path.
|
|
131
|
+
|
|
132
|
+
Path format: ~/.claude/projects/-Users-chris-projects-myproject/session.jsonl
|
|
133
|
+
The parent directory name has dashes replacing path separators.
|
|
134
|
+
"""
|
|
135
|
+
parent = os.path.basename(os.path.dirname(file_path))
|
|
136
|
+
parts = parent.split("-")
|
|
137
|
+
for i, p in enumerate(parts):
|
|
138
|
+
if p == "projects" and i + 1 < len(parts):
|
|
139
|
+
return "-".join(parts[i + 1 :])
|
|
140
|
+
return parent
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""OpenClaw/OpenCode session adapter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from reprompt.adapters.base import BaseAdapter
|
|
10
|
+
from reprompt.adapters.claude_code import should_keep_prompt
|
|
11
|
+
from reprompt.core.models import Prompt
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OpenClawAdapter(BaseAdapter):
|
|
15
|
+
"""Adapter for OpenClaw/OpenCode JSONL session files.
|
|
16
|
+
|
|
17
|
+
OpenClaw sessions use a simpler format than Claude Code:
|
|
18
|
+
- No 'type' wrapper -- directly has 'role' field
|
|
19
|
+
- 'content' is always a string (not list)
|
|
20
|
+
- Session path: ~/.opencode/sessions/
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name = "openclaw"
|
|
24
|
+
default_session_path = "~/.opencode/sessions"
|
|
25
|
+
|
|
26
|
+
def __init__(self, session_path: Path | None = None) -> None:
|
|
27
|
+
self._session_path = session_path or Path(os.path.expanduser(self.default_session_path))
|
|
28
|
+
|
|
29
|
+
def detect_installed(self) -> bool:
|
|
30
|
+
"""Check if OpenClaw session directory exists."""
|
|
31
|
+
return self._session_path.is_dir()
|
|
32
|
+
|
|
33
|
+
def parse_session(self, path: Path) -> list[Prompt]:
|
|
34
|
+
"""Parse an OpenClaw JSONL session file into Prompt objects."""
|
|
35
|
+
prompts: list[Prompt] = []
|
|
36
|
+
|
|
37
|
+
with open(path, encoding="utf-8") as f:
|
|
38
|
+
for line in f:
|
|
39
|
+
line = line.strip()
|
|
40
|
+
if not line:
|
|
41
|
+
continue
|
|
42
|
+
try:
|
|
43
|
+
entry = json.loads(line)
|
|
44
|
+
except json.JSONDecodeError:
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
# Only process user messages
|
|
48
|
+
if entry.get("role") != "user":
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
text = str(entry.get("content", "")).strip()
|
|
52
|
+
if not should_keep_prompt(text):
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
session_id = entry.get("session_id", path.stem)
|
|
56
|
+
timestamp = entry.get("timestamp", "")
|
|
57
|
+
project = self._project_from_path(str(path))
|
|
58
|
+
|
|
59
|
+
prompts.append(
|
|
60
|
+
Prompt(
|
|
61
|
+
text=text,
|
|
62
|
+
source=self.name,
|
|
63
|
+
session_id=session_id,
|
|
64
|
+
project=project,
|
|
65
|
+
timestamp=timestamp,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return prompts
|
|
70
|
+
|
|
71
|
+
def _project_from_path(self, file_path: str) -> str:
|
|
72
|
+
"""Extract project name from OpenClaw session path.
|
|
73
|
+
|
|
74
|
+
Path format: ~/.opencode/sessions/<project-name>/session.jsonl
|
|
75
|
+
"""
|
|
76
|
+
parent = os.path.basename(os.path.dirname(file_path))
|
|
77
|
+
if parent == "sessions":
|
|
78
|
+
return ""
|
|
79
|
+
return parent
|
reprompt/cli.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""CLI entry point."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
from reprompt import __version__
|
|
11
|
+
|
|
12
|
+
app = typer.Typer(
|
|
13
|
+
name="reprompt",
|
|
14
|
+
help="Discover, analyze, and evolve your best prompts from AI coding sessions.",
|
|
15
|
+
no_args_is_help=True,
|
|
16
|
+
)
|
|
17
|
+
console = Console()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _version_callback(value: bool) -> None:
|
|
21
|
+
if value:
|
|
22
|
+
typer.echo(f"reprompt {__version__}")
|
|
23
|
+
raise typer.Exit()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@app.callback()
|
|
27
|
+
def main(
|
|
28
|
+
version: bool = typer.Option(
|
|
29
|
+
False, "--version", "-V", callback=_version_callback, is_eager=True
|
|
30
|
+
),
|
|
31
|
+
) -> None:
|
|
32
|
+
"""reprompt -- Discover, analyze, and evolve your best prompts from AI coding sessions."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@app.command()
|
|
36
|
+
def scan(
|
|
37
|
+
source: str | None = typer.Option(None, help="Source adapter (claude-code, openclaw)"),
|
|
38
|
+
path: str | None = typer.Option(None, help="Custom session path"),
|
|
39
|
+
) -> None:
|
|
40
|
+
"""Scan AI tool sessions for prompts."""
|
|
41
|
+
from reprompt.config import Settings
|
|
42
|
+
from reprompt.core.pipeline import run_scan
|
|
43
|
+
|
|
44
|
+
settings = Settings()
|
|
45
|
+
result = run_scan(source=source, path=path, settings=settings)
|
|
46
|
+
|
|
47
|
+
console.print("[bold]Scan complete[/bold]")
|
|
48
|
+
console.print(f" Sessions scanned: {result.sessions_scanned}")
|
|
49
|
+
console.print(f" Prompts found: {result.total_parsed}")
|
|
50
|
+
console.print(f" Unique: {result.unique_after_dedup}")
|
|
51
|
+
console.print(f" Duplicates: {result.duplicates}")
|
|
52
|
+
console.print(f" New stored: {result.new_stored}")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@app.command()
|
|
56
|
+
def report(
|
|
57
|
+
format: str = typer.Option("terminal", help="Output format: terminal, json"),
|
|
58
|
+
top: int = typer.Option(20, help="Number of top terms to show"),
|
|
59
|
+
) -> None:
|
|
60
|
+
"""Generate analytics report."""
|
|
61
|
+
from reprompt.config import Settings
|
|
62
|
+
from reprompt.core.pipeline import build_report_data
|
|
63
|
+
from reprompt.output.json_out import format_json_report
|
|
64
|
+
from reprompt.output.terminal import render_report
|
|
65
|
+
|
|
66
|
+
settings = Settings()
|
|
67
|
+
data = build_report_data(settings=settings)
|
|
68
|
+
|
|
69
|
+
if format == "json":
|
|
70
|
+
console.print(format_json_report(data))
|
|
71
|
+
else:
|
|
72
|
+
console.print(render_report(data))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@app.command()
|
|
76
|
+
def library(
|
|
77
|
+
category: str | None = typer.Option(None, help="Filter by category"),
|
|
78
|
+
export: str | None = typer.Argument(None, help="Export to file path (Markdown)"),
|
|
79
|
+
) -> None:
|
|
80
|
+
"""Show or export your prompt library."""
|
|
81
|
+
from reprompt.config import Settings
|
|
82
|
+
from reprompt.output.markdown import export_library_markdown
|
|
83
|
+
from reprompt.storage.db import PromptDB
|
|
84
|
+
|
|
85
|
+
settings = Settings()
|
|
86
|
+
db = PromptDB(settings.db_path)
|
|
87
|
+
patterns = db.get_patterns(category=category)
|
|
88
|
+
|
|
89
|
+
if export:
|
|
90
|
+
md = export_library_markdown(patterns)
|
|
91
|
+
Path(export).write_text(md)
|
|
92
|
+
console.print(f"Library exported to {export}")
|
|
93
|
+
else:
|
|
94
|
+
if not patterns:
|
|
95
|
+
console.print("No patterns yet. Run [bold]reprompt scan[/bold] first.")
|
|
96
|
+
return
|
|
97
|
+
from rich.table import Table
|
|
98
|
+
|
|
99
|
+
table = Table(title="Prompt Library")
|
|
100
|
+
table.add_column("#", style="dim", width=4)
|
|
101
|
+
table.add_column("Pattern", max_width=50)
|
|
102
|
+
table.add_column("Uses", justify="right")
|
|
103
|
+
table.add_column("Category")
|
|
104
|
+
for i, p in enumerate(patterns, 1):
|
|
105
|
+
table.add_row(
|
|
106
|
+
str(i),
|
|
107
|
+
str(p.get("pattern_text", ""))[:50],
|
|
108
|
+
str(p.get("frequency", 0)),
|
|
109
|
+
str(p.get("category", "")),
|
|
110
|
+
)
|
|
111
|
+
console.print(table)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@app.command()
|
|
115
|
+
def status() -> None:
|
|
116
|
+
"""Show database statistics."""
|
|
117
|
+
from reprompt.config import Settings
|
|
118
|
+
from reprompt.storage.db import PromptDB
|
|
119
|
+
|
|
120
|
+
settings = Settings()
|
|
121
|
+
db = PromptDB(settings.db_path)
|
|
122
|
+
stats = db.get_stats()
|
|
123
|
+
|
|
124
|
+
console.print("[bold]reprompt status[/bold]")
|
|
125
|
+
console.print(f" Total prompts: {stats.get('total_prompts', 0)}")
|
|
126
|
+
console.print(f" Unique prompts: {stats.get('unique_prompts', 0)}")
|
|
127
|
+
console.print(f" Sessions: {stats.get('sessions_processed', 0)}")
|
|
128
|
+
console.print(f" Patterns: {stats.get('patterns', 0)}")
|
|
129
|
+
console.print(f" DB path: {settings.db_path}")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@app.command()
|
|
133
|
+
def purge(
|
|
134
|
+
older_than: str = typer.Option("90d", help="Delete prompts older than (e.g. 90d)"),
|
|
135
|
+
) -> None:
|
|
136
|
+
"""Clean up old data."""
|
|
137
|
+
import re
|
|
138
|
+
|
|
139
|
+
from reprompt.config import Settings
|
|
140
|
+
from reprompt.storage.db import PromptDB
|
|
141
|
+
|
|
142
|
+
m = re.fullmatch(r"(\d+)d?", older_than.strip(), re.IGNORECASE)
|
|
143
|
+
if not m:
|
|
144
|
+
raise typer.BadParameter("Use format like '90d' or '30'")
|
|
145
|
+
days = int(m.group(1))
|
|
146
|
+
settings = Settings()
|
|
147
|
+
db = PromptDB(settings.db_path)
|
|
148
|
+
deleted = db.purge_old_prompts(days)
|
|
149
|
+
console.print(f"Purged {deleted} prompts older than {days} days")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@app.command("install-hook")
|
|
153
|
+
def install_hook(
|
|
154
|
+
source: str = typer.Option("claude-code", help="AI tool to install hook for"),
|
|
155
|
+
) -> None:
|
|
156
|
+
"""Install post-session hook for automatic scanning."""
|
|
157
|
+
home = Path.home()
|
|
158
|
+
|
|
159
|
+
if source == "claude-code":
|
|
160
|
+
hooks_dir = home / ".claude" / "hooks"
|
|
161
|
+
hook_path = hooks_dir / "reprompt-scan.sh"
|
|
162
|
+
|
|
163
|
+
if hook_path.exists():
|
|
164
|
+
console.print(f"Hook already exists at {hook_path}")
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
if not (home / ".claude").exists():
|
|
168
|
+
console.print("[yellow]Claude Code not detected (~/.claude/ not found)[/yellow]")
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
hooks_dir.mkdir(parents=True, exist_ok=True)
|
|
172
|
+
hook_path.write_text("#!/bin/sh\nreprompt scan --source claude-code\n")
|
|
173
|
+
hook_path.chmod(0o755)
|
|
174
|
+
console.print(f"[green]Hook installed at {hook_path}[/green]")
|
|
175
|
+
console.print("reprompt will automatically scan after Claude Code sessions.")
|
|
176
|
+
else:
|
|
177
|
+
console.print(f"[yellow]Hook installation for '{source}' not yet supported[/yellow]")
|
reprompt/config.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Configuration with env var override support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from pydantic_settings import BaseSettings
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _default_db_path() -> str:
|
|
12
|
+
if os.name == "nt":
|
|
13
|
+
base = Path(os.environ.get("LOCALAPPDATA", "~"))
|
|
14
|
+
elif hasattr(os, "uname") and os.uname().sysname == "Darwin":
|
|
15
|
+
base = Path("~/Library/Application Support")
|
|
16
|
+
else:
|
|
17
|
+
base = Path(os.environ.get("XDG_DATA_HOME", "~/.local/share"))
|
|
18
|
+
return str(base / "reprompt" / "reprompt.db")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Settings(BaseSettings):
|
|
22
|
+
model_config = {"env_prefix": "REPROMPT_"}
|
|
23
|
+
|
|
24
|
+
# Embedding
|
|
25
|
+
embedding_backend: str = "tfidf"
|
|
26
|
+
ollama_url: str = "http://localhost:11434"
|
|
27
|
+
|
|
28
|
+
# Storage
|
|
29
|
+
db_path: Path = Path(os.path.expanduser(_default_db_path()))
|
|
30
|
+
|
|
31
|
+
# Dedup
|
|
32
|
+
dedup_threshold: float = 0.85
|
|
33
|
+
|
|
34
|
+
# Library
|
|
35
|
+
library_min_frequency: int = 3
|
|
36
|
+
library_categories: list[str] = [
|
|
37
|
+
"debug",
|
|
38
|
+
"implement",
|
|
39
|
+
"review",
|
|
40
|
+
"test",
|
|
41
|
+
"refactor",
|
|
42
|
+
"explain",
|
|
43
|
+
"config",
|
|
44
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core domain models and logic."""
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""TF-IDF analysis and K-means clustering."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from sklearn.cluster import KMeans
|
|
9
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def compute_tfidf_stats(texts: list[str], top_n: int = 20) -> list[dict[str, Any]]:
|
|
13
|
+
"""Compute TF-IDF stats, return top N terms with scores.
|
|
14
|
+
|
|
15
|
+
Returns list of dicts: [{"term": str, "count": int, "df": int, "tfidf_avg": float}]
|
|
16
|
+
"""
|
|
17
|
+
if not texts:
|
|
18
|
+
return []
|
|
19
|
+
|
|
20
|
+
vectorizer = TfidfVectorizer(max_features=5000)
|
|
21
|
+
tfidf_matrix = vectorizer.fit_transform(texts)
|
|
22
|
+
feature_names = vectorizer.get_feature_names_out()
|
|
23
|
+
|
|
24
|
+
# Average TF-IDF score per term across all documents
|
|
25
|
+
avg_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
|
|
26
|
+
|
|
27
|
+
# Document frequency (number of docs containing each term)
|
|
28
|
+
df = np.asarray((tfidf_matrix > 0).sum(axis=0)).flatten()
|
|
29
|
+
|
|
30
|
+
# Sum of TF-IDF weights (approximate count)
|
|
31
|
+
count = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
|
|
32
|
+
|
|
33
|
+
results = []
|
|
34
|
+
for i, term in enumerate(feature_names):
|
|
35
|
+
results.append(
|
|
36
|
+
{
|
|
37
|
+
"term": term,
|
|
38
|
+
"count": int(count[i] * len(texts)),
|
|
39
|
+
"df": int(df[i]),
|
|
40
|
+
"tfidf_avg": float(avg_scores[i]),
|
|
41
|
+
}
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
results.sort(key=lambda x: x["tfidf_avg"], reverse=True)
|
|
45
|
+
return results[:top_n]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def cluster_prompts(texts: list[str], n_clusters: int = 5) -> dict[int, list[str]]:
|
|
49
|
+
"""Cluster prompts using K-means on TF-IDF vectors.
|
|
50
|
+
|
|
51
|
+
Returns {cluster_id: [texts]}
|
|
52
|
+
"""
|
|
53
|
+
if not texts:
|
|
54
|
+
return {}
|
|
55
|
+
|
|
56
|
+
n_clusters = min(n_clusters, len(texts))
|
|
57
|
+
|
|
58
|
+
vectorizer = TfidfVectorizer(max_features=5000)
|
|
59
|
+
tfidf_matrix = vectorizer.fit_transform(texts)
|
|
60
|
+
|
|
61
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
|
62
|
+
labels = kmeans.fit_predict(tfidf_matrix)
|
|
63
|
+
|
|
64
|
+
clusters: dict[int, list[str]] = {}
|
|
65
|
+
for text, label in zip(texts, labels):
|
|
66
|
+
clusters.setdefault(int(label), []).append(text)
|
|
67
|
+
|
|
68
|
+
return clusters
|
reprompt/core/dedup.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Two-layer deduplication engine.
|
|
2
|
+
|
|
3
|
+
L0: SHA-256 exact hash dedup (always runs)
|
|
4
|
+
L1: TF-IDF cosine similarity dedup (runs on hash-unique prompts)
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from reprompt.core.models import Prompt
|
|
10
|
+
from reprompt.embeddings.base import BaseEmbedder
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _get_embedder(backend: str) -> BaseEmbedder:
|
|
14
|
+
"""Factory function to create an embedder by name."""
|
|
15
|
+
if backend == "tfidf":
|
|
16
|
+
from reprompt.embeddings.tfidf import TfidfEmbedder
|
|
17
|
+
|
|
18
|
+
return TfidfEmbedder()
|
|
19
|
+
elif backend == "ollama":
|
|
20
|
+
from reprompt.embeddings.ollama import OllamaEmbedder
|
|
21
|
+
|
|
22
|
+
return OllamaEmbedder()
|
|
23
|
+
else:
|
|
24
|
+
raise ValueError(f"Unknown embedding backend: {backend}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DedupEngine:
|
|
28
|
+
"""Two-layer deduplication: exact hash then semantic similarity."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, backend: str = "tfidf", threshold: float = 0.85) -> None:
|
|
31
|
+
self._backend = backend
|
|
32
|
+
self._threshold = threshold
|
|
33
|
+
|
|
34
|
+
def deduplicate(self, prompts: list[Prompt]) -> tuple[list[Prompt], list[Prompt]]:
|
|
35
|
+
"""Deduplicate prompts using hash then semantic similarity.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
(unique_prompts, duplicate_prompts)
|
|
39
|
+
"""
|
|
40
|
+
if not prompts:
|
|
41
|
+
return [], []
|
|
42
|
+
|
|
43
|
+
# L0: Exact hash dedup -- keep first occurrence per hash
|
|
44
|
+
seen_hashes: dict[str, int] = {}
|
|
45
|
+
hash_unique: list[Prompt] = []
|
|
46
|
+
hash_dupes: list[Prompt] = []
|
|
47
|
+
|
|
48
|
+
for prompt in prompts:
|
|
49
|
+
if prompt.hash in seen_hashes:
|
|
50
|
+
hash_dupes.append(prompt)
|
|
51
|
+
else:
|
|
52
|
+
seen_hashes[prompt.hash] = len(hash_unique)
|
|
53
|
+
hash_unique.append(prompt)
|
|
54
|
+
|
|
55
|
+
# L1: Semantic dedup on hash-unique prompts
|
|
56
|
+
if len(hash_unique) < 2:
|
|
57
|
+
return hash_unique, hash_dupes
|
|
58
|
+
|
|
59
|
+
embedder = _get_embedder(self._backend)
|
|
60
|
+
texts = [p.text for p in hash_unique]
|
|
61
|
+
embeddings = embedder.embed(texts)
|
|
62
|
+
|
|
63
|
+
if embeddings.size == 0:
|
|
64
|
+
return hash_unique, hash_dupes
|
|
65
|
+
|
|
66
|
+
# Mark semantic duplicates (later items are dupes of earlier ones)
|
|
67
|
+
is_dupe = [False] * len(hash_unique)
|
|
68
|
+
|
|
69
|
+
for i in range(len(hash_unique)):
|
|
70
|
+
if is_dupe[i]:
|
|
71
|
+
continue
|
|
72
|
+
for j in range(i + 1, len(hash_unique)):
|
|
73
|
+
if is_dupe[j]:
|
|
74
|
+
continue
|
|
75
|
+
sim = embedder.cosine_similarity(embeddings[i], embeddings[j])
|
|
76
|
+
if sim >= self._threshold:
|
|
77
|
+
is_dupe[j] = True
|
|
78
|
+
|
|
79
|
+
semantic_unique: list[Prompt] = []
|
|
80
|
+
semantic_dupes: list[Prompt] = []
|
|
81
|
+
|
|
82
|
+
for idx, prompt in enumerate(hash_unique):
|
|
83
|
+
if is_dupe[idx]:
|
|
84
|
+
semantic_dupes.append(prompt)
|
|
85
|
+
else:
|
|
86
|
+
semantic_unique.append(prompt)
|
|
87
|
+
|
|
88
|
+
all_dupes = hash_dupes + semantic_dupes
|
|
89
|
+
return semantic_unique, all_dupes
|