kc-cli 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kc/__init__.py +5 -0
- kc/__main__.py +11 -0
- kc/artifacts/__init__.py +1 -0
- kc/artifacts/diff.py +76 -0
- kc/artifacts/frontmatter.py +26 -0
- kc/artifacts/markdown.py +116 -0
- kc/atomic_write.py +33 -0
- kc/cli.py +284 -0
- kc/commands/__init__.py +1 -0
- kc/commands/artifact.py +1190 -0
- kc/commands/citation.py +231 -0
- kc/commands/common.py +346 -0
- kc/commands/conformance.py +293 -0
- kc/commands/context.py +190 -0
- kc/commands/doctor.py +81 -0
- kc/commands/eval.py +133 -0
- kc/commands/export.py +97 -0
- kc/commands/guide.py +571 -0
- kc/commands/index.py +54 -0
- kc/commands/init.py +207 -0
- kc/commands/lint.py +238 -0
- kc/commands/source.py +464 -0
- kc/commands/status.py +52 -0
- kc/commands/task.py +260 -0
- kc/config.py +127 -0
- kc/embedding_models/potion-base-8M/README.md +97 -0
- kc/embedding_models/potion-base-8M/config.json +13 -0
- kc/embedding_models/potion-base-8M/model.safetensors +0 -0
- kc/embedding_models/potion-base-8M/modules.json +14 -0
- kc/embedding_models/potion-base-8M/tokenizer.json +1 -0
- kc/errors.py +141 -0
- kc/fingerprints.py +35 -0
- kc/ids.py +23 -0
- kc/locks.py +65 -0
- kc/models/__init__.py +17 -0
- kc/models/artifact.py +34 -0
- kc/models/citation.py +60 -0
- kc/models/context.py +23 -0
- kc/models/eval.py +21 -0
- kc/models/plan.py +37 -0
- kc/models/source.py +37 -0
- kc/models/source_range.py +29 -0
- kc/models/source_revision.py +19 -0
- kc/models/task.py +35 -0
- kc/output.py +838 -0
- kc/paths.py +126 -0
- kc/provenance/__init__.py +1 -0
- kc/provenance/citations.py +296 -0
- kc/search/__init__.py +1 -0
- kc/search/extract.py +268 -0
- kc/search/fts.py +284 -0
- kc/search/semantic.py +346 -0
- kc/store/__init__.py +1 -0
- kc/store/jsonl.py +55 -0
- kc/store/sqlite.py +444 -0
- kc/store/transaction.py +67 -0
- kc/templates/agents/skills/kc/SKILL.md +282 -0
- kc/templates/agents/skills/kc/agents/openai.yaml +5 -0
- kc/templates/agents/skills/kc/scripts/resolve_query_citations.py +134 -0
- kc/workspace.py +98 -0
- kc_cli-0.4.0.dist-info/METADATA +522 -0
- kc_cli-0.4.0.dist-info/RECORD +65 -0
- kc_cli-0.4.0.dist-info/WHEEL +4 -0
- kc_cli-0.4.0.dist-info/entry_points.txt +2 -0
- kc_cli-0.4.0.dist-info/licenses/LICENSE +21 -0
kc/commands/task.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from typing import Annotated
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
|
|
8
|
+
from kc.atomic_write import atomic_write_text
|
|
9
|
+
from kc.commands.common import json_dumps, parse_input_json, run, validate_payload_schema
|
|
10
|
+
from kc.config import load_config
|
|
11
|
+
from kc.errors import KcError
|
|
12
|
+
from kc.ids import new_id
|
|
13
|
+
from kc.models.task import TaskRecord
|
|
14
|
+
from kc.output import emit_success
|
|
15
|
+
from kc.paths import current_paths
|
|
16
|
+
from kc.search.fts import ensure_index, search_ranges
|
|
17
|
+
from kc.store.sqlite import load_task, save_task
|
|
18
|
+
from kc.store.transaction import mutation_transaction
|
|
19
|
+
|
|
20
|
+
app = typer.Typer(help="Manage durable task records for external-agent workflows.")
|
|
21
|
+
|
|
22
|
+
STATE_EVENTS = {
|
|
23
|
+
"awaiting_agent": "artifact_created",
|
|
24
|
+
"awaiting_validation": "artifact_validated",
|
|
25
|
+
"awaiting_apply": "artifact_applied",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _event_schema(event: str | None) -> dict | None:
|
|
30
|
+
if event in {"artifact_created", "artifact_validated", "artifact_apply_dry_run", "artifact_applied"}:
|
|
31
|
+
return {
|
|
32
|
+
"type": "object",
|
|
33
|
+
"required": ["path"],
|
|
34
|
+
"properties": {"path": {"type": "string"}, "valid": {"type": "boolean"}},
|
|
35
|
+
}
|
|
36
|
+
if event and event.startswith("blocked_"):
|
|
37
|
+
return {
|
|
38
|
+
"type": "object",
|
|
39
|
+
"required": ["reason"],
|
|
40
|
+
"properties": {"reason": {"type": "string"}, "path": {"type": "string"}},
|
|
41
|
+
}
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _next_commands_for_status(task: TaskRecord) -> list[str]:
|
|
46
|
+
target = task.target_artifacts[0] if task.target_artifacts else "<artifact>"
|
|
47
|
+
if task.status == "awaiting_agent":
|
|
48
|
+
return [f"kc task resume --task-id {task.task_id} --event artifact_created --input @event.json"]
|
|
49
|
+
if task.status == "awaiting_validation":
|
|
50
|
+
return [
|
|
51
|
+
f"kc artifact validate --file {target}",
|
|
52
|
+
f"kc task resume --task-id {task.task_id} --event artifact_validated --input @event.json",
|
|
53
|
+
]
|
|
54
|
+
if task.status == "awaiting_apply":
|
|
55
|
+
return [
|
|
56
|
+
f"kc artifact diff --file {target}",
|
|
57
|
+
f"kc artifact apply --file {target} --dry-run",
|
|
58
|
+
f"kc artifact apply --file {target} --yes",
|
|
59
|
+
f"kc task resume --task-id {task.task_id} --event artifact_applied --input @event.json",
|
|
60
|
+
]
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _set_expected_event(task: TaskRecord) -> None:
|
|
65
|
+
task.expected_event_name = STATE_EVENTS.get(task.status)
|
|
66
|
+
task.expected_event_schema = _event_schema(task.expected_event_name)
|
|
67
|
+
task.next_commands = _next_commands_for_status(task)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _now() -> str:
|
|
71
|
+
return datetime.now(UTC).isoformat()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@app.command("start", help="Create a task, gather candidate ranges, and optionally enter awaiting-agent state.")
|
|
75
|
+
def start(
|
|
76
|
+
goal: Annotated[str, typer.Option("--goal", help="Knowledge-work goal.")],
|
|
77
|
+
shape: Annotated[
|
|
78
|
+
str, typer.Option("--shape", help="Expected artifact/answer shape.")
|
|
79
|
+
] = "knowledge_page",
|
|
80
|
+
domain: Annotated[list[str] | None, typer.Option("--domain", help="Domain tag.")] = None,
|
|
81
|
+
target: Annotated[
|
|
82
|
+
list[str] | None, typer.Option("--target", help="Target artifact path.")
|
|
83
|
+
] = None,
|
|
84
|
+
await_agent: Annotated[bool, typer.Option("--await-agent/--no-await-agent")] = True,
|
|
85
|
+
) -> None:
|
|
86
|
+
def _run() -> None:
|
|
87
|
+
paths = current_paths()
|
|
88
|
+
timestamp = _now()
|
|
89
|
+
ensure_index(paths.sqlite_path, paths.sources_jsonl, paths.ranges_jsonl)
|
|
90
|
+
ranges = search_ranges(
|
|
91
|
+
paths.sqlite_path,
|
|
92
|
+
goal,
|
|
93
|
+
domain=(domain or [None])[0],
|
|
94
|
+
limit=20,
|
|
95
|
+
)
|
|
96
|
+
task = TaskRecord(
|
|
97
|
+
task_id=new_id("task"),
|
|
98
|
+
goal=goal,
|
|
99
|
+
status="awaiting_agent" if await_agent else "completed",
|
|
100
|
+
created_at=timestamp,
|
|
101
|
+
updated_at=timestamp,
|
|
102
|
+
shape=shape,
|
|
103
|
+
domain=list(domain or []),
|
|
104
|
+
candidate_sources=sorted({item["source_id"] for item in ranges}),
|
|
105
|
+
candidate_ranges=[item["range_id"] for item in ranges],
|
|
106
|
+
target_artifacts=list(target or []),
|
|
107
|
+
agent_instructions=[
|
|
108
|
+
"Read the candidate source ranges.",
|
|
109
|
+
"Create or update the target artifact yourself.",
|
|
110
|
+
"Do not add material claims without kc citation tokens.",
|
|
111
|
+
"Run kc artifact validate before apply.",
|
|
112
|
+
],
|
|
113
|
+
next_commands=[
|
|
114
|
+
f"kc artifact validate --file {target[0] if target else '<artifact>'}",
|
|
115
|
+
f"kc artifact diff --file {target[0] if target else '<artifact>'}",
|
|
116
|
+
f"kc artifact apply --file {target[0] if target else '<artifact>'} --dry-run",
|
|
117
|
+
],
|
|
118
|
+
expected_event_name="artifact_created" if await_agent else None,
|
|
119
|
+
expected_event_schema={
|
|
120
|
+
"type": "object",
|
|
121
|
+
"required": ["path"],
|
|
122
|
+
"properties": {"path": {"type": "string"}},
|
|
123
|
+
}
|
|
124
|
+
if await_agent
|
|
125
|
+
else None,
|
|
126
|
+
)
|
|
127
|
+
_set_expected_event(task)
|
|
128
|
+
with mutation_transaction(paths, "task.start", [paths.tasks_dir / f"{task.task_id}.json"]) as tx:
|
|
129
|
+
save_task(paths.sqlite_path, task)
|
|
130
|
+
paths.tasks_dir.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
atomic_write_text(
|
|
132
|
+
paths.tasks_dir / f"{task.task_id}.json", json_dumps(task.model_dump(mode="json"))
|
|
133
|
+
)
|
|
134
|
+
tx.commit({"task_id": task.task_id})
|
|
135
|
+
config = load_config(paths.root)
|
|
136
|
+
exit_code = (
|
|
137
|
+
config.waiting_exit_code
|
|
138
|
+
if await_agent and config.enable_wait_exit_code
|
|
139
|
+
else 0
|
|
140
|
+
)
|
|
141
|
+
emit_success(
|
|
142
|
+
"task.start",
|
|
143
|
+
{
|
|
144
|
+
"task": task.model_dump(mode="json"),
|
|
145
|
+
"candidate_ranges": ranges,
|
|
146
|
+
"resume_command": (
|
|
147
|
+
f"kc task resume --task-id {task.task_id} --event artifact_created --input @event.json"
|
|
148
|
+
if await_agent
|
|
149
|
+
else None
|
|
150
|
+
),
|
|
151
|
+
},
|
|
152
|
+
exit_code=exit_code,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
run("task.start", _run)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@app.command("status", help="Show a compact task status and next commands.")
|
|
159
|
+
def status(task_id: Annotated[str, typer.Option("--task-id", help="Task ID.")]) -> None:
|
|
160
|
+
def _run() -> None:
|
|
161
|
+
task = load_task(current_paths().sqlite_path, task_id)
|
|
162
|
+
if task is None:
|
|
163
|
+
raise KcError(code="KC_TASK_NOT_FOUND", message=f"Task not found: {task_id}")
|
|
164
|
+
emit_success(
|
|
165
|
+
"task.status",
|
|
166
|
+
{
|
|
167
|
+
"task_id": task.task_id,
|
|
168
|
+
"status": task.status,
|
|
169
|
+
"updated_at": task.updated_at,
|
|
170
|
+
"next_commands": task.next_commands,
|
|
171
|
+
},
|
|
172
|
+
target={"task_id": task_id},
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
run("task.status", _run)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@app.command("next", help="Return state-specific next commands for a task.")
|
|
179
|
+
def next_command(task_id: Annotated[str, typer.Option("--task-id", help="Task ID.")]) -> None:
|
|
180
|
+
def _run() -> None:
|
|
181
|
+
task = load_task(current_paths().sqlite_path, task_id)
|
|
182
|
+
if task is None:
|
|
183
|
+
raise KcError(code="KC_TASK_NOT_FOUND", message=f"Task not found: {task_id}")
|
|
184
|
+
_set_expected_event(task)
|
|
185
|
+
emit_success(
|
|
186
|
+
"task.next",
|
|
187
|
+
{
|
|
188
|
+
"task_id": task.task_id,
|
|
189
|
+
"status": task.status,
|
|
190
|
+
"expected_event_name": task.expected_event_name,
|
|
191
|
+
"expected_event_schema": task.expected_event_schema,
|
|
192
|
+
"next_commands": task.next_commands,
|
|
193
|
+
},
|
|
194
|
+
target={"task_id": task_id},
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
run("task.next", _run)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@app.command("inspect", help="Show the full stored task record.")
|
|
201
|
+
def inspect(task_id: Annotated[str, typer.Option("--task-id", help="Task ID.")]) -> None:
|
|
202
|
+
def _run() -> None:
|
|
203
|
+
task = load_task(current_paths().sqlite_path, task_id)
|
|
204
|
+
if task is None:
|
|
205
|
+
raise KcError(code="KC_TASK_NOT_FOUND", message=f"Task not found: {task_id}")
|
|
206
|
+
emit_success(
|
|
207
|
+
"task.inspect", {"task": task.model_dump(mode="json")}, target={"task_id": task_id}
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
run("task.inspect", _run)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@app.command("resume", help="Resume an awaiting task with a structured event payload.")
|
|
214
|
+
def resume(
|
|
215
|
+
task_id: Annotated[str, typer.Option("--task-id", help="Task ID.")],
|
|
216
|
+
event: Annotated[str, typer.Option("--event", help="Event name.")],
|
|
217
|
+
input_data: Annotated[str, typer.Option("--input", help="Inline JSON or @file.")],
|
|
218
|
+
) -> None:
|
|
219
|
+
def _run() -> None:
|
|
220
|
+
paths = current_paths()
|
|
221
|
+
task = load_task(paths.sqlite_path, task_id)
|
|
222
|
+
if task is None:
|
|
223
|
+
raise KcError(code="KC_TASK_NOT_FOUND", message=f"Task not found: {task_id}")
|
|
224
|
+
if task.status not in {"awaiting_agent", "awaiting_validation", "awaiting_apply"}:
|
|
225
|
+
raise KcError(
|
|
226
|
+
code="KC_TASK_NOT_WAITING",
|
|
227
|
+
message=f"Task is not awaiting agent input: {task_id}",
|
|
228
|
+
details={"status": task.status},
|
|
229
|
+
)
|
|
230
|
+
if event != task.expected_event_name and event not in {"blocked_missing_source", "blocked_validation_failed"}:
|
|
231
|
+
raise KcError(
|
|
232
|
+
code="KC_EVENT_INVALID",
|
|
233
|
+
message=f"Expected event {task.expected_event_name}, got {event}",
|
|
234
|
+
)
|
|
235
|
+
payload = parse_input_json(input_data)
|
|
236
|
+
validate_payload_schema(payload, _event_schema(event) if event.startswith("blocked_") else task.expected_event_schema)
|
|
237
|
+
task.events.append({"event": event, "input": payload, "received_at": _now()})
|
|
238
|
+
if event == "artifact_created":
|
|
239
|
+
task.status = "awaiting_validation"
|
|
240
|
+
elif event in {"artifact_validated", "artifact_apply_dry_run"}:
|
|
241
|
+
task.status = "awaiting_apply"
|
|
242
|
+
elif event == "artifact_applied":
|
|
243
|
+
task.status = "completed"
|
|
244
|
+
elif event in {"blocked_missing_source", "blocked_validation_failed"}:
|
|
245
|
+
task.status = "blocked"
|
|
246
|
+
else:
|
|
247
|
+
task.status = "completed"
|
|
248
|
+
task.updated_at = _now()
|
|
249
|
+
_set_expected_event(task)
|
|
250
|
+
with mutation_transaction(paths, "task.resume", [paths.tasks_dir / f"{task.task_id}.json"]) as tx:
|
|
251
|
+
save_task(paths.sqlite_path, task)
|
|
252
|
+
atomic_write_text(
|
|
253
|
+
paths.tasks_dir / f"{task.task_id}.json", json_dumps(task.model_dump(mode="json"))
|
|
254
|
+
)
|
|
255
|
+
tx.commit({"task_id": task.task_id, "event": event, "status": task.status})
|
|
256
|
+
emit_success(
|
|
257
|
+
"task.resume", {"task": task.model_dump(mode="json")}, target={"task_id": task_id}
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
run("task.resume", _run)
|
kc/config.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""kc.toml defaults and parsing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import tomllib
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from kc.errors import KcError
|
|
10
|
+
|
|
11
|
+
DEFAULT_CONFIG = """schema_version = "kc.config.v1"
|
|
12
|
+
project_id = "kc-project"
|
|
13
|
+
data_dir = "knowledge"
|
|
14
|
+
state_dir = ".kc"
|
|
15
|
+
|
|
16
|
+
[output]
|
|
17
|
+
default_format = "json"
|
|
18
|
+
human_format = "table"
|
|
19
|
+
llm_env_var = "LLM"
|
|
20
|
+
|
|
21
|
+
[source_policy]
|
|
22
|
+
copy_sources = false
|
|
23
|
+
require_fingerprint = true
|
|
24
|
+
require_locator = true
|
|
25
|
+
allow_unregistered_citations = false
|
|
26
|
+
|
|
27
|
+
[citation_policy]
|
|
28
|
+
required_for_material_claims = true
|
|
29
|
+
citation_token_pattern = "kc_v1"
|
|
30
|
+
fail_on_stale_source_fingerprint = true
|
|
31
|
+
|
|
32
|
+
[index]
|
|
33
|
+
fts_enabled = true
|
|
34
|
+
rrf_k = 60
|
|
35
|
+
|
|
36
|
+
[index.semantic]
|
|
37
|
+
provider = "model2vec"
|
|
38
|
+
model = "potion-base-8M"
|
|
39
|
+
dimension = 256
|
|
40
|
+
checksum = "sha256:aef1c5e1fd70060804f5295ec8e9ab3ed62e50e79b208435fb77e15c5bf94bb8"
|
|
41
|
+
purpose = "ranking_only"
|
|
42
|
+
|
|
43
|
+
[mutation]
|
|
44
|
+
default_dry_run = true
|
|
45
|
+
require_yes_for_apply = true
|
|
46
|
+
atomic_writes = true
|
|
47
|
+
create_snapshots = true
|
|
48
|
+
require_idempotency_key_for_apply = false
|
|
49
|
+
update_log = true
|
|
50
|
+
allow_skip_validate_in_llm = false
|
|
51
|
+
|
|
52
|
+
[task]
|
|
53
|
+
enable_wait_exit_code = false
|
|
54
|
+
waiting_exit_code = 40
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass(frozen=True)
|
|
59
|
+
class KcConfig:
|
|
60
|
+
schema_version: str = "kc.config.v1"
|
|
61
|
+
project_id: str = "kc-project"
|
|
62
|
+
data_dir: str = "knowledge"
|
|
63
|
+
state_dir: str = ".kc"
|
|
64
|
+
raw: dict | None = None
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def fail_on_stale_source_fingerprint(self) -> bool:
|
|
68
|
+
return bool(
|
|
69
|
+
(self.raw or {})
|
|
70
|
+
.get("citation_policy", {})
|
|
71
|
+
.get("fail_on_stale_source_fingerprint", True)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def update_log(self) -> bool:
|
|
76
|
+
return bool((self.raw or {}).get("mutation", {}).get("update_log", True))
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def allow_skip_validate_in_llm(self) -> bool:
|
|
80
|
+
return bool((self.raw or {}).get("mutation", {}).get("allow_skip_validate_in_llm", False))
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def enable_wait_exit_code(self) -> bool:
|
|
84
|
+
return bool((self.raw or {}).get("task", {}).get("enable_wait_exit_code", False))
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def waiting_exit_code(self) -> int:
|
|
88
|
+
return int((self.raw or {}).get("task", {}).get("waiting_exit_code", 40))
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def rrf_k(self) -> int:
|
|
92
|
+
return int((self.raw or {}).get("index", {}).get("rrf_k", 60))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def load_config(root: Path | None = None, *, required: bool = False) -> KcConfig:
|
|
96
|
+
root = root or Path.cwd()
|
|
97
|
+
path = root / "kc.toml"
|
|
98
|
+
if not path.exists():
|
|
99
|
+
if required:
|
|
100
|
+
raise KcError(
|
|
101
|
+
code="KC_CONFIG_NOT_FOUND",
|
|
102
|
+
message="kc.toml not found. Run kc init --yes first.",
|
|
103
|
+
details={"path": str(path)},
|
|
104
|
+
)
|
|
105
|
+
return KcConfig(raw={})
|
|
106
|
+
try:
|
|
107
|
+
data = tomllib.loads(path.read_text(encoding="utf-8"))
|
|
108
|
+
except tomllib.TOMLDecodeError as exc:
|
|
109
|
+
raise KcError(
|
|
110
|
+
code="KC_CONFIG_INVALID",
|
|
111
|
+
message=f"Invalid kc.toml: {exc}",
|
|
112
|
+
details={"path": str(path)},
|
|
113
|
+
) from exc
|
|
114
|
+
schema_version = str(data.get("schema_version", "kc.config.v1"))
|
|
115
|
+
if schema_version != "kc.config.v1":
|
|
116
|
+
raise KcError(
|
|
117
|
+
code="KC_CONFIG_INVALID",
|
|
118
|
+
message=f"Unsupported kc.toml schema_version: {schema_version}",
|
|
119
|
+
details={"path": str(path), "schema_version": schema_version, "supported": ["kc.config.v1"]},
|
|
120
|
+
)
|
|
121
|
+
return KcConfig(
|
|
122
|
+
schema_version=schema_version,
|
|
123
|
+
project_id=str(data.get("project_id", "kc-project")),
|
|
124
|
+
data_dir=str(data.get("data_dir", "knowledge")),
|
|
125
|
+
state_dir=str(data.get("state_dir", ".kc")),
|
|
126
|
+
raw=data,
|
|
127
|
+
)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
---
|
|
2
|
+
library_name: model2vec
|
|
3
|
+
license: mit
|
|
4
|
+
model_name: potion-base-8M
|
|
5
|
+
tags:
|
|
6
|
+
- embeddings
|
|
7
|
+
- static-embeddings
|
|
8
|
+
- sentence-transformers
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# potion-base-8M Model Card
|
|
12
|
+
|
|
13
|
+
This [Model2Vec](https://github.com/MinishLab/model2vec) model is a distilled version of a Sentence Transformer. It uses static embeddings, allowing text embeddings to be computed orders of magnitude faster on both GPU and CPU. It is designed for applications where computational resources are limited or where real-time performance is critical. Model2Vec models are the smallest, fastest, and most performant static embedders available. The distilled models are up to 50 times smaller and 500 times faster than traditional Sentence Transformers.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
Install model2vec using pip:
|
|
19
|
+
```
|
|
20
|
+
pip install model2vec
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
### Using Model2Vec
|
|
26
|
+
|
|
27
|
+
The [Model2Vec library](https://github.com/MinishLab/model2vec) is the fastest and most lightweight way to run Model2Vec models.
|
|
28
|
+
|
|
29
|
+
Load this model using the `from_pretrained` method:
|
|
30
|
+
```python
|
|
31
|
+
from model2vec import StaticModel
|
|
32
|
+
|
|
33
|
+
# Load a pretrained Model2Vec model
|
|
34
|
+
model = StaticModel.from_pretrained("potion-base-8M")
|
|
35
|
+
|
|
36
|
+
# Compute text embeddings
|
|
37
|
+
embeddings = model.encode(["Example sentence"])
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Using Sentence Transformers
|
|
41
|
+
|
|
42
|
+
You can also use the [Sentence Transformers library](https://github.com/UKPLab/sentence-transformers) to load and use the model:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from sentence_transformers import SentenceTransformer
|
|
46
|
+
|
|
47
|
+
# Load a pretrained Sentence Transformer model
|
|
48
|
+
model = SentenceTransformer("potion-base-8M")
|
|
49
|
+
|
|
50
|
+
# Compute text embeddings
|
|
51
|
+
embeddings = model.encode(["Example sentence"])
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Distilling a Model2Vec model
|
|
55
|
+
|
|
56
|
+
You can distill a Model2Vec model from a Sentence Transformer model using the `distill` method. First, install the `distill` extra with `pip install model2vec[distill]`. Then, run the following code:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from model2vec.distill import distill
|
|
60
|
+
|
|
61
|
+
# Distill a Sentence Transformer model, in this case the BAAI/bge-base-en-v1.5 model
|
|
62
|
+
m2v_model = distill(model_name="BAAI/bge-base-en-v1.5", pca_dims=256)
|
|
63
|
+
|
|
64
|
+
# Save the model
|
|
65
|
+
m2v_model.save_pretrained("m2v_model")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## How it works
|
|
69
|
+
|
|
70
|
+
Model2vec creates a small, fast, and powerful model that outperforms other static embedding models by a large margin on all tasks we could find, while being much faster to create than traditional static embedding models such as GloVe. Best of all, you don't need any data to distill a model using Model2Vec.
|
|
71
|
+
|
|
72
|
+
It works by passing a vocabulary through a sentence transformer model, then reducing the dimensionality of the resulting embeddings using PCA, and finally weighting the embeddings using [SIF weighting](https://openreview.net/pdf?id=SyK00v5xx). During inference, we simply take the mean of all token embeddings occurring in a sentence.
|
|
73
|
+
|
|
74
|
+
## Additional Resources
|
|
75
|
+
|
|
76
|
+
- [Model2Vec Repo](https://github.com/MinishLab/model2vec)
|
|
77
|
+
- [Model2Vec Base Models](https://huggingface.co/collections/minishlab/model2vec-base-models-66fd9dd9b7c3b3c0f25ca90e)
|
|
78
|
+
- [Model2Vec Results](https://github.com/MinishLab/model2vec/tree/main/results)
|
|
79
|
+
- [Model2Vec Tutorials](https://github.com/MinishLab/model2vec/tree/main/tutorials)
|
|
80
|
+
- [Website](https://minishlab.github.io/)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
## Library Authors
|
|
84
|
+
|
|
85
|
+
Model2Vec was developed by the [Minish Lab](https://github.com/MinishLab) team consisting of [Stephan Tulkens](https://github.com/stephantul) and [Thomas van Dongen](https://github.com/Pringled).
|
|
86
|
+
|
|
87
|
+
## Citation
|
|
88
|
+
|
|
89
|
+
Please cite the [Model2Vec repository](https://github.com/MinishLab/model2vec) if you use this model in your work.
|
|
90
|
+
```
|
|
91
|
+
@article{minishlab2024model2vec,
|
|
92
|
+
author = {Tulkens, Stephan and {van Dongen}, Thomas},
|
|
93
|
+
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
|
|
94
|
+
year = {2024},
|
|
95
|
+
url = {https://github.com/MinishLab/model2vec}
|
|
96
|
+
}
|
|
97
|
+
```
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"model_type": "model2vec",
|
|
3
|
+
"architectures": [
|
|
4
|
+
"StaticModel"
|
|
5
|
+
],
|
|
6
|
+
"tokenizer_name": "baai/bge-base-en-v1.5",
|
|
7
|
+
"apply_pca": 256,
|
|
8
|
+
"apply_zipf": true,
|
|
9
|
+
"hidden_dim": 256,
|
|
10
|
+
"seq_length": 1000000,
|
|
11
|
+
"normalize": true,
|
|
12
|
+
"embedding_dtype": "float32"
|
|
13
|
+
}
|
|
Binary file
|