deeprefine-cli 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deeprefine_cli-0.1.3.dist-info/METADATA +223 -0
- deeprefine_cli-0.1.3.dist-info/RECORD +14 -0
- deeprefine_cli-0.1.3.dist-info/WHEEL +5 -0
- deeprefine_cli-0.1.3.dist-info/entry_points.txt +2 -0
- deeprefine_cli-0.1.3.dist-info/licenses/LICENSE +21 -0
- deeprefine_cli-0.1.3.dist-info/top_level.txt +1 -0
- deeprefine_skill/SKILL.md +52 -0
- deeprefine_skill/__init__.py +3 -0
- deeprefine_skill/adapter_graphify.py +207 -0
- deeprefine_skill/cli.py +192 -0
- deeprefine_skill/history.py +81 -0
- deeprefine_skill/installers.py +47 -0
- deeprefine_skill/paths.py +92 -0
- deeprefine_skill/refine_runner.py +197 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deeprefine-cli
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: CLI and Cursor skill to refine graphify knowledge graphs with DeepRefine
|
|
5
|
+
Author: HKUST-KnowComp
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/HKUST-KnowComp/DeepRefine
|
|
8
|
+
Project-URL: Documentation, https://github.com/HKUST-KnowComp/DeepRefine
|
|
9
|
+
Project-URL: Repository, https://github.com/HKUST-KnowComp/DeepRefine
|
|
10
|
+
Project-URL: Issues, https://github.com/HKUST-KnowComp/DeepRefine/issues
|
|
11
|
+
Keywords: deeprefine,graphify,knowledge-graph,rag,cursor,agent-skill
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: openai>=1.0
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# DeepRefine-Skill
|
|
28
|
+
|
|
29
|
+
Cursor skill and CLI to refine **[graphify](https://github.com/safishamsi/graphify)** knowledge graphs with **[DeepRefine](https://github.com/HKUST-KnowComp/DeepRefine)** (`graphify-out/graph.json` + session query history).
|
|
30
|
+
|
|
31
|
+
This repository is **standalone**. The DeepRefine model code (`autorefiner`, `atlas_rag`) lives in a separate **DeepRefine** checkout.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Repository layout
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
DeepRefine-Skill/ ← this repo (pip install -e .)
|
|
39
|
+
├── README.md
|
|
40
|
+
├── SKILL.md
|
|
41
|
+
├── pyproject.toml
|
|
42
|
+
├── deeprefine_skill/
|
|
43
|
+
└── scripts/deeprefine.py
|
|
44
|
+
|
|
45
|
+
DeepRefine/ ← separate clone (training + Reafiner)
|
|
46
|
+
├── autorefiner/
|
|
47
|
+
├── AutoSchemaKG/
|
|
48
|
+
└── ...
|
|
49
|
+
|
|
50
|
+
your-kb-project/ ← your data (graphify-out/)
|
|
51
|
+
└── graphify-out/graph.json
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Recommended clone layout:
|
|
55
|
+
|
|
56
|
+
```text
|
|
57
|
+
www/code/
|
|
58
|
+
├── DeepRefine/
|
|
59
|
+
└── DeepRefine-Skill/ # sibling → auto-detected if DEEPREFINE_REPO unset
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## How graphify and DeepRefine fit together
|
|
65
|
+
|
|
66
|
+
| Stage | Tool | Input | Output |
|
|
67
|
+
|-------|------|-------|--------|
|
|
68
|
+
| Build | **graphify** | Project files | `graphify-out/graph.json`, report, HTML |
|
|
69
|
+
| Query | **graphify** | Questions | `graphify query "..."` |
|
|
70
|
+
| Refine | **DeepRefine** (this repo) | Graph + **query history** | Updated `graph.json`, logs |
|
|
71
|
+
|
|
72
|
+
DeepRefine does not build the graph; it edits `graph.json` incrementally so later `graphify query` works better.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Setup
|
|
77
|
+
|
|
78
|
+
### 1. DeepRefine environment (`atlastune`)
|
|
79
|
+
|
|
80
|
+
Follow [DeepRefine/README](https://github.com/HKUST-KnowComp/DeepRefine) **Environment** to create `atlastune` and install the main repo:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
conda activate atlastune
|
|
84
|
+
cd /path/to/DeepRefine
|
|
85
|
+
pip install -e .
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### 2. Install this CLI
|
|
89
|
+
|
|
90
|
+
**From PyPI:**
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
conda activate atlastune
|
|
94
|
+
pip install deeprefine-cli
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**From source (development):**
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pip install -e /path/to/DeepRefine-Skill
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Verify:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
deeprefine --help
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
> `deeprefine refine` still requires a local [DeepRefine](https://github.com/HKUST-KnowComp/DeepRefine) checkout (`autorefiner`, `atlas_rag`) and running vLLM services — see below.
|
|
110
|
+
|
|
111
|
+
### 3. Point to DeepRefine (if not cloned as sibling)
|
|
112
|
+
|
|
113
|
+
Only needed when `DeepRefine` is **not** at `../DeepRefine` relative to this repo and not found by walking up from your cwd:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
export DEEPREFINE_REPO=/path/to/DeepRefine
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Add to `~/.bashrc` if you use a fixed path.
|
|
120
|
+
|
|
121
|
+
### 4. Start vLLM (before `deeprefine refine`)
|
|
122
|
+
|
|
123
|
+
From the **DeepRefine** repo:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
conda activate atlastune
|
|
127
|
+
bash /path/to/DeepRefine/scripts/vllm_serve/qwen3-0.6b-emb.sh
|
|
128
|
+
bash /path/to/DeepRefine/scripts/vllm_serve/qwen3-8b-vllm-reafiner.sh
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
| Variable | Default |
|
|
132
|
+
|----------|---------|
|
|
133
|
+
| `DEEPREFINE_LLM_URL` | `http://127.0.0.1:8134/v1` |
|
|
134
|
+
| `DEEPREFINE_EMBED_URL` | `http://127.0.0.1:8128/v1` |
|
|
135
|
+
| `DEEPREFINE_MODEL` | `HaoyuHuang2/DeepRefine-v1-8B` |
|
|
136
|
+
| `DEEPREFINE_EMBED_MODEL` | `Qwen/Qwen3-Embedding-0.6B` |
|
|
137
|
+
|
|
138
|
+
### 5. Install Cursor skill (KB project root)
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
cd /path/to/your-kb-project # must contain or will contain graphify-out/
|
|
142
|
+
deeprefine cursor install
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Same pattern as `graphify cursor install`. Use `--user` for all projects.
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Workflow with graphify
|
|
150
|
+
|
|
151
|
+
**One-time**
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
pip install graphifyy
|
|
155
|
+
graphify cursor install # in KB project
|
|
156
|
+
pip install -e /path/to/DeepRefine-Skill # in atlastune
|
|
157
|
+
deeprefine cursor install # in KB project
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
**Per session** (KB project root)
|
|
161
|
+
|
|
162
|
+
1. `/graphify .` or `graphify .` → `graphify-out/graph.json`
|
|
163
|
+
2. `graphify query "..."` or `/graphify query "..."`
|
|
164
|
+
3. `deeprefine history add --query "..."`
|
|
165
|
+
4. `/deeprefine` or `deeprefine refine`
|
|
166
|
+
5. Optional: `graphify query "..."` again to verify
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
project files ──graphify──► graph.json
|
|
170
|
+
│
|
|
171
|
+
graphify query
|
|
172
|
+
│
|
|
173
|
+
deeprefine history add
|
|
174
|
+
│
|
|
175
|
+
deeprefine refine
|
|
176
|
+
│
|
|
177
|
+
graphify query
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## Command cheat sheet (KB project root)
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
deeprefine history add --query "..."
|
|
186
|
+
deeprefine history list --pending
|
|
187
|
+
deeprefine refine
|
|
188
|
+
deeprefine refine --query "..."
|
|
189
|
+
deeprefine index --rebuild
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Where to run commands
|
|
195
|
+
|
|
196
|
+
| Command | Directory |
|
|
197
|
+
|---------|-----------|
|
|
198
|
+
| `pip install -e .../DeepRefine-Skill` | Any |
|
|
199
|
+
| `pip install -e .../DeepRefine` | DeepRefine repo |
|
|
200
|
+
| `graphify` / `deeprefine cursor install` | **KB project root** |
|
|
201
|
+
| `deeprefine refine` | **KB project root** |
|
|
202
|
+
| vLLM serve scripts | **DeepRefine repo** |
|
|
203
|
+
|
|
204
|
+
Agent instructions: [SKILL.md](./SKILL.md).
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## Publish to PyPI (maintainers)
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
cd /path/to/DeepRefine-Skill
|
|
212
|
+
python -m pip install --upgrade build twine
|
|
213
|
+
python -m build
|
|
214
|
+
twine check dist/*
|
|
215
|
+
twine upload dist/* # needs PyPI token: TWINE_USERNAME=__token__ TWINE_PASSWORD=pypi-...
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
Test install from TestPyPI first (optional):
|
|
219
|
+
|
|
220
|
+
```bash
|
|
221
|
+
twine upload --repository testpypi dist/*
|
|
222
|
+
pip install -i https://test.pypi.org/simple/ deeprefine-cli
|
|
223
|
+
```
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
deeprefine_cli-0.1.3.dist-info/licenses/LICENSE,sha256=K5NjmGZ2iGTxhVTWH1wiFs9uUL6S5RMyj8yoRQlTIbc,1071
|
|
2
|
+
deeprefine_skill/SKILL.md,sha256=BiMmg-xIq2GLkqywmJBJ6-LION5CdClqQjqA6o9vGBQ,1489
|
|
3
|
+
deeprefine_skill/__init__.py,sha256=XIDBdRneslC9k1-ybuVHnpS07dgsTBj8VZAiZ_leAbs,94
|
|
4
|
+
deeprefine_skill/adapter_graphify.py,sha256=I9yudnpUdH_BBs4L090SBYTSY49fnfh0opghryd9-qo,6838
|
|
5
|
+
deeprefine_skill/cli.py,sha256=Ulfq-oyWQoPyiJ2qPr-oZgXlhX9KQs5ByvTfBxkTdlc,6561
|
|
6
|
+
deeprefine_skill/history.py,sha256=ufMUanwfEcJIf22ZkdrAlXj24IW9EIJFZfF2Z0MBBHc,2321
|
|
7
|
+
deeprefine_skill/installers.py,sha256=6PGoFg0z7kdxXEevccBcgrV44CPZ5d1DMNm2BmMi9a0,1455
|
|
8
|
+
deeprefine_skill/paths.py,sha256=jF3QUp53wcwaVBY_4K9Ly1QE622okgrLY80LsGXD5JM,3019
|
|
9
|
+
deeprefine_skill/refine_runner.py,sha256=Pz2z2jw9FSwXPdgtbHkWqUaE6kLeA6sUTiFEvPEAPJg,6435
|
|
10
|
+
deeprefine_cli-0.1.3.dist-info/METADATA,sha256=s_St2jZqmIM2awlDO7imk9I9hWUJ96tMWXldts2icpw,6194
|
|
11
|
+
deeprefine_cli-0.1.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
deeprefine_cli-0.1.3.dist-info/entry_points.txt,sha256=zLtVs6vRaVIzaKUGppMuE9cfJXOjksBAbYmRmhKllFE,57
|
|
13
|
+
deeprefine_cli-0.1.3.dist-info/top_level.txt,sha256=owOn5r-Gn0sQbil4xRMfZKOXybukl_jxjPjUu3_IzeM,17
|
|
14
|
+
deeprefine_cli-0.1.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 HKUST-KnowComp
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
deeprefine_skill
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: deeprefine
|
|
3
|
+
description: >-
|
|
4
|
+
Refines a graphify knowledge graph (graphify-out/graph.json) using DeepRefine
|
|
5
|
+
Reafiner based on session query history. Use when the user runs /deeprefine,
|
|
6
|
+
asks to improve the graphify KB after Q&A, or wants to patch graph.json from
|
|
7
|
+
failed retrieval queries.
|
|
8
|
+
disable-model-invocation: true
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# DeepRefine (graphify)
|
|
12
|
+
|
|
13
|
+
Refine a **[graphify](https://github.com/safishamsi/graphify)** `graph.json` with the DeepRefine agent loop.
|
|
14
|
+
|
|
15
|
+
## Setup
|
|
16
|
+
|
|
17
|
+
**DeepRefine-Skill** (CLI + this file) is separate from the **DeepRefine** model repo.
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# 1) DeepRefine env (atlastune) + main package
|
|
21
|
+
conda activate atlastune
|
|
22
|
+
cd /path/to/DeepRefine && pip install -e .
|
|
23
|
+
|
|
24
|
+
# 2) This skill CLI
|
|
25
|
+
pip install -e /path/to/DeepRefine-Skill
|
|
26
|
+
|
|
27
|
+
# 3) Optional if DeepRefine is not ../DeepRefine
|
|
28
|
+
export DEEPREFINE_REPO=/path/to/DeepRefine
|
|
29
|
+
|
|
30
|
+
# 4) Cursor skill in KB project root
|
|
31
|
+
cd /path/to/your-kb-project
|
|
32
|
+
deeprefine cursor install
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Start vLLM from DeepRefine: embedding `8128`, refine model `8134` (see DeepRefine `scripts/vllm_serve/`).
|
|
36
|
+
|
|
37
|
+
## `/deeprefine`
|
|
38
|
+
|
|
39
|
+
From the **KB project root** (with `graphify-out/graph.json`):
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
deeprefine history add --query "..." # after graph Q&A
|
|
43
|
+
deeprefine refine # all pending queries
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Do not hand-edit `graph.json` for refinement.
|
|
47
|
+
|
|
48
|
+
## Paths
|
|
49
|
+
|
|
50
|
+
- History: `graphify-out/.deeprefine/history.jsonl`
|
|
51
|
+
- Log: `graphify-out/.deeprefine/refinement_results_*.jsonl`
|
|
52
|
+
- Backup: `graphify-out/.deeprefine/graph.json.bak`
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
import json
|
|
5
|
+
import pickle
|
|
6
|
+
import shutil
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import networkx as nx
|
|
11
|
+
from networkx.readwrite import json_graph
|
|
12
|
+
|
|
13
|
+
from atlas_rag.vectorstore.create_graph_index import (
|
|
14
|
+
build_faiss_index_flat,
|
|
15
|
+
compute_graph_embeddings,
|
|
16
|
+
compute_text_embeddings,
|
|
17
|
+
)
|
|
18
|
+
from atlas_rag.vectorstore.embedding_model import BaseEmbeddingModel
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_graphify_json(path: Path) -> tuple[dict[str, Any], nx.DiGraph]:
|
|
22
|
+
raw = json.loads(path.read_text(encoding="utf-8"))
|
|
23
|
+
links_key = "links" if "links" in raw else "edges"
|
|
24
|
+
# node_link_graph returns Graph/ DiGraph depending on attrs; force DiGraph
|
|
25
|
+
base = json_graph.node_link_graph(raw, edges=links_key, directed=True)
|
|
26
|
+
if not isinstance(base, nx.DiGraph):
|
|
27
|
+
kg = nx.DiGraph(base)
|
|
28
|
+
else:
|
|
29
|
+
kg = base
|
|
30
|
+
|
|
31
|
+
id_to_meta = {n.get("id"): n for n in raw.get("nodes", []) if n.get("id")}
|
|
32
|
+
|
|
33
|
+
for nid in list(kg.nodes):
|
|
34
|
+
meta = id_to_meta.get(nid, {})
|
|
35
|
+
label = meta.get("label") or meta.get("id") or str(nid)
|
|
36
|
+
kg.nodes[nid]["id"] = label
|
|
37
|
+
kg.nodes[nid]["type"] = kg.nodes[nid].get("type") or "entity"
|
|
38
|
+
kg.nodes[nid]["file_id"] = meta.get("source_file")
|
|
39
|
+
if "community" in meta:
|
|
40
|
+
kg.nodes[nid]["community"] = meta["community"]
|
|
41
|
+
|
|
42
|
+
for u, v, data in kg.edges(data=True):
|
|
43
|
+
if "relation" not in data:
|
|
44
|
+
data["relation"] = data.pop("label", "related_to")
|
|
45
|
+
data.setdefault("type", "Relation")
|
|
46
|
+
conf = data.get("confidence", "INFERRED")
|
|
47
|
+
data.setdefault("confidence", conf)
|
|
48
|
+
|
|
49
|
+
return raw, kg
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _entity_nodes(kg: nx.DiGraph) -> list[str]:
|
|
53
|
+
return [
|
|
54
|
+
n
|
|
55
|
+
for n in kg.nodes
|
|
56
|
+
if kg.nodes[n].get("type") != "passage"
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def build_reafiner_data(
|
|
61
|
+
kg: nx.DiGraph,
|
|
62
|
+
sentence_encoder: BaseEmbeddingModel,
|
|
63
|
+
*,
|
|
64
|
+
normalize_embeddings: bool = False,
|
|
65
|
+
batch_size: int = 64,
|
|
66
|
+
) -> dict[str, Any]:
|
|
67
|
+
node_list = _entity_nodes(kg)
|
|
68
|
+
node_set = set(node_list)
|
|
69
|
+
edge_list = [(u, v) for u, v in kg.edges if u in node_set and v in node_set]
|
|
70
|
+
node_list_string = [kg.nodes[n]["id"] for n in node_list]
|
|
71
|
+
edge_list_string = [
|
|
72
|
+
f"{kg.nodes[u]['id']} {kg.edges[u, v]['relation']} {kg.nodes[v]['id']}"
|
|
73
|
+
for u, v in edge_list
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
node_embeddings, edge_embeddings = compute_graph_embeddings(
|
|
77
|
+
node_list_string,
|
|
78
|
+
edge_list_string,
|
|
79
|
+
sentence_encoder,
|
|
80
|
+
batch_size=batch_size,
|
|
81
|
+
normalize_embeddings=normalize_embeddings,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
node_faiss_index = build_faiss_index_flat(node_embeddings)
|
|
85
|
+
edge_faiss_index = build_faiss_index_flat(edge_embeddings)
|
|
86
|
+
|
|
87
|
+
passage_id = "__deeprefine_passage__"
|
|
88
|
+
if passage_id not in kg.nodes:
|
|
89
|
+
kg.add_node(
|
|
90
|
+
passage_id,
|
|
91
|
+
id="graphify knowledge graph",
|
|
92
|
+
type="passage",
|
|
93
|
+
file_id=None,
|
|
94
|
+
)
|
|
95
|
+
text_dict = {passage_id: "graphify knowledge graph"}
|
|
96
|
+
text_embeddings = compute_text_embeddings(
|
|
97
|
+
list(text_dict.values()),
|
|
98
|
+
sentence_encoder,
|
|
99
|
+
batch_size=8,
|
|
100
|
+
normalize_embeddings=normalize_embeddings,
|
|
101
|
+
)
|
|
102
|
+
text_faiss_index = build_faiss_index_flat(text_embeddings)
|
|
103
|
+
|
|
104
|
+
n_nodes = len(node_list)
|
|
105
|
+
n_edges = len(edge_list)
|
|
106
|
+
return {
|
|
107
|
+
"KG": kg,
|
|
108
|
+
"node_faiss_index": node_faiss_index,
|
|
109
|
+
"edge_faiss_index": edge_faiss_index,
|
|
110
|
+
"text_faiss_index": text_faiss_index,
|
|
111
|
+
"node_embeddings": node_embeddings,
|
|
112
|
+
"edge_embeddings": edge_embeddings,
|
|
113
|
+
"text_embeddings": text_embeddings,
|
|
114
|
+
"node_list": node_list,
|
|
115
|
+
"edge_list": edge_list,
|
|
116
|
+
"text_dict": text_dict,
|
|
117
|
+
"edge_faiss_id_to_list_idx": {i: i for i in range(n_edges)},
|
|
118
|
+
"node_faiss_id_to_list_idx": {i: i for i in range(n_nodes)},
|
|
119
|
+
"text_faiss_id_to_list_idx": {0: 0},
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def load_or_build_data(
|
|
124
|
+
graph_path: Path,
|
|
125
|
+
cache_pkl: Path,
|
|
126
|
+
sentence_encoder: BaseEmbeddingModel,
|
|
127
|
+
*,
|
|
128
|
+
rebuild: bool = False,
|
|
129
|
+
) -> tuple[dict[str, Any], dict[str, Any]]:
|
|
130
|
+
graph_mtime = graph_path.stat().st_mtime
|
|
131
|
+
if (
|
|
132
|
+
not rebuild
|
|
133
|
+
and cache_pkl.is_file()
|
|
134
|
+
and cache_pkl.stat().st_mtime >= graph_mtime
|
|
135
|
+
):
|
|
136
|
+
with cache_pkl.open("rb") as f:
|
|
137
|
+
bundle = pickle.load(f)
|
|
138
|
+
raw = bundle["graphify_raw"]
|
|
139
|
+
data = bundle["reafiner_data"]
|
|
140
|
+
return raw, data
|
|
141
|
+
|
|
142
|
+
raw, kg = load_graphify_json(graph_path)
|
|
143
|
+
data = build_reafiner_data(kg, sentence_encoder)
|
|
144
|
+
cache_pkl.parent.mkdir(parents=True, exist_ok=True)
|
|
145
|
+
with cache_pkl.open("wb") as f:
|
|
146
|
+
pickle.dump({"graphify_raw": raw, "reafiner_data": data}, f)
|
|
147
|
+
return raw, data
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def sync_kg_to_graphify(raw: dict[str, Any], kg: nx.DiGraph) -> dict[str, Any]:
|
|
151
|
+
"""Merge refined nx graph back into graphify node-link JSON."""
|
|
152
|
+
out = copy.deepcopy(raw)
|
|
153
|
+
links_key = "links" if "links" in out else "edges"
|
|
154
|
+
|
|
155
|
+
old_nodes = {n["id"]: n for n in out.get("nodes", [])}
|
|
156
|
+
new_nodes: list[dict[str, Any]] = []
|
|
157
|
+
for nid in sorted(kg.nodes, key=str):
|
|
158
|
+
if kg.nodes[nid].get("type") == "passage":
|
|
159
|
+
continue
|
|
160
|
+
base = copy.deepcopy(old_nodes.get(nid, {}))
|
|
161
|
+
base["id"] = nid
|
|
162
|
+
base["label"] = kg.nodes[nid].get("id", nid)
|
|
163
|
+
if kg.nodes[nid].get("file_id"):
|
|
164
|
+
base["source_file"] = kg.nodes[nid]["file_id"]
|
|
165
|
+
if "community" in kg.nodes[nid]:
|
|
166
|
+
base["community"] = kg.nodes[nid]["community"]
|
|
167
|
+
new_nodes.append(base)
|
|
168
|
+
|
|
169
|
+
new_links: list[dict[str, Any]] = []
|
|
170
|
+
for u, v, edata in kg.edges(data=True):
|
|
171
|
+
if kg.nodes[u].get("type") == "passage" or kg.nodes[v].get("type") == "passage":
|
|
172
|
+
continue
|
|
173
|
+
link: dict[str, Any] = {
|
|
174
|
+
"source": u,
|
|
175
|
+
"target": v,
|
|
176
|
+
"relation": edata.get("relation", "related_to"),
|
|
177
|
+
"confidence": edata.get("confidence", "INFERRED"),
|
|
178
|
+
}
|
|
179
|
+
if link["confidence"] == "INFERRED":
|
|
180
|
+
link["confidence_score"] = 0.7
|
|
181
|
+
elif link["confidence"] == "EXTRACTED":
|
|
182
|
+
link["confidence_score"] = 1.0
|
|
183
|
+
else:
|
|
184
|
+
link["confidence_score"] = 0.4
|
|
185
|
+
new_links.append(link)
|
|
186
|
+
|
|
187
|
+
out["nodes"] = new_nodes
|
|
188
|
+
out[links_key] = new_links
|
|
189
|
+
return out
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def save_graphify_json(
|
|
193
|
+
path: Path,
|
|
194
|
+
graph_data: dict[str, Any],
|
|
195
|
+
*,
|
|
196
|
+
backup_path: Path | None = None,
|
|
197
|
+
) -> None:
|
|
198
|
+
if backup_path and path.is_file():
|
|
199
|
+
backup_path.parent.mkdir(parents=True, exist_ok=True)
|
|
200
|
+
shutil.copy2(path, backup_path)
|
|
201
|
+
path.write_text(json.dumps(graph_data, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def save_bundle(cache_pkl: Path, raw: dict[str, Any], data: dict[str, Any]) -> None:
|
|
205
|
+
cache_pkl.parent.mkdir(parents=True, exist_ok=True)
|
|
206
|
+
with cache_pkl.open("wb") as f:
|
|
207
|
+
pickle.dump({"graphify_raw": raw, "reafiner_data": data}, f)
|
deeprefine_skill/cli.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""DeepRefine CLI: `deeprefine cursor install` (graphify-style)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from deeprefine_skill.history import append_history, iter_history, pending_queries
|
|
9
|
+
from deeprefine_skill.installers import install_cursor_skill, uninstall_cursor_skill
|
|
10
|
+
from deeprefine_skill.paths import (
|
|
11
|
+
env_defaults,
|
|
12
|
+
find_deeprefine_repo,
|
|
13
|
+
find_project_root,
|
|
14
|
+
graphify_paths,
|
|
15
|
+
setup_import_paths,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _setup_repo_imports() -> None:
|
|
20
|
+
setup_import_paths(find_deeprefine_repo())
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def cmd_cursor_install(args: argparse.Namespace) -> int:
|
|
24
|
+
dest = install_cursor_skill(project=args.project)
|
|
25
|
+
scope = "project" if args.project else "user"
|
|
26
|
+
print(f"Installed DeepRefine Cursor skill ({scope}) → {dest}")
|
|
27
|
+
if args.project:
|
|
28
|
+
print("Open this folder in Cursor, then use /deeprefine in chat.")
|
|
29
|
+
return 0
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def cmd_cursor_uninstall(args: argparse.Namespace) -> int:
|
|
33
|
+
removed = uninstall_cursor_skill(project=args.project)
|
|
34
|
+
if removed:
|
|
35
|
+
scope = "project" if args.project else "user"
|
|
36
|
+
print(f"Removed DeepRefine Cursor skill ({scope}).")
|
|
37
|
+
else:
|
|
38
|
+
print("Skill not installed at the selected scope.")
|
|
39
|
+
return 0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def cmd_install(args: argparse.Namespace) -> int:
|
|
43
|
+
"""Alias for `deeprefine cursor install` (graphify-compatible naming)."""
|
|
44
|
+
return cmd_cursor_install(args)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def cmd_history_add(args: argparse.Namespace) -> int:
|
|
48
|
+
project = find_project_root()
|
|
49
|
+
paths = graphify_paths(project)
|
|
50
|
+
entry = append_history(
|
|
51
|
+
paths["history"], args.query, source=args.source, refined=False
|
|
52
|
+
)
|
|
53
|
+
print(f"Recorded: {entry['id']} → {paths['history']}")
|
|
54
|
+
return 0
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def cmd_history_list(args: argparse.Namespace) -> int:
|
|
58
|
+
project = find_project_root()
|
|
59
|
+
paths = graphify_paths(project)
|
|
60
|
+
rows = (
|
|
61
|
+
pending_queries(paths["history"])
|
|
62
|
+
if args.pending
|
|
63
|
+
else list(iter_history(paths["history"]))
|
|
64
|
+
)
|
|
65
|
+
if not rows and args.pending:
|
|
66
|
+
print("No pending queries.")
|
|
67
|
+
return 0
|
|
68
|
+
for row in rows:
|
|
69
|
+
flag = "refined" if row.get("refined") else "pending"
|
|
70
|
+
print(f"[{flag}] {row.get('id', '?')}: {row.get('query', '')}")
|
|
71
|
+
return 0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def cmd_index(args: argparse.Namespace) -> int:
|
|
75
|
+
_setup_repo_imports()
|
|
76
|
+
from deeprefine_skill.adapter_graphify import load_or_build_data
|
|
77
|
+
from deeprefine_skill.refine_runner import make_clients
|
|
78
|
+
|
|
79
|
+
project = find_project_root()
|
|
80
|
+
paths = graphify_paths(project)
|
|
81
|
+
cfg = env_defaults()
|
|
82
|
+
llm, encoder = make_clients(cfg)
|
|
83
|
+
del llm
|
|
84
|
+
load_or_build_data(
|
|
85
|
+
paths["graph_json"],
|
|
86
|
+
paths["reafiner_pkl"],
|
|
87
|
+
encoder,
|
|
88
|
+
rebuild=True,
|
|
89
|
+
)
|
|
90
|
+
print(f"Index cache: {paths['reafiner_pkl']}")
|
|
91
|
+
return 0
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def cmd_refine(args: argparse.Namespace) -> int:
|
|
95
|
+
_setup_repo_imports()
|
|
96
|
+
from deeprefine_skill.refine_runner import refine_from_history
|
|
97
|
+
|
|
98
|
+
project = find_project_root(Path(args.project_root) if args.project_root else None)
|
|
99
|
+
paths = graphify_paths(project)
|
|
100
|
+
cfg = env_defaults()
|
|
101
|
+
result = refine_from_history(
|
|
102
|
+
paths,
|
|
103
|
+
cfg,
|
|
104
|
+
query=args.query,
|
|
105
|
+
rebuild_index=args.rebuild_index,
|
|
106
|
+
)
|
|
107
|
+
print("\n--- DeepRefine summary ---")
|
|
108
|
+
print(f"Queries processed: {result['queries_processed']}")
|
|
109
|
+
print(f"Graph: {result['graph_path']} ({result['nodes']} nodes, {result['edges']} edges)")
|
|
110
|
+
print(f"Log: {result['log_path']}")
|
|
111
|
+
return 0
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _add_project_flag(parser: argparse.ArgumentParser) -> None:
|
|
115
|
+
group = parser.add_mutually_exclusive_group()
|
|
116
|
+
group.add_argument(
|
|
117
|
+
"--project",
|
|
118
|
+
action="store_true",
|
|
119
|
+
default=None,
|
|
120
|
+
help="Install to .cursor/skills in the current directory (default for cursor install)",
|
|
121
|
+
)
|
|
122
|
+
group.add_argument(
|
|
123
|
+
"--user",
|
|
124
|
+
action="store_true",
|
|
125
|
+
help="Install to ~/.cursor/skills (all projects)",
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _resolve_project(args: argparse.Namespace, *, default_project: bool) -> None:
|
|
130
|
+
if getattr(args, "user", False):
|
|
131
|
+
args.project = False
|
|
132
|
+
elif getattr(args, "project", None) is True:
|
|
133
|
+
args.project = True
|
|
134
|
+
else:
|
|
135
|
+
args.project = default_project
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def main(argv: list[str] | None = None) -> int:
|
|
139
|
+
parser = argparse.ArgumentParser(
|
|
140
|
+
prog="deeprefine",
|
|
141
|
+
description="DeepRefine: refine graphify-out/graph.json using query history",
|
|
142
|
+
)
|
|
143
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
144
|
+
|
|
145
|
+
# deeprefine cursor install | uninstall
|
|
146
|
+
p_cursor = sub.add_parser("cursor", help="Cursor IDE integration")
|
|
147
|
+
cursor_sub = p_cursor.add_subparsers(dest="cursor_cmd", required=True)
|
|
148
|
+
|
|
149
|
+
p_ci = cursor_sub.add_parser("install", help="Install /deeprefine skill for Cursor")
|
|
150
|
+
_add_project_flag(p_ci)
|
|
151
|
+
p_ci.set_defaults(func=cmd_cursor_install, _default_project=True)
|
|
152
|
+
|
|
153
|
+
p_cu = cursor_sub.add_parser("uninstall", help="Remove Cursor skill")
|
|
154
|
+
_add_project_flag(p_cu)
|
|
155
|
+
p_cu.set_defaults(func=cmd_cursor_uninstall, _default_project=True)
|
|
156
|
+
|
|
157
|
+
# deeprefine install (alias)
|
|
158
|
+
p_install = sub.add_parser(
|
|
159
|
+
"install",
|
|
160
|
+
help="Install Cursor skill (alias: deeprefine cursor install)",
|
|
161
|
+
)
|
|
162
|
+
_add_project_flag(p_install)
|
|
163
|
+
p_install.set_defaults(func=cmd_install, _default_project=True)
|
|
164
|
+
|
|
165
|
+
p_hist = sub.add_parser("history", help="Manage query history")
|
|
166
|
+
hsub = p_hist.add_subparsers(dest="history_cmd", required=True)
|
|
167
|
+
p_add = hsub.add_parser("add", help="Append a query to history")
|
|
168
|
+
p_add.add_argument("--query", required=True)
|
|
169
|
+
p_add.add_argument("--source", default="user")
|
|
170
|
+
p_add.set_defaults(func=cmd_history_add)
|
|
171
|
+
p_list = hsub.add_parser("list", help="List history entries")
|
|
172
|
+
p_list.add_argument("--pending", action="store_true")
|
|
173
|
+
p_list.set_defaults(func=cmd_history_list)
|
|
174
|
+
|
|
175
|
+
p_index = sub.add_parser("index", help="Rebuild FAISS cache from graph.json")
|
|
176
|
+
p_index.add_argument("--rebuild", action="store_true", default=True)
|
|
177
|
+
p_index.set_defaults(func=cmd_index)
|
|
178
|
+
|
|
179
|
+
p_refine = sub.add_parser("refine", help="Run refinement on pending or given query")
|
|
180
|
+
p_refine.add_argument("--query", default=None, help="Single query (also recorded)")
|
|
181
|
+
p_refine.add_argument("--project-root", default=None)
|
|
182
|
+
p_refine.add_argument("--rebuild-index", action="store_true")
|
|
183
|
+
p_refine.set_defaults(func=cmd_refine)
|
|
184
|
+
|
|
185
|
+
args = parser.parse_args(argv)
|
|
186
|
+
if hasattr(args, "_default_project"):
|
|
187
|
+
_resolve_project(args, default_project=args._default_project)
|
|
188
|
+
return args.func(args)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
if __name__ == "__main__":
|
|
192
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Iterator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def query_id(query: str, entry_id: str | None = None) -> str:
|
|
11
|
+
"""Stable id for a history row (matches append_history id field)."""
|
|
12
|
+
if entry_id:
|
|
13
|
+
return entry_id
|
|
14
|
+
return hashlib.sha256(query.strip().encode("utf-8")).hexdigest()[:16]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _line_id(query: str) -> str:
|
|
18
|
+
return query_id(query)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def append_history(
|
|
22
|
+
path: Path,
|
|
23
|
+
query: str,
|
|
24
|
+
*,
|
|
25
|
+
source: str = "user",
|
|
26
|
+
refined: bool = False,
|
|
27
|
+
) -> dict[str, Any]:
|
|
28
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
entry = {
|
|
30
|
+
"id": _line_id(query),
|
|
31
|
+
"ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
32
|
+
"query": query.strip(),
|
|
33
|
+
"source": source,
|
|
34
|
+
"refined": refined,
|
|
35
|
+
}
|
|
36
|
+
with path.open("a", encoding="utf-8") as f:
|
|
37
|
+
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
|
38
|
+
return entry
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def iter_history(path: Path) -> Iterator[dict[str, Any]]:
|
|
42
|
+
if not path.is_file():
|
|
43
|
+
return
|
|
44
|
+
with path.open(encoding="utf-8") as f:
|
|
45
|
+
for line in f:
|
|
46
|
+
line = line.strip()
|
|
47
|
+
if not line:
|
|
48
|
+
continue
|
|
49
|
+
yield json.loads(line)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def pending_queries(path: Path) -> list[dict[str, Any]]:
|
|
53
|
+
seen: set[str] = set()
|
|
54
|
+
pending: list[dict[str, Any]] = []
|
|
55
|
+
for row in iter_history(path):
|
|
56
|
+
q = row.get("query", "").strip()
|
|
57
|
+
if not q or row.get("refined") is True:
|
|
58
|
+
continue
|
|
59
|
+
qid = row.get("id") or _line_id(q)
|
|
60
|
+
if qid in seen:
|
|
61
|
+
continue
|
|
62
|
+
seen.add(qid)
|
|
63
|
+
pending.append(row)
|
|
64
|
+
return pending
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def mark_refined(path: Path, query_ids: set[str]) -> None:
|
|
68
|
+
if not path.is_file() or not query_ids:
|
|
69
|
+
return
|
|
70
|
+
rows: list[dict[str, Any]] = list(iter_history(path))
|
|
71
|
+
changed = False
|
|
72
|
+
for row in rows:
|
|
73
|
+
qid = row.get("id") or _line_id(row.get("query", ""))
|
|
74
|
+
if qid in query_ids and not row.get("refined"):
|
|
75
|
+
row["refined"] = True
|
|
76
|
+
row["refined_ts"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
77
|
+
changed = True
|
|
78
|
+
if changed:
|
|
79
|
+
with path.open("w", encoding="utf-8") as f:
|
|
80
|
+
for row in rows:
|
|
81
|
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
_SKILL_MD_NAME = "SKILL.md"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def skill_md_path() -> Path:
|
|
10
|
+
"""SKILL.md bundled in the wheel, or repo root when doing editable install."""
|
|
11
|
+
bundled = Path(__file__).resolve().parent / _SKILL_MD_NAME
|
|
12
|
+
if bundled.is_file():
|
|
13
|
+
return bundled
|
|
14
|
+
repo_root = Path(__file__).resolve().parents[1]
|
|
15
|
+
fallback = repo_root / _SKILL_MD_NAME
|
|
16
|
+
if fallback.is_file():
|
|
17
|
+
return fallback
|
|
18
|
+
raise FileNotFoundError(
|
|
19
|
+
f"Missing {_SKILL_MD_NAME} (expected next to deeprefine_skill/ or repo root)."
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def install_cursor_skill(*, project: bool) -> Path:
|
|
24
|
+
src = skill_md_path()
|
|
25
|
+
if project:
|
|
26
|
+
dest_dir = Path.cwd() / ".cursor" / "skills" / "deeprefine"
|
|
27
|
+
else:
|
|
28
|
+
dest_dir = Path.home() / ".cursor" / "skills" / "deeprefine"
|
|
29
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
shutil.copy2(src, dest_dir / _SKILL_MD_NAME)
|
|
31
|
+
return dest_dir / _SKILL_MD_NAME
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def uninstall_cursor_skill(*, project: bool) -> bool:
|
|
35
|
+
if project:
|
|
36
|
+
dest = Path.cwd() / ".cursor" / "skills" / "deeprefine" / _SKILL_MD_NAME
|
|
37
|
+
else:
|
|
38
|
+
dest = Path.home() / ".cursor" / "skills" / "deeprefine" / _SKILL_MD_NAME
|
|
39
|
+
if dest.is_file():
|
|
40
|
+
dest.unlink()
|
|
41
|
+
for parent in [dest.parent, dest.parent.parent]:
|
|
42
|
+
try:
|
|
43
|
+
parent.rmdir()
|
|
44
|
+
except OSError:
|
|
45
|
+
pass
|
|
46
|
+
return True
|
|
47
|
+
return False
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def find_skill_root() -> Path:
|
|
8
|
+
"""Root of this repository (DeepRefine-Skill)."""
|
|
9
|
+
return Path(__file__).resolve().parents[1]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_deeprefine_repo(start: Path | None = None) -> Path:
|
|
13
|
+
"""
|
|
14
|
+
DeepRefine source repo (autorefiner/ + AutoSchemaKG/).
|
|
15
|
+
|
|
16
|
+
Resolution order:
|
|
17
|
+
1. DEEPREFINE_REPO environment variable
|
|
18
|
+
2. Walk up from cwd for autorefiner/ + AutoSchemaKG/
|
|
19
|
+
3. Sibling ../DeepRefine next to this skill repo
|
|
20
|
+
"""
|
|
21
|
+
env = os.environ.get("DEEPREFINE_REPO", "").strip()
|
|
22
|
+
if env:
|
|
23
|
+
root = Path(env).expanduser().resolve()
|
|
24
|
+
if (root / "autorefiner").is_dir() and (root / "AutoSchemaKG").is_dir():
|
|
25
|
+
return root
|
|
26
|
+
raise FileNotFoundError(
|
|
27
|
+
f"DEEPREFINE_REPO={env} is not a valid DeepRefine checkout "
|
|
28
|
+
"(expected autorefiner/ and AutoSchemaKG/)."
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
here = (start or Path.cwd()).resolve()
|
|
32
|
+
for parent in [here, *here.parents]:
|
|
33
|
+
if (parent / "autorefiner").is_dir() and (parent / "AutoSchemaKG").is_dir():
|
|
34
|
+
return parent
|
|
35
|
+
|
|
36
|
+
sibling = find_skill_root().parent / "DeepRefine"
|
|
37
|
+
if (sibling / "autorefiner").is_dir() and (sibling / "AutoSchemaKG").is_dir():
|
|
38
|
+
return sibling.resolve()
|
|
39
|
+
|
|
40
|
+
raise FileNotFoundError(
|
|
41
|
+
"Could not locate the DeepRefine repository (need autorefiner/ and AutoSchemaKG/).\n"
|
|
42
|
+
"Clone DeepRefine alongside this repo, or set:\n"
|
|
43
|
+
" export DEEPREFINE_REPO=/path/to/DeepRefine"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def find_project_root(start: Path | None = None) -> Path:
|
|
48
|
+
"""User KB project root containing graphify-out/graph.json."""
|
|
49
|
+
here = (start or Path.cwd()).resolve()
|
|
50
|
+
for parent in [here, *here.parents]:
|
|
51
|
+
if (parent / "graphify-out" / "graph.json").is_file():
|
|
52
|
+
return parent
|
|
53
|
+
return here
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def graphify_paths(project_root: Path) -> dict[str, Path]:
|
|
57
|
+
out = project_root / "graphify-out"
|
|
58
|
+
deep = out / ".deeprefine"
|
|
59
|
+
return {
|
|
60
|
+
"graphify_out": out,
|
|
61
|
+
"graph_json": out / "graph.json",
|
|
62
|
+
"history": deep / "history.jsonl",
|
|
63
|
+
"cache_dir": deep / "cache",
|
|
64
|
+
"reafiner_pkl": deep / "cache" / "reafiner_data.pkl",
|
|
65
|
+
"graph_backup": deep / "graph.json.bak",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def setup_import_paths(deeprefine_repo: Path) -> None:
|
|
70
|
+
import sys
|
|
71
|
+
|
|
72
|
+
for p in (deeprefine_repo / "AutoSchemaKG", deeprefine_repo):
|
|
73
|
+
s = str(p)
|
|
74
|
+
if s not in sys.path:
|
|
75
|
+
sys.path.insert(0, s)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def env_defaults() -> dict[str, str]:
|
|
79
|
+
return {
|
|
80
|
+
"DEEPREFINE_LLM_URL": os.environ.get(
|
|
81
|
+
"DEEPREFINE_LLM_URL", "http://127.0.0.1:8134/v1"
|
|
82
|
+
),
|
|
83
|
+
"DEEPREFINE_EMBED_URL": os.environ.get(
|
|
84
|
+
"DEEPREFINE_EMBED_URL", "http://127.0.0.1:8128/v1"
|
|
85
|
+
),
|
|
86
|
+
"DEEPREFINE_MODEL": os.environ.get(
|
|
87
|
+
"DEEPREFINE_MODEL", "HaoyuHuang2/DeepRefine-v1-8B"
|
|
88
|
+
),
|
|
89
|
+
"DEEPREFINE_EMBED_MODEL": os.environ.get(
|
|
90
|
+
"DEEPREFINE_EMBED_MODEL", "Qwen/Qwen3-Embedding-0.6B"
|
|
91
|
+
),
|
|
92
|
+
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from openai import OpenAI
|
|
9
|
+
|
|
10
|
+
from atlas_rag.llm_generator import GenerationConfig, LLMGenerator
|
|
11
|
+
from atlas_rag.vectorstore.embedding_model import Qwen3Emb
|
|
12
|
+
from autorefiner.src.reafiner import Reafiner, RetrievalStepResult
|
|
13
|
+
|
|
14
|
+
from .adapter_graphify import (
|
|
15
|
+
load_or_build_data,
|
|
16
|
+
save_bundle,
|
|
17
|
+
save_graphify_json,
|
|
18
|
+
sync_kg_to_graphify,
|
|
19
|
+
)
|
|
20
|
+
from .history import append_history, mark_refined, query_id
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def refinement_to_jsonable(
|
|
24
|
+
sample: dict[str, Any],
|
|
25
|
+
final_answer: Any,
|
|
26
|
+
refinement_result: Any,
|
|
27
|
+
) -> dict[str, Any]:
|
|
28
|
+
base = {"sample": sample, "final_answer": final_answer}
|
|
29
|
+
if refinement_result is None:
|
|
30
|
+
base["refinement_result"] = None
|
|
31
|
+
return base
|
|
32
|
+
|
|
33
|
+
hist = []
|
|
34
|
+
for step in refinement_result.interaction_history:
|
|
35
|
+
if isinstance(step, RetrievalStepResult):
|
|
36
|
+
hist.append(
|
|
37
|
+
{
|
|
38
|
+
"num_hops": step.num_hops,
|
|
39
|
+
"base_top_k": step.base_top_k,
|
|
40
|
+
"query": step.query,
|
|
41
|
+
"retrieved_subgraph": step.retrieved_subgraph,
|
|
42
|
+
"raw_response": step.raw_response,
|
|
43
|
+
"answerable": step.answerable,
|
|
44
|
+
"answer": step.answer,
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
else:
|
|
48
|
+
hist.append(str(step))
|
|
49
|
+
|
|
50
|
+
base["refinement_result"] = {
|
|
51
|
+
"query": refinement_result.query,
|
|
52
|
+
"history_horizon_size": refinement_result.history_horizon_size,
|
|
53
|
+
"interaction_history": hist,
|
|
54
|
+
"error_abduction_reason": refinement_result.error_abduction_reason,
|
|
55
|
+
"original_subgraph": refinement_result.original_subgraph,
|
|
56
|
+
"refined_subgraph": refinement_result.refined_subgraph,
|
|
57
|
+
"refinement_action_raw": refinement_result.refinement_action_raw,
|
|
58
|
+
"refinement_action_count": len(refinement_result.refinement_action_list),
|
|
59
|
+
}
|
|
60
|
+
return base
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def make_clients(cfg: dict[str, str]) -> tuple[LLMGenerator, Qwen3Emb]:
|
|
64
|
+
llm_client = OpenAI(base_url=cfg["DEEPREFINE_LLM_URL"], api_key="EMPTY")
|
|
65
|
+
embed_client = OpenAI(base_url=cfg["DEEPREFINE_EMBED_URL"], api_key="EMPTY")
|
|
66
|
+
llm = LLMGenerator(
|
|
67
|
+
client=llm_client,
|
|
68
|
+
model_name=cfg["DEEPREFINE_MODEL"],
|
|
69
|
+
default_config=GenerationConfig(chat_template_kwargs={"enable_thinking": False}),
|
|
70
|
+
)
|
|
71
|
+
encoder = Qwen3Emb(embed_client)
|
|
72
|
+
return llm, encoder
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def run_refine(
|
|
76
|
+
*,
|
|
77
|
+
graph_path: Path,
|
|
78
|
+
cache_pkl: Path,
|
|
79
|
+
backup_path: Path,
|
|
80
|
+
history_path: Path,
|
|
81
|
+
log_dir: Path,
|
|
82
|
+
cfg: dict[str, str],
|
|
83
|
+
queries: list[dict[str, Any]],
|
|
84
|
+
rebuild_index: bool = False,
|
|
85
|
+
base_top_k: int = 5,
|
|
86
|
+
max_hops: int = 4,
|
|
87
|
+
) -> dict[str, Any]:
|
|
88
|
+
if not graph_path.is_file():
|
|
89
|
+
raise FileNotFoundError(f"graphify graph not found: {graph_path}")
|
|
90
|
+
|
|
91
|
+
llm, encoder = make_clients(cfg)
|
|
92
|
+
raw, data = load_or_build_data(
|
|
93
|
+
graph_path, cache_pkl, encoder, rebuild=rebuild_index
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
reafiner = Reafiner(
|
|
97
|
+
data=data,
|
|
98
|
+
sentence_encoder=encoder,
|
|
99
|
+
llm_generator=llm,
|
|
100
|
+
base_top_k=base_top_k,
|
|
101
|
+
max_hops=max_hops,
|
|
102
|
+
max_triple_num=20,
|
|
103
|
+
max_triple_num_by_step=[5, 10, 15, 20],
|
|
104
|
+
history_horizon_size=4,
|
|
105
|
+
if_gen_answer=False,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
log_path = log_dir / f"refinement_results_{int(time.time())}.jsonl"
|
|
110
|
+
refined_ids: set[str] = set()
|
|
111
|
+
summary_rows: list[dict[str, Any]] = []
|
|
112
|
+
completed = 0
|
|
113
|
+
|
|
114
|
+
def _persist() -> None:
|
|
115
|
+
if completed == 0:
|
|
116
|
+
return
|
|
117
|
+
data["KG"] = reafiner.kg
|
|
118
|
+
nonlocal raw
|
|
119
|
+
raw = sync_kg_to_graphify(raw, reafiner.kg)
|
|
120
|
+
save_graphify_json(graph_path, raw, backup_path=backup_path)
|
|
121
|
+
save_bundle(cache_pkl, raw, data)
|
|
122
|
+
mark_refined(history_path, refined_ids)
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
with log_path.open("w", encoding="utf-8") as log_f:
|
|
126
|
+
for sample in queries:
|
|
127
|
+
query = sample["query"]
|
|
128
|
+
qid = query_id(query, sample.get("id"))
|
|
129
|
+
print(f"\n=== [{qid}] {query}")
|
|
130
|
+
final_answer, _, refinement_result = reafiner.refine(query=query)
|
|
131
|
+
record = refinement_to_jsonable(sample, final_answer, refinement_result)
|
|
132
|
+
log_f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
133
|
+
log_f.flush()
|
|
134
|
+
n_steps = (
|
|
135
|
+
len(refinement_result.interaction_history)
|
|
136
|
+
if refinement_result is not None
|
|
137
|
+
else 0
|
|
138
|
+
)
|
|
139
|
+
rr = record.get("refinement_result") or {}
|
|
140
|
+
summary_rows.append(
|
|
141
|
+
{
|
|
142
|
+
"id": qid,
|
|
143
|
+
"query": query,
|
|
144
|
+
"steps": n_steps,
|
|
145
|
+
"action_count": rr.get("refinement_action_count", 0),
|
|
146
|
+
}
|
|
147
|
+
)
|
|
148
|
+
refined_ids.add(qid)
|
|
149
|
+
completed += 1
|
|
150
|
+
print(
|
|
151
|
+
f" steps={n_steps}, nodes={reafiner.kg.number_of_nodes()}, "
|
|
152
|
+
f"edges={reafiner.kg.number_of_edges()}"
|
|
153
|
+
)
|
|
154
|
+
finally:
|
|
155
|
+
_persist()
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
"log_path": str(log_path),
|
|
159
|
+
"graph_path": str(graph_path),
|
|
160
|
+
"nodes": reafiner.kg.number_of_nodes(),
|
|
161
|
+
"edges": reafiner.kg.number_of_edges(),
|
|
162
|
+
"queries_processed": len(queries),
|
|
163
|
+
"summary": summary_rows,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def refine_from_history(
|
|
168
|
+
paths: dict[str, Path],
|
|
169
|
+
cfg: dict[str, str],
|
|
170
|
+
*,
|
|
171
|
+
query: str | None = None,
|
|
172
|
+
rebuild_index: bool = False,
|
|
173
|
+
) -> dict[str, Any]:
|
|
174
|
+
if query:
|
|
175
|
+
entry = append_history(paths["history"], query, source="deeprefine")
|
|
176
|
+
queries = [entry]
|
|
177
|
+
else:
|
|
178
|
+
from .history import pending_queries
|
|
179
|
+
|
|
180
|
+
queries = pending_queries(paths["history"])
|
|
181
|
+
if not queries:
|
|
182
|
+
raise SystemExit(
|
|
183
|
+
"No pending queries in history. Use:\n"
|
|
184
|
+
" deeprefine history add --query '...'\n"
|
|
185
|
+
" deeprefine refine --query '...'"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return run_refine(
|
|
189
|
+
graph_path=paths["graph_json"],
|
|
190
|
+
cache_pkl=paths["reafiner_pkl"],
|
|
191
|
+
backup_path=paths["graph_backup"],
|
|
192
|
+
history_path=paths["history"],
|
|
193
|
+
log_dir=paths["graphify_out"] / ".deeprefine",
|
|
194
|
+
cfg=cfg,
|
|
195
|
+
queries=queries,
|
|
196
|
+
rebuild_index=rebuild_index,
|
|
197
|
+
)
|