xindex 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.xindex.json +1 -1
- package/README.md +67 -0
- package/apps/mcpApp.ts +2 -2
- package/apps/run.search.ts +0 -3
- package/apps/searchApp.ts +1 -1
- package/componets/index/formatSearchResults.ts +2 -2
- package/package.json +1 -1
- package/rnd/hf.ts +0 -14
- package/rnd/keywords-compromise.ts +0 -18
- package/rnd/keywords-pipeline.ts +0 -79
- package/rnd/keywords.ts +0 -38
- package/rnd/test-vectra-memory.ts +0 -63
- package/rnd/vectra-keywords.ts +0 -95
- package/rnd/vectra.ts +0 -50
package/.xindex.json
CHANGED
package/README.md
CHANGED
|
@@ -39,6 +39,73 @@ Drop this into `.mcp.json` at your project root:
|
|
|
39
39
|
|
|
40
40
|
Open the project in Claude Code — it picks up the xindex MCP server and can call `xindex_search`, `xindex_index`, and `xindex_reset` directly. Fewer hallucinations, fewer round-trips.
|
|
41
41
|
|
|
42
|
+
## Claude Code skills (`@xi`)
|
|
43
|
+
|
|
44
|
+
Two optional [Claude Code skills](https://docs.claude.com/en/docs/claude-code/skills) wrap the MCP tools so you don't have to think about them:
|
|
45
|
+
|
|
46
|
+
- **`ask-xi`** — read-only discovery. `@xi where is auth handled` drafts several focused queries, runs `xindex_search` for each, and returns ranked file paths with matched keywords. Use it as a cheap first step before grepping or asking a heavier model.
|
|
47
|
+
- **`xindex`** — index management (`xindex_index`, `xindex_reset`). Reset requires explicit confirmation every time.
|
|
48
|
+
|
|
49
|
+
Keeping them separate keeps `@xi` safe to fire casually while destructive ops stay behind the `xindex` skill.
|
|
50
|
+
|
|
51
|
+
### Install
|
|
52
|
+
|
|
53
|
+
Pick one — project-scoped or user-global:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
# Project (checked in, shared with the repo)
|
|
57
|
+
mkdir -p .claude/skills/ask-xi .claude/skills/xindex
|
|
58
|
+
|
|
59
|
+
# Or user-global (available in every project)
|
|
60
|
+
mkdir -p ~/.claude/skills/ask-xi ~/.claude/skills/xindex
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Then drop these two files in.
|
|
64
|
+
|
|
65
|
+
`ask-xi/SKILL.md`:
|
|
66
|
+
|
|
67
|
+
````md
|
|
68
|
+
---
|
|
69
|
+
name: ask-xi
|
|
70
|
+
description: Discovers relevant files via xindex semantic search — preps queries, auto-indexes on empty, returns file links with keywords. Triggered by @xi.
|
|
71
|
+
argument-hint: "[question]"
|
|
72
|
+
---
|
|
73
|
+
Surface-level codebase discovery via xindex. Tool: `xindex_search` (natural-language, meaning-based).
|
|
74
|
+
|
|
75
|
+
**Steps:**
|
|
76
|
+
1. Draft 5–10 focused queries from $ARGUMENTS (entry points, routing, config, integrations, tests, related patterns).
|
|
77
|
+
2. Run `xindex_search` for each.
|
|
78
|
+
3. If results are empty/sparse/stale → scoped-index the most relevant content-heavy root folders (one path per `xindex_index` call, e.g. `src`, `apps`, `features`, `componets`), then re-search. Prefer scoped over full-repo.
|
|
79
|
+
4. Refine with 2–3 narrower follow-ups.
|
|
80
|
+
5. Return file paths + brief keywords showing why each matched.
|
|
81
|
+
|
|
82
|
+
Output = file links + keywords, not analysis. For reset or full re-index, delegate to `/xindex` (owns safety rules).
|
|
83
|
+
````
|
|
84
|
+
|
|
85
|
+
`xindex/SKILL.md`:
|
|
86
|
+
|
|
87
|
+
````md
|
|
88
|
+
---
|
|
89
|
+
name: xindex
|
|
90
|
+
description: Manages xindex semantic search — index, search, reset via MCP tools. For research questions, use /ask-xi.
|
|
91
|
+
argument-hint: "[search query | index | reset]"
|
|
92
|
+
---
|
|
93
|
+
Full xindex tool management. For research, use `/ask-xi`. Install: `npm i -g xindex`.
|
|
94
|
+
|
|
95
|
+
**Tools:**
|
|
96
|
+
- `xindex_search` — find files by meaning (synonyms, semantics). Try before grepping blindly.
|
|
97
|
+
- `xindex_index` — index a path (recursive, respects .gitignore). **MUST** run one path per call.
|
|
98
|
+
- `xindex_reset` — destructive wipe+rebuild. **MUST** get explicit user confirmation every time; if ambiguous, don't run.
|
|
99
|
+
|
|
100
|
+
**Workflow:** stale/corrupt → confirm → `xindex_reset` → `xindex_index(["."])` → `xindex_search`. Incremental → one-path `xindex_index(["changed/path"])` calls.
|
|
101
|
+
|
|
102
|
+
**Scoped indexing (preferred):** index only task-relevant content-heavy folders, sequentially. Full-repo `xindex_index(["."])` only for cross-cutting discovery.
|
|
103
|
+
|
|
104
|
+
$ARGUMENTS
|
|
105
|
+
````
|
|
106
|
+
|
|
107
|
+
Both skills assume the `xindex` MCP server is registered (see the section above). Restart Claude Code after adding skills.
|
|
108
|
+
|
|
42
109
|
## Features
|
|
43
110
|
|
|
44
111
|
- **Local** — everything runs on your machine; embeddings cached on disk
|
package/apps/mcpApp.ts
CHANGED
|
@@ -44,8 +44,8 @@ export function McpApp({
|
|
|
44
44
|
inputSchema: z.object({
|
|
45
45
|
query: z.string()
|
|
46
46
|
.describe("Natural language search query"),
|
|
47
|
-
limit: z.number().int().min(1).max(
|
|
48
|
-
.describe("Max results to return,
|
|
47
|
+
limit: z.number().int().min(1).max(50).default(7)
|
|
48
|
+
.describe("Max results to return, 7 by default, 50 max"),
|
|
49
49
|
}),
|
|
50
50
|
annotations: {readOnlyHint: true},
|
|
51
51
|
}, async ({query, limit}) => {
|
package/apps/run.search.ts
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import {BuildComponents} from "../componets/buildComponents.js";
|
|
2
2
|
import {BufferedLoggerToStdOut} from "../componets/logger.js";
|
|
3
3
|
import {SearchApp} from "./searchApp.js";
|
|
4
|
-
import {AppId} from "../componets/appId.js";
|
|
5
4
|
import {FormatSearchResults} from "../componets/index/formatSearchResults.js";
|
|
6
5
|
|
|
7
|
-
const appId = AppId();
|
|
8
6
|
const log = BufferedLoggerToStdOut();
|
|
9
7
|
const {searchContentIndex, config} = await BuildComponents({log});
|
|
10
8
|
const search = SearchApp({searchContentIndex});
|
|
@@ -15,7 +13,6 @@ if (!query) {
|
|
|
15
13
|
process.exit(1);
|
|
16
14
|
}
|
|
17
15
|
|
|
18
|
-
log(`[${appId}] searching: "${query}"`);
|
|
19
16
|
const results = await search(query);
|
|
20
17
|
const format = FormatSearchResults();
|
|
21
18
|
log(await format(query, results));
|
package/apps/searchApp.ts
CHANGED
|
@@ -3,7 +3,7 @@ import {ISearchIndex, IIndexRecord} from "../features/searchIndex.js";
|
|
|
3
3
|
export type ISearchApp = (query: string, limit?: number) => Promise<IIndexRecord[]>;
|
|
4
4
|
|
|
5
5
|
export function SearchApp({searchContentIndex}: {searchContentIndex: ISearchIndex}): ISearchApp {
|
|
6
|
-
return async function search(query, limit =
|
|
6
|
+
return async function search(query, limit = 7) {
|
|
7
7
|
return await searchContentIndex(query, limit);
|
|
8
8
|
}
|
|
9
9
|
}
|
|
@@ -10,9 +10,9 @@ export function FormatSearchResults(): IFormatSearchResults {
|
|
|
10
10
|
for (let i = 0; i < results.length; i++) {
|
|
11
11
|
const r = results[i];
|
|
12
12
|
const kw = r.keywords ? ` — ${r.keywords}` : "";
|
|
13
|
-
lines.push(
|
|
13
|
+
lines.push(`${i + 1}. ${r.id}${kw}`);
|
|
14
14
|
}
|
|
15
15
|
|
|
16
|
-
return
|
|
16
|
+
return `\n# Search: "${query}" — ${results.length} result(s)\n\n${lines.join(";\n\n")}\n`;
|
|
17
17
|
};
|
|
18
18
|
}
|
package/package.json
CHANGED
package/rnd/hf.ts
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import { pipeline } from "@huggingface/transformers";
|
|
2
|
-
|
|
3
|
-
const generator = await pipeline(
|
|
4
|
-
"text-generation",
|
|
5
|
-
"HuggingFaceTB/SmolLM2-135M-Instruct"
|
|
6
|
-
);
|
|
7
|
-
|
|
8
|
-
const messages = [
|
|
9
|
-
{ role: "system", content: "You are a helpful assistant." },
|
|
10
|
-
{ role: "user", content: "Who is Microsoft?" },
|
|
11
|
-
];
|
|
12
|
-
|
|
13
|
-
const output = await generator(messages, { max_new_tokens: 64 });
|
|
14
|
-
console.log(output[0].generated_text.at(-1).content);
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import nlp from "compromise";
|
|
2
|
-
import { readFile } from "fs/promises";
|
|
3
|
-
|
|
4
|
-
const filePath = process.argv[2];
|
|
5
|
-
if (!filePath) {
|
|
6
|
-
console.error("Usage: npx tsx keywords-compromise.ts <file>");
|
|
7
|
-
process.exit(1);
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
const text = await readFile(filePath, "utf8");
|
|
11
|
-
const doc = nlp(text);
|
|
12
|
-
|
|
13
|
-
console.log(`Keywords from: ${filePath}\n`);
|
|
14
|
-
console.log("Topics:", doc.topics().out("array"));
|
|
15
|
-
console.log("\nNouns:", doc.nouns().out("array"));
|
|
16
|
-
console.log("\nVerbs:", doc.verbs().out("array"));
|
|
17
|
-
console.log("\nPeople:", doc.people().out("array"));
|
|
18
|
-
console.log("\nOrganizations:", doc.organizations().out("array"));
|
package/rnd/keywords-pipeline.ts
DELETED
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
import nlp from "compromise";
|
|
2
|
-
import {createRequire} from "module";
|
|
3
|
-
import {readFile} from "fs/promises";
|
|
4
|
-
import {pipeline} from "@huggingface/transformers";
|
|
5
|
-
|
|
6
|
-
const require = createRequire(import.meta.url);
|
|
7
|
-
const keyword_extractor = require("keyword-extractor");
|
|
8
|
-
|
|
9
|
-
const generator = await pipeline(
|
|
10
|
-
"text-generation",
|
|
11
|
-
"HuggingFaceTB/SmolLM2-135M-Instruct"
|
|
12
|
-
);
|
|
13
|
-
|
|
14
|
-
export async function llm(input: string): Promise<string> {
|
|
15
|
-
const prompt = "Extract and list the most important keywords from the following text. Return only keywords separated by commas.";
|
|
16
|
-
|
|
17
|
-
const messages = [
|
|
18
|
-
{role: "system", content: prompt},
|
|
19
|
-
{role: "user", content: input},
|
|
20
|
-
];
|
|
21
|
-
|
|
22
|
-
const output = await generator(messages, {max_new_tokens: 128});
|
|
23
|
-
return output[0].generated_text.at(-1).content;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
const filePath = process.argv[2];
|
|
27
|
-
if (!filePath) {
|
|
28
|
-
console.error("Usage: npx tsx keywords-pipeline.ts <file>");
|
|
29
|
-
process.exit(1);
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
// Step 1: Read file
|
|
33
|
-
const text = await readFile(filePath, "utf8");
|
|
34
|
-
console.log("=== Step 1: Read file ===");
|
|
35
|
-
console.log(`${text.length} chars\n`);
|
|
36
|
-
|
|
37
|
-
// Step 2: Compromise — extract nouns, verbs, topics
|
|
38
|
-
const doc = nlp(text);
|
|
39
|
-
const nouns = doc.nouns().out("array") as string[];
|
|
40
|
-
const verbs = doc.verbs().out("array") as string[];
|
|
41
|
-
const topics = doc.topics().out("array") as string[];
|
|
42
|
-
const combined = [...topics, ...nouns, ...verbs].join(" ");
|
|
43
|
-
console.log("=== Step 2: Compromise ===");
|
|
44
|
-
console.log(`${topics.length} topics, ${nouns.length} nouns, ${verbs.length} verbs\n`);
|
|
45
|
-
|
|
46
|
-
// Step 3: Regex — replace all non-word chars with space
|
|
47
|
-
const cleaned = combined.replace(/\W+/g, " ").trim();
|
|
48
|
-
console.log("=== Step 3: Regex cleanup ===");
|
|
49
|
-
console.log(cleaned.slice(0, 200), "\n");
|
|
50
|
-
|
|
51
|
-
// Step 4: LLM — pass cleaned text to local model
|
|
52
|
-
const llmResult = await llm(cleaned);
|
|
53
|
-
console.log("=== Step 4: LLM ===");
|
|
54
|
-
console.log(llmResult, "\n");
|
|
55
|
-
|
|
56
|
-
// Step 5: keyword-extractor
|
|
57
|
-
const keywords: string[] = keyword_extractor.extract(cleaned, {
|
|
58
|
-
language: "english",
|
|
59
|
-
remove_digits: false,
|
|
60
|
-
return_changed_case: true,
|
|
61
|
-
remove_duplicates: true,
|
|
62
|
-
return_max_ngrams: 3,
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
// Score by frequency
|
|
66
|
-
const lower = cleaned.toLowerCase();
|
|
67
|
-
const scored = keywords
|
|
68
|
-
.filter((kw) => kw.length > 2)
|
|
69
|
-
.map((kw) => {
|
|
70
|
-
const re = new RegExp(`\\b${kw.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "gi");
|
|
71
|
-
const count = (lower.match(re) || []).length;
|
|
72
|
-
return {keyword: kw, count};
|
|
73
|
-
});
|
|
74
|
-
scored.sort((a, b) => b.count - a.count);
|
|
75
|
-
|
|
76
|
-
console.log("=== Step 5: Keywords ===");
|
|
77
|
-
for (const {keyword, count} of scored) {
|
|
78
|
-
console.log(` ${keyword.padEnd(35)} (${count}x)`);
|
|
79
|
-
}
|
package/rnd/keywords.ts
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import { createRequire } from "module";
|
|
2
|
-
import { readFile } from "fs/promises";
|
|
3
|
-
|
|
4
|
-
const require = createRequire(import.meta.url);
|
|
5
|
-
const keyword_extractor = require("keyword-extractor");
|
|
6
|
-
|
|
7
|
-
const filePath = process.argv[2];
|
|
8
|
-
if (!filePath) {
|
|
9
|
-
console.error("Usage: npx tsx keywords.ts <file>");
|
|
10
|
-
process.exit(1);
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
const text = await readFile(filePath, "utf8");
|
|
14
|
-
|
|
15
|
-
const keywords: string[] = keyword_extractor.extract(text, {
|
|
16
|
-
language: "english",
|
|
17
|
-
remove_digits: false,
|
|
18
|
-
return_changed_case: true,
|
|
19
|
-
remove_duplicates: true,
|
|
20
|
-
return_max_ngrams: 3,
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
// Count frequency of each keyword in original text (case-insensitive)
|
|
24
|
-
const lower = text.toLowerCase();
|
|
25
|
-
const scored = keywords
|
|
26
|
-
.filter((kw) => kw.length > 2 && !/^[^a-z]*$/.test(kw))
|
|
27
|
-
.map((kw) => {
|
|
28
|
-
const re = new RegExp(`\\b${kw.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "gi");
|
|
29
|
-
const count = (lower.match(re) || []).length;
|
|
30
|
-
return { keyword: kw, count };
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
scored.sort((a, b) => b.count - a.count);
|
|
34
|
-
|
|
35
|
-
console.log(`Keywords from: ${filePath}\n`);
|
|
36
|
-
for (const { keyword, count } of scored) {
|
|
37
|
-
console.log(` ${keyword.padEnd(35)} (${count}x)`);
|
|
38
|
-
}
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
import { LocalIndex, VirtualFileStorage } from 'vectra';
|
|
2
|
-
|
|
3
|
-
async function main() {
|
|
4
|
-
// 1. Create in-memory index
|
|
5
|
-
const storage = new VirtualFileStorage();
|
|
6
|
-
const index = new LocalIndex('mem://test', undefined, storage);
|
|
7
|
-
await index.createIndex({ version: 1 });
|
|
8
|
-
console.log('created index');
|
|
9
|
-
|
|
10
|
-
// 2. Insert items with fake 8-dim vectors
|
|
11
|
-
const dim = 8;
|
|
12
|
-
await index.insertItem({
|
|
13
|
-
id: 'doc-a',
|
|
14
|
-
vector: Array.from({ length: dim }, (_, i) => (i + 1) / dim),
|
|
15
|
-
metadata: { block: 'A', label: 'first' },
|
|
16
|
-
});
|
|
17
|
-
await index.insertItem({
|
|
18
|
-
id: 'doc-b',
|
|
19
|
-
vector: Array.from({ length: dim }, (_, i) => (dim - i) / dim),
|
|
20
|
-
metadata: { block: 'B', label: 'second' },
|
|
21
|
-
});
|
|
22
|
-
await index.insertItem({
|
|
23
|
-
id: 'doc-c',
|
|
24
|
-
vector: Array.from({ length: dim }, () => 0.5),
|
|
25
|
-
metadata: { block: 'C', label: 'third' },
|
|
26
|
-
});
|
|
27
|
-
console.log('inserted 3 items');
|
|
28
|
-
|
|
29
|
-
// 3. Query — vector close to doc-a
|
|
30
|
-
const queryVec = Array.from({ length: dim }, (_, i) => (i + 1) / dim);
|
|
31
|
-
const results = await index.queryItems(queryVec, '', 3);
|
|
32
|
-
console.log('query results:');
|
|
33
|
-
for (const r of results) {
|
|
34
|
-
console.log(` ${r.item.id} score=${r.score.toFixed(4)} block=${r.item.metadata.block}`);
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
// 4. Upsert — update doc-a's vector
|
|
38
|
-
await index.upsertItem({
|
|
39
|
-
id: 'doc-a',
|
|
40
|
-
vector: Array.from({ length: dim }, () => 0.5),
|
|
41
|
-
metadata: { block: 'A', label: 'updated' },
|
|
42
|
-
});
|
|
43
|
-
const updated = await index.getItem('doc-a');
|
|
44
|
-
console.log(`upserted doc-a → label=${updated?.metadata.label}`);
|
|
45
|
-
|
|
46
|
-
// 5. List all items
|
|
47
|
-
const all = await index.listItems();
|
|
48
|
-
console.log(`total items: ${all.length}`);
|
|
49
|
-
|
|
50
|
-
// 6. Delete
|
|
51
|
-
await index.deleteItem('doc-b');
|
|
52
|
-
const afterDelete = await index.listItems();
|
|
53
|
-
console.log(`after delete: ${afterDelete.length} items`);
|
|
54
|
-
|
|
55
|
-
// 7. Verify no disk artifacts
|
|
56
|
-
const { default: fs } = await import('fs');
|
|
57
|
-
const exists = fs.existsSync('mem://test');
|
|
58
|
-
console.log(`disk folder exists: ${exists}`);
|
|
59
|
-
|
|
60
|
-
console.log('\ndone — all in-memory, nothing on disk');
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
main().catch(console.error);
|
package/rnd/vectra-keywords.ts
DELETED
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import { LocalIndex } from "vectra";
|
|
2
|
-
import { pipeline } from "@huggingface/transformers";
|
|
3
|
-
import nlp from "compromise";
|
|
4
|
-
import { createRequire } from "module";
|
|
5
|
-
import { readFile } from "fs/promises";
|
|
6
|
-
import { readdirSync } from "fs";
|
|
7
|
-
|
|
8
|
-
const require = createRequire(import.meta.url);
|
|
9
|
-
const keyword_extractor = require("keyword-extractor");
|
|
10
|
-
|
|
11
|
-
// Init embedder + index
|
|
12
|
-
const embedder = await pipeline(
|
|
13
|
-
"feature-extraction",
|
|
14
|
-
"sentence-transformers/all-MiniLM-L6-v2"
|
|
15
|
-
);
|
|
16
|
-
const index = new LocalIndex("./vectra-keyword-index");
|
|
17
|
-
if (!(await index.isIndexCreated())) {
|
|
18
|
-
await index.createIndex();
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
async function embed(text: string): Promise<number[]> {
|
|
22
|
-
const result = await embedder(text, { pooling: "mean", normalize: true });
|
|
23
|
-
return Array.from(result.data as Float32Array);
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
function extractKeywords(text: string): string[] {
|
|
27
|
-
// Compromise: pull nouns, verbs, topics
|
|
28
|
-
const doc = nlp(text);
|
|
29
|
-
const parts = [
|
|
30
|
-
...doc.topics().out("array"),
|
|
31
|
-
...doc.nouns().out("array"),
|
|
32
|
-
...doc.verbs().out("array"),
|
|
33
|
-
] as string[];
|
|
34
|
-
const cleaned = parts.join(" ").replace(/\W+/g, " ").trim();
|
|
35
|
-
|
|
36
|
-
// keyword-extractor
|
|
37
|
-
const keywords: string[] = keyword_extractor.extract(cleaned, {
|
|
38
|
-
language: "english",
|
|
39
|
-
remove_digits: false,
|
|
40
|
-
return_changed_case: true,
|
|
41
|
-
remove_duplicates: true,
|
|
42
|
-
return_max_ngrams: 2,
|
|
43
|
-
});
|
|
44
|
-
return keywords.filter((kw: string) => kw.length > 2);
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// --- Index files ---
|
|
48
|
-
const files = process.argv.slice(2);
|
|
49
|
-
if (!files.length) {
|
|
50
|
-
console.error("Usage: npx tsx vectra-keywords.ts <file1> [file2] ...");
|
|
51
|
-
process.exit(1);
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
console.log("=== Indexing ===");
|
|
55
|
-
for (const filePath of files) {
|
|
56
|
-
const text = await readFile(filePath, "utf8");
|
|
57
|
-
const keywords = extractKeywords(text);
|
|
58
|
-
const keywordStr = keywords.join(", ");
|
|
59
|
-
const vector = await embed(keywordStr);
|
|
60
|
-
|
|
61
|
-
await index.upsertItem({
|
|
62
|
-
id: filePath,
|
|
63
|
-
vector,
|
|
64
|
-
metadata: { keywords: keywordStr, file: filePath },
|
|
65
|
-
});
|
|
66
|
-
console.log(` ${filePath} → ${keywords.slice(0, 8).join(", ")}...`);
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
// --- Search by keyword ---
|
|
70
|
-
async function search(query: string, topK = 5) {
|
|
71
|
-
const queryVector = await embed(query);
|
|
72
|
-
return index.queryItems(queryVector, query, topK);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
console.log("\n=== Search: keyword 'fruit' ===");
|
|
76
|
-
const r1 = await search("fruit");
|
|
77
|
-
for (const r of r1) {
|
|
78
|
-
console.log(` ${r.score.toFixed(4)} | ${r.item.id}`);
|
|
79
|
-
console.log(` keywords: ${(r.item.metadata as any).keywords.slice(0, 100)}`);
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
// --- Search by synonym ---
|
|
83
|
-
console.log("\n=== Search: synonym 'automobile' (for 'cars/vehicles') ===");
|
|
84
|
-
const r2 = await search("automobile vehicle transportation");
|
|
85
|
-
for (const r of r2) {
|
|
86
|
-
console.log(` ${r.score.toFixed(4)} | ${r.item.id}`);
|
|
87
|
-
console.log(` keywords: ${(r.item.metadata as any).keywords.slice(0, 100)}`);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
console.log("\n=== Search: synonym 'embedding model neural network' ===");
|
|
91
|
-
const r3 = await search("embedding model neural network");
|
|
92
|
-
for (const r of r3) {
|
|
93
|
-
console.log(` ${r.score.toFixed(4)} | ${r.item.id}`);
|
|
94
|
-
console.log(` keywords: ${(r.item.metadata as any).keywords.slice(0, 100)}`);
|
|
95
|
-
}
|
package/rnd/vectra.ts
DELETED
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
import { LocalIndex } from "vectra";
|
|
2
|
-
import { pipeline } from "@huggingface/transformers";
|
|
3
|
-
|
|
4
|
-
// Create/load index (folder-based)
|
|
5
|
-
const index = new LocalIndex("./vectra-index");
|
|
6
|
-
|
|
7
|
-
if (!(await index.isIndexCreated())) {
|
|
8
|
-
await index.createIndex();
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
// Embedding pipeline (fast local model)
|
|
12
|
-
const embedder = await pipeline(
|
|
13
|
-
"feature-extraction",
|
|
14
|
-
"sentence-transformers/all-MiniLM-L6-v2"
|
|
15
|
-
);
|
|
16
|
-
|
|
17
|
-
async function embed(text: string): Promise<number[]> {
|
|
18
|
-
const result = await embedder(text, { pooling: "mean", normalize: true });
|
|
19
|
-
return Array.from(result.data as Float32Array);
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
// Add items with vectors + metadata
|
|
23
|
-
const texts = [
|
|
24
|
-
{ text: "Apples are red fruit", metadata: { category: "fruit" } },
|
|
25
|
-
{ text: "Bananas are yellow", metadata: { category: "fruit" } },
|
|
26
|
-
{ text: "Cars are vehicles", metadata: { category: "transport" } },
|
|
27
|
-
];
|
|
28
|
-
|
|
29
|
-
for (const item of texts) {
|
|
30
|
-
const vector = await embed(item.text);
|
|
31
|
-
await index.upsertItem({
|
|
32
|
-
id: item.text.slice(0, 20),
|
|
33
|
-
vector,
|
|
34
|
-
metadata: item.metadata,
|
|
35
|
-
});
|
|
36
|
-
}
|
|
37
|
-
console.log("Items indexed");
|
|
38
|
-
|
|
39
|
-
// Query example
|
|
40
|
-
const queryText = "red fruit";
|
|
41
|
-
const queryVector = await embed(queryText);
|
|
42
|
-
const results = await index.queryItems(queryVector, queryText, 3, {
|
|
43
|
-
category: { $eq: "fruit" },
|
|
44
|
-
});
|
|
45
|
-
|
|
46
|
-
for (const result of results) {
|
|
47
|
-
console.log(
|
|
48
|
-
`Score: ${result.score.toFixed(4)} | ID: ${result.item.id} | Metadata: ${JSON.stringify(result.item.metadata)}`
|
|
49
|
-
);
|
|
50
|
-
}
|