@harperfast/skills 1.5.1 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -2,160 +2,96 @@
|
|
|
2
2
|
name: vector-indexing
|
|
3
3
|
description: How to enable and query vector indexes for similarity search in Harper.
|
|
4
4
|
metadata:
|
|
5
|
-
mode:
|
|
5
|
+
mode: generate
|
|
6
|
+
sources:
|
|
7
|
+
- reference/v5/database/schema.md#Vector Indexing
|
|
8
|
+
sourceCommit: e8fc9e51c7c04637b8ec02d073eed42d495034f1
|
|
9
|
+
inputHash: 9c47b18c8795e403
|
|
6
10
|
---
|
|
7
11
|
|
|
8
12
|
# Vector Indexing
|
|
9
13
|
|
|
10
|
-
Instructions for the agent to follow when
|
|
14
|
+
Instructions for the agent to follow when enabling and querying vector indexes for similarity search in Harper using the HNSW algorithm.
|
|
11
15
|
|
|
12
16
|
## When to Use
|
|
13
17
|
|
|
14
|
-
|
|
18
|
+
Apply this rule when adding a vector index to a Harper table schema to support approximate nearest-neighbor (similarity) search on high-dimensional float arrays. Use it whenever a query requires ranking results by vector similarity, optionally combined with filter conditions.
|
|
15
19
|
|
|
16
20
|
## How It Works
|
|
17
21
|
|
|
18
|
-
1. **
|
|
22
|
+
1. **Define the table schema with a vector index**: Add `@indexed(type: "HNSW")` to a `[Float]` attribute on a `@table` type. See [adding-tables-with-schemas](adding-tables-with-schemas.md) for general schema setup.
|
|
23
|
+
|
|
19
24
|
```graphql
|
|
20
|
-
type
|
|
21
|
-
id:
|
|
25
|
+
type Document @table {
|
|
26
|
+
id: Long @primaryKey
|
|
22
27
|
textEmbeddings: [Float] @indexed(type: "HNSW")
|
|
23
28
|
}
|
|
24
29
|
```
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
|
|
31
|
+
2. **Query by nearest neighbors**: Call `.search()` with a `sort` parameter specifying the indexed `attribute` and a `target` vector. The `target` is the query vector to compare against.
|
|
32
|
+
|
|
27
33
|
```javascript
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
attribute: 'textEmbeddings',
|
|
32
|
-
target: [0.1, 0.2, ...], // query vector
|
|
33
|
-
},
|
|
34
|
-
limit: 5,
|
|
34
|
+
let results = Document.search({
|
|
35
|
+
sort: { attribute: 'textEmbeddings', target: searchVector },
|
|
36
|
+
limit: 5,
|
|
35
37
|
});
|
|
36
38
|
```
|
|
37
|
-
|
|
39
|
+
|
|
40
|
+
3. **Combine with filter conditions**: Add a `conditions` array alongside `sort` to filter results before ranking by similarity.
|
|
41
|
+
|
|
38
42
|
```javascript
|
|
39
|
-
|
|
40
|
-
conditions: {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
value: 0.1,
|
|
44
|
-
target: searchVector,
|
|
45
|
-
},
|
|
43
|
+
let results = Document.search({
|
|
44
|
+
conditions: [{ attribute: 'price', comparator: 'lt', value: 50 }],
|
|
45
|
+
sort: { attribute: 'textEmbeddings', target: searchVector },
|
|
46
|
+
limit: 5,
|
|
46
47
|
});
|
|
47
48
|
```
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
// the name of the Ollama embedding model
|
|
60
|
-
const OLLAMA_EMBEDDING_MODEL = 'llama3';
|
|
61
|
-
|
|
62
|
-
const SIMILARITY_THRESHOLD = 0.5;
|
|
63
|
-
|
|
64
|
-
export class ProductSearch extends Resource {
|
|
65
|
-
// based on env variable we choose the appropriate embedding generator
|
|
66
|
-
generateEmbedding =
|
|
67
|
-
process.env.EMBEDDING_GENERATOR === 'ollama'
|
|
68
|
-
? this._generateOllamaEmbedding
|
|
69
|
-
: this._generateOpenAIEmbedding;
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Executes a search query using a generated text embedding and returns the matching products.
|
|
73
|
-
*
|
|
74
|
-
* @param {Object} data - The input data for the request.
|
|
75
|
-
* @param {string} data.prompt - The prompt to generate the text embedding from.
|
|
76
|
-
* @return {Promise<Array>} Returns a promise that resolves to an array of products matching the conditions,
|
|
77
|
-
* including fields: name, description, price, and $distance.
|
|
78
|
-
*/
|
|
79
|
-
async post(data) {
|
|
80
|
-
const embedding = await this.generateEmbedding(data.prompt);
|
|
81
|
-
|
|
82
|
-
return await Product.search({
|
|
83
|
-
select: ['name', 'description', 'price', '$distance'],
|
|
84
|
-
conditions: {
|
|
85
|
-
attribute: 'textEmbeddings',
|
|
86
|
-
comparator: 'lt',
|
|
87
|
-
value: SIMILARITY_THRESHOLD,
|
|
88
|
-
target: embedding[0],
|
|
89
|
-
},
|
|
90
|
-
limit: 5,
|
|
91
|
-
});
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
/**
|
|
95
|
-
* Generates an embedding using the Ollama API.
|
|
96
|
-
*
|
|
97
|
-
* @param {string} promptData - The input data for which the embedding is to be generated.
|
|
98
|
-
* @return {Promise<number[][]>} A promise that resolves to the generated embedding as an array of numbers.
|
|
99
|
-
*/
|
|
100
|
-
async _generateOllamaEmbedding(promptData) {
|
|
101
|
-
const embedding = await ollama.embed({
|
|
102
|
-
model: OLLAMA_EMBEDDING_MODEL,
|
|
103
|
-
input: promptData,
|
|
104
|
-
});
|
|
105
|
-
return embedding?.embeddings;
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
/**
|
|
109
|
-
* Generates OpenAI embeddings based on the given prompt data.
|
|
110
|
-
*
|
|
111
|
-
* @param {string} promptData - The input data used for generating the embedding.
|
|
112
|
-
* @return {Promise<number[][]>} A promise that resolves to an array of embeddings, where each embedding is an array of floats.
|
|
113
|
-
*/
|
|
114
|
-
async _generateOpenAIEmbedding(promptData) {
|
|
115
|
-
const embedding = await openai.embeddings.create({
|
|
116
|
-
model: OPENAI_EMBEDDING_MODEL,
|
|
117
|
-
input: promptData,
|
|
118
|
-
encoding_format: 'float',
|
|
119
|
-
});
|
|
120
|
-
|
|
121
|
-
let embeddings = [];
|
|
122
|
-
embedding.data.forEach((embeddingData) => {
|
|
123
|
-
embeddings.push(embeddingData.embedding);
|
|
124
|
-
});
|
|
125
|
-
|
|
126
|
-
return embeddings;
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
```
|
|
49
|
+
|
|
50
|
+
4. **Tune HNSW parameters**: Pass additional parameters directly in the `@indexed` directive to control index quality and performance.
|
|
51
|
+
|
|
52
|
+
| Parameter | Default | Description |
|
|
53
|
+
| ---------------------- | ----------------- | --------------------------------------------------------------------------------------------------- |
|
|
54
|
+
| `distance` | `"cosine"` | Distance function: `"euclidean"` or `"cosine"` (negative cosine similarity) |
|
|
55
|
+
| `efConstruction` | `100` | Max nodes explored during index construction. Higher = better recall, lower = better performance |
|
|
56
|
+
| `M` | `16` | Preferred connections per graph layer. Higher = more space, better recall for high-dimensional data |
|
|
57
|
+
| `optimizeRouting` | `0.5` | Heuristic aggressiveness for omitting redundant connections (0 = off, 1 = most aggressive) |
|
|
58
|
+
| `mL` | computed from `M` | Normalization factor for level generation |
|
|
59
|
+
| `efSearchConstruction` | `50` | Max nodes explored during search |
|
|
130
60
|
|
|
131
61
|
## Examples
|
|
132
62
|
|
|
133
|
-
|
|
63
|
+
Schema with default settings:
|
|
134
64
|
|
|
135
|
-
```
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
-d '{"prompt": "shorts for the gym"}'
|
|
65
|
+
```graphql
|
|
66
|
+
type Document @table {
|
|
67
|
+
id: Long @primaryKey
|
|
68
|
+
textEmbeddings: [Float] @indexed(type: "HNSW")
|
|
69
|
+
}
|
|
141
70
|
```
|
|
142
71
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
## When to Use Vector Indexing
|
|
72
|
+
Schema with custom parameters (euclidean distance, routing disabled, higher search recall):
|
|
146
73
|
|
|
147
|
-
|
|
74
|
+
```graphql
|
|
75
|
+
type Document @table {
|
|
76
|
+
id: Long @primaryKey
|
|
77
|
+
textEmbeddings: [Float]
|
|
78
|
+
@indexed(type: "HNSW", distance: "euclidean", optimizeRouting: 0, efSearchConstruction: 100)
|
|
79
|
+
}
|
|
80
|
+
```
|
|
148
81
|
|
|
149
|
-
-
|
|
150
|
-
- Performing semantic or similarity-based search
|
|
151
|
-
- Working with high-dimensional numeric data
|
|
152
|
-
- Exact-match indexes are insufficient
|
|
82
|
+
Filtered nearest-neighbor search:
|
|
153
83
|
|
|
154
|
-
|
|
84
|
+
```javascript
|
|
85
|
+
let results = Document.search({
|
|
86
|
+
conditions: [{ attribute: 'price', comparator: 'lt', value: 50 }],
|
|
87
|
+
sort: { attribute: 'textEmbeddings', target: searchVector },
|
|
88
|
+
limit: 5,
|
|
89
|
+
});
|
|
90
|
+
```
|
|
155
91
|
|
|
156
|
-
##
|
|
92
|
+
## Notes
|
|
157
93
|
|
|
158
|
-
-
|
|
159
|
-
-
|
|
160
|
-
-
|
|
161
|
-
-
|
|
94
|
+
- The default `distance` function is `cosine`. Use `"euclidean"` when your vectors are not normalized or when Euclidean geometry better fits your use case.
|
|
95
|
+
- Increasing `efConstruction` improves index recall at the cost of build performance.
|
|
96
|
+
- `mL` is computed automatically from `M` unless explicitly overridden.
|
|
97
|
+
- Always pair `sort` with a `limit` to bound the number of nearest-neighbor results returned.
|
|
@@ -41,7 +41,19 @@ rules:
|
|
|
41
41
|
category: schema
|
|
42
42
|
priority: 1
|
|
43
43
|
order: 4
|
|
44
|
-
mode:
|
|
44
|
+
mode: generate
|
|
45
|
+
sources:
|
|
46
|
+
- path: reference/v5/database/schema.md
|
|
47
|
+
section: 'Vector Indexing'
|
|
48
|
+
role: primary
|
|
49
|
+
must_cover:
|
|
50
|
+
- '@indexed(type: "HNSW")'
|
|
51
|
+
- 'sort'
|
|
52
|
+
- 'target'
|
|
53
|
+
- 'cosine'
|
|
54
|
+
- 'efConstruction'
|
|
55
|
+
cross_links:
|
|
56
|
+
- adding-tables-with-schemas
|
|
45
57
|
|
|
46
58
|
- rule: using-blob-datatype
|
|
47
59
|
description: How to use the Blob data type for efficient binary storage in Harper.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@harperfast/skills",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.0",
|
|
4
4
|
"description": "Best practices for making awesome Harper apps with your favorite Agent",
|
|
5
5
|
"keywords": [],
|
|
6
6
|
"homepage": "https://github.com/harperfast",
|
|
@@ -28,9 +28,11 @@
|
|
|
28
28
|
"build": "node scripts/build.mjs",
|
|
29
29
|
"format": "oxfmt",
|
|
30
30
|
"format:check": "oxfmt --check",
|
|
31
|
+
"generate": "node scripts/generation/generate-rules.mjs",
|
|
31
32
|
"validate": "npm run format:check && npm run build && node scripts/validate-skills.mjs && node scripts/generation/validate-generated.mjs"
|
|
32
33
|
},
|
|
33
34
|
"devDependencies": {
|
|
35
|
+
"@anthropic-ai/sdk": "^0.98.0",
|
|
34
36
|
"@commitlint/cli": "^20.4.1",
|
|
35
37
|
"@commitlint/config-conventional": "^20.4.1",
|
|
36
38
|
"@semantic-release/commit-analyzer": "^13.0.1",
|