@harperfast/skills 1.5.1 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -2,41 +2,55 @@
|
|
|
2
2
|
name: vector-indexing
|
|
3
3
|
description: How to enable and query vector indexes for similarity search in Harper.
|
|
4
4
|
metadata:
|
|
5
|
-
mode:
|
|
5
|
+
mode: generate
|
|
6
|
+
sources:
|
|
7
|
+
- reference/v5/database/schema.md#Vector Indexing
|
|
8
|
+
sourceCommit: 6d4a30ccd5b32528e0e9963565782dca9fff5ada
|
|
9
|
+
inputHash: 3732961c671aac00
|
|
6
10
|
---
|
|
7
11
|
|
|
8
12
|
# Vector Indexing
|
|
9
13
|
|
|
10
|
-
Instructions for the agent to follow when
|
|
14
|
+
Instructions for the agent to follow when enabling and querying vector indexes for similarity search in Harper using the HNSW algorithm.
|
|
11
15
|
|
|
12
16
|
## When to Use
|
|
13
17
|
|
|
14
|
-
|
|
18
|
+
Apply this rule when adding a vector index to a Harper table schema or writing similarity search queries against high-dimensional vector fields. Use it whenever you need approximate nearest-neighbor search, distance-threshold filtering, or distance-scored results.
|
|
15
19
|
|
|
16
20
|
## How It Works
|
|
17
21
|
|
|
18
|
-
1. **
|
|
22
|
+
1. **Declare a vector index on a `[Float]` field**: Add `@indexed(type: "HNSW")` to any `[Float]` attribute in a `@table` type. See [adding-tables-with-schemas.md](adding-tables-with-schemas.md) for general schema setup.
|
|
23
|
+
|
|
19
24
|
```graphql
|
|
20
|
-
type
|
|
21
|
-
id:
|
|
25
|
+
type Document @table {
|
|
26
|
+
id: Long @primaryKey
|
|
22
27
|
textEmbeddings: [Float] @indexed(type: "HNSW")
|
|
23
28
|
}
|
|
24
29
|
```
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
|
|
31
|
+
2. **Query by nearest neighbors using `sort`**: Call `Document.search()` with a `sort` object specifying `attribute` (the indexed field) and `target` (the query vector). Include `limit` to cap results.
|
|
32
|
+
|
|
27
33
|
```javascript
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
attribute: 'textEmbeddings',
|
|
32
|
-
target: [0.1, 0.2, ...], // query vector
|
|
33
|
-
},
|
|
34
|
-
limit: 5,
|
|
34
|
+
let results = Document.search({
|
|
35
|
+
sort: { attribute: 'textEmbeddings', target: searchVector },
|
|
36
|
+
limit: 5,
|
|
35
37
|
});
|
|
36
38
|
```
|
|
37
|
-
|
|
39
|
+
|
|
40
|
+
3. **Combine HNSW with filter conditions**: Add a `conditions` array alongside `sort` to pre-filter records before ranking by similarity.
|
|
41
|
+
|
|
38
42
|
```javascript
|
|
39
|
-
|
|
43
|
+
let results = Document.search({
|
|
44
|
+
conditions: [{ attribute: 'price', comparator: 'lt', value: 50 }],
|
|
45
|
+
sort: { attribute: 'textEmbeddings', target: searchVector },
|
|
46
|
+
limit: 5,
|
|
47
|
+
});
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
4. **Filter by distance threshold**: Place `target` directly on a condition (alongside `attribute`, `comparator`, and `value`) to return only records whose distance to the target vector is below a threshold. Use this form to bound result quality by a similarity cutoff rather than ranking.
|
|
51
|
+
|
|
52
|
+
```javascript
|
|
53
|
+
let results = Document.search({
|
|
40
54
|
conditions: {
|
|
41
55
|
attribute: 'textEmbeddings',
|
|
42
56
|
comparator: 'lt',
|
|
@@ -45,117 +59,66 @@ Use this skill when you need to perform similarity searches on high-dimensional
|
|
|
45
59
|
},
|
|
46
60
|
});
|
|
47
61
|
```
|
|
48
|
-
5. **Generate Embeddings**: Use external services (OpenAI, Ollama) to generate the numeric vectors before storing or searching them in Harper.
|
|
49
|
-
|
|
50
|
-
```typescript
|
|
51
|
-
import OpenAI from 'openai';
|
|
52
|
-
import ollama from 'ollama';
|
|
53
|
-
|
|
54
|
-
const { Product } = tables;
|
|
55
|
-
const openai = new OpenAI();
|
|
56
|
-
// the name of the OpenAI embedding model
|
|
57
|
-
const OPENAI_EMBEDDING_MODEL = 'text-embedding-3-small';
|
|
58
|
-
|
|
59
|
-
// the name of the Ollama embedding model
|
|
60
|
-
const OLLAMA_EMBEDDING_MODEL = 'llama3';
|
|
61
|
-
|
|
62
|
-
const SIMILARITY_THRESHOLD = 0.5;
|
|
63
|
-
|
|
64
|
-
export class ProductSearch extends Resource {
|
|
65
|
-
// based on env variable we choose the appropriate embedding generator
|
|
66
|
-
generateEmbedding =
|
|
67
|
-
process.env.EMBEDDING_GENERATOR === 'ollama'
|
|
68
|
-
? this._generateOllamaEmbedding
|
|
69
|
-
: this._generateOpenAIEmbedding;
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Executes a search query using a generated text embedding and returns the matching products.
|
|
73
|
-
*
|
|
74
|
-
* @param {Object} data - The input data for the request.
|
|
75
|
-
* @param {string} data.prompt - The prompt to generate the text embedding from.
|
|
76
|
-
* @return {Promise<Array>} Returns a promise that resolves to an array of products matching the conditions,
|
|
77
|
-
* including fields: name, description, price, and $distance.
|
|
78
|
-
*/
|
|
79
|
-
async post(data) {
|
|
80
|
-
const embedding = await this.generateEmbedding(data.prompt);
|
|
81
|
-
|
|
82
|
-
return await Product.search({
|
|
83
|
-
select: ['name', 'description', 'price', '$distance'],
|
|
84
|
-
conditions: {
|
|
85
|
-
attribute: 'textEmbeddings',
|
|
86
|
-
comparator: 'lt',
|
|
87
|
-
value: SIMILARITY_THRESHOLD,
|
|
88
|
-
target: embedding[0],
|
|
89
|
-
},
|
|
90
|
-
limit: 5,
|
|
91
|
-
});
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
/**
|
|
95
|
-
* Generates an embedding using the Ollama API.
|
|
96
|
-
*
|
|
97
|
-
* @param {string} promptData - The input data for which the embedding is to be generated.
|
|
98
|
-
* @return {Promise<number[][]>} A promise that resolves to the generated embedding as an array of numbers.
|
|
99
|
-
*/
|
|
100
|
-
async _generateOllamaEmbedding(promptData) {
|
|
101
|
-
const embedding = await ollama.embed({
|
|
102
|
-
model: OLLAMA_EMBEDDING_MODEL,
|
|
103
|
-
input: promptData,
|
|
104
|
-
});
|
|
105
|
-
return embedding?.embeddings;
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
/**
|
|
109
|
-
* Generates OpenAI embeddings based on the given prompt data.
|
|
110
|
-
*
|
|
111
|
-
* @param {string} promptData - The input data used for generating the embedding.
|
|
112
|
-
* @return {Promise<number[][]>} A promise that resolves to an array of embeddings, where each embedding is an array of floats.
|
|
113
|
-
*/
|
|
114
|
-
async _generateOpenAIEmbedding(promptData) {
|
|
115
|
-
const embedding = await openai.embeddings.create({
|
|
116
|
-
model: OPENAI_EMBEDDING_MODEL,
|
|
117
|
-
input: promptData,
|
|
118
|
-
encoding_format: 'float',
|
|
119
|
-
});
|
|
120
|
-
|
|
121
|
-
let embeddings = [];
|
|
122
|
-
embedding.data.forEach((embeddingData) => {
|
|
123
|
-
embeddings.push(embeddingData.embedding);
|
|
124
|
-
});
|
|
125
|
-
|
|
126
|
-
return embeddings;
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
```
|
|
130
62
|
|
|
131
|
-
|
|
63
|
+
5. **Include computed distance in results**: Add `'$distance'` to the `select` array to return the computed distance from the target vector alongside each record. `$distance` works in both `sort`-based and `conditions`-based queries.
|
|
132
64
|
|
|
133
|
-
|
|
65
|
+
```javascript
|
|
66
|
+
let results = Document.search({
|
|
67
|
+
select: ['name', '$distance'],
|
|
68
|
+
sort: { attribute: 'textEmbeddings', target: searchVector },
|
|
69
|
+
limit: 5,
|
|
70
|
+
});
|
|
71
|
+
```
|
|
134
72
|
|
|
135
|
-
|
|
136
|
-
curl -X POST "http://localhost:9926/ProductSearch/" \
|
|
137
|
-
-H "Accept: application/json" \
|
|
138
|
-
-H "Content-Type: application/json" \
|
|
139
|
-
-H "Authorization: Basic <YOUR_AUTH>" \
|
|
140
|
-
-d '{"prompt": "shorts for the gym"}'
|
|
141
|
-
```
|
|
73
|
+
6. **Tune HNSW parameters**: Pass additional parameters to `@indexed(type: "HNSW", ...)` to control index quality and performance:
|
|
142
74
|
|
|
143
|
-
|
|
75
|
+
| Parameter | Default | Description |
|
|
76
|
+
| ---------------------- | ----------------- | --------------------------------------------------------------------------------------------------- |
|
|
77
|
+
| `distance` | `"cosine"` | Distance function: `"euclidean"` or `"cosine"` (negative cosine similarity) |
|
|
78
|
+
| `efConstruction` | `100` | Max nodes explored during index construction. Higher = better recall, lower = better performance |
|
|
79
|
+
| `M` | `16` | Preferred connections per graph layer. Higher = more space, better recall for high-dimensional data |
|
|
80
|
+
| `optimizeRouting` | `0.5` | Heuristic aggressiveness for omitting redundant connections (0 = off, 1 = most aggressive) |
|
|
81
|
+
| `mL` | computed from `M` | Normalization factor for level generation |
|
|
82
|
+
| `efSearchConstruction` | `50` | Max nodes explored during search |
|
|
144
83
|
|
|
145
|
-
##
|
|
84
|
+
## Examples
|
|
146
85
|
|
|
147
|
-
|
|
86
|
+
**Schema with custom HNSW parameters:**
|
|
148
87
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
88
|
+
```graphql
|
|
89
|
+
type Document @table {
|
|
90
|
+
id: Long @primaryKey
|
|
91
|
+
textEmbeddings: [Float]
|
|
92
|
+
@indexed(type: "HNSW", distance: "euclidean", optimizeRouting: 0, efSearchConstruction: 100)
|
|
93
|
+
}
|
|
94
|
+
```
|
|
153
95
|
|
|
154
|
-
|
|
96
|
+
**Nearest-neighbor search with distance output:**
|
|
97
|
+
|
|
98
|
+
```javascript
|
|
99
|
+
let results = Document.search({
|
|
100
|
+
select: ['name', '$distance'],
|
|
101
|
+
sort: { attribute: 'textEmbeddings', target: searchVector },
|
|
102
|
+
limit: 5,
|
|
103
|
+
});
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Distance-threshold filter (no ranking):**
|
|
107
|
+
|
|
108
|
+
```javascript
|
|
109
|
+
let results = Document.search({
|
|
110
|
+
conditions: {
|
|
111
|
+
attribute: 'textEmbeddings',
|
|
112
|
+
comparator: 'lt',
|
|
113
|
+
value: 0.1,
|
|
114
|
+
target: searchVector,
|
|
115
|
+
},
|
|
116
|
+
});
|
|
117
|
+
```
|
|
155
118
|
|
|
156
|
-
##
|
|
119
|
+
## Notes
|
|
157
120
|
|
|
158
|
-
-
|
|
159
|
-
-
|
|
160
|
-
-
|
|
161
|
-
-
|
|
121
|
+
- The default `distance` function is `cosine`. To use Euclidean distance, set `distance: "euclidean"` in the `@indexed` directive.
|
|
122
|
+
- `efConstruction` controls index build quality; increase it to improve recall at the cost of slower indexing.
|
|
123
|
+
- `$distance` is a special field — prefix it with `$` exactly as shown; it is not a schema attribute.
|
|
124
|
+
- `target` is required in both `sort`-based and threshold-based condition queries to identify the reference vector for distance computation.
|
|
@@ -41,7 +41,19 @@ rules:
|
|
|
41
41
|
category: schema
|
|
42
42
|
priority: 1
|
|
43
43
|
order: 4
|
|
44
|
-
mode:
|
|
44
|
+
mode: generate
|
|
45
|
+
sources:
|
|
46
|
+
- path: reference/v5/database/schema.md
|
|
47
|
+
section: 'Vector Indexing'
|
|
48
|
+
role: primary
|
|
49
|
+
must_cover:
|
|
50
|
+
- '@indexed(type: "HNSW")'
|
|
51
|
+
- 'sort'
|
|
52
|
+
- 'target'
|
|
53
|
+
- 'cosine'
|
|
54
|
+
- 'efConstruction'
|
|
55
|
+
cross_links:
|
|
56
|
+
- adding-tables-with-schemas
|
|
45
57
|
|
|
46
58
|
- rule: using-blob-datatype
|
|
47
59
|
description: How to use the Blob data type for efficient binary storage in Harper.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@harperfast/skills",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.1",
|
|
4
4
|
"description": "Best practices for making awesome Harper apps with your favorite Agent",
|
|
5
5
|
"keywords": [],
|
|
6
6
|
"homepage": "https://github.com/harperfast",
|
|
@@ -28,9 +28,11 @@
|
|
|
28
28
|
"build": "node scripts/build.mjs",
|
|
29
29
|
"format": "oxfmt",
|
|
30
30
|
"format:check": "oxfmt --check",
|
|
31
|
+
"generate": "node scripts/generation/generate-rules.mjs",
|
|
31
32
|
"validate": "npm run format:check && npm run build && node scripts/validate-skills.mjs && node scripts/generation/validate-generated.mjs"
|
|
32
33
|
},
|
|
33
34
|
"devDependencies": {
|
|
35
|
+
"@anthropic-ai/sdk": "^0.98.0",
|
|
34
36
|
"@commitlint/cli": "^20.4.1",
|
|
35
37
|
"@commitlint/config-conventional": "^20.4.1",
|
|
36
38
|
"@semantic-release/commit-analyzer": "^13.0.1",
|