@nextsparkjs/plugin-ai 0.1.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +79 -0
- package/README.md +529 -0
- package/api/README.md +65 -0
- package/api/ai-history/[id]/route.ts +112 -0
- package/api/embeddings/route.ts +129 -0
- package/api/generate/route.ts +160 -0
- package/docs/01-getting-started/01-introduction.md +237 -0
- package/docs/01-getting-started/02-installation.md +447 -0
- package/docs/01-getting-started/03-configuration.md +416 -0
- package/docs/02-features/01-text-generation.md +523 -0
- package/docs/02-features/02-embeddings.md +241 -0
- package/docs/02-features/03-ai-history.md +549 -0
- package/docs/03-advanced-usage/01-core-utilities.md +500 -0
- package/docs/04-use-cases/01-content-generation.md +453 -0
- package/entities/ai-history/ai-history.config.ts +123 -0
- package/entities/ai-history/ai-history.fields.ts +330 -0
- package/entities/ai-history/messages/en.json +56 -0
- package/entities/ai-history/messages/es.json +56 -0
- package/entities/ai-history/migrations/001_ai_history_table.sql +167 -0
- package/entities/ai-history/migrations/002_ai_history_metas.sql +103 -0
- package/lib/ai-history-meta-service.ts +379 -0
- package/lib/ai-history-service.ts +391 -0
- package/lib/ai-sdk.ts +7 -0
- package/lib/core-utils.ts +217 -0
- package/lib/plugin-env.ts +252 -0
- package/lib/sanitize.ts +122 -0
- package/lib/save-example.ts +237 -0
- package/lib/server-env.ts +104 -0
- package/package.json +23 -0
- package/plugin.config.ts +55 -0
- package/public/docs/login-404-error.png +0 -0
- package/tsconfig.json +47 -0
- package/tsconfig.tsbuildinfo +1 -0
- package/types/ai.types.ts +51 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Text Embeddings
|
|
3
|
+
description: Generate and use text embeddings for semantic search
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Text Embeddings
|
|
7
|
+
|
|
8
|
+
Text embeddings convert text into high-dimensional vectors that capture semantic meaning, enabling powerful search and recommendation features.
|
|
9
|
+
|
|
10
|
+
## What are Embeddings?
|
|
11
|
+
|
|
12
|
+
Embeddings are numerical representations of text where similar concepts are close together in vector space. This enables:
|
|
13
|
+
|
|
14
|
+
- **Semantic Search**: Find content by meaning, not just keywords
|
|
15
|
+
- **Recommendations**: Suggest similar content based on semantic similarity
|
|
16
|
+
- **Clustering**: Group related documents automatically
|
|
17
|
+
- **Anomaly Detection**: Identify unusual or out-of-place content
|
|
18
|
+
|
|
19
|
+
## Generating Embeddings
|
|
20
|
+
|
|
21
|
+
### Basic Example
|
|
22
|
+
|
|
23
|
+
```typescript
|
|
24
|
+
import { openai } from '@/plugins/ai/lib/openai'
|
|
25
|
+
|
|
26
|
+
async function generateEmbedding(text: string) {
|
|
27
|
+
const response = await openai.embeddings.create({
|
|
28
|
+
model: 'text-embedding-3-small',
|
|
29
|
+
input: text
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
return response.data[0].embedding
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Usage
|
|
36
|
+
const embedding = await generateEmbedding(
|
|
37
|
+
'This is a sample text to embed'
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
console.log(embedding) // [0.123, -0.456, 0.789, ...]
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Batch Processing
|
|
44
|
+
|
|
45
|
+
Process multiple texts efficiently:
|
|
46
|
+
|
|
47
|
+
```typescript
|
|
48
|
+
async function batchEmbeddings(texts: string[]) {
|
|
49
|
+
const response = await openai.embeddings.create({
|
|
50
|
+
model: 'text-embedding-3-small',
|
|
51
|
+
input: texts
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
return response.data.map(d => d.embedding)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Generate embeddings for multiple documents
|
|
58
|
+
const embeddings = await batchEmbeddings([
|
|
59
|
+
'First document text',
|
|
60
|
+
'Second document text',
|
|
61
|
+
'Third document text'
|
|
62
|
+
])
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Semantic Search
|
|
66
|
+
|
|
67
|
+
Implement semantic search with vector similarity:
|
|
68
|
+
|
|
69
|
+
```typescript
|
|
70
|
+
import { cosineSimilarity } from '@/plugins/ai/lib/utils'
|
|
71
|
+
|
|
72
|
+
async function semanticSearch(query: string, documents: Document[]) {
|
|
73
|
+
// Generate query embedding
|
|
74
|
+
const queryEmbedding = await generateEmbedding(query)
|
|
75
|
+
|
|
76
|
+
// Calculate similarity scores
|
|
77
|
+
const results = documents.map(doc => ({
|
|
78
|
+
document: doc,
|
|
79
|
+
similarity: cosineSimilarity(queryEmbedding, doc.embedding)
|
|
80
|
+
}))
|
|
81
|
+
|
|
82
|
+
// Sort by similarity (highest first)
|
|
83
|
+
results.sort((a, b) => b.similarity - a.similarity)
|
|
84
|
+
|
|
85
|
+
return results.slice(0, 10) // Top 10 results
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Usage
|
|
89
|
+
const results = await semanticSearch(
|
|
90
|
+
'How do I configure authentication?',
|
|
91
|
+
allDocuments
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Vector Database Integration
|
|
96
|
+
|
|
97
|
+
Store and query embeddings efficiently using a vector database:
|
|
98
|
+
|
|
99
|
+
### Pinecone Example
|
|
100
|
+
|
|
101
|
+
```typescript
|
|
102
|
+
import { Pinecone } from '@pinecone-database/pinecone'
|
|
103
|
+
|
|
104
|
+
const pinecone = new Pinecone({
|
|
105
|
+
apiKey: process.env.PINECONE_API_KEY!
|
|
106
|
+
})
|
|
107
|
+
|
|
108
|
+
const index = pinecone.index('documentation')
|
|
109
|
+
|
|
110
|
+
// Upsert embeddings
|
|
111
|
+
await index.upsert([
|
|
112
|
+
{
|
|
113
|
+
id: 'doc-1',
|
|
114
|
+
values: embedding,
|
|
115
|
+
metadata: {
|
|
116
|
+
title: 'Getting Started',
|
|
117
|
+
content: 'How to get started...',
|
|
118
|
+
url: '/docs/getting-started'
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
])
|
|
122
|
+
|
|
123
|
+
// Query similar vectors
|
|
124
|
+
const queryResults = await index.query({
|
|
125
|
+
vector: queryEmbedding,
|
|
126
|
+
topK: 10,
|
|
127
|
+
includeMetadata: true
|
|
128
|
+
})
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Supabase pgvector Example
|
|
132
|
+
|
|
133
|
+
```typescript
|
|
134
|
+
import { createClient } from '@supabase/supabase-js'
|
|
135
|
+
|
|
136
|
+
const supabase = createClient(
|
|
137
|
+
process.env.SUPABASE_URL!,
|
|
138
|
+
process.env.SUPABASE_KEY!
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
// Store embedding
|
|
142
|
+
await supabase.from('documents').insert({
|
|
143
|
+
content: 'Document content',
|
|
144
|
+
embedding: embedding
|
|
145
|
+
})
|
|
146
|
+
|
|
147
|
+
// Semantic search with pgvector
|
|
148
|
+
const { data } = await supabase.rpc('match_documents', {
|
|
149
|
+
query_embedding: queryEmbedding,
|
|
150
|
+
match_threshold: 0.7,
|
|
151
|
+
match_count: 10
|
|
152
|
+
})
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Use Cases
|
|
156
|
+
|
|
157
|
+
### 1. Documentation Search
|
|
158
|
+
|
|
159
|
+
```typescript
|
|
160
|
+
// Build searchable documentation
|
|
161
|
+
const docs = await getAllDocs()
|
|
162
|
+
|
|
163
|
+
for (const doc of docs) {
|
|
164
|
+
const embedding = await generateEmbedding(doc.content)
|
|
165
|
+
await saveEmbedding(doc.id, embedding)
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Search
|
|
169
|
+
const results = await semanticSearch('How to deploy?', docs)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### 2. Content Recommendations
|
|
173
|
+
|
|
174
|
+
```typescript
|
|
175
|
+
// Recommend similar articles
|
|
176
|
+
async function recommendSimilar(articleId: string, limit = 5) {
|
|
177
|
+
const article = await getArticle(articleId)
|
|
178
|
+
const similar = await semanticSearch(article.content, allArticles)
|
|
179
|
+
|
|
180
|
+
return similar
|
|
181
|
+
.filter(r => r.document.id !== articleId)
|
|
182
|
+
.slice(0, limit)
|
|
183
|
+
}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### 3. Duplicate Detection
|
|
187
|
+
|
|
188
|
+
```typescript
|
|
189
|
+
// Find duplicate or very similar content
|
|
190
|
+
async function findDuplicates(threshold = 0.95) {
|
|
191
|
+
const duplicates = []
|
|
192
|
+
|
|
193
|
+
for (let i = 0; i < documents.length; i++) {
|
|
194
|
+
for (let j = i + 1; j < documents.length; j++) {
|
|
195
|
+
const similarity = cosineSimilarity(
|
|
196
|
+
documents[i].embedding,
|
|
197
|
+
documents[j].embedding
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
if (similarity > threshold) {
|
|
201
|
+
duplicates.push([documents[i], documents[j], similarity])
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return duplicates
|
|
207
|
+
}
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Best Practices
|
|
211
|
+
|
|
212
|
+
1. **Chunk long documents** into smaller pieces (max 8191 tokens)
|
|
213
|
+
2. **Cache embeddings** - they don't change for the same text
|
|
214
|
+
3. **Use batch processing** for multiple texts to reduce API calls
|
|
215
|
+
4. **Choose the right model**:
|
|
216
|
+
- `text-embedding-3-small`: Faster, cheaper, good for most cases
|
|
217
|
+
- `text-embedding-3-large`: Higher accuracy, more expensive
|
|
218
|
+
5. **Store embeddings efficiently** using a vector database
|
|
219
|
+
6. **Update embeddings** when content changes
|
|
220
|
+
|
|
221
|
+
## Performance Tips
|
|
222
|
+
|
|
223
|
+
```typescript
|
|
224
|
+
// Process in parallel with rate limiting
|
|
225
|
+
import pLimit from 'p-limit'
|
|
226
|
+
|
|
227
|
+
const limit = pLimit(5) // Max 5 concurrent requests
|
|
228
|
+
|
|
229
|
+
const embeddings = await Promise.all(
|
|
230
|
+
documents.map(doc =>
|
|
231
|
+
limit(() => generateEmbedding(doc.content))
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## Cost Optimization
|
|
237
|
+
|
|
238
|
+
- `text-embedding-3-small`: ~$0.02 per 1M tokens
|
|
239
|
+
- `text-embedding-3-large`: ~$0.13 per 1M tokens
|
|
240
|
+
- Cache embeddings to avoid regenerating
|
|
241
|
+
- Use smaller models when accuracy difference is minimal
|