@pyxmate/memory 0.20.4 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +63 -34
- package/dist/chunk-7P6ASYW6.mjs +9 -0
- package/dist/cli/pyx-mem.mjs +15705 -0
- package/dist/dashboard.mjs +1 -0
- package/dist/index.mjs +1 -0
- package/dist/react.mjs +1 -0
- package/package.json +7 -4
- package/bin/init.mjs +0 -63
- package/skills/pyx-memory/SKILL.md +0 -143
- package/skills/pyx-memory/examples/disabled-memory.ts +0 -53
- package/skills/pyx-memory/examples/minimal-embedded.ts +0 -37
- package/skills/pyx-memory/examples/minimal-sidecar.ts +0 -14
- package/skills/pyx-memory/patterns/access-control.md +0 -586
- package/skills/pyx-memory/patterns/consumer.md +0 -129
- package/skills/pyx-memory/patterns/embedded.md +0 -249
- package/skills/pyx-memory/patterns/file-uploads.md +0 -78
- package/skills/pyx-memory/reference/advanced.md +0 -274
- package/skills/pyx-memory/reference/http-api.md +0 -526
- package/skills/pyx-memory/reference/parity.md +0 -74
- package/skills/pyx-memory/reference/sdk-guide.md +0 -233
- package/skills/pyx-memory/reference/types.md +0 -344
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
# Consumer Integration Patterns
|
|
2
|
-
|
|
3
|
-
For downstream projects that **consume** pyx-memory (e.g., agent-forge, custom AI agents).
|
|
4
|
-
|
|
5
|
-
**Rule: Only depend on `@pyx-memory/client` + `@pyx-memory/shared`. Never import `@pyx-memory/core`.**
|
|
6
|
-
|
|
7
|
-
`@pyx-memory/core` is pyx-memory's internal implementation (SQLiteStore, LanceDB, embedding providers,
|
|
8
|
-
RAG engines, graph stores). Importing it creates tight coupling — any internal change can break your project.
|
|
9
|
-
|
|
10
|
-
---
|
|
11
|
-
|
|
12
|
-
## Pattern 8: Consumer (Sidecar-Only)
|
|
13
|
-
|
|
14
|
-
```typescript
|
|
15
|
-
import { MemoryClient, MemoryServerError } from '@pyx-memory/client';
|
|
16
|
-
import type { MemoryInterface, ExtendedMemoryInterface } from '@pyx-memory/client';
|
|
17
|
-
import type { MemoryEntry, MemorySearchResult } from '@pyx-memory/shared';
|
|
18
|
-
|
|
19
|
-
const MEMORY_URL = process.env.MEMORY_URL; // e.g., 'http://localhost:7822'
|
|
20
|
-
|
|
21
|
-
let memory: MemoryClient | null = null;
|
|
22
|
-
|
|
23
|
-
if (MEMORY_URL) {
|
|
24
|
-
// Simple: API key only
|
|
25
|
-
memory = new MemoryClient(MEMORY_URL, process.env.MEMORY_API_KEY);
|
|
26
|
-
|
|
27
|
-
// Multi-tenant: API key + tenant headers
|
|
28
|
-
// memory = new MemoryClient(MEMORY_URL, {
|
|
29
|
-
// apiKey: process.env.MEMORY_API_KEY,
|
|
30
|
-
// defaultHeaders: { 'X-Tenant-Id': 'tenant-abc', 'X-User-Id': 'user-123' },
|
|
31
|
-
// });
|
|
32
|
-
|
|
33
|
-
await memory.initialize(); // verifies connectivity
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
// Use memory if available
|
|
37
|
-
if (memory) {
|
|
38
|
-
await memory.store({ content: 'fact', type: 'long-term', metadata: {} });
|
|
39
|
-
const results = await memory.search({ query: 'fact', limit: 5 });
|
|
40
|
-
|
|
41
|
-
// Graph queries — available on MemoryClient directly (no @pyx-memory/core needed)
|
|
42
|
-
const nodes = await memory.graphNodes();
|
|
43
|
-
const traversal = await memory.graphQuery({ nodeId: 'node-1', depth: 2 });
|
|
44
|
-
|
|
45
|
-
// File ingestion — native NDJSON streaming via ingestFileEvents()
|
|
46
|
-
const file = new File([buffer], 'report.pdf', { type: 'application/pdf' });
|
|
47
|
-
for await (const event of memory.ingestFileEvents(file)) {
|
|
48
|
-
if (event.type === 'progress') console.log(`[${event.stage}] ${event.message ?? ''}`);
|
|
49
|
-
if (event.type === 'error') throw new Error(event.message ?? event.error);
|
|
50
|
-
// event.type === 'result' carries the terminal { filename, chunks, entryIds, ... }
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
// Image ingestion with AI description
|
|
54
|
-
// pyx-memory saves the original image to {DATA_DIR}/files/ and indexes the
|
|
55
|
-
// description in vector + SQLite for semantic search. Without a description,
|
|
56
|
-
// images get a minimal placeholder ("[Image] filename (size KB)").
|
|
57
|
-
const image = new File([imgBuffer], 'photo.png', { type: 'image/png' });
|
|
58
|
-
for await (const _ of memory.ingestFileEvents(image, {
|
|
59
|
-
description: 'A screenshot of the login page showing an error dialog',
|
|
60
|
-
})) {
|
|
61
|
-
/* drain to terminal */
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// Lifecycle
|
|
65
|
-
await memory.consolidate();
|
|
66
|
-
await memory.runDecay();
|
|
67
|
-
}
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
---
|
|
71
|
-
|
|
72
|
-
## Pattern 9: Graceful Degradation (DisabledMemory)
|
|
73
|
-
|
|
74
|
-
When your project should work with or without pyx-memory, implement a no-op wrapper.
|
|
75
|
-
See [examples/disabled-memory.ts](../examples/disabled-memory.ts) for a copy-paste ready implementation.
|
|
76
|
-
|
|
77
|
-
```typescript
|
|
78
|
-
import { MemoryClient } from '@pyx-memory/client';
|
|
79
|
-
|
|
80
|
-
const memory: MemoryInterface = process.env.MEMORY_URL
|
|
81
|
-
? new MemoryClient(process.env.MEMORY_URL, process.env.MEMORY_API_KEY)
|
|
82
|
-
: new DisabledMemory();
|
|
83
|
-
|
|
84
|
-
await memory.initialize();
|
|
85
|
-
```
|
|
86
|
-
|
|
87
|
-
---
|
|
88
|
-
|
|
89
|
-
## Health Endpoint Pattern
|
|
90
|
-
|
|
91
|
-
Report memory status in your app's health check:
|
|
92
|
-
|
|
93
|
-
```typescript
|
|
94
|
-
// In your health route
|
|
95
|
-
const memoryStatus = process.env.MEMORY_URL
|
|
96
|
-
? { status: 'connected', url: process.env.MEMORY_URL }
|
|
97
|
-
: { status: 'disabled' };
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
---
|
|
101
|
-
|
|
102
|
-
## Adding as Sidecar (Docker)
|
|
103
|
-
|
|
104
|
-
```yaml
|
|
105
|
-
# docker-compose.yaml
|
|
106
|
-
services:
|
|
107
|
-
memory:
|
|
108
|
-
build:
|
|
109
|
-
context: ./vendor/pyx-memory
|
|
110
|
-
dockerfile: docker/Dockerfile
|
|
111
|
-
ports:
|
|
112
|
-
- "7822:7822"
|
|
113
|
-
volumes:
|
|
114
|
-
- memory-data:/data
|
|
115
|
-
environment:
|
|
116
|
-
- DATA_DIR=/data
|
|
117
|
-
- API_KEY=${MEMORY_API_KEY} # auth for all requests
|
|
118
|
-
# - TENANT_MODE=multi # require X-Tenant-Id on all ops
|
|
119
|
-
# - SENSITIVITY_POLICY=encrypt # encrypt secret entries at rest
|
|
120
|
-
# - ENCRYPTION_KEY=${ENCRYPTION_KEY} # 32 bytes as 64 hex chars
|
|
121
|
-
# Embedding is internal (BGE-M3, 1024d) — no API keys needed
|
|
122
|
-
|
|
123
|
-
your-app:
|
|
124
|
-
environment:
|
|
125
|
-
- MEMORY_URL=http://memory:7822
|
|
126
|
-
|
|
127
|
-
volumes:
|
|
128
|
-
memory-data:
|
|
129
|
-
```
|
|
@@ -1,249 +0,0 @@
|
|
|
1
|
-
# Embedded Integration Patterns
|
|
2
|
-
|
|
3
|
-
For projects using pyx-memory directly in-process with full feature access.
|
|
4
|
-
|
|
5
|
-
## Contents
|
|
6
|
-
- [Pattern 1: Testing / Development](#pattern-1-testing--development)
|
|
7
|
-
- [Pattern 2: Production](#pattern-2-production)
|
|
8
|
-
- [Pattern 3: Production with Store Targets](#pattern-3-production-with-store-targets)
|
|
9
|
-
- [Pattern 4: With Knowledge Graph](#pattern-4-with-knowledge-graph)
|
|
10
|
-
- [Pattern 5: With LLM Lifecycle](#pattern-5-with-llm-lifecycle)
|
|
11
|
-
- [Pattern 6: File Ingestion](#pattern-6-file-ingestion)
|
|
12
|
-
- [Pattern 7: Factory with Auto-Mode Switching](#pattern-7-factory-with-auto-mode-switching)
|
|
13
|
-
- [Adding as Git Submodule](#adding-as-git-submodule-recommended-for-embedded-mode)
|
|
14
|
-
- [MemoryOptions Quick Reference](#memoryoptions-quick-reference)
|
|
15
|
-
|
|
16
|
-
---
|
|
17
|
-
|
|
18
|
-
## Pattern 1: Testing / Development
|
|
19
|
-
|
|
20
|
-
```typescript
|
|
21
|
-
import { Memory } from '@pyx-memory/core';
|
|
22
|
-
|
|
23
|
-
const memory = new Memory({ dataDir: ':memory:' });
|
|
24
|
-
await memory.initialize();
|
|
25
|
-
// Memory internally creates a LocalEmbeddingProvider (BGE-M3, 1024d)
|
|
26
|
-
// No embedder needed — embedding is fully managed
|
|
27
|
-
```
|
|
28
|
-
|
|
29
|
-
## Pattern 2: Production
|
|
30
|
-
|
|
31
|
-
```typescript
|
|
32
|
-
import { Memory } from '@pyx-memory/core';
|
|
33
|
-
|
|
34
|
-
const memory = new Memory({ dataDir: './data' });
|
|
35
|
-
await memory.initialize();
|
|
36
|
-
// Embedding is handled internally by LocalEmbeddingProvider (BGE-M3 via @huggingface/transformers, 1024d)
|
|
37
|
-
// No external embedding provider needed
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
## Pattern 2b: Production with Multi-Tenant + Encryption
|
|
41
|
-
|
|
42
|
-
```typescript
|
|
43
|
-
import { Memory } from '@pyx-memory/core';
|
|
44
|
-
|
|
45
|
-
const memory = new Memory({
|
|
46
|
-
dataDir: './data',
|
|
47
|
-
tenantId: 'tenant-abc', // Auto-scope all operations to this tenant
|
|
48
|
-
encryptionKey: Buffer.from(process.env.ENCRYPTION_KEY!, 'hex'), // 32 bytes for AES-256-GCM
|
|
49
|
-
});
|
|
50
|
-
await memory.initialize();
|
|
51
|
-
|
|
52
|
-
// All store/search/get/delete/list operations are automatically tenant-scoped
|
|
53
|
-
await memory.store({
|
|
54
|
-
content: 'Secret API key: sk-abc123', // auto-classified as 'secret', encrypted at rest
|
|
55
|
-
type: 'long-term',
|
|
56
|
-
metadata: { source: 'config' },
|
|
57
|
-
});
|
|
58
|
-
|
|
59
|
-
// Search respects tenant isolation + sensitivity filtering
|
|
60
|
-
const results = await memory.search({
|
|
61
|
-
query: 'API key',
|
|
62
|
-
maxSensitivity: 'internal', // 'secret' entries are redacted in results
|
|
63
|
-
});
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
## Pattern 3: Production with Store Targets
|
|
67
|
-
|
|
68
|
-
```typescript
|
|
69
|
-
import { Memory } from '@pyx-memory/core';
|
|
70
|
-
|
|
71
|
-
const memory = new Memory({ dataDir: './data' });
|
|
72
|
-
await memory.initialize();
|
|
73
|
-
|
|
74
|
-
// Default: stores to sqlite + vector
|
|
75
|
-
await memory.store({
|
|
76
|
-
content: 'User prefers dark mode',
|
|
77
|
-
type: 'long-term',
|
|
78
|
-
metadata: { source: 'settings' },
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
// Explicit targets: sqlite only (skip vector indexing)
|
|
82
|
-
await memory.store({
|
|
83
|
-
content: 'Temporary note',
|
|
84
|
-
type: 'working',
|
|
85
|
-
metadata: {},
|
|
86
|
-
targets: ['sqlite'],
|
|
87
|
-
});
|
|
88
|
-
```
|
|
89
|
-
|
|
90
|
-
## Pattern 4: With Knowledge Graph
|
|
91
|
-
|
|
92
|
-
```typescript
|
|
93
|
-
import { Memory, createGraphStore } from '@pyx-memory/core';
|
|
94
|
-
import type { StoreTarget, IngestEntity, IngestRelationship } from '@pyx-memory/shared';
|
|
95
|
-
|
|
96
|
-
// 1. Create and initialize graph store BEFORE Memory
|
|
97
|
-
const graphStore = createGraphStore({}); // returns SQLiteGraphStore (default)
|
|
98
|
-
await graphStore.initialize({}); // REQUIRED — Memory does NOT init this for you
|
|
99
|
-
|
|
100
|
-
// For Neo4j instead: createGraphStore({ neo4jUrl: 'bolt://localhost:7687' })
|
|
101
|
-
|
|
102
|
-
const memory = new Memory({
|
|
103
|
-
dataDir: './data',
|
|
104
|
-
graphStore, // enables graph RAG search
|
|
105
|
-
});
|
|
106
|
-
await memory.initialize();
|
|
107
|
-
|
|
108
|
-
// Graph storage is agent-driven — YOU provide entities and relationships explicitly
|
|
109
|
-
await memory.store({
|
|
110
|
-
content: 'Alice works at Acme Corp as a senior engineer',
|
|
111
|
-
type: 'long-term',
|
|
112
|
-
metadata: {},
|
|
113
|
-
targets: ['sqlite', 'vector', 'graph'],
|
|
114
|
-
entities: [
|
|
115
|
-
{ name: 'Alice', type: 'PERSON', properties: { role: 'senior engineer' } },
|
|
116
|
-
{ name: 'Acme Corp', type: 'ORGANIZATION' },
|
|
117
|
-
],
|
|
118
|
-
relationships: [
|
|
119
|
-
{ source: 'Alice', target: 'Acme Corp', type: 'WORKS_AT' },
|
|
120
|
-
],
|
|
121
|
-
});
|
|
122
|
-
|
|
123
|
-
// Graph-aware search
|
|
124
|
-
const results = await memory.search({ query: 'Alice employer', strategy: 'graph' });
|
|
125
|
-
|
|
126
|
-
// Cleanup both
|
|
127
|
-
await memory.shutdown();
|
|
128
|
-
await graphStore.shutdown();
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
## Pattern 5: With LLM Lifecycle
|
|
132
|
-
|
|
133
|
-
```typescript
|
|
134
|
-
import { Memory } from '@pyx-memory/core';
|
|
135
|
-
import type { LLMCallback } from '@pyx-memory/core';
|
|
136
|
-
|
|
137
|
-
// LLMCallback: any function that takes a prompt string and returns a completion string
|
|
138
|
-
const llm: LLMCallback = async (prompt) => {
|
|
139
|
-
const res = await fetch('https://api.anthropic.com/v1/messages', {
|
|
140
|
-
method: 'POST',
|
|
141
|
-
headers: {
|
|
142
|
-
'x-api-key': process.env.ANTHROPIC_API_KEY!,
|
|
143
|
-
'content-type': 'application/json',
|
|
144
|
-
'anthropic-version': '2023-06-01',
|
|
145
|
-
},
|
|
146
|
-
body: JSON.stringify({
|
|
147
|
-
model: 'claude-sonnet-4-20250514',
|
|
148
|
-
max_tokens: 1024,
|
|
149
|
-
messages: [{ role: 'user', content: prompt }],
|
|
150
|
-
}),
|
|
151
|
-
});
|
|
152
|
-
const data = await res.json() as any;
|
|
153
|
-
return data.content[0].text;
|
|
154
|
-
};
|
|
155
|
-
|
|
156
|
-
const memory = new Memory({
|
|
157
|
-
dataDir: './data',
|
|
158
|
-
llm, // enables LLM-powered lifecycle
|
|
159
|
-
});
|
|
160
|
-
await memory.initialize();
|
|
161
|
-
|
|
162
|
-
// Now lifecycle methods use LLM intelligence
|
|
163
|
-
await memory.consolidate(); // LLM scoring + dedup + conflict resolution
|
|
164
|
-
await memory.summarizeSession('session-123'); // LLM summarization
|
|
165
|
-
await memory.runDecay(); // importance-based archival
|
|
166
|
-
```
|
|
167
|
-
|
|
168
|
-
**Without LLM**: Lifecycle still works using heuristic fallbacks (regex extraction, embedding-distance dedup, formula-based scoring). LLM makes it smarter, not mandatory.
|
|
169
|
-
|
|
170
|
-
## Pattern 6: File Ingestion
|
|
171
|
-
|
|
172
|
-
```typescript
|
|
173
|
-
import { IngestionAgent, Memory } from '@pyx-memory/core';
|
|
174
|
-
|
|
175
|
-
const memory = new Memory({ dataDir: './data' });
|
|
176
|
-
await memory.initialize();
|
|
177
|
-
|
|
178
|
-
const agent = new IngestionAgent({
|
|
179
|
-
llm: myLlmCallback, // optional: smart classification + enrichment
|
|
180
|
-
embedder: (texts) => myEmbedder.embed(texts), // optional: semantic chunking (separate from Memory's internal embedder)
|
|
181
|
-
useSemanticChunking: true,
|
|
182
|
-
useStructuralChunking: false,
|
|
183
|
-
enableEnrichment: true,
|
|
184
|
-
enableMetadata: true,
|
|
185
|
-
enableHierarchical: false, // requires LLM
|
|
186
|
-
});
|
|
187
|
-
|
|
188
|
-
// Supported: .txt, .md, .csv, .pdf, .docx, .json, .html
|
|
189
|
-
const buffer = Buffer.from(await Bun.file('report.pdf').arrayBuffer());
|
|
190
|
-
const result = await agent.ingest(buffer, 'report.pdf', memory);
|
|
191
|
-
// result: { filename, fileType, chunks, entryIds, totalCharacters }
|
|
192
|
-
```
|
|
193
|
-
|
|
194
|
-
**Note**: `IngestionAgent` may accept its own `embedder` for semantic chunking. This is separate from Memory's internal embedding — Memory handles its own embedding automatically.
|
|
195
|
-
|
|
196
|
-
## Pattern 7: Factory with Auto-Mode Switching
|
|
197
|
-
|
|
198
|
-
```typescript
|
|
199
|
-
import { createMemory } from '@pyx-memory/core';
|
|
200
|
-
|
|
201
|
-
// Embedded mode (default)
|
|
202
|
-
const memory = createMemory({
|
|
203
|
-
dataDir: './data',
|
|
204
|
-
});
|
|
205
|
-
|
|
206
|
-
// Sidecar mode (when MEMORY_URL is set)
|
|
207
|
-
const remote = createMemory({
|
|
208
|
-
memoryUrl: process.env.MEMORY_URL, // e.g., 'http://localhost:7822'
|
|
209
|
-
apiKey: process.env.MEMORY_API_KEY,
|
|
210
|
-
});
|
|
211
|
-
|
|
212
|
-
await memory.initialize();
|
|
213
|
-
|
|
214
|
-
// WARNING: createMemory() returns MemoryInterface, NOT ExtendedMemoryInterface.
|
|
215
|
-
// If you need lifecycle methods, cast:
|
|
216
|
-
// const extended = memory as ExtendedMemoryInterface;
|
|
217
|
-
// Or prefer `new Memory()` / `new MemoryClient()` directly.
|
|
218
|
-
```
|
|
219
|
-
|
|
220
|
-
---
|
|
221
|
-
|
|
222
|
-
## Adding as Git Submodule (Recommended for Embedded Mode)
|
|
223
|
-
|
|
224
|
-
```bash
|
|
225
|
-
git submodule add https://github.com/fysoul17/pyx-memory-v1.git vendor/pyx-memory
|
|
226
|
-
```
|
|
227
|
-
|
|
228
|
-
Add to your `package.json` workspaces:
|
|
229
|
-
|
|
230
|
-
```json
|
|
231
|
-
{
|
|
232
|
-
"workspaces": [
|
|
233
|
-
"packages/*",
|
|
234
|
-
"vendor/pyx-memory/packages/shared",
|
|
235
|
-
"vendor/pyx-memory/packages/client",
|
|
236
|
-
"vendor/pyx-memory/packages/core"
|
|
237
|
-
]
|
|
238
|
-
}
|
|
239
|
-
```
|
|
240
|
-
|
|
241
|
-
Then: `bun install`
|
|
242
|
-
|
|
243
|
-
---
|
|
244
|
-
|
|
245
|
-
## MemoryOptions Quick Reference
|
|
246
|
-
|
|
247
|
-
`embedder` has been removed from MemoryOptions. Memory internally creates a `LocalEmbeddingProvider` using BGE-M3 (1024 dimensions) via `@huggingface/transformers`. You never need to provide an embedding function.
|
|
248
|
-
|
|
249
|
-
See [reference/types.md](../reference/types.md#memoryoptions-reference) for the full MemoryOptions table.
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
# Pattern: File Uploads
|
|
2
|
-
|
|
3
|
-
This is consumer-side guidance: how to decide whether to forward a file
|
|
4
|
-
straight to `ingestFileEvents()` or to pre-extract its text upstream first.
|
|
5
|
-
|
|
6
|
-
## TL;DR
|
|
7
|
-
|
|
8
|
-
| Format | Default action |
|
|
9
|
-
|---|---|
|
|
10
|
-
| `txt`, `md`, `csv`, `tsv`, `log`, `json`, `jsonl`, `html`, `htm` | Forward raw — pyx-memory streams. |
|
|
11
|
-
| `pdf` | Forward raw — pyx-memory streams via poppler. Install `poppler-utils` on the server image. |
|
|
12
|
-
| Images (`png`, `jpg`, `jpeg`, `webp`, `gif`, `bmp`, `tiff`, `svg`) | Forward raw with a `description` (use vision capability). |
|
|
13
|
-
| `docx` ≤ 10 MB | Forward raw. |
|
|
14
|
-
| `docx` > 10 MB | **Pre-extract upstream** as `.txt` or `.md`. The server returns a `MemoryError` if you don't. |
|
|
15
|
-
| `xlsx` (large or shared-string-heavy) | **Pre-extract upstream** as `.xxx.xlsx.txt` for deterministic UX. |
|
|
16
|
-
| `pptx` (production UX) | **Pre-extract upstream** as `.xxx.pptx.txt` for deterministic UX. |
|
|
17
|
-
|
|
18
|
-
"Production UX" here means: you can't afford a single hung upload to wedge
|
|
19
|
-
the user-facing layer for 30+ seconds, you need actionable error messages
|
|
20
|
-
on every failure, and you have your own copy of the original file
|
|
21
|
-
(separate from pyx-memory's internal storage).
|
|
22
|
-
|
|
23
|
-
## Why pre-extract pptx and large xlsx
|
|
24
|
-
|
|
25
|
-
The server's pptx parser decompresses the full ZIP in memory (~3× file
|
|
26
|
-
size peak). The xlsx parser streams rows but caches shared strings for
|
|
27
|
-
the entire workbook (`ExcelJS.WorkbookReader { sharedStrings: 'cache' }`).
|
|
28
|
-
Both are bounded by the 100 MB file / 200 MB decompressed caps, but
|
|
29
|
-
"bounded" is not "constant" — pathological files (huge shared-string
|
|
30
|
-
tables, dense cell formulas, embedded media) can push peak memory and
|
|
31
|
-
parse time well past what naive callers expect.
|
|
32
|
-
|
|
33
|
-
If you control the upload boundary (e.g. you operate a runtime/proxy
|
|
34
|
-
service that fronts pyx-memory), upstream pre-extraction lets you:
|
|
35
|
-
|
|
36
|
-
1. **Catch parse failures at your boundary**, where you can return an
|
|
37
|
-
actionable error to the user (`"Excel formula evaluation failed at
|
|
38
|
-
sheet 'Q3 Revenue', row 412"`) instead of a generic upstream 5xx.
|
|
39
|
-
2. **Bound the wire payload to pyx-memory** — text/plain only — so the
|
|
40
|
-
memory server's parser is never the bottleneck.
|
|
41
|
-
3. **Keep the original binary in your own storage**, so users can still
|
|
42
|
-
download the file. pyx-memory's catalog only holds the indexed text.
|
|
43
|
-
|
|
44
|
-
## Reference implementation
|
|
45
|
-
|
|
46
|
-
[ai-rag-hub](https://github.com/fysoul17/one-query-v1) (a consumer of
|
|
47
|
-
pyx-memory) implements this pattern in its runtime:
|
|
48
|
-
|
|
49
|
-
- `packages/server/src/text-extractors.ts` — local extractors for `pptx`
|
|
50
|
-
and `xlsx`, both wrapped in the same OOXML safety envelope (zip-bomb
|
|
51
|
-
defense, path-traversal check, macro reject, decompressed-size cap,
|
|
52
|
-
char limit).
|
|
53
|
-
- `packages/server/src/routes/memory.ts` — `prepareFileForIngest`
|
|
54
|
-
dispatches via `getTextExtractor(mimeType)`; matched formats are
|
|
55
|
-
re-uploaded as `<original>.txt` with `text/plain`. Catalog metadata
|
|
56
|
-
flags `downloadableFromMemory: false` so the consumer's own
|
|
57
|
-
`/api/team/documents/[id]/download` route serves the original
|
|
58
|
-
binary instead.
|
|
59
|
-
|
|
60
|
-
## When NOT to pre-extract
|
|
61
|
-
|
|
62
|
-
Single-tenant lab usage, internal tools, batch jobs where a 30-second
|
|
63
|
-
parse latency is acceptable, or any case where you don't have your own
|
|
64
|
-
copy of the file and need pyx-memory's `GET /api/memory/files/download/:filename`
|
|
65
|
-
to return the original binary. In those cases, the native pyx-memory
|
|
66
|
-
parsers are exactly what you want.
|
|
67
|
-
|
|
68
|
-
## What about other formats
|
|
69
|
-
|
|
70
|
-
- **HTML**: pyx-memory strips `<script>` and `<style>` during parse
|
|
71
|
-
(`parsers/html.ts`). No upstream sanitizer needed for indexing.
|
|
72
|
-
- **PDF with images**: use the SDK's two-phase enrichment via
|
|
73
|
-
`EnrichmentCallbacks` — see `reference/sdk-guide.md`. Don't pre-extract
|
|
74
|
-
the PDF as text and lose image enrichment.
|
|
75
|
-
- **SVG**: currently classified as an image but pyx-memory's image
|
|
76
|
-
parser only stores it as a placeholder; if you need SVG text indexed,
|
|
77
|
-
pre-extract the `<text>` and `<desc>` content yourself or convert to
|
|
78
|
-
raster + describeImage.
|