@pyxmate/memory 0.12.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -394,6 +394,17 @@ interface IngestRelationship {
394
394
  /** Optional properties to attach to the graph edge. */
395
395
  properties?: Record<string, unknown>;
396
396
  }
397
+ /**
398
+ * How `Memory.store()` should react when the graph write fails.
399
+ * - `throw` (default, since v0.13.0): propagate the failure so the caller
400
+ * knows the graph is incomplete. Honest contract: a graph write either
401
+ * commits or fails loudly. Silent partial-loss was the v0.12.2 bug that
402
+ * masked 91/92 dropped entities.
403
+ * - `best-effort`: swallow the error and log a warning; the entry is still
404
+ * considered ingested in SQLite/vector. Use only when you genuinely don't
405
+ * need the graph slice and a transient neo4j blip shouldn't fail ingest.
406
+ */
407
+ type GraphFailureMode = 'throw' | 'best-effort';
397
408
  /** Store input: what the agent sends to Memory.store(). */
398
409
  type StoreInput = Omit<MemoryEntry, 'id' | 'createdAt'> & {
399
410
  id?: string;
@@ -404,6 +415,8 @@ type StoreInput = Omit<MemoryEntry, 'id' | 'createdAt'> & {
404
415
  entities?: IngestEntity[];
405
416
  /** Agent-provided relationships for graph storage. */
406
417
  relationships?: IngestRelationship[];
418
+ /** Graph-failure handling. Default: "throw" (loud) — see GraphFailureMode. */
419
+ graphFailureMode?: GraphFailureMode;
407
420
  };
408
421
  interface MemoryIngestRequest {
409
422
  content?: string;
@@ -417,6 +430,8 @@ interface MemoryIngestRequest {
417
430
  entities?: IngestEntity[];
418
431
  /** Agent-provided relationships for graph storage. */
419
432
  relationships?: IngestRelationship[];
433
+ /** Graph-failure handling. Default: "throw" (loud) — see GraphFailureMode. */
434
+ graphFailureMode?: GraphFailureMode;
420
435
  /** Importance score (1-10). */
421
436
  importance?: number;
422
437
  /** Source identifier (e.g., filename, URL). */
@@ -477,4 +492,4 @@ interface GraphTraversalResult {
477
492
  }>;
478
493
  }
479
494
 
480
- export { type AgentId, type ApiResponse, type ConsolidationRunResult, DEFAULTS, EmbeddingProviderName, type EnrichmentCallbacks, type ExtendedMemoryInterface, type GraphNode, type GraphRelationship, type GraphTraversalResult, type IngestEntity, type IngestRelationship, type IngestionResult, MemoryClient, type MemoryClientOptions, type MemoryEntry, type MemoryIngestRequest, type MemoryInterface, type MemoryListParams, type MemoryListResult, type MemorySearchParams, type MemorySearchResult, MemoryServerError, type MemoryStats, MemoryType, RAGStrategy, SensitivityLevel, type StoreInput, StoreTarget, type TemporalQueryFilters, type TenantScopeOptions, type Timestamp, VectorProvider };
495
+ export { type AgentId, type ApiResponse, type ConsolidationRunResult, DEFAULTS, EmbeddingProviderName, type EnrichmentCallbacks, type ExtendedMemoryInterface, type GraphFailureMode, type GraphNode, type GraphRelationship, type GraphTraversalResult, type IngestEntity, type IngestRelationship, type IngestionResult, MemoryClient, type MemoryClientOptions, type MemoryEntry, type MemoryIngestRequest, type MemoryInterface, type MemoryListParams, type MemoryListResult, type MemorySearchParams, type MemorySearchResult, MemoryServerError, type MemoryStats, MemoryType, RAGStrategy, SensitivityLevel, type StoreInput, StoreTarget, type TemporalQueryFilters, type TenantScopeOptions, type Timestamp, VectorProvider };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pyxmate/memory",
3
- "version": "0.12.1",
3
+ "version": "0.13.0",
4
4
  "type": "module",
5
5
  "description": "SDK for pyx-memory — Memory as a Service for AI agents",
6
6
  "license": "MIT",
@@ -93,7 +93,16 @@ curl -s -X POST {{ENDPOINT}}/api/memory/ingest/file \
93
93
 
94
94
  Supported formats: txt, md, csv, tsv, log, pdf, docx, xlsx, pptx, json, jsonl, html, htm, png, jpg, jpeg, webp, gif, bmp, tiff, svg (100MB limit).
95
95
 
96
- Ingestion streams from disk — parsers read row-by-row (csv/xlsx), line-by-line (txt/jsonl), page-by-page (pdf), or slide-by-slide (pptx) so peak memory is bounded regardless of file size. The only exception is `.docx`, which is hard-capped at 10MB because the underlying library (mammoth) loads the whole document into memory; above 10MB the server returns an error asking you to pre-extract the text upstream and re-upload as `.txt` or `.md`.
96
+ Ingestion memory profile, by format:
97
+
98
+ - **Plain-text + structured-text** (`csv`, `tsv`, `txt`, `log`, `json`, `jsonl`, `html`, `htm`, `md`): truly streaming — peak memory ≈ a few MB regardless of file size, up to the 100 MB file cap.
99
+ - **`pdf`**: streaming via the poppler `pdftotext` first-class path — peak memory ≈ a few MB. The `pdf-parse` fallback (when poppler is absent) loads the whole buffer; install poppler-utils for streaming.
100
+ - **`xlsx`**: row-by-row streaming via `ExcelJS.stream.xlsx.WorkbookReader`, but **shared strings are cached for the whole workbook** (`sharedStrings: 'cache'`). Peak memory grows with shared-string count, not just file size — workbooks with dense, repeated cell strings can use significantly more memory than the file cap suggests.
101
+ - **`pptx`**: the full ZIP is decompressed in memory (same shape as docx); only one slide's XML is decoded at a time. For a typical 30 MB presentation, peak memory is ~90 MB. Capped at 100 MB file / 200 MB decompressed.
102
+ - **`docx`**: hard-capped at 10 MB. The underlying library (mammoth) has no streaming API and loads the whole document into memory; above 10 MB the server returns a `MemoryError` asking you to pre-extract the text upstream and re-upload as `.txt` or `.md`.
103
+ - **Images** (`png`, `jpg`, `jpeg`, `webp`, `gif`, `bmp`, `tiff`, `svg`): the file is held in memory once for embedding/storage; size scales with the file, not with content complexity.
104
+
105
+ If you need deterministic memory or timing for production UX (e.g. RAG over large user-supplied workbooks), prefer pre-extracting `.pptx` and large `.xlsx` upstream and uploading the text as `.txt`. See `patterns/file-uploads.md` for the consumer-side pattern.
97
106
 
98
107
  **Images require a `description`** — this is how the content gets embedded and becomes searchable. Without it, the image is stored but not findable. Use your vision capabilities to generate the description when the user doesn't provide one.
99
108
 
@@ -0,0 +1,78 @@
1
+ # Pattern: File Uploads
2
+
3
+ This is consumer-side guidance: how to decide whether to forward a file
4
+ straight to `ingestFile()` or to pre-extract its text upstream first.
5
+
6
+ ## TL;DR
7
+
8
+ | Format | Default action |
9
+ |---|---|
10
+ | `txt`, `md`, `csv`, `tsv`, `log`, `json`, `jsonl`, `html`, `htm` | Forward raw — pyx-memory streams. |
11
+ | `pdf` | Forward raw — pyx-memory streams via poppler. Install `poppler-utils` on the server image. |
12
+ | Images (`png`, `jpg`, `jpeg`, `webp`, `gif`, `bmp`, `tiff`, `svg`) | Forward raw with a `description` (use vision capability). |
13
+ | `docx` ≤ 10 MB | Forward raw. |
14
+ | `docx` > 10 MB | **Pre-extract upstream** as `.txt` or `.md`. The server returns a `MemoryError` if you don't. |
15
+ | `xlsx` (large or shared-string-heavy) | **Pre-extract upstream** as `.xxx.xlsx.txt` for deterministic UX. |
16
+ | `pptx` (production UX) | **Pre-extract upstream** as `.xxx.pptx.txt` for deterministic UX. |
17
+
18
+ "Production UX" here means: you can't afford a single hung upload to wedge
19
+ the user-facing layer for 30+ seconds, you need actionable error messages
20
+ on every failure, and you have your own copy of the original file
21
+ (separate from pyx-memory's internal storage).
22
+
23
+ ## Why pre-extract pptx and large xlsx
24
+
25
+ The server's pptx parser decompresses the full ZIP in memory (~3× file
26
+ size peak). The xlsx parser streams rows but caches shared strings for
27
+ the entire workbook (`ExcelJS.WorkbookReader { sharedStrings: 'cache' }`).
28
+ Both are bounded by the 100 MB file / 200 MB decompressed caps, but
29
+ "bounded" is not "constant" — pathological files (huge shared-string
30
+ tables, dense cell formulas, embedded media) can push peak memory and
31
+ parse time well past what naive callers expect.
32
+
33
+ If you control the upload boundary (e.g. you operate a runtime/proxy
34
+ service that fronts pyx-memory), upstream pre-extraction lets you:
35
+
36
+ 1. **Catch parse failures at your boundary**, where you can return an
37
+ actionable error to the user (`"Excel formula evaluation failed at
38
+ sheet 'Q3 Revenue', row 412"`) instead of a generic upstream 5xx.
39
+ 2. **Bound the wire payload to pyx-memory** — text/plain only — so the
40
+ memory server's parser is never the bottleneck.
41
+ 3. **Keep the original binary in your own storage**, so users can still
42
+ download the file. pyx-memory's catalog only holds the indexed text.
43
+
44
+ ## Reference implementation
45
+
46
+ [ai-rag-hub](https://github.com/fysoul17/one-query-v1) (a consumer of
47
+ pyx-memory) implements this pattern in its runtime:
48
+
49
+ - `packages/server/src/text-extractors.ts` — local extractors for `pptx`
50
+ and `xlsx`, both wrapped in the same OOXML safety envelope (zip-bomb
51
+ defense, path-traversal check, macro reject, decompressed-size cap,
52
+ char limit).
53
+ - `packages/server/src/routes/memory.ts` — `prepareFileForIngest`
54
+ dispatches via `getTextExtractor(mimeType)`; matched formats are
55
+ re-uploaded as `<original>.txt` with `text/plain`. Catalog metadata
56
+ flags `downloadableFromMemory: false` so the consumer's own
57
+ `/api/team/documents/[id]/download` route serves the original
58
+ binary instead.
59
+
60
+ ## When NOT to pre-extract
61
+
62
+ Single-tenant lab usage, internal tools, batch jobs where a 30-second
63
+ parse latency is acceptable, or any case where you don't have your own
64
+ copy of the file and need pyx-memory's `GET /api/memory/files/download/:filename`
65
+ to return the original binary. In those cases, the native pyx-memory
66
+ parsers are exactly what you want.
67
+
68
+ ## What about other formats
69
+
70
+ - **HTML**: pyx-memory strips `<script>` and `<style>` during parse
71
+ (`parsers/html.ts`). No upstream sanitizer needed for indexing.
72
+ - **PDF with images**: use the SDK's two-phase enrichment via
73
+ `EnrichmentCallbacks` — see `reference/sdk-guide.md`. Don't pre-extract
74
+ the PDF as text and lose image enrichment.
75
+ - **SVG**: currently classified as an image but pyx-memory's image
76
+ parser only stores it as a placeholder; if you need SVG text indexed,
77
+ pre-extract the `<text>` and `<desc>` content yourself or convert to
78
+ raster + describeImage.
@@ -71,7 +71,15 @@ const client = new MemoryClient('http://localhost:7822', process.env.MEMORY_API_
71
71
 
72
72
  **Supported formats**: `.txt`, `.md`, `.csv`, `.tsv`, `.log`, `.pdf`, `.docx`, `.xlsx`, `.pptx`, `.json`, `.jsonl`, `.html`, `.htm`, `.png`, `.jpg`, `.jpeg`, `.webp`, `.gif`, `.bmp`, `.tiff`, `.svg`
73
73
 
74
- **Memory behavior**: All text parsers (csv, tsv, txt, log, pdf, json, jsonl, html, xlsx, pptx) stream from disk during ingestion — peak server memory is bounded to a few MB regardless of file size, up to the 100MB file limit. `.docx` is the exception: mammoth (the parser) has no streaming API, so `.docx` is hard-capped at 10MB on the server. Above 10MB the server returns a `MemoryError` asking you to pre-extract the text upstream and re-upload as `.txt` or `.md`.
74
+ **Memory behavior, by format**:
75
+
76
+ - **Plain-text + structured-text** (`csv`, `tsv`, `txt`, `log`, `json`, `jsonl`, `html`, `htm`, `md`) — fully streaming; peak server memory ≈ a few MB regardless of file size up to the 100 MB cap.
77
+ - **`pdf`** — streaming via `pdftotext` (poppler-utils); fall-back path (`pdf-parse`) buffers the file, so install poppler-utils on the server for streaming behavior.
78
+ - **`xlsx`** — `ExcelJS.stream.xlsx.WorkbookReader` yields rows as it parses, but the shared-string table is cached for the whole workbook. Peak memory grows with shared-string count; dense workbooks can exceed "a few MB" significantly.
79
+ - **`pptx`** — the full ZIP is decompressed in memory (~3× file size peak for typical decks). One slide's XML is decoded at a time. Bounded by the 100 MB file cap and a 200 MB decompressed cap.
80
+ - **`docx`** — hard-capped at 10 MB. mammoth has no streaming API; above 10 MB the server returns a `MemoryError` asking you to pre-extract the text upstream and re-upload as `.txt` or `.md`.
81
+
82
+ For deterministic peak memory or timing (production UX), consumers should pre-extract `.pptx` and large `.xlsx` upstream and upload the text as `.txt` — see `patterns/file-uploads.md`.
75
83
 
76
84
  **What happens on upload**:
77
85
  1. Original file is saved to `{DATA_DIR}/files/{filename}` (persistent across restarts)