albex 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +141 -0
- package/README.md +242 -112
- package/dist/albex-worker.d.ts +70 -0
- package/dist/albex-worker.d.ts.map +1 -0
- package/dist/albex-worker.js +153 -0
- package/dist/albex-worker.js.map +1 -0
- package/dist/albex.d.ts +368 -6
- package/dist/albex.d.ts.map +1 -1
- package/dist/albex.js +1692 -95
- package/dist/albex.js.map +1 -1
- package/dist/errors.d.ts +38 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +63 -0
- package/dist/errors.js.map +1 -0
- package/dist/gpu/bloom-runtime.d.ts +60 -0
- package/dist/gpu/bloom-runtime.d.ts.map +1 -0
- package/dist/gpu/bloom-runtime.js +176 -0
- package/dist/gpu/bloom-runtime.js.map +1 -0
- package/dist/gpu/bloom-shader.wgsl.d.ts +19 -0
- package/dist/gpu/bloom-shader.wgsl.d.ts.map +1 -0
- package/dist/gpu/bloom-shader.wgsl.js +49 -0
- package/dist/gpu/bloom-shader.wgsl.js.map +1 -0
- package/dist/persistence.d.ts +21 -0
- package/dist/persistence.d.ts.map +1 -0
- package/dist/persistence.js +174 -0
- package/dist/persistence.js.map +1 -0
- package/dist/pool/coordinator.d.ts +98 -0
- package/dist/pool/coordinator.d.ts.map +1 -0
- package/dist/pool/coordinator.js +247 -0
- package/dist/pool/coordinator.js.map +1 -0
- package/dist/profile.d.ts +95 -0
- package/dist/profile.d.ts.map +1 -0
- package/dist/profile.js +207 -0
- package/dist/profile.js.map +1 -0
- package/dist/resource-manager.d.ts +56 -0
- package/dist/resource-manager.d.ts.map +1 -0
- package/dist/resource-manager.js +138 -0
- package/dist/resource-manager.js.map +1 -0
- package/dist/tiered-store.d.ts +98 -0
- package/dist/tiered-store.d.ts.map +1 -0
- package/dist/tiered-store.js +238 -0
- package/dist/tiered-store.js.map +1 -0
- package/dist/wasm-bindings.d.ts +139 -0
- package/dist/wasm-bindings.d.ts.map +1 -0
- package/dist/wasm-bindings.js +33 -0
- package/dist/wasm-bindings.js.map +1 -0
- package/dist/worker-protocol.d.ts +86 -0
- package/dist/worker-protocol.d.ts.map +1 -0
- package/dist/worker-protocol.js +20 -0
- package/dist/worker-protocol.js.map +1 -0
- package/dist/worker-runtime.d.ts +14 -0
- package/dist/worker-runtime.d.ts.map +1 -0
- package/dist/worker-runtime.js +100 -0
- package/dist/worker-runtime.js.map +1 -0
- package/package.json +56 -13
- package/src/albex-worker.ts +187 -0
- package/src/albex.ts +1845 -130
- package/src/errors.ts +60 -0
- package/src/gpu/bloom-runtime.ts +229 -0
- package/src/gpu/bloom-shader.wgsl.ts +48 -0
- package/src/persistence.ts +175 -0
- package/src/pool/coordinator.ts +324 -0
- package/src/profile.ts +279 -0
- package/src/resource-manager.ts +167 -0
- package/src/tiered-store.ts +259 -0
- package/src/wasm-bindings.ts +200 -0
- package/src/worker-protocol.ts +48 -0
- package/src/worker-runtime.ts +96 -0
- package/wasm/pkg/albex_pdf.wasm +0 -0
- package/wasm/pkg/albex_wasm_bg.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini.wasm +0 -0
- package/wasm/pkg/albex_wasm_mini_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro.wasm +0 -0
- package/wasm/pkg/albex_wasm_pro_simd.wasm +0 -0
- package/wasm/pkg/albex_wasm_std.wasm +0 -0
- package/wasm/pkg/albex_wasm_std_simd.wasm +0 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Resource manager — listens to environmental signals and exposes them as
|
|
3
|
+
* a small event API consumed by `AlbexEngine` (and `AlbexEngineWorker`).
|
|
4
|
+
*
|
|
5
|
+
* The signals tracked:
|
|
6
|
+
*
|
|
7
|
+
* - **Visibility** — `document.visibilitychange`. When the tab is hidden
|
|
8
|
+
* the engine should pause speculative work (background indexing,
|
|
9
|
+
* prefetch of optional binaries) but must still answer in-flight queries.
|
|
10
|
+
*
|
|
11
|
+
* - **Battery** — `navigator.getBattery()`. When level <20% AND not
|
|
12
|
+
* charging, switch to low-power mode (smaller worker pool, longer
|
|
13
|
+
* frame budget yields, no GPU acceleration).
|
|
14
|
+
*
|
|
15
|
+
* - **Connection** — `navigator.connection.effectiveType` + `saveData`.
|
|
16
|
+
* On `'slow-2g'/'2g'` or `saveData === true`, defer optional downloads
|
|
17
|
+
* (PDF wasm, embedding model) until the user explicitly needs them.
|
|
18
|
+
*
|
|
19
|
+
* The manager is *passive*: it does not call into the engine. Instead it
|
|
20
|
+
* exposes a `state` snapshot and an `on(event, callback)` subscription so
|
|
21
|
+
* the engine can react with its own policy. This keeps the dependency
|
|
22
|
+
* direction one-way and lets the engine be tested without the DOM.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
export type ResourceMode = 'normal' | 'low-power' | 'background' | 'constrained-network';
|
|
26
|
+
|
|
27
|
+
export interface ResourceState {
|
|
28
|
+
visible: boolean;
|
|
29
|
+
lowPower: boolean;
|
|
30
|
+
constrainedNetwork: boolean;
|
|
31
|
+
/** Composite mode derived from the three signals above. */
|
|
32
|
+
mode: ResourceMode;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
type Listener = (state: ResourceState) => void;
|
|
36
|
+
|
|
37
|
+
interface BatteryLike {
|
|
38
|
+
level: number;
|
|
39
|
+
charging: boolean;
|
|
40
|
+
addEventListener?: (type: string, cb: () => void) => void;
|
|
41
|
+
removeEventListener?: (type: string, cb: () => void) => void;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
interface ConnectionLike {
|
|
45
|
+
effectiveType?: string;
|
|
46
|
+
saveData?: boolean;
|
|
47
|
+
addEventListener?: (type: string, cb: () => void) => void;
|
|
48
|
+
removeEventListener?: (type: string, cb: () => void) => void;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export class ResourceManager {
|
|
52
|
+
private _state: ResourceState = {
|
|
53
|
+
visible: true,
|
|
54
|
+
lowPower: false,
|
|
55
|
+
constrainedNetwork: false,
|
|
56
|
+
mode: 'normal',
|
|
57
|
+
};
|
|
58
|
+
private _listeners = new Set<Listener>();
|
|
59
|
+
private _battery: BatteryLike | null = null;
|
|
60
|
+
private _connection: ConnectionLike | null = null;
|
|
61
|
+
private _onVisibility = (): void => this._refresh();
|
|
62
|
+
private _onBatteryChange = (): void => this._refresh();
|
|
63
|
+
private _onConnChange = (): void => this._refresh();
|
|
64
|
+
private _started = false;
|
|
65
|
+
|
|
66
|
+
get state(): ResourceState { return this._state; }
|
|
67
|
+
|
|
68
|
+
/** Subscribe to changes. Returns an unsubscribe function. */
|
|
69
|
+
on(cb: Listener): () => void {
|
|
70
|
+
this._listeners.add(cb);
|
|
71
|
+
return () => this._listeners.delete(cb);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Start listening. Idempotent. Safe to call from non-browser environments
|
|
76
|
+
* (Node tests, Workers without DOM access) — missing APIs are tolerated.
|
|
77
|
+
*/
|
|
78
|
+
async start(): Promise<void> {
|
|
79
|
+
if (this._started) return;
|
|
80
|
+
this._started = true;
|
|
81
|
+
|
|
82
|
+
if (typeof document !== 'undefined' && document.addEventListener) {
|
|
83
|
+
document.addEventListener('visibilitychange', this._onVisibility);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
try {
|
|
87
|
+
// @ts-expect-error Battery API is non-standard in TS typings
|
|
88
|
+
const getBat: (() => Promise<BatteryLike>) | undefined = navigator?.getBattery?.bind(navigator);
|
|
89
|
+
if (getBat) {
|
|
90
|
+
const b = await getBat();
|
|
91
|
+
this._battery = b;
|
|
92
|
+
b.addEventListener?.('levelchange', this._onBatteryChange);
|
|
93
|
+
b.addEventListener?.('chargingchange', this._onBatteryChange);
|
|
94
|
+
}
|
|
95
|
+
} catch { /* unavailable; tolerate */ }
|
|
96
|
+
|
|
97
|
+
const conn = (navigator as unknown as { connection?: ConnectionLike } | undefined)?.connection;
|
|
98
|
+
if (conn) {
|
|
99
|
+
this._connection = conn;
|
|
100
|
+
conn.addEventListener?.('change', this._onConnChange);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
this._refresh();
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/** Tear down listeners. */
|
|
107
|
+
stop(): void {
|
|
108
|
+
if (!this._started) return;
|
|
109
|
+
this._started = false;
|
|
110
|
+
|
|
111
|
+
if (typeof document !== 'undefined' && document.removeEventListener) {
|
|
112
|
+
document.removeEventListener('visibilitychange', this._onVisibility);
|
|
113
|
+
}
|
|
114
|
+
this._battery?.removeEventListener?.('levelchange', this._onBatteryChange);
|
|
115
|
+
this._battery?.removeEventListener?.('chargingchange', this._onBatteryChange);
|
|
116
|
+
this._connection?.removeEventListener?.('change', this._onConnChange);
|
|
117
|
+
this._battery = null;
|
|
118
|
+
this._connection = null;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
private _refresh(): void {
|
|
122
|
+
const visible = typeof document !== 'undefined'
|
|
123
|
+
? document.visibilityState === 'visible'
|
|
124
|
+
: true;
|
|
125
|
+
|
|
126
|
+
const lowPower = !!(this._battery
|
|
127
|
+
&& this._battery.level < 0.2
|
|
128
|
+
&& this._battery.charging === false);
|
|
129
|
+
|
|
130
|
+
const conn = this._connection;
|
|
131
|
+
const constrainedNetwork = !!conn && (
|
|
132
|
+
conn.saveData === true ||
|
|
133
|
+
conn.effectiveType === 'slow-2g' ||
|
|
134
|
+
conn.effectiveType === '2g'
|
|
135
|
+
);
|
|
136
|
+
|
|
137
|
+
let mode: ResourceMode = 'normal';
|
|
138
|
+
if (!visible) mode = 'background';
|
|
139
|
+
else if (lowPower) mode = 'low-power';
|
|
140
|
+
else if (constrainedNetwork) mode = 'constrained-network';
|
|
141
|
+
|
|
142
|
+
const next: ResourceState = { visible, lowPower, constrainedNetwork, mode };
|
|
143
|
+
if (
|
|
144
|
+
next.visible === this._state.visible &&
|
|
145
|
+
next.lowPower === this._state.lowPower &&
|
|
146
|
+
next.constrainedNetwork === this._state.constrainedNetwork &&
|
|
147
|
+
next.mode === this._state.mode
|
|
148
|
+
) return;
|
|
149
|
+
|
|
150
|
+
this._state = next;
|
|
151
|
+
for (const cb of this._listeners) {
|
|
152
|
+
try { cb(next); } catch { /* swallow listener errors */ }
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Singleton accessor. Multiple engines in the same realm share the same
|
|
159
|
+
* manager — there is no benefit to running the listeners more than once,
|
|
160
|
+
* and the signal is global to the page anyway.
|
|
161
|
+
*/
|
|
162
|
+
let _instance: ResourceManager | null = null;
|
|
163
|
+
|
|
164
|
+
export function getResourceManager(): ResourceManager {
|
|
165
|
+
if (!_instance) _instance = new ResourceManager();
|
|
166
|
+
return _instance;
|
|
167
|
+
}
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tiered storage layer for Albex.
|
|
3
|
+
*
|
|
4
|
+
* The base engine keeps every indexed document in memory inside the BSS
|
|
5
|
+
* arrays. That works beautifully up to the tier's capacity (4 MB to 128 MB
|
|
6
|
+
* of indexed text) but breaks when the user wants to search across more.
|
|
7
|
+
*
|
|
8
|
+
* `TieredStore` adds two memory tiers behind the engine:
|
|
9
|
+
*
|
|
10
|
+
* HOT — already-indexed documents living in the WASM arrays.
|
|
11
|
+
* WARM — original file blobs serialised in OPFS, NOT in the engine.
|
|
12
|
+
*
|
|
13
|
+
* Eviction happens when text capacity climbs above a configurable
|
|
14
|
+
* threshold (default 85 %). The least-recently-accessed HOT document is
|
|
15
|
+
* removed from the engine; its blob stays in OPFS so we can promote it
|
|
16
|
+
* back later without asking the user to re-pick the file.
|
|
17
|
+
*
|
|
18
|
+
* Promotion happens explicitly: callers tell the store "I want these
|
|
19
|
+
* names searchable again" and we re-feed them through `engine.indexFile`.
|
|
20
|
+
*
|
|
21
|
+
* **Trade-off vs storing the internal representation in OPFS:** promoting
|
|
22
|
+
* a doc means re-parsing it (DOCX XML decode, etc.). For typical document
|
|
23
|
+
* sizes this is 20-200 ms — negligible for an explicit "search across the
|
|
24
|
+
* archive" action. The win is huge: the persistence format is just the
|
|
25
|
+
* source files, so it survives engine version bumps without any migration.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import type { AlbexEngine, IndexedDocument } from './albex.js';
|
|
29
|
+
|
|
30
|
+
const OPFS_DIR = 'albex-tiered';
|
|
31
|
+
|
|
32
|
+
interface TieredEntry {
|
|
33
|
+
name: string;
|
|
34
|
+
ext: string;
|
|
35
|
+
lastAccessedMs: number;
|
|
36
|
+
/** Whether the doc is currently indexed in the engine. */
|
|
37
|
+
hot: boolean;
|
|
38
|
+
/** Size of the original blob in bytes, for capacity estimation. */
|
|
39
|
+
byteSize: number;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface TieredStoreOptions {
|
|
43
|
+
/**
|
|
44
|
+
* Evict when textUsed exceeds `evictThreshold * textCapacity`.
|
|
45
|
+
* Default 0.85. Set 1.0 to disable.
|
|
46
|
+
*/
|
|
47
|
+
evictThreshold?: number;
|
|
48
|
+
/**
|
|
49
|
+
* Keep this many documents in the hot tier at minimum. Default 1.
|
|
50
|
+
* Useful when you want to ensure the most recently added file is
|
|
51
|
+
* always available without an explicit promote step.
|
|
52
|
+
*/
|
|
53
|
+
hotFloor?: number;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export class TieredStore {
|
|
57
|
+
private readonly _engine: AlbexEngine;
|
|
58
|
+
private _entries = new Map<string, TieredEntry>();
|
|
59
|
+
private _dir: FileSystemDirectoryHandle | null = null;
|
|
60
|
+
private readonly _opts: Required<TieredStoreOptions>;
|
|
61
|
+
|
|
62
|
+
constructor(engine: AlbexEngine, opts: TieredStoreOptions = {}) {
|
|
63
|
+
this._engine = engine;
|
|
64
|
+
this._opts = {
|
|
65
|
+
evictThreshold: opts.evictThreshold ?? 0.85,
|
|
66
|
+
hotFloor: opts.hotFloor ?? 1,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Ensure the OPFS directory exists. Idempotent. Tolerated when OPFS is unavailable. */
|
|
71
|
+
async init(): Promise<void> {
|
|
72
|
+
try {
|
|
73
|
+
const root = await navigator.storage.getDirectory();
|
|
74
|
+
this._dir = await root.getDirectoryHandle(OPFS_DIR, { create: true });
|
|
75
|
+
} catch {
|
|
76
|
+
// No OPFS — warm tier disabled; everything stays hot.
|
|
77
|
+
this._dir = null;
|
|
78
|
+
}
|
|
79
|
+
await this._rehydrateIndex();
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Index a file AND register it in the warm tier so it survives across
|
|
84
|
+
* sessions. Equivalent to `engine.indexFile(file)` but with the extra
|
|
85
|
+
* persistence guarantee.
|
|
86
|
+
*/
|
|
87
|
+
async indexFile(file: File): Promise<IndexedDocument> {
|
|
88
|
+
const doc = await this._engine.indexFile(file);
|
|
89
|
+
|
|
90
|
+
// Persist the original blob in OPFS so we can promote it back later
|
|
91
|
+
// without asking the user to re-pick the file.
|
|
92
|
+
await this._writeBlob(file);
|
|
93
|
+
|
|
94
|
+
this._entries.set(doc.name, {
|
|
95
|
+
name: doc.name,
|
|
96
|
+
ext: doc.ext,
|
|
97
|
+
lastAccessedMs: Date.now(),
|
|
98
|
+
hot: true,
|
|
99
|
+
byteSize: file.size,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
await this._enforceCapacity();
|
|
103
|
+
return doc;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/** Touch an entry to mark it recently used (for LRU). */
|
|
107
|
+
touch(name: string): void {
|
|
108
|
+
const e = this._entries.get(name);
|
|
109
|
+
if (e) e.lastAccessedMs = Date.now();
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Evict the least-recently-used HOT documents until `textUsed` falls
|
|
114
|
+
* below the configured threshold. Respects `hotFloor` (never evicts
|
|
115
|
+
* the last N docs).
|
|
116
|
+
*/
|
|
117
|
+
async _enforceCapacity(): Promise<void> {
|
|
118
|
+
const stats = this._engine.getStats();
|
|
119
|
+
const threshold = stats.textCapacity * this._opts.evictThreshold;
|
|
120
|
+
if (stats.textUsed <= threshold) return;
|
|
121
|
+
|
|
122
|
+
const hot = [...this._entries.values()]
|
|
123
|
+
.filter(e => e.hot)
|
|
124
|
+
.sort((a, b) => a.lastAccessedMs - b.lastAccessedMs);
|
|
125
|
+
|
|
126
|
+
while (hot.length > this._opts.hotFloor) {
|
|
127
|
+
const victim = hot.shift()!;
|
|
128
|
+
this._engine.removeDocument(victim.name);
|
|
129
|
+
victim.hot = false;
|
|
130
|
+
// Reclaim storage now so subsequent index calls see real headroom.
|
|
131
|
+
this._engine.compact();
|
|
132
|
+
const after = this._engine.getStats();
|
|
133
|
+
if (after.textUsed <= threshold) break;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Bring a warm document back into the engine. No-op if already hot.
|
|
139
|
+
* Returns the resulting `IndexedDocument` or `null` if the doc isn't known.
|
|
140
|
+
*/
|
|
141
|
+
async promote(name: string): Promise<IndexedDocument | null> {
|
|
142
|
+
const e = this._entries.get(name);
|
|
143
|
+
if (!e) return null;
|
|
144
|
+
if (e.hot) {
|
|
145
|
+
this.touch(name);
|
|
146
|
+
const stats = this._engine.getStats();
|
|
147
|
+
void stats;
|
|
148
|
+
return this._engine.documents.find(d => d.name === name) ?? null;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const blob = await this._readBlob(name);
|
|
152
|
+
if (!blob) return null;
|
|
153
|
+
|
|
154
|
+
// Re-create the File so `engine.indexFile` sees the original metadata.
|
|
155
|
+
const file = new File([blob], name);
|
|
156
|
+
const doc = await this._engine.indexFile(file);
|
|
157
|
+
e.hot = true;
|
|
158
|
+
e.lastAccessedMs = Date.now();
|
|
159
|
+
return doc;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Forget a document entirely: remove from engine and delete its OPFS blob.
|
|
164
|
+
* Returns whether the entry existed.
|
|
165
|
+
*/
|
|
166
|
+
async forget(name: string): Promise<boolean> {
|
|
167
|
+
const had = this._entries.has(name);
|
|
168
|
+
if (this._entries.get(name)?.hot) this._engine.removeDocument(name);
|
|
169
|
+
this._entries.delete(name);
|
|
170
|
+
await this._deleteBlob(name);
|
|
171
|
+
return had;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/** Names of all known documents, hot or warm. */
|
|
175
|
+
list(): { name: string; hot: boolean; byteSize: number }[] {
|
|
176
|
+
return [...this._entries.values()].map(e => ({
|
|
177
|
+
name: e.name, hot: e.hot, byteSize: e.byteSize,
|
|
178
|
+
}));
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/** Aggregate storage stats. */
|
|
182
|
+
getTierStats(): { hot: number; warm: number; totalBytes: number } {
|
|
183
|
+
let hot = 0, warm = 0, totalBytes = 0;
|
|
184
|
+
for (const e of this._entries.values()) {
|
|
185
|
+
if (e.hot) hot++; else warm++;
|
|
186
|
+
totalBytes += e.byteSize;
|
|
187
|
+
}
|
|
188
|
+
return { hot, warm, totalBytes };
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// ── OPFS plumbing ─────────────────────────────────────────────────────
|
|
192
|
+
|
|
193
|
+
private _safeName(name: string): string {
|
|
194
|
+
// Strip path separators just in case; OPFS requires plain file names.
|
|
195
|
+
return name.replace(/[/\\]/g, '_');
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
private async _writeBlob(file: File): Promise<void> {
|
|
199
|
+
if (!this._dir) return;
|
|
200
|
+
try {
|
|
201
|
+
const handle = await this._dir.getFileHandle(this._safeName(file.name), { create: true });
|
|
202
|
+
const w = await handle.createWritable();
|
|
203
|
+
const bytes = new Uint8Array(await file.arrayBuffer());
|
|
204
|
+
const plain = new Uint8Array(bytes.byteLength);
|
|
205
|
+
plain.set(bytes);
|
|
206
|
+
await w.write(plain);
|
|
207
|
+
await w.close();
|
|
208
|
+
} catch (e) {
|
|
209
|
+
console.warn(`[albex] failed to persist blob for ${file.name}:`, e);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
private async _readBlob(name: string): Promise<Blob | null> {
|
|
214
|
+
if (!this._dir) return null;
|
|
215
|
+
try {
|
|
216
|
+
const handle = await this._dir.getFileHandle(this._safeName(name));
|
|
217
|
+
return await handle.getFile();
|
|
218
|
+
} catch {
|
|
219
|
+
return null;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
private async _deleteBlob(name: string): Promise<void> {
|
|
224
|
+
if (!this._dir) return;
|
|
225
|
+
try { await this._dir.removeEntry(this._safeName(name)); } catch { /* not found */ }
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* TC39 explicit-resource-management hook. Drops the in-memory index and
|
|
230
|
+
* the OPFS directory handle. The underlying OPFS blobs are NOT deleted —
|
|
231
|
+
* disposal only frees JS-side state. Use `forget()` per-doc or
|
|
232
|
+
* `deleteOpfsAll()` if you want to wipe persisted data.
|
|
233
|
+
*/
|
|
234
|
+
[Symbol.dispose](): void {
|
|
235
|
+
this._entries.clear();
|
|
236
|
+
this._dir = null;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
private async _rehydrateIndex(): Promise<void> {
|
|
240
|
+
if (!this._dir) return;
|
|
241
|
+
try {
|
|
242
|
+
// @ts-expect-error async-iterable on FileSystemDirectoryHandle
|
|
243
|
+
for await (const [name, handle] of this._dir.entries()) {
|
|
244
|
+
if (handle.kind !== 'file') continue;
|
|
245
|
+
const file = await handle.getFile();
|
|
246
|
+
if (this._entries.has(name)) continue;
|
|
247
|
+
this._entries.set(name, {
|
|
248
|
+
name,
|
|
249
|
+
ext: (name.split('.').pop() ?? '').toLowerCase(),
|
|
250
|
+
lastAccessedMs: 0, // never accessed in this session yet
|
|
251
|
+
hot: false,
|
|
252
|
+
byteSize: file.size,
|
|
253
|
+
});
|
|
254
|
+
}
|
|
255
|
+
} catch (e) {
|
|
256
|
+
console.warn('[albex] could not rehydrate tiered index:', e);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Typed interfaces for the two WASM modules Albex ships with.
|
|
3
|
+
*
|
|
4
|
+
* These types replace ad-hoc `as Function` casts and give us:
|
|
5
|
+
* - argument/return type checking at call sites,
|
|
6
|
+
* - autocompletion in IDEs,
|
|
7
|
+
* - safe refactors when an export name or signature changes.
|
|
8
|
+
*
|
|
9
|
+
* The interfaces mirror the `#[no_mangle] pub extern "C" fn` exports
|
|
10
|
+
* in `wasm/src/lib.rs` and `pdf-wasm/src/lib.rs`.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
14
|
+
// Main WASM module (albex_wasm_bg.wasm)
|
|
15
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
16
|
+
|
|
17
|
+
export interface AlbexWasmExports {
|
|
18
|
+
readonly memory: WebAssembly.Memory;
|
|
19
|
+
|
|
20
|
+
// Scratchpad / lifecycle
|
|
21
|
+
getBuffer(size: number): number;
|
|
22
|
+
init(): void;
|
|
23
|
+
|
|
24
|
+
// Document ingestion
|
|
25
|
+
setDocumentName(len: number): void;
|
|
26
|
+
beginDocument(): number;
|
|
27
|
+
feedXmlBytes(len: number): void;
|
|
28
|
+
endDocument(): number;
|
|
29
|
+
|
|
30
|
+
// XLSX
|
|
31
|
+
beginXlsx(): number;
|
|
32
|
+
feedXlsxBytes(len: number): void;
|
|
33
|
+
|
|
34
|
+
// Generic text path (PDF, TXT, XML, future formats)
|
|
35
|
+
feedText(len: number): void;
|
|
36
|
+
flushParagraph(): void;
|
|
37
|
+
|
|
38
|
+
// Search configuration
|
|
39
|
+
setMaxErrors(errors: number): void;
|
|
40
|
+
setThreshold(threshold: number): void;
|
|
41
|
+
setMaxResults(max: number): void;
|
|
42
|
+
|
|
43
|
+
// Search execution
|
|
44
|
+
setPattern(len: number): number;
|
|
45
|
+
search(): number;
|
|
46
|
+
// Resumable search (used by frame-budgeted searches and worker shards)
|
|
47
|
+
searchBegin(): number;
|
|
48
|
+
searchSlice(maxChunks: number): number;
|
|
49
|
+
getSearchCursor(): number;
|
|
50
|
+
getSearchTotal(): number;
|
|
51
|
+
|
|
52
|
+
// Result accessors
|
|
53
|
+
getResultCount(): number;
|
|
54
|
+
getResultDocId(i: number): number;
|
|
55
|
+
getResultLocation(i: number): number;
|
|
56
|
+
getResultScore(i: number): number;
|
|
57
|
+
getResultStart(i: number): number;
|
|
58
|
+
getResultEnd(i: number): number;
|
|
59
|
+
getResultChunkIdx(i: number): number;
|
|
60
|
+
getResultDocName(i: number): number;
|
|
61
|
+
getResultMatchCount(i: number): number;
|
|
62
|
+
getResultMatchStartAt(i: number, k: number): number;
|
|
63
|
+
getResultMatchEndAt(i: number, k: number): number;
|
|
64
|
+
getSnippet(i: number): number;
|
|
65
|
+
getSnippetWindow(i: number, before: number, after: number): number;
|
|
66
|
+
getSnippetWindowOffset(): number;
|
|
67
|
+
|
|
68
|
+
// Stats
|
|
69
|
+
getStatBloomTested(): number;
|
|
70
|
+
getStatBloomPassed(): number;
|
|
71
|
+
getStatBitapMatched(): number;
|
|
72
|
+
getChunkCount(): number;
|
|
73
|
+
getDocCount(): number;
|
|
74
|
+
getTextUsed(): number;
|
|
75
|
+
getTextCapacity(): number;
|
|
76
|
+
|
|
77
|
+
// Snapshot / restore
|
|
78
|
+
snapshotSize(): number;
|
|
79
|
+
snapshotChunk(offset: number, maxLen: number): number;
|
|
80
|
+
restoreBegin(): number;
|
|
81
|
+
restoreFeed(len: number): number;
|
|
82
|
+
|
|
83
|
+
// Incremental / per-doc
|
|
84
|
+
getDocId(index: number): number;
|
|
85
|
+
getDocChunkCount(index: number): number;
|
|
86
|
+
getDocName(index: number): number;
|
|
87
|
+
isDocDeleted(index: number): number;
|
|
88
|
+
removeDocument(docId: number): number;
|
|
89
|
+
compact(): void;
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Per-document content hash (snapshot v2). Returns a pointer to 8 bytes
|
|
93
|
+
* holding the FNV-1a 64-bit hash of the original file bytes, or 0 if the
|
|
94
|
+
* doc index is out of range. All-zero bytes mean "hash not available"
|
|
95
|
+
* — either the host never called setDocumentContentHash for this doc
|
|
96
|
+
* (legacy code path) or it was restored from a v1 snapshot.
|
|
97
|
+
*/
|
|
98
|
+
getDocContentHashPtr(index: number): number;
|
|
99
|
+
/** Always returns 8. Useful as a runtime feature-detect: older binaries
|
|
100
|
+
* without snapshot v2 will not export this function at all. */
|
|
101
|
+
getDocContentHashLen(): number;
|
|
102
|
+
/** Copy a content hash (up to 8 bytes from the scratchpad) into the
|
|
103
|
+
* pending slot. endDocument() then writes it into doc_hashes[]. */
|
|
104
|
+
setDocumentContentHash(len: number): void;
|
|
105
|
+
|
|
106
|
+
// Stemming
|
|
107
|
+
setLanguage(lang: number): void;
|
|
108
|
+
|
|
109
|
+
// Tier identification
|
|
110
|
+
getTier(): number; // 1=mini, 2=std, 3=pro
|
|
111
|
+
getMaxChunks(): number;
|
|
112
|
+
getMaxDocs(): number;
|
|
113
|
+
getNameCapacity(): number;
|
|
114
|
+
|
|
115
|
+
// GPU bridge (CD1): zero-copy access to chunk array + candidate mask
|
|
116
|
+
getChunksPtr(): number;
|
|
117
|
+
getChunkStructSize(): number;
|
|
118
|
+
setCandidateMask(byteLen: number): void;
|
|
119
|
+
clearCandidateMask(): void;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
123
|
+
// PDF WASM module (albex_pdf.wasm)
|
|
124
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
export interface AlbexPdfExports {
|
|
127
|
+
readonly memory: WebAssembly.Memory;
|
|
128
|
+
|
|
129
|
+
/** Reserve `len` bytes inside the PDF module and return a pointer. */
|
|
130
|
+
allocInput(len: number): number;
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Parse the PDF. Returns:
|
|
134
|
+
* N ≥ 0 — page count,
|
|
135
|
+
* -1 — parse error (read with getErrorPtr/getErrorLen),
|
|
136
|
+
* -2 — image-only / no extractable text.
|
|
137
|
+
*
|
|
138
|
+
* When `-2` is returned, the host can fall through to the scanned-PDF
|
|
139
|
+
* path via `getPageCount` + `extractPageImages`.
|
|
140
|
+
*/
|
|
141
|
+
extractPdf(len: number): number;
|
|
142
|
+
|
|
143
|
+
getPageLen(page: number): number;
|
|
144
|
+
getPagePtr(page: number): number;
|
|
145
|
+
getErrorLen(): number;
|
|
146
|
+
getErrorPtr(): number;
|
|
147
|
+
|
|
148
|
+
// ── Scanned-PDF path (extracts embedded image XObjects) ──────────────────
|
|
149
|
+
//
|
|
150
|
+
// Available since pdf-wasm 0.2. Older binaries built before this addition
|
|
151
|
+
// will not expose these exports — `asAlbexPdfExports` keeps them typed,
|
|
152
|
+
// but the engine should feature-detect at runtime before relying on them.
|
|
153
|
+
|
|
154
|
+
/** Total page count of the loaded PDF; 0 if the input cannot be parsed. */
|
|
155
|
+
getPageCount(): number;
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Extract every supported image XObject on page `page` (0-based) into the
|
|
159
|
+
* module's internal buffer.
|
|
160
|
+
*
|
|
161
|
+
* Returns:
|
|
162
|
+
* N ≥ 0 — number of images extracted on this page,
|
|
163
|
+
* -1 — parse error or page index out of range.
|
|
164
|
+
*
|
|
165
|
+
* Each extracted image is one of:
|
|
166
|
+
* * JPEG (kind = 1, from a `/DCTDecode` filter), or
|
|
167
|
+
* * JPEG2000 (kind = 2, from a `/JPXDecode` filter).
|
|
168
|
+
*
|
|
169
|
+
* Filters that require Rust-side reconstruction (`FlateDecode`,
|
|
170
|
+
* `CCITTFaxDecode`, `JBIG2Decode`) are intentionally skipped — they
|
|
171
|
+
* would roughly double the binary size for ~5 % more coverage.
|
|
172
|
+
*/
|
|
173
|
+
extractPageImages(page: number): number;
|
|
174
|
+
|
|
175
|
+
/** Byte length of extracted image `i` (from the last `extractPageImages`). */
|
|
176
|
+
getPageImageLen(i: number): number;
|
|
177
|
+
|
|
178
|
+
/** Pointer to extracted image `i`'s raw bytes. */
|
|
179
|
+
getPageImagePtr(i: number): number;
|
|
180
|
+
|
|
181
|
+
/** Format tag for extracted image `i`: 1 = JPEG, 2 = JPEG2000, 0 = none. */
|
|
182
|
+
getPageImageKind(i: number): number;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
186
|
+
// Narrowing helpers for instantiation results
|
|
187
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Cast `WebAssembly.Exports` to the typed Albex main interface.
|
|
191
|
+
* Runtime check is intentionally minimal — if the .wasm doesn't match,
|
|
192
|
+
* the first call site that touches a missing function throws naturally.
|
|
193
|
+
*/
|
|
194
|
+
export function asAlbexExports(exports: WebAssembly.Exports): AlbexWasmExports {
|
|
195
|
+
return exports as unknown as AlbexWasmExports;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
export function asAlbexPdfExports(exports: WebAssembly.Exports): AlbexPdfExports {
|
|
199
|
+
return exports as unknown as AlbexPdfExports;
|
|
200
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Wire protocol between the main thread and the AlbexEngineWorker runtime.
|
|
3
|
+
*
|
|
4
|
+
* One request/response pair per call, identified by `id`. The runtime is
|
|
5
|
+
* single-threaded so we serialise requests on the main side (one in-flight
|
|
6
|
+
* call at a time per worker) — keeps the protocol trivial and matches the
|
|
7
|
+
* actual constraint of `static mut` WASM state.
|
|
8
|
+
*
|
|
9
|
+
* `Transferable` is opt-in per op; we use it for `indexFile` to avoid
|
|
10
|
+
* copying the file bytes into the worker.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import type { AlbexOptions, IndexedDocument, SearchOptions, SearchResult, EngineStats, SearchStats } from './albex.js';
|
|
14
|
+
|
|
15
|
+
export type WorkerOp =
|
|
16
|
+
| { kind: 'init'; opts: AlbexOptions }
|
|
17
|
+
| { kind: 'indexFile'; name: string; buffer: ArrayBuffer }
|
|
18
|
+
| { kind: 'search'; query: string; options: SearchOptions }
|
|
19
|
+
| { kind: 'removeDocument'; id: string }
|
|
20
|
+
| { kind: 'compact' }
|
|
21
|
+
| { kind: 'reset' }
|
|
22
|
+
| { kind: 'getStats' }
|
|
23
|
+
| { kind: 'getLastSearchStats' }
|
|
24
|
+
| { kind: 'getDocuments' }
|
|
25
|
+
| { kind: 'setMaxErrors'; n: 0 | 1 | 2 | 3 }
|
|
26
|
+
| { kind: 'setThreshold'; n: number }
|
|
27
|
+
| { kind: 'setMaxResults'; n: number }
|
|
28
|
+
| { kind: 'setLanguage'; lang: 'off' | 'es' }
|
|
29
|
+
| { kind: 'save'; name: string }
|
|
30
|
+
| { kind: 'load'; name: string }
|
|
31
|
+
| { kind: 'loadOrInit'; name: string }
|
|
32
|
+
| { kind: 'deleteSnapshot'; name: string }
|
|
33
|
+
| { kind: 'listSnapshots' };
|
|
34
|
+
|
|
35
|
+
export interface WorkerRequest {
|
|
36
|
+
id: number;
|
|
37
|
+
op: WorkerOp;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export type WorkerResponse =
|
|
41
|
+
| { id: number; ok: true; result: unknown }
|
|
42
|
+
| { id: number; ok: false; error: { name: string; kind?: string; message: string } };
|
|
43
|
+
|
|
44
|
+
export type IndexFileResult = IndexedDocument;
|
|
45
|
+
export type SearchResultArr = SearchResult[];
|
|
46
|
+
export type StatsResult = EngineStats;
|
|
47
|
+
export type SearchStatsRes = SearchStats | null;
|
|
48
|
+
export type DocsResult = readonly IndexedDocument[];
|