albex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 RafaCalRob
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,220 @@
1
+ # Albex
2
+
3
+ Local full-text search for documents. Runs entirely in the browser — no server, no upload, no network request after the initial load.
4
+
5
+ Drop a DOCX, PDF, XLSX, TXT or XML file, start typing, get results in milliseconds.
6
+
7
+ ---
8
+
9
+ ## Features
10
+
11
+ - **Zero server** — all text stays on the user's machine.
12
+ - **Fuzzy matching** — finds "contrato" even if you type "conttrato" (adaptive edit distance).
13
+ - **Accent-insensitive** — "accion" matches "acción", "espana" matches "España".
14
+ - **Multi-format** — DOCX, XLSX, PDF (text-based), TXT, XML.
15
+ - **Phrase search** — `"contrato marco"` requires the words to appear together.
16
+ - **OR search** — `contrato | acuerdo` unions two independent searches.
17
+ - **No dependencies** — one TypeScript file, two WASM binaries, nothing else.
18
+ - **Tiny footprint** — main WASM is ~14 KB on disk; PDF module (~1 MB) loads on demand.
19
+
20
+ ---
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ npm install albex
26
+ ```
27
+
28
+ Or copy `dist/albex.js`, `wasm/pkg/albex_wasm_bg.wasm` (and optionally `albex_pdf.wasm`) to your project.
29
+
30
+ ---
31
+
32
+ ## Quick start
33
+
34
+ ```ts
35
+ import { AlbexEngine } from 'albex';
36
+
37
+ const engine = new AlbexEngine({
38
+ wasmUrl: '/assets/albex_wasm_bg.wasm',
39
+ pdfWasmUrl: '/assets/albex_pdf.wasm', // only needed for PDFs
40
+ });
41
+
42
+ await engine.init();
43
+
44
+ // Index a file from a <input type="file"> or drag-and-drop
45
+ const file = inputElement.files[0];
46
+ const doc = await engine.indexFile(file);
47
+ console.log(`Indexed ${doc.chunks} chunks in ${doc.indexTimeMs.toFixed(0)} ms`);
48
+
49
+ // Search
50
+ const results = engine.search('contrato marco');
51
+ for (const r of results) {
52
+ console.log(`[${r.score}] ${r.documentName} — ${r.snippet}`);
53
+ }
54
+ ```
55
+
56
+ ---
57
+
58
+ ## Supported formats
59
+
60
+ | Extension | How text is extracted |
61
+ |-----------|----------------------|
62
+ | `.docx` | Native Rust/WASM XML parser — reads `word/document.xml` directly |
63
+ | `.xlsx` | Native Rust/WASM XML parser — reads shared strings + inline strings |
64
+ | `.pdf` | Separate `albex_pdf.wasm` (pure Rust, loaded on demand) |
65
+ | `.txt` | Plain text split on double newlines |
66
+ | `.xml` | Tag-stripped, entity-decoded |
67
+
68
+ ---
69
+
70
+ ## Query syntax
71
+
72
+ | Input | Behaviour |
73
+ |-------|-----------|
74
+ | `contrato` | Fuzzy match, accent-insensitive |
75
+ | `contrato marco` | Both words must appear in the same chunk |
76
+ | `"contrato marco"` | Both words AND they must be adjacent (phrase) |
77
+ | `contrato \| acuerdo` | OR: returns results matching either term |
78
+
79
+ Up to 4 space-separated tokens per simple/phrase query. OR branches are unlimited.
80
+
81
+ ---
82
+
83
+ ## API reference
84
+
85
+ ### `new AlbexEngine(opts)`
86
+
87
+ ```ts
88
+ interface AlbexOptions {
89
+ wasmUrl: string; // required
90
+ pdfWasmUrl?: string; // required only for PDF indexing
91
+ }
92
+ ```
93
+
94
+ ### `engine.init(): Promise<void>`
95
+
96
+ Fetches and initialises the main WASM module. Must be called before anything else.
97
+
98
+ ### `engine.indexFile(file: File): Promise<IndexedDocument>`
99
+
100
+ Detects the file format by extension, extracts text, and adds it to the search index. Throws for unsupported extensions or parse errors.
101
+
102
+ ```ts
103
+ interface IndexedDocument {
104
+ name: string;
105
+ ext: string;
106
+ chunks: number; // number of indexed text chunks
107
+ indexTimeMs: number;
108
+ textBytes: number; // raw UTF-8 text indexed
109
+ }
110
+ ```
111
+
112
+ ### `engine.search(query: string): SearchResult[]`
113
+
114
+ Returns results sorted by score (0–1000, descending).
115
+
116
+ ```ts
117
+ interface SearchResult {
118
+ documentName: string;
119
+ location: number; // paragraph (DOCX/TXT) or page (PDF, 1-based)
120
+ score: number; // 0–1000
121
+ snippet: string; // full chunk text (original, with accents)
122
+ matchStart: number; // byte offset of match in snippet
123
+ matchEnd: number; // exclusive
124
+ }
125
+ ```
126
+
127
+ ### `engine.getStats(): EngineStats`
128
+
129
+ ```ts
130
+ interface EngineStats {
131
+ documents: number;
132
+ chunks: number;
133
+ textUsed: number; // bytes
134
+ textCapacity: number; // 16 MB hard cap
135
+ wasmMemoryBytes: number;
136
+ }
137
+ ```
138
+
139
+ ### `engine.getLastSearchStats(): SearchStats | null`
140
+
141
+ Bloom/Bitap pipeline counters from the most recent search — useful for debugging and UI dashboards.
142
+
143
+ ```ts
144
+ interface SearchStats {
145
+ query: string;
146
+ timeMs: number;
147
+ results: number;
148
+ bloomTested: number; // chunks tested
149
+ bloomPassed: number; // passed bloom pre-filter
150
+ bitapMatched: number; // confirmed by Bitap
151
+ }
152
+ ```
153
+
154
+ ### Tuning
155
+
156
+ ```ts
157
+ engine.setMaxErrors(n); // 0–3 (default 2, auto-scaled by query length)
158
+ engine.setThreshold(n); // 0–1000 minimum score (default 250)
159
+ engine.setMaxResults(n); // 1–200 (default 50)
160
+ ```
161
+
162
+ ### `engine.reset()`
163
+
164
+ Clears all indexed documents. The engine is ready to index new files immediately after.
165
+
166
+ ---
167
+
168
+ ## Capacity
169
+
170
+ | Resource | Limit |
171
+ |----------|-------|
172
+ | Documents | 128 |
173
+ | Chunks | 100 000 |
174
+ | Total text | 16 MB |
175
+ | Query length | 64 characters (longer queries are truncated) |
176
+ | Results | 200 (configurable, default 50) |
177
+
178
+ These are hard-coded BSS limits in the WASM module. Exceeding them is silent — the engine stops indexing additional content without error.
179
+
180
+ ---
181
+
182
+ ## Browser requirements
183
+
184
+ - WebAssembly (all modern browsers since 2017)
185
+ - `DecompressionStream` for DOCX/XLSX (Chrome 80+, Firefox 113+, Safari 16.4+)
186
+ - `String.prototype.normalize` for phrase search (all modern browsers)
187
+
188
+ PDF support additionally requires the `albex_pdf.wasm` module to be served with the correct MIME type (`application/wasm`).
189
+
190
+ ---
191
+
192
+ ## Building from source
193
+
194
+ ```bash
195
+ # Install Rust + wasm-pack
196
+ rustup target add wasm32-unknown-unknown
197
+
198
+ # Build main WASM
199
+ cd wasm && cargo build --target wasm32-unknown-unknown --release
200
+ cp ../target/wasm32-unknown-unknown/release/albex_wasm.wasm pkg/albex_wasm_bg.wasm
201
+
202
+ # Build PDF WASM
203
+ cd ../pdf-wasm && cargo build --target wasm32-unknown-unknown --release
204
+ cp ../target/wasm32-unknown-unknown/release/albex_pdf.wasm ../wasm/pkg/albex_pdf.wasm
205
+
206
+ # Build TypeScript
207
+ cd .. && npm install && npm run build
208
+ ```
209
+
210
+ ---
211
+
212
+ ## Privacy
213
+
214
+ Albex does not transmit any document content. Text extraction, indexing, and search all happen inside the browser's WASM sandbox. The only network requests are the initial fetch of the `.wasm` binary files.
215
+
216
+ ---
217
+
218
+ ## License
219
+
220
+ MIT
@@ -0,0 +1,110 @@
1
+ /**
2
+ * Albex — local full-text search engine.
3
+ *
4
+ * Zero-dependency TypeScript/ESM wrapper around albex_wasm_bg.wasm and
5
+ * (optionally) albex_pdf.wasm. All text stays in-browser; nothing is sent
6
+ * to any server.
7
+ *
8
+ * @example
9
+ * ```ts
10
+ * const engine = new AlbexEngine({ wasmUrl: './wasm/pkg/albex_wasm_bg.wasm' });
11
+ * await engine.init();
12
+ * await engine.indexFile(myFile);
13
+ * const results = engine.search('contrato marco');
14
+ * ```
15
+ */
16
+ export interface AlbexOptions {
17
+ /** URL to albex_wasm_bg.wasm (required). */
18
+ wasmUrl: string;
19
+ /** URL to albex_pdf.wasm. Required only if you call indexFile() with PDFs. */
20
+ pdfWasmUrl?: string;
21
+ }
22
+ export interface IndexedDocument {
23
+ name: string;
24
+ ext: string;
25
+ chunks: number;
26
+ indexTimeMs: number;
27
+ textBytes: number;
28
+ }
29
+ export interface SearchResult {
30
+ documentName: string;
31
+ /** Paragraph index (DOCX/TXT) or page number (PDF, 1-based). */
32
+ location: number;
33
+ /** Relevance score 0–1000. */
34
+ score: number;
35
+ /** Raw snippet text (original, with accents). */
36
+ snippet: string;
37
+ /** Match start byte offset within snippet. */
38
+ matchStart: number;
39
+ /** Match end byte offset within snippet (exclusive). */
40
+ matchEnd: number;
41
+ }
42
+ export interface EngineStats {
43
+ documents: number;
44
+ chunks: number;
45
+ textUsed: number;
46
+ textCapacity: number;
47
+ wasmMemoryBytes: number;
48
+ }
49
+ export interface SearchStats {
50
+ query: string;
51
+ timeMs: number;
52
+ results: number;
53
+ bloomTested: number;
54
+ bloomPassed: number;
55
+ bitapMatched: number;
56
+ }
57
+ export declare class AlbexEngine {
58
+ private _wasm;
59
+ private _mem;
60
+ private _pdfWasm;
61
+ private _pdfMem;
62
+ private _docs;
63
+ private _lastSearch;
64
+ private readonly _opts;
65
+ constructor(opts: AlbexOptions);
66
+ /** Load and initialise the main WASM module. Must be called before any other method. */
67
+ init(): Promise<void>;
68
+ private _u8;
69
+ private _writePad;
70
+ private _writeStr;
71
+ private _readPad;
72
+ private _feedText;
73
+ private _feedXmlBytes;
74
+ private _ensurePdfWasm;
75
+ private _indexDocx;
76
+ private _indexXlsx;
77
+ private _indexPdf;
78
+ private _indexTxt;
79
+ private _indexXml;
80
+ private static readonly _INDEXERS;
81
+ /**
82
+ * Index a file. Supported formats: DOCX, XLSX, PDF, TXT, XML.
83
+ * Throws for unsupported formats or parse errors.
84
+ */
85
+ indexFile(file: File): Promise<IndexedDocument>;
86
+ /**
87
+ * Search the index. Supports:
88
+ * - Simple queries: `contrato` (AND of tokens, accent-insensitive)
89
+ * - Phrase queries: `"contrato marco"` (must appear as phrase)
90
+ * - OR queries: `contrato | acuerdo` (union of two searches)
91
+ */
92
+ search(query: string): SearchResult[];
93
+ private _searchOr;
94
+ private _runSearch;
95
+ /** Returns current engine statistics. */
96
+ getStats(): EngineStats;
97
+ /** Returns stats from the most recent search, or null. */
98
+ getLastSearchStats(): SearchStats | null;
99
+ /** Returns the list of indexed documents. */
100
+ get documents(): readonly IndexedDocument[];
101
+ /** Supported file extensions. */
102
+ static get supportedExtensions(): string[];
103
+ /** Configure search sensitivity. */
104
+ setMaxErrors(errors: 0 | 1 | 2 | 3): void;
105
+ setThreshold(threshold: number): void;
106
+ setMaxResults(max: number): void;
107
+ /** Full reset — clears all indexed documents and chunks. */
108
+ reset(): void;
109
+ }
110
+ //# sourceMappingURL=albex.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"albex.d.ts","sourceRoot":"","sources":["../src/albex.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAMH,MAAM,WAAW,YAAY;IAC3B,4CAA4C;IAC5C,OAAO,EAAE,MAAM,CAAC;IAChB,8EAA8E;IAC9E,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,YAAY;IAC3B,YAAY,EAAE,MAAM,CAAC;IACrB,gEAAgE;IAChE,QAAQ,EAAE,MAAM,CAAC;IACjB,8BAA8B;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,iDAAiD;IACjD,OAAO,EAAE,MAAM,CAAC;IAChB,8CAA8C;IAC9C,UAAU,EAAE,MAAM,CAAC;IACnB,wDAAwD;IACxD,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,WAAW;IAC1B,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,YAAY,EAAE,MAAM,CAAC;IACrB,eAAe,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB;AAsLD,qBAAa,WAAW;IAEtB,OAAO,CAAC,KAAK,CAAuB;IACpC,OAAO,CAAC,IAAI,CAAsB;IAGlC,OAAO,CAAC,QAAQ,CAAoC;IACpD,OAAO,CAAC,OAAO,CAAmC;IAElD,OAAO,CAAC,KAAK,CAAyB;IACtC,OAAO,CAAC,WAAW,CAA4B;IAC/C,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAe;gBAEzB,IAAI,EAAE,YAAY;IAI9B,wFAAwF;IAClF,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAW3B,OAAO,CAAC,GAAG;IAIX,OAAO,CAAC,SAAS;IAOjB,OAAO,CAAC,SAAS;IAMjB,OAAO,CAAC,QAAQ;IAKhB,OAAO,CAAC,SAAS;IASjB,OAAO,CAAC,aAAa;YAUP,cAAc;YAad,UAAU;YASV,UAAU;YAqBV,SAAS;YAuCT,SAAS;YAWT,SAAS;IAevB,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAM/B;IAIF;;;OAGG;IACG,SAAS,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO,CAAC,eAAe,CAAC;IAmBrD;;;;;OAKG;IACH,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,YAAY,EAAE;IAgBrC,OAAO,CAAC,SAAS;IAmBjB,OAAO,CAAC,UAAU;IAiClB,yCAAyC;IACzC,QAAQ,IAAI,WAAW;IAUvB,0DAA0D;IAC1D,kBAAkB,IAAI,WAAW,GAAG,IAAI;IAIxC,6CAA6C;IAC7C,IAAI,SAAS,IAAI,SAAS,eAAe,EAAE,CAE1C;IAED,iCAAiC;IACjC,MAAM,KAAK,mBAAmB,IAAI,MAAM,EAAE,CAEzC;IAED,oCAAoC;IACpC,YAAY,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,IAAI;IAIzC,YAAY,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI;IAIrC,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI;IAIhC,4DAA4D;IAC5D,KAAK,IAAI,IAAI;CAKd"}