@albex/ocr 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +180 -0
- package/dist/errors.d.ts +20 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +34 -0
- package/dist/errors.js.map +1 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +26 -0
- package/dist/index.js.map +1 -0
- package/dist/language-detector.d.ts +39 -0
- package/dist/language-detector.d.ts.map +1 -0
- package/dist/language-detector.js +111 -0
- package/dist/language-detector.js.map +1 -0
- package/dist/ocr-worker.d.ts +71 -0
- package/dist/ocr-worker.d.ts.map +1 -0
- package/dist/ocr-worker.js +146 -0
- package/dist/ocr-worker.js.map +1 -0
- package/dist/orchestrator.d.ts +117 -0
- package/dist/orchestrator.d.ts.map +1 -0
- package/dist/orchestrator.js +120 -0
- package/dist/orchestrator.js.map +1 -0
- package/package.json +45 -0
- package/src/errors.ts +37 -0
- package/src/index.ts +48 -0
- package/src/language-detector.ts +129 -0
- package/src/ocr-worker.ts +223 -0
- package/src/orchestrator.ts +229 -0
package/README.md
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# `@albex/ocr`
|
|
2
|
+
|
|
3
|
+
Optional OCR module for [Albex](https://www.npmjs.com/package/albex) powered by
|
|
4
|
+
[Tesseract.js](https://github.com/naptha/tesseract.js). Lazy at every level:
|
|
5
|
+
the package is opt-in, the Tesseract library is dynamic-imported on first use,
|
|
6
|
+
language models are downloaded on demand and cached forever in IndexedDB.
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
npm install @albex/ocr
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
This package has a peer dependency on `albex@^0.2.0`.
|
|
15
|
+
|
|
16
|
+
## Quick start
|
|
17
|
+
|
|
18
|
+
```ts
|
|
19
|
+
import { AlbexEngine } from 'albex';
|
|
20
|
+
import { enableOcr } from '@albex/ocr';
|
|
21
|
+
|
|
22
|
+
const engine = new AlbexEngine();
|
|
23
|
+
await engine.init();
|
|
24
|
+
|
|
25
|
+
const ocr = enableOcr(engine);
|
|
26
|
+
|
|
27
|
+
// engine now exposes ocrImage()
|
|
28
|
+
const blob: Blob = await fetch('/scan.png').then(r => r.blob());
|
|
29
|
+
const { text, confidence } = await engine.ocrImage(blob);
|
|
30
|
+
|
|
31
|
+
console.log(`OCR: ${text} (confidence ${confidence}%)`);
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Scanned PDFs
|
|
35
|
+
|
|
36
|
+
`@albex/ocr` also handles scanned (image-only) PDFs transparently. After
|
|
37
|
+
`enableOcr(engine)`, any PDF whose text layer is empty will be routed
|
|
38
|
+
through OCR automatically:
|
|
39
|
+
|
|
40
|
+
```ts
|
|
41
|
+
import { AlbexEngine } from 'albex';
|
|
42
|
+
import { enableOcr } from '@albex/ocr';
|
|
43
|
+
|
|
44
|
+
const engine = new AlbexEngine();
|
|
45
|
+
await engine.init();
|
|
46
|
+
enableOcr(engine);
|
|
47
|
+
|
|
48
|
+
// Digital PDF → text is extracted directly.
|
|
49
|
+
// Scanned PDF → embedded JPEG/JPEG2000 pages are OCR'd.
|
|
50
|
+
await engine.indexFile(pdfFile);
|
|
51
|
+
|
|
52
|
+
const hits = engine.search('contract');
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
The work is split between the PDF WebAssembly module and Tesseract.js:
|
|
56
|
+
|
|
57
|
+
1. `albex_pdf.wasm` parses the PDF and detects whether each page has
|
|
58
|
+
extractable text.
|
|
59
|
+
2. For pages that don't, the same module copies out the embedded image
|
|
60
|
+
stream — exactly as it lives inside the PDF, no decoding in Rust.
|
|
61
|
+
3. The browser decodes the image via `<img>` / `createImageBitmap` and
|
|
62
|
+
Tesseract.js runs OCR on the pixels.
|
|
63
|
+
|
|
64
|
+
**What's supported.** Pages where the embedded image is a JPEG
|
|
65
|
+
(`/DCTDecode`) or a JPEG2000 (`/JPXDecode`). Modern scanners — Adobe
|
|
66
|
+
Acrobat, ABBYY FineReader, mobile scan apps — ship JPEG by default, so
|
|
67
|
+
this covers the overwhelming majority of real-world scanned PDFs.
|
|
68
|
+
|
|
69
|
+
**What's not (yet).** Pages whose image uses `/FlateDecode`,
|
|
70
|
+
`/CCITTFaxDecode` or `/JBIG2Decode`. Extracting those would require a
|
|
71
|
+
PNG/TIFF encoder inside the Rust crate (~+2 MB to `albex_pdf.wasm`), and
|
|
72
|
+
in practice they show up in ~5 % of scanned PDFs. If you hit one, the page
|
|
73
|
+
contributes zero chunks and the document is still registered — search
|
|
74
|
+
won't match those pages, but everything else keeps working. A fallback
|
|
75
|
+
that uses `pdfjs-dist` to render the page is on the backlog.
|
|
76
|
+
|
|
77
|
+
## Languages
|
|
78
|
+
|
|
79
|
+
Six languages are pre-supported with automatic worker creation:
|
|
80
|
+
|
|
81
|
+
| Lang code | Language |
|
|
82
|
+
|-----------|------------|
|
|
83
|
+
| `eng` | English |
|
|
84
|
+
| `spa` | Spanish |
|
|
85
|
+
| `fra` | French |
|
|
86
|
+
| `deu` | German |
|
|
87
|
+
| `ita` | Italian |
|
|
88
|
+
| `por` | Portuguese |
|
|
89
|
+
|
|
90
|
+
Other Tesseract languages (~100 total) work but must be requested explicitly:
|
|
91
|
+
|
|
92
|
+
```ts
|
|
93
|
+
await engine.ocrImage(blob, { lang: 'rus' });
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Language auto-detection
|
|
97
|
+
|
|
98
|
+
`@albex/ocr` exports a lightweight detector for the six pre-supported
|
|
99
|
+
languages. Use it when you have a sample of known-good text (e.g. a previous
|
|
100
|
+
OCR pass) and want to pick the right language for subsequent images:
|
|
101
|
+
|
|
102
|
+
```ts
|
|
103
|
+
import { detectLanguageOr } from '@albex/ocr';
|
|
104
|
+
|
|
105
|
+
const sample = "Le contrat établit les clauses de l'accord";
|
|
106
|
+
const lang = detectLanguageOr(sample); // → 'fra'
|
|
107
|
+
|
|
108
|
+
await engine.ocrImage(image, { lang });
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
The detector is a ~4 KB blend of distinctive character checks and stop-word
|
|
112
|
+
matching. It is intentionally NOT a robust language ID model — when in doubt
|
|
113
|
+
it falls back to the supplied default (typically `'eng'`).
|
|
114
|
+
|
|
115
|
+
## Options
|
|
116
|
+
|
|
117
|
+
```ts
|
|
118
|
+
enableOcr(engine, {
|
|
119
|
+
languages: ['eng', 'spa', 'fra'], // allowlist (default: all 6)
|
|
120
|
+
defaultLanguage: 'eng', // when detection abstains
|
|
121
|
+
preload: ['eng'], // warm models eagerly
|
|
122
|
+
idleTimeoutMs: 5 * 60_000, // evict inactive workers
|
|
123
|
+
langPath: 'https://my.cdn/lang/', // override traineddata source
|
|
124
|
+
});
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Lifecycle
|
|
128
|
+
|
|
129
|
+
The handle returned by `enableOcr` exposes diagnostic and teardown methods:
|
|
130
|
+
|
|
131
|
+
```ts
|
|
132
|
+
const ocr = enableOcr(engine);
|
|
133
|
+
|
|
134
|
+
await ocr.preload(['eng', 'spa']); // wait for warming to finish
|
|
135
|
+
console.log(ocr.loadedLanguages()); // → ['eng', 'spa']
|
|
136
|
+
|
|
137
|
+
await ocr.dispose(); // terminate all workers
|
|
138
|
+
// engine.ocrImage is removed
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
The handle is also a TC39 disposable, so you can `using ocr = enableOcr(engine)`
|
|
142
|
+
in environments that support it.
|
|
143
|
+
|
|
144
|
+
## Performance
|
|
145
|
+
|
|
146
|
+
Tesseract.js on a typical 1024×1024 scanned page in 2026:
|
|
147
|
+
|
|
148
|
+
| Variant | Per page CPU | Per page WebGPU* |
|
|
149
|
+
|------------------------|-------------:|-----------------:|
|
|
150
|
+
| `_fast` models (this) | 1–3 s | 0.5–1.5 s |
|
|
151
|
+
| `_best` models | 3–8 s | 1.5–3 s |
|
|
152
|
+
|
|
153
|
+
*WebGPU is experimental in Tesseract.js v5.1+. Enabled automatically when
|
|
154
|
+
available.
|
|
155
|
+
|
|
156
|
+
The "fast" models trade ~5 % char-level accuracy for 4–6× smaller size and
|
|
157
|
+
similar speedup. For Albex, that 5 % is invisible: the fuzzy Bitap matcher
|
|
158
|
+
already tolerates up to 3 character errors per token, and the accent-fold
|
|
159
|
+
plus lowercase normalisation absorb the other typical OCR mistakes.
|
|
160
|
+
|
|
161
|
+
## Bundle impact
|
|
162
|
+
|
|
163
|
+
Zero. Until `enableOcr()` is invoked at runtime, nothing in this package
|
|
164
|
+
ships in your initial bundle. The Tesseract.js library, its WASM core and
|
|
165
|
+
language models are all downloaded on demand and cached in IndexedDB.
|
|
166
|
+
|
|
167
|
+
Typical download path for a user that drags one Spanish scanned PDF:
|
|
168
|
+
|
|
169
|
+
| Step | Cost | When |
|
|
170
|
+
|-------------------------------------|-------------|--------------------|
|
|
171
|
+
| Initial app load | 0 bytes | (no OCR yet) |
|
|
172
|
+
| First call to `ocrImage` | ~3.5 MB | Tesseract runtime |
|
|
173
|
+
| ↳ First Spanish recognise | ~1.8 MB | spa_fast model |
|
|
174
|
+
| Subsequent Spanish recognises | 0 bytes | (cached) |
|
|
175
|
+
| Subsequent visits to the same origin| 0 bytes | (IndexedDB cache) |
|
|
176
|
+
|
|
177
|
+
## License
|
|
178
|
+
|
|
179
|
+
MIT. Same as [Tesseract.js](https://github.com/naptha/tesseract.js) and
|
|
180
|
+
[albex](https://www.npmjs.com/package/albex).
|
package/dist/errors.d.ts
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Errors thrown by `@albex/ocr`. Mirrors the AlbexError hierarchy of the
|
|
3
|
+
* main package: subclass of `Error`, exposes a `kind` discriminator that
|
|
4
|
+
* survives `structuredClone` across worker boundaries.
|
|
5
|
+
*/
|
|
6
|
+
export declare class AlbexOcrError extends Error {
|
|
7
|
+
readonly kind: string;
|
|
8
|
+
constructor(kind: string, message: string);
|
|
9
|
+
}
|
|
10
|
+
export declare class AlbexOcrInitError extends AlbexOcrError {
|
|
11
|
+
constructor(message: string);
|
|
12
|
+
}
|
|
13
|
+
export declare class AlbexOcrLanguageError extends AlbexOcrError {
|
|
14
|
+
readonly lang: string;
|
|
15
|
+
constructor(lang: string, message: string);
|
|
16
|
+
}
|
|
17
|
+
export declare class AlbexOcrRecognitionError extends AlbexOcrError {
|
|
18
|
+
constructor(message: string);
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=errors.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"errors.d.ts","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,qBAAa,aAAc,SAAQ,KAAK;IACtC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;gBACV,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;CAK1C;AAED,qBAAa,iBAAkB,SAAQ,aAAa;gBACtC,OAAO,EAAE,MAAM;CAI5B;AAED,qBAAa,qBAAsB,SAAQ,aAAa;IACtD,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;gBACV,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;CAK1C;AAED,qBAAa,wBAAyB,SAAQ,aAAa;gBAC7C,OAAO,EAAE,MAAM;CAI5B"}
|
package/dist/errors.js
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Errors thrown by `@albex/ocr`. Mirrors the AlbexError hierarchy of the
|
|
3
|
+
* main package: subclass of `Error`, exposes a `kind` discriminator that
|
|
4
|
+
* survives `structuredClone` across worker boundaries.
|
|
5
|
+
*/
|
|
6
|
+
export class AlbexOcrError extends Error {
|
|
7
|
+
kind;
|
|
8
|
+
constructor(kind, message) {
|
|
9
|
+
super(message);
|
|
10
|
+
this.name = 'AlbexOcrError';
|
|
11
|
+
this.kind = kind;
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
export class AlbexOcrInitError extends AlbexOcrError {
|
|
15
|
+
constructor(message) {
|
|
16
|
+
super('ocr_init', message);
|
|
17
|
+
this.name = 'AlbexOcrInitError';
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
export class AlbexOcrLanguageError extends AlbexOcrError {
|
|
21
|
+
lang;
|
|
22
|
+
constructor(lang, message) {
|
|
23
|
+
super('ocr_language', message);
|
|
24
|
+
this.name = 'AlbexOcrLanguageError';
|
|
25
|
+
this.lang = lang;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
export class AlbexOcrRecognitionError extends AlbexOcrError {
|
|
29
|
+
constructor(message) {
|
|
30
|
+
super('ocr_recognition', message);
|
|
31
|
+
this.name = 'AlbexOcrRecognitionError';
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
//# sourceMappingURL=errors.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"errors.js","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,MAAM,OAAO,aAAc,SAAQ,KAAK;IAC7B,IAAI,CAAS;IACtB,YAAY,IAAY,EAAE,OAAe;QACvC,KAAK,CAAC,OAAO,CAAC,CAAC;QACf,IAAI,CAAC,IAAI,GAAG,eAAe,CAAC;QAC5B,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;IACnB,CAAC;CACF;AAED,MAAM,OAAO,iBAAkB,SAAQ,aAAa;IAClD,YAAY,OAAe;QACzB,KAAK,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;QAC3B,IAAI,CAAC,IAAI,GAAG,mBAAmB,CAAC;IAClC,CAAC;CACF;AAED,MAAM,OAAO,qBAAsB,SAAQ,aAAa;IAC7C,IAAI,CAAS;IACtB,YAAY,IAAY,EAAE,OAAe;QACvC,KAAK,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC;QAC/B,IAAI,CAAC,IAAI,GAAG,uBAAuB,CAAC;QACpC,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;IACnB,CAAC;CACF;AAED,MAAM,OAAO,wBAAyB,SAAQ,aAAa;IACzD,YAAY,OAAe;QACzB,KAAK,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC;QAClC,IAAI,CAAC,IAAI,GAAG,0BAA0B,CAAC;IACzC,CAAC;CACF"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@albex/ocr` — OCR module for Albex.
|
|
3
|
+
*
|
|
4
|
+
* Drop-in OCR for Albex powered by Tesseract.js. Lazy loaded at every level:
|
|
5
|
+
* the package itself is opt-in, the Tesseract.js library is dynamic-imported
|
|
6
|
+
* on first use, language models are downloaded on demand and cached forever
|
|
7
|
+
* in IndexedDB.
|
|
8
|
+
*
|
|
9
|
+
* Quick start:
|
|
10
|
+
*
|
|
11
|
+
* ```ts
|
|
12
|
+
* import { AlbexEngine } from 'albex';
|
|
13
|
+
* import { enableOcr } from '@albex/ocr';
|
|
14
|
+
*
|
|
15
|
+
* const engine = new AlbexEngine();
|
|
16
|
+
* await engine.init();
|
|
17
|
+
*
|
|
18
|
+
* const ocr = enableOcr(engine);
|
|
19
|
+
*
|
|
20
|
+
* const { text } = await engine.ocrImage(myImageBlob);
|
|
21
|
+
* console.log(text);
|
|
22
|
+
* ```
|
|
23
|
+
*/
|
|
24
|
+
export { enableOcr, detectLanguage, detectLanguageOr, scoreLanguages, SUPPORTED_LANGS, AlbexOcrError, AlbexOcrInitError, AlbexOcrLanguageError, AlbexOcrRecognitionError, } from './orchestrator.js';
|
|
25
|
+
export type { OcrHandle, OcrOptions, OcrRecognizeOptions, OcrCapableEngine, Lang, RecognitionResult, ImageLike, } from './orchestrator.js';
|
|
26
|
+
export { OcrWorkerPool } from './ocr-worker.js';
|
|
27
|
+
export type { OcrWorkerOptions } from './ocr-worker.js';
|
|
28
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,EACL,SAAS,EACT,cAAc,EACd,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,wBAAwB,GACzB,MAAM,mBAAmB,CAAC;AAE3B,YAAY,EACV,SAAS,EACT,UAAU,EACV,mBAAmB,EACnB,gBAAgB,EAChB,IAAI,EACJ,iBAAiB,EACjB,SAAS,GACV,MAAM,mBAAmB,CAAC;AAE3B,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAChD,YAAY,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@albex/ocr` — OCR module for Albex.
|
|
3
|
+
*
|
|
4
|
+
* Drop-in OCR for Albex powered by Tesseract.js. Lazy loaded at every level:
|
|
5
|
+
* the package itself is opt-in, the Tesseract.js library is dynamic-imported
|
|
6
|
+
* on first use, language models are downloaded on demand and cached forever
|
|
7
|
+
* in IndexedDB.
|
|
8
|
+
*
|
|
9
|
+
* Quick start:
|
|
10
|
+
*
|
|
11
|
+
* ```ts
|
|
12
|
+
* import { AlbexEngine } from 'albex';
|
|
13
|
+
* import { enableOcr } from '@albex/ocr';
|
|
14
|
+
*
|
|
15
|
+
* const engine = new AlbexEngine();
|
|
16
|
+
* await engine.init();
|
|
17
|
+
*
|
|
18
|
+
* const ocr = enableOcr(engine);
|
|
19
|
+
*
|
|
20
|
+
* const { text } = await engine.ocrImage(myImageBlob);
|
|
21
|
+
* console.log(text);
|
|
22
|
+
* ```
|
|
23
|
+
*/
|
|
24
|
+
export { enableOcr, detectLanguage, detectLanguageOr, scoreLanguages, SUPPORTED_LANGS, AlbexOcrError, AlbexOcrInitError, AlbexOcrLanguageError, AlbexOcrRecognitionError, } from './orchestrator.js';
|
|
25
|
+
export { OcrWorkerPool } from './ocr-worker.js';
|
|
26
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,EACL,SAAS,EACT,cAAc,EACd,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,wBAAwB,GACzB,MAAM,mBAAmB,CAAC;AAY3B,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC"}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight language detection for the 6 pre-supported OCR languages.
|
|
3
|
+
*
|
|
4
|
+
* The signal is a combination of:
|
|
5
|
+
* 1. Distinctive single characters (`ñ`, `ç`, `ß`, …).
|
|
6
|
+
* 2. The most common stop words of each language, scored by frequency.
|
|
7
|
+
*
|
|
8
|
+
* The detector is meant to be called on a SHORT sample of source text — a
|
|
9
|
+
* paragraph or two from a known-good area of the document, or a previous
|
|
10
|
+
* OCR pass of the first page. It is NOT a robust language ID model; the
|
|
11
|
+
* threshold of confidence below which we fall back to `eng` is intentional.
|
|
12
|
+
*
|
|
13
|
+
* Total weight in the bundle: ~4 KB minified once the rest of the package
|
|
14
|
+
* is tree-shaken alongside it.
|
|
15
|
+
*/
|
|
16
|
+
export type Lang = 'eng' | 'spa' | 'fra' | 'deu' | 'ita' | 'por';
|
|
17
|
+
export declare const SUPPORTED_LANGS: readonly Lang[];
|
|
18
|
+
/**
|
|
19
|
+
* Score how likely a sample of text is in each candidate language.
|
|
20
|
+
*
|
|
21
|
+
* Returns scores normalised so the top language is always > 0. Use
|
|
22
|
+
* `detectLanguage` for the simple "which one?" answer.
|
|
23
|
+
*/
|
|
24
|
+
export declare function scoreLanguages(text: string): Record<Lang, number>;
|
|
25
|
+
/**
|
|
26
|
+
* Pick the most likely language. Returns `null` when no signal is strong
|
|
27
|
+
* enough — the caller should fall back to a configured default (usually
|
|
28
|
+
* `eng`).
|
|
29
|
+
*
|
|
30
|
+
* `minScore` is the threshold below which we abstain. Default 6 — empirical
|
|
31
|
+
* from testing on short samples (<200 chars) of each language.
|
|
32
|
+
*/
|
|
33
|
+
export declare function detectLanguage(text: string, minScore?: number): Lang | null;
|
|
34
|
+
/**
|
|
35
|
+
* Convenience: detect with a fallback. The default fallback is `eng`,
|
|
36
|
+
* which is also the safest universal choice for technical / mixed corpora.
|
|
37
|
+
*/
|
|
38
|
+
export declare function detectLanguageOr(text: string, fallback?: Lang, minScore?: number): Lang;
|
|
39
|
+
//# sourceMappingURL=language-detector.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"language-detector.d.ts","sourceRoot":"","sources":["../src/language-detector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,MAAM,MAAM,IAAI,GAAG,KAAK,GAAG,KAAK,GAAG,KAAK,GAAG,KAAK,GAAG,KAAK,GAAG,KAAK,CAAC;AAEjE,eAAO,MAAM,eAAe,EAAE,SAAS,IAAI,EAA+C,CAAC;AA6C3F;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,CA4BjE;AAED;;;;;;;GAOG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,SAAI,GAAG,IAAI,GAAG,IAAI,CAatE;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,GAAE,IAAY,EAAE,QAAQ,SAAI,GAAG,IAAI,CAEzF"}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight language detection for the 6 pre-supported OCR languages.
|
|
3
|
+
*
|
|
4
|
+
* The signal is a combination of:
|
|
5
|
+
* 1. Distinctive single characters (`ñ`, `ç`, `ß`, …).
|
|
6
|
+
* 2. The most common stop words of each language, scored by frequency.
|
|
7
|
+
*
|
|
8
|
+
* The detector is meant to be called on a SHORT sample of source text — a
|
|
9
|
+
* paragraph or two from a known-good area of the document, or a previous
|
|
10
|
+
* OCR pass of the first page. It is NOT a robust language ID model; the
|
|
11
|
+
* threshold of confidence below which we fall back to `eng` is intentional.
|
|
12
|
+
*
|
|
13
|
+
* Total weight in the bundle: ~4 KB minified once the rest of the package
|
|
14
|
+
* is tree-shaken alongside it.
|
|
15
|
+
*/
|
|
16
|
+
export const SUPPORTED_LANGS = ['eng', 'spa', 'fra', 'deu', 'ita', 'por'];
|
|
17
|
+
/**
|
|
18
|
+
* One profile per language. Stop words come from the OPUS-100 / Wikipedia
|
|
19
|
+
* frequency tables; the top 12 of each language cover ~25-30 % of any
|
|
20
|
+
* substantial text. Distinctive characters are conservative — listed only
|
|
21
|
+
* when they appear in that language and almost nowhere else among our six.
|
|
22
|
+
*/
|
|
23
|
+
const PROFILES = {
|
|
24
|
+
eng: {
|
|
25
|
+
chars: [], // English shares its alphabet with everyone else in our set.
|
|
26
|
+
stopWords: ['the', 'of', 'and', 'to', 'in', 'a', 'is', 'that', 'for', 'it', 'with', 'as'],
|
|
27
|
+
},
|
|
28
|
+
spa: {
|
|
29
|
+
chars: ['ñ', '¿', '¡'],
|
|
30
|
+
stopWords: ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'se', 'del', 'las', 'por'],
|
|
31
|
+
},
|
|
32
|
+
fra: {
|
|
33
|
+
chars: ['ç', 'œ', 'à', 'â', 'ê', 'ô', 'û'],
|
|
34
|
+
stopWords: ['de', 'la', 'le', 'et', 'les', 'des', 'un', 'une', 'que', 'il', 'au', 'aux'],
|
|
35
|
+
},
|
|
36
|
+
deu: {
|
|
37
|
+
chars: ['ß', 'ü', 'ö', 'ä'],
|
|
38
|
+
stopWords: ['die', 'der', 'und', 'in', 'den', 'von', 'zu', 'das', 'mit', 'sich', 'des', 'auf'],
|
|
39
|
+
},
|
|
40
|
+
ita: {
|
|
41
|
+
chars: [], // No unique distinctive chars vs spa/por.
|
|
42
|
+
stopWords: ['di', 'la', 'il', 'e', 'che', 'un', 'del', 'le', 'per', 'una', 'gli', 'sono'],
|
|
43
|
+
},
|
|
44
|
+
por: {
|
|
45
|
+
chars: ['ã', 'õ'],
|
|
46
|
+
stopWords: ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', 'os', 'com'],
|
|
47
|
+
},
|
|
48
|
+
};
|
|
49
|
+
const CHAR_WEIGHT = 8;
|
|
50
|
+
const STOP_WEIGHT = 4;
|
|
51
|
+
/**
|
|
52
|
+
* Score how likely a sample of text is in each candidate language.
|
|
53
|
+
*
|
|
54
|
+
* Returns scores normalised so the top language is always > 0. Use
|
|
55
|
+
* `detectLanguage` for the simple "which one?" answer.
|
|
56
|
+
*/
|
|
57
|
+
export function scoreLanguages(text) {
|
|
58
|
+
const lower = text.toLowerCase();
|
|
59
|
+
// Tokenise into words once — used by stop word scoring.
|
|
60
|
+
// Accepts UTF-8 letters and digits; everything else is a separator.
|
|
61
|
+
const words = lower.match(/[\p{L}\p{N}']+/gu) ?? [];
|
|
62
|
+
const wordSet = new Set(words);
|
|
63
|
+
const scores = { eng: 0, spa: 0, fra: 0, deu: 0, ita: 0, por: 0 };
|
|
64
|
+
for (const lang of SUPPORTED_LANGS) {
|
|
65
|
+
const profile = PROFILES[lang];
|
|
66
|
+
// Distinctive char score.
|
|
67
|
+
for (const c of profile.chars) {
|
|
68
|
+
if (lower.includes(c))
|
|
69
|
+
scores[lang] += CHAR_WEIGHT;
|
|
70
|
+
}
|
|
71
|
+
// Stop word score — top words count more.
|
|
72
|
+
for (let i = 0; i < profile.stopWords.length; i++) {
|
|
73
|
+
const word = profile.stopWords[i];
|
|
74
|
+
if (wordSet.has(word)) {
|
|
75
|
+
// First word in the list = highest weight; decays linearly.
|
|
76
|
+
scores[lang] += STOP_WEIGHT * (1 - i / profile.stopWords.length);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return scores;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Pick the most likely language. Returns `null` when no signal is strong
|
|
84
|
+
* enough — the caller should fall back to a configured default (usually
|
|
85
|
+
* `eng`).
|
|
86
|
+
*
|
|
87
|
+
* `minScore` is the threshold below which we abstain. Default 6 — empirical
|
|
88
|
+
* from testing on short samples (<200 chars) of each language.
|
|
89
|
+
*/
|
|
90
|
+
export function detectLanguage(text, minScore = 6) {
|
|
91
|
+
if (text.length < 20)
|
|
92
|
+
return null; // Nothing to learn from.
|
|
93
|
+
const scores = scoreLanguages(text);
|
|
94
|
+
let best = null;
|
|
95
|
+
let bestScore = -1;
|
|
96
|
+
for (const lang of SUPPORTED_LANGS) {
|
|
97
|
+
if (scores[lang] > bestScore) {
|
|
98
|
+
bestScore = scores[lang];
|
|
99
|
+
best = lang;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
return bestScore >= minScore ? best : null;
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Convenience: detect with a fallback. The default fallback is `eng`,
|
|
106
|
+
* which is also the safest universal choice for technical / mixed corpora.
|
|
107
|
+
*/
|
|
108
|
+
export function detectLanguageOr(text, fallback = 'eng', minScore = 6) {
|
|
109
|
+
return detectLanguage(text, minScore) ?? fallback;
|
|
110
|
+
}
|
|
111
|
+
//# sourceMappingURL=language-detector.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"language-detector.js","sourceRoot":"","sources":["../src/language-detector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAIH,MAAM,CAAC,MAAM,eAAe,GAAoB,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;AAS3F;;;;;GAKG;AACH,MAAM,QAAQ,GAA8B;IAC1C,GAAG,EAAE;QACH,KAAK,EAAM,EAAE,EAAE,6DAA6D;QAC5E,SAAS,EAAE,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC;KAC1F;IACD,GAAG,EAAE;QACH,KAAK,EAAM,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC;QAC1B,SAAS,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,CAAC;KACvF;IACD,GAAG,EAAE;QACH,KAAK,EAAM,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC;QAC9C,SAAS,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,CAAC;KACzF;IACD,GAAG,EAAE;QACH,KAAK,EAAM,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC;QAC/B,SAAS,EAAE,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC;KAC/F;IACD,GAAG,EAAE;QACH,KAAK,EAAM,EAAE,EAAE,0CAA0C;QACzD,SAAS,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,CAAC;KAC1F;IACD,GAAG,EAAE;QACH,KAAK,EAAM,CAAC,GAAG,EAAE,GAAG,CAAC;QACrB,SAAS,EAAE,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC;KACrF;CACF,CAAC;AAEF,MAAM,WAAW,GAAI,CAAC,CAAC;AACvB,MAAM,WAAW,GAAI,CAAC,CAAC;AAEvB;;;;;GAKG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IACjC,wDAAwD;IACxD,oEAAoE;IACpE,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,kBAAkB,CAAC,IAAI,EAAE,CAAC;IACpD,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,CAAC;IAE/B,MAAM,MAAM,GAAyB,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC;IAExF,KAAK,MAAM,IAAI,IAAI,eAAe,EAAE,CAAC;QACnC,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;QAE/B,0BAA0B;QAC1B,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YAC9B,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC;gBAAE,MAAM,CAAC,IAAI,CAAC,IAAI,WAAW,CAAC;QACrD,CAAC;QAED,0CAA0C;QAC1C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAClD,MAAM,IAAI,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC;YACnC,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBACtB,4DAA4D;gBAC5D,MAAM,CAAC,IAAI,CAAC,IAAI,WAAW,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;YACnE,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,QAAQ,GAAG,CAAC;IACvD,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE;QAAE,OAAO,IAAI,CAAC,CAAC,yBAAyB;IAE5D,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IACpC,IAAI,IAAI,GAAgB,IAAI,CAAC;IAC7B,IAAI,SAAS,GAAG,CAAC,CAAC,CAAC;IACnB,KAAK,MAAM,IAAI,IAAI,eAAe,EAAE,CAAC;QACnC,IAAI,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,EAAE,CAAC;YAC7B,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;YACzB,IAAI,GAAG,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,SAAS,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;AAC7C,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY,EAAE,WAAiB,KAAK,EAAE,QAAQ,GAAG,CAAC;IACjF,OAAO,cAAc,CAAC,IAAI,EAAE,QAAQ,CAAC,IAAI,QAAQ,CAAC;AACpD,CAAC"}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Wrapper around Tesseract.js.
|
|
3
|
+
*
|
|
4
|
+
* Responsibilities:
|
|
5
|
+
* 1. Lazy-load the Tesseract.js library on first use (dynamic import).
|
|
6
|
+
* 2. Maintain a per-language `Worker` instance, created on demand.
|
|
7
|
+
* 3. Auto-terminate idle workers after a configurable timeout to release
|
|
8
|
+
* the LSTM model from memory (each Tesseract worker holds 2-5 MB).
|
|
9
|
+
* 4. Expose a Promise-based `recognize(image, lang)` that returns text +
|
|
10
|
+
* confidence in a stable shape.
|
|
11
|
+
*
|
|
12
|
+
* Why one worker per language: Tesseract.js workers are tied to the language
|
|
13
|
+
* model they were initialised with. Switching languages on the same worker
|
|
14
|
+
* triggers a slow reload of the LSTM. Maintaining N workers — one per
|
|
15
|
+
* language ever used — keeps each recognise call fast at the cost of slightly
|
|
16
|
+
* more memory, which the idle eviction then claws back.
|
|
17
|
+
*/
|
|
18
|
+
import type { Lang } from './language-detector.js';
|
|
19
|
+
/**
|
|
20
|
+
* Anything Tesseract.js can accept as input. We narrow to what an Albex
|
|
21
|
+
* consumer is likely to hand us (Blob, ArrayBuffer, Uint8Array, an HTML
|
|
22
|
+
* image element). The Tesseract.js source itself does the discrimination.
|
|
23
|
+
*/
|
|
24
|
+
export type ImageLike = Blob | ArrayBuffer | Uint8Array | string | HTMLImageElement | HTMLCanvasElement | OffscreenCanvas;
|
|
25
|
+
export interface RecognitionResult {
|
|
26
|
+
/** Raw OCR output. May contain newlines for paragraphs / lines. */
|
|
27
|
+
text: string;
|
|
28
|
+
/** 0-100 confidence reported by Tesseract for the page. */
|
|
29
|
+
confidence: number;
|
|
30
|
+
/** Wall-clock time spent on this recognition. */
|
|
31
|
+
timeMs: number;
|
|
32
|
+
}
|
|
33
|
+
export interface OcrWorkerOptions {
|
|
34
|
+
/**
|
|
35
|
+
* Milliseconds of inactivity after which an idle Tesseract worker is
|
|
36
|
+
* terminated and its language model released. Default: 5 minutes.
|
|
37
|
+
* Set 0 to disable eviction (worker stays for the lifetime of the page).
|
|
38
|
+
*/
|
|
39
|
+
idleTimeoutMs?: number;
|
|
40
|
+
/**
|
|
41
|
+
* Override the `tessdata_fast` mirror. Defaults to the official Tesseract.js
|
|
42
|
+
* jsDelivr mirror, which is what `tesseract.js` ships with anyway.
|
|
43
|
+
*/
|
|
44
|
+
langPath?: string;
|
|
45
|
+
}
|
|
46
|
+
export declare class OcrWorkerPool {
|
|
47
|
+
private _workers;
|
|
48
|
+
private _evictionTimer;
|
|
49
|
+
private readonly _idleTimeoutMs;
|
|
50
|
+
private readonly _langPath;
|
|
51
|
+
constructor(opts?: OcrWorkerOptions);
|
|
52
|
+
/**
|
|
53
|
+
* Run OCR on a single image. Spawns the appropriate language worker on
|
|
54
|
+
* first use and caches it; subsequent calls for the same language reuse it.
|
|
55
|
+
*/
|
|
56
|
+
recognize(image: ImageLike, lang: Lang): Promise<RecognitionResult>;
|
|
57
|
+
/**
|
|
58
|
+
* Names of languages currently loaded in memory. Useful for diagnostics
|
|
59
|
+
* and for the demo's runtime panel.
|
|
60
|
+
*/
|
|
61
|
+
loadedLanguages(): Lang[];
|
|
62
|
+
/**
|
|
63
|
+
* Tear down all workers immediately. Called by orchestrator dispose.
|
|
64
|
+
*/
|
|
65
|
+
dispose(): Promise<void>;
|
|
66
|
+
/** TC39 explicit-resource-management alias. Fires `dispose()` async. */
|
|
67
|
+
[Symbol.dispose](): void;
|
|
68
|
+
private _getOrCreate;
|
|
69
|
+
private _sweepIdle;
|
|
70
|
+
}
|
|
71
|
+
//# sourceMappingURL=ocr-worker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-worker.d.ts","sourceRoot":"","sources":["../src/ocr-worker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AAqBnD;;;;GAIG;AACH,MAAM,MAAM,SAAS,GACjB,IAAI,GACJ,WAAW,GACX,UAAU,GACV,MAAM,GACN,gBAAgB,GAChB,iBAAiB,GACjB,eAAe,CAAC;AAEpB,MAAM,WAAW,iBAAiB;IAChC,mEAAmE;IACnE,IAAI,EAAE,MAAM,CAAC;IACb,2DAA2D;IAC3D,UAAU,EAAE,MAAM,CAAC;IACnB,iDAAiD;IACjD,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,gBAAgB;IAC/B;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAqCD,qBAAa,aAAa;IACxB,OAAO,CAAC,QAAQ,CAAgC;IAChD,OAAO,CAAC,cAAc,CAA+C;IACrE,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;IACxC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAqB;gBAEnC,IAAI,GAAE,gBAAqB;IAiBvC;;;OAGG;IACG,SAAS,CAAC,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,GAAG,OAAO,CAAC,iBAAiB,CAAC;IAkBzE;;;OAGG;IACH,eAAe,IAAI,IAAI,EAAE;IAIzB;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAU9B,wEAAwE;IACxE,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,IAAI;YAMV,YAAY;YA2BZ,UAAU;CAWzB"}
|