med-pdf-nmo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +298 -0
- package/README.ru.md +298 -0
- package/dist/bm25.d.ts +47 -0
- package/dist/bm25.js +86 -0
- package/dist/browser-shims/buffer.d.ts +30 -0
- package/dist/browser-shims/buffer.js +31 -0
- package/dist/browser-shims/crypto.d.ts +33 -0
- package/dist/browser-shims/crypto.js +45 -0
- package/dist/browser-shims/fs-promises.d.ts +13 -0
- package/dist/browser-shims/fs-promises.js +25 -0
- package/dist/browser-shims/fs.d.ts +14 -0
- package/dist/browser-shims/fs.js +24 -0
- package/dist/browser-shims/globals.d.ts +9 -0
- package/dist/browser-shims/globals.js +23 -0
- package/dist/browser-shims/path.d.ts +57 -0
- package/dist/browser-shims/path.js +65 -0
- package/dist/browser-shims/process.d.ts +22 -0
- package/dist/browser-shims/process.js +27 -0
- package/dist/browser.d.ts +9 -0
- package/dist/browser.js +12 -0
- package/dist/chunk.d.ts +15 -0
- package/dist/chunk.js +76 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +87 -0
- package/dist/index.d.ts +82 -0
- package/dist/index.js +51 -0
- package/dist/med-pdf-nmo.browser.js +40413 -0
- package/dist/med-pdf-nmo.browser.mjs +40395 -0
- package/dist/normalize.d.ts +73 -0
- package/dist/normalize.js +477 -0
- package/dist/pdf.d.ts +35 -0
- package/dist/pdf.js +396 -0
- package/dist/predictor/config.d.ts +28 -0
- package/dist/predictor/config.js +26 -0
- package/dist/predictor/constants.d.ts +3 -0
- package/dist/predictor/constants.js +59 -0
- package/dist/predictor/runtime.d.ts +15 -0
- package/dist/predictor/runtime.js +59 -0
- package/dist/predictor/scorers/biomedical-symbols.d.ts +36 -0
- package/dist/predictor/scorers/biomedical-symbols.js +347 -0
- package/dist/predictor/scorers/coordinate-table.d.ts +82 -0
- package/dist/predictor/scorers/coordinate-table.js +1210 -0
- package/dist/predictor/scorers/direction.d.ts +71 -0
- package/dist/predictor/scorers/direction.js +345 -0
- package/dist/predictor/scorers/drug-dose.d.ts +6 -0
- package/dist/predictor/scorers/drug-dose.js +221 -0
- package/dist/predictor/scorers/exact-answer.d.ts +10 -0
- package/dist/predictor/scorers/exact-answer.js +75 -0
- package/dist/predictor/scorers/fibrosis-stage.d.ts +6 -0
- package/dist/predictor/scorers/fibrosis-stage.js +103 -0
- package/dist/predictor/scorers/focused.d.ts +40 -0
- package/dist/predictor/scorers/focused.js +204 -0
- package/dist/predictor/scorers/frequency.d.ts +10 -0
- package/dist/predictor/scorers/frequency.js +203 -0
- package/dist/predictor/scorers/numeric.d.ts +77 -0
- package/dist/predictor/scorers/numeric.js +1161 -0
- package/dist/predictor/scorers/recommendation-item.d.ts +27 -0
- package/dist/predictor/scorers/recommendation-item.js +469 -0
- package/dist/predictor/scorers/search.d.ts +41 -0
- package/dist/predictor/scorers/search.js +515 -0
- package/dist/predictor/selection.d.ts +30 -0
- package/dist/predictor/selection.js +370 -0
- package/dist/predictor/text-utils.d.ts +49 -0
- package/dist/predictor/text-utils.js +497 -0
- package/dist/predictor/types.d.ts +23 -0
- package/dist/predictor/types.js +1 -0
- package/dist/predictor.d.ts +52 -0
- package/dist/predictor.js +3834 -0
- package/package.json +82 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 med-pdf-nmo contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
# med-pdf-nmo
|
|
2
|
+
|
|
3
|
+
[Русская версия](./README.ru.md)
|
|
4
|
+
|
|
5
|
+
`med-pdf-nmo` is a browser-first JavaScript/Node.js package that selects the most likely answer, or answer set, for NMO-style medical questions using a source PDF with clinical recommendations.
|
|
6
|
+
|
|
7
|
+
The runtime is fully local and non-LLM. It does not use ChatGPT, OpenAI, Anthropic, Gemini, HuggingFace inference, transformer models, or any external AI service. The predictor is based on PDF text extraction, normalization, lexical search, structural heuristics, scoring, and evidence snippets from the PDF.
|
|
8
|
+
|
|
9
|
+
## What It Does
|
|
10
|
+
|
|
11
|
+
- Accepts a medical recommendations PDF, a question, and answer variants.
|
|
12
|
+
- Extracts PDF text with `pdfjs-dist`.
|
|
13
|
+
- Normalizes Russian medical text, PDF artifacts, Greek letters, numeric references, dosage forms, and common OCR quirks.
|
|
14
|
+
- Scores every answer using local evidence from the PDF.
|
|
15
|
+
- Supports both `single` and `multi` questions.
|
|
16
|
+
- Returns selected answers, confidence, per-answer scores, raw scores, evidence snippets, and PDF metadata.
|
|
17
|
+
- Works in Node.js, browser bundles, and Chrome-extension style environments.
|
|
18
|
+
|
|
19
|
+
## Current Accuracy
|
|
20
|
+
|
|
21
|
+
These numbers come from the local keyed validation corpus. They are not a guarantee for every new PDF, but they are the current reference quality after the final test run.
|
|
22
|
+
|
|
23
|
+
| Dataset | Exact accuracy | Single-answer | Multi-answer exact set |
|
|
24
|
+
| --- | ---: | ---: | ---: |
|
|
25
|
+
| All keyed cases | `73.53%` (`2069/2814`) | `81.17%` (`1573/1938`) | `56.62%` (`496/876`) |
|
|
26
|
+
| Holdout split | `83.79%` (`486/580`) | `87.39%` | `72.92%` |
|
|
27
|
+
| Dev split | `77.14%` (`388/503`) | `83.09%` | `63.64%` |
|
|
28
|
+
|
|
29
|
+
For `single`, only one exact selected answer is counted as correct. For `multi`, the selected set must exactly match the full expected set, so the metric is naturally stricter.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
From npm, once published:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
npm install med-pdf-nmo
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Directly from a Git HTTPS URL:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
npm install git+https://github.com/lKolabrodl/med-pdf-nmo.git#main
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Or in `package.json`:
|
|
46
|
+
|
|
47
|
+
```json
|
|
48
|
+
{
|
|
49
|
+
"dependencies": {
|
|
50
|
+
"med-pdf-nmo": "git+https://github.com/lKolabrodl/med-pdf-nmo.git#main"
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
When installed from Git, npm runs `prepare`, so the package builds `dist` during installation.
|
|
56
|
+
|
|
57
|
+
## Browser / React / Chrome Extension
|
|
58
|
+
|
|
59
|
+
Use the browser entrypoint when your app runs in a browser-like environment:
|
|
60
|
+
|
|
61
|
+
```ts
|
|
62
|
+
import { answerQuestion } from "med-pdf-nmo/browser";
|
|
63
|
+
|
|
64
|
+
const result = await answerQuestion(new Uint8Array(pdfData.slice(0)), {
|
|
65
|
+
question,
|
|
66
|
+
variants,
|
|
67
|
+
type: isSingle ? "single" : "multi",
|
|
68
|
+
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
The browser entrypoint bundles and registers PDF.js internally. In normal React, Vite, Webpack, and Chrome-extension usage you do not need to import `pdfjs-dist`, configure `GlobalWorkerOptions.workerSrc`, or pass `pdfjsLib` into every call.
|
|
72
|
+
|
|
73
|
+
## Browser Script Tag
|
|
74
|
+
|
|
75
|
+
For direct browser usage, load the IIFE bundle:
|
|
76
|
+
|
|
77
|
+
```html
|
|
78
|
+
<script src="./dist/med-pdf-nmo.browser.js"></script>
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Then call the global object:
|
|
82
|
+
|
|
83
|
+
```html
|
|
84
|
+
<input id="pdf" type="file" accept="application/pdf" />
|
|
85
|
+
|
|
86
|
+
<script>
|
|
87
|
+
document.querySelector("#pdf").addEventListener("change", async (event) => {
|
|
88
|
+
const file = event.target.files[0];
|
|
89
|
+
|
|
90
|
+
const result = await MedPdfNmo.answerQuestion(file, {
|
|
91
|
+
question: "Question text",
|
|
92
|
+
variants: ["Answer A", "Answer B", "Answer C"],
|
|
93
|
+
type: "single"
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
console.log(result.selectedIds, result.selected, result.confidence);
|
|
97
|
+
});
|
|
98
|
+
</script>
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
For public GitHub repositories, CDN usage is also possible:
|
|
102
|
+
|
|
103
|
+
```html
|
|
104
|
+
<script src="https://cdn.jsdelivr.net/gh/lKolabrodl/med-pdf-nmo@main/dist/med-pdf-nmo.browser.js"></script>
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Node.js
|
|
108
|
+
|
|
109
|
+
```js
|
|
110
|
+
import fs from "node:fs/promises";
|
|
111
|
+
import { answerQuestion } from "med-pdf-nmo";
|
|
112
|
+
|
|
113
|
+
const pdfBuffer = await fs.readFile("./doc.pdf");
|
|
114
|
+
|
|
115
|
+
const result = await answerQuestion(pdfBuffer, {
|
|
116
|
+
question: "Which drug is recommended?",
|
|
117
|
+
variants: ["Answer A", "Answer B", "Answer C", "Answer D"],
|
|
118
|
+
type: "single"
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
console.log(result.selectedIds);
|
|
122
|
+
console.log(result.selected);
|
|
123
|
+
console.log(result.confidence);
|
|
124
|
+
console.log(result.evidence);
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
In Node.js, the PDF input can be a `Buffer`, `Uint8Array`, `ArrayBuffer`, or URL string.
|
|
128
|
+
|
|
129
|
+
## API
|
|
130
|
+
|
|
131
|
+
### `answerQuestion(pdf, options)`
|
|
132
|
+
|
|
133
|
+
```ts
|
|
134
|
+
const result = await answerQuestion(pdf, {
|
|
135
|
+
question: "Question text",
|
|
136
|
+
variants: ["Answer A", "Answer B", "Answer C"],
|
|
137
|
+
type: "single"
|
|
138
|
+
});
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
`pdf` can be:
|
|
142
|
+
|
|
143
|
+
- `File`
|
|
144
|
+
- `Blob`
|
|
145
|
+
- `Buffer`
|
|
146
|
+
- `ArrayBuffer`
|
|
147
|
+
- `Uint8Array`
|
|
148
|
+
- URL string
|
|
149
|
+
- any object with `arrayBuffer()`
|
|
150
|
+
|
|
151
|
+
`options`:
|
|
152
|
+
|
|
153
|
+
- `question`: question text.
|
|
154
|
+
- `variants`: answer variants.
|
|
155
|
+
- `answers`: alias for `variants`.
|
|
156
|
+
- `type`: `"single"` or `"multi"`.
|
|
157
|
+
- `mode`: alias for `type`.
|
|
158
|
+
- `cacheKey`: optional PDF text cache key.
|
|
159
|
+
- `pdfjsLib`: optional explicit PDF.js module override.
|
|
160
|
+
|
|
161
|
+
Variants can be plain strings:
|
|
162
|
+
|
|
163
|
+
```js
|
|
164
|
+
variants: ["Answer A", "Answer B", "Answer C"]
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Or objects with stable IDs:
|
|
168
|
+
|
|
169
|
+
```js
|
|
170
|
+
variants: [
|
|
171
|
+
{ id: "A", text: "Answer A" },
|
|
172
|
+
{ id: "B", text: "Answer B" },
|
|
173
|
+
{ id: "C", text: "Answer C" }
|
|
174
|
+
]
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Result Shape
|
|
178
|
+
|
|
179
|
+
```js
|
|
180
|
+
{
|
|
181
|
+
selected: ["Answer B"],
|
|
182
|
+
selectedIds: ["B"],
|
|
183
|
+
mode: "single",
|
|
184
|
+
confidence: 0.73,
|
|
185
|
+
scores: [
|
|
186
|
+
{ id: "A", variant: "Answer A", score: 0.12, raw: 0.41 },
|
|
187
|
+
{ id: "B", variant: "Answer B", score: 0.73, raw: 1.92 }
|
|
188
|
+
],
|
|
189
|
+
evidence: [],
|
|
190
|
+
meta: {},
|
|
191
|
+
raw: {}
|
|
192
|
+
}
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Important fields:
|
|
196
|
+
|
|
197
|
+
- `selected`: selected answer texts.
|
|
198
|
+
- `selectedIds`: selected answer IDs.
|
|
199
|
+
- `confidence`: relative confidence for the selected answer or set.
|
|
200
|
+
- `scores`: calibrated and raw score per variant.
|
|
201
|
+
- `evidence`: PDF snippets used by the scorer.
|
|
202
|
+
- `raw`: low-level predictor output.
|
|
203
|
+
|
|
204
|
+
## Multi-Answer Questions
|
|
205
|
+
|
|
206
|
+
```js
|
|
207
|
+
const result = await answerQuestion(pdfBuffer, {
|
|
208
|
+
question: "Which statements are correct?",
|
|
209
|
+
variants: [
|
|
210
|
+
{ id: "A", text: "Statement A" },
|
|
211
|
+
{ id: "B", text: "Statement B" },
|
|
212
|
+
{ id: "C", text: "Statement C" },
|
|
213
|
+
{ id: "D", text: "Statement D" }
|
|
214
|
+
],
|
|
215
|
+
type: "multi"
|
|
216
|
+
});
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
`selectedIds` will contain all selected answer IDs.
|
|
220
|
+
|
|
221
|
+
## Low-Level Exports
|
|
222
|
+
|
|
223
|
+
```js
|
|
224
|
+
import {
|
|
225
|
+
predict,
|
|
226
|
+
answerQuestion,
|
|
227
|
+
setPdfJsLib,
|
|
228
|
+
clearPredictorCache
|
|
229
|
+
} from "med-pdf-nmo";
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
- `answerQuestion`: convenient high-level API.
|
|
233
|
+
- `predict`: low-level predictor API.
|
|
234
|
+
- `setPdfJsLib`: explicit PDF.js configuration hook.
|
|
235
|
+
- `clearPredictorCache`: clears the runtime predictor cache.
|
|
236
|
+
|
|
237
|
+
## CLI
|
|
238
|
+
|
|
239
|
+
After installation, the package provides:
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
med-pdf-nmo --help
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
Example:
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
med-pdf-nmo --pdf doc.pdf --question "Question text" --mode single --answer A="Answer A" --answer B="Answer B"
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Local development:
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
npm run predict -- --pdf doc.pdf --question "Question text" --mode single --answer A="Answer A" --answer B="Answer B"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## Build
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
npm install
|
|
261
|
+
npm run build
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Build outputs:
|
|
265
|
+
|
|
266
|
+
- `dist/index.js`: main ESM entrypoint.
|
|
267
|
+
- `dist/index.d.ts`: TypeScript declarations.
|
|
268
|
+
- `dist/med-pdf-nmo.browser.js`: browser global bundle with `MedPdfNmo`.
|
|
269
|
+
- `dist/med-pdf-nmo.browser.mjs`: browser ESM bundle with PDF.js included.
|
|
270
|
+
- `dist/browser-shims/*`: browser alias targets for Node built-ins.
|
|
271
|
+
- `dist/cli.js`: CLI entrypoint.
|
|
272
|
+
|
|
273
|
+
## Development Checks
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
npm test
|
|
277
|
+
npm run typecheck
|
|
278
|
+
npm run build
|
|
279
|
+
npm pack --dry-run
|
|
280
|
+
npm run eval
|
|
281
|
+
npm run eval:holdout
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
`npm run eval` and `npm run eval:holdout` are development-only quality checks. They read local test PDFs and answer keys to calculate accuracy.
|
|
285
|
+
|
|
286
|
+
The runtime package API does not read eval files, split files, answer keys, or test fixtures during inference.
|
|
287
|
+
|
|
288
|
+
## Limitations
|
|
289
|
+
|
|
290
|
+
- This package is not medical advice and does not replace expert review.
|
|
291
|
+
- Quality depends on how well PDF.js extracts text from a specific PDF.
|
|
292
|
+
- Scanned PDFs without a text layer may require OCR before being passed to the package.
|
|
293
|
+
- The algorithm selects likely answers from PDF evidence, but it cannot guarantee absolute correctness.
|
|
294
|
+
- Runtime inference is non-LLM and does not call external intelligent services.
|
|
295
|
+
|
|
296
|
+
## License
|
|
297
|
+
|
|
298
|
+
MIT. See [LICENSE](./LICENSE).
|
package/README.ru.md
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
# med-pdf-nmo
|
|
2
|
+
|
|
3
|
+
[English README](./README.md)
|
|
4
|
+
|
|
5
|
+
`med-pdf-nmo` - browser-first JavaScript/Node.js пакет, который выбирает наиболее вероятный ответ или набор ответов на НМО-вопрос по PDF-файлу с медицинскими или клиническими рекомендациями.
|
|
6
|
+
|
|
7
|
+
Runtime работает локально и не использует LLM. В inference нет ChatGPT, OpenAI API, Anthropic, Gemini, HuggingFace inference, transformer-моделей или внешних интеллектуальных сервисов. Алгоритм основан на извлечении текста PDF, нормализации, поиске, структурных эвристиках, скоринге и evidence-фрагментах из PDF.
|
|
8
|
+
|
|
9
|
+
## Что делает пакет
|
|
10
|
+
|
|
11
|
+
- Принимает PDF медицинских рекомендаций, вопрос и варианты ответа.
|
|
12
|
+
- Извлекает текст из PDF через `pdfjs-dist`.
|
|
13
|
+
- Нормализует русский медицинский текст, PDF-артефакты, греческие буквы, числовые ссылки, дозировки и частые OCR-искажения.
|
|
14
|
+
- Считает score для каждого варианта ответа.
|
|
15
|
+
- Поддерживает `single` и `multi` вопросы.
|
|
16
|
+
- Возвращает выбранные ответы, confidence, score по вариантам, raw score, evidence из PDF и метаданные.
|
|
17
|
+
- Работает в Node.js, browser bundle и Chrome-extension окружениях.
|
|
18
|
+
|
|
19
|
+
## Текущие метрики
|
|
20
|
+
|
|
21
|
+
Цифры получены на локальном корпусе PDF-групп с answer key. Это не гарантия качества на любом новом PDF, а текущий ориентир после финального прогона.
|
|
22
|
+
|
|
23
|
+
| Набор | Exact accuracy | Single-answer | Multi-answer exact set |
|
|
24
|
+
| --- | ---: | ---: | ---: |
|
|
25
|
+
| Все keyed cases | `73.53%` (`2069/2814`) | `81.17%` (`1573/1938`) | `56.62%` (`496/876`) |
|
|
26
|
+
| Holdout split | `83.79%` (`486/580`) | `87.39%` | `72.92%` |
|
|
27
|
+
| Dev split | `77.14%` (`388/503`) | `83.09%` | `63.64%` |
|
|
28
|
+
|
|
29
|
+
Для `single` правильным считается только точный выбор одного ответа. Для `multi` правильным считается только полное совпадение множества ответов, поэтому multi-метрика строже и обычно ниже.
|
|
30
|
+
|
|
31
|
+
## Установка
|
|
32
|
+
|
|
33
|
+
Из npm, когда пакет опубликован:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
npm install med-pdf-nmo
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Напрямую из Git HTTPS URL:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
npm install git+https://github.com/lKolabrodl/med-pdf-nmo.git#main
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Или в `package.json`:
|
|
46
|
+
|
|
47
|
+
```json
|
|
48
|
+
{
|
|
49
|
+
"dependencies": {
|
|
50
|
+
"med-pdf-nmo": "git+https://github.com/lKolabrodl/med-pdf-nmo.git#main"
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
При установке из Git npm выполнит `prepare`, поэтому пакет сам соберет `dist`.
|
|
56
|
+
|
|
57
|
+
## Browser / React / Chrome Extension
|
|
58
|
+
|
|
59
|
+
Для браузерного окружения используй browser entrypoint:
|
|
60
|
+
|
|
61
|
+
```ts
|
|
62
|
+
import { answerQuestion } from "med-pdf-nmo/browser";
|
|
63
|
+
|
|
64
|
+
const result = await answerQuestion(new Uint8Array(pdfData.slice(0)), {
|
|
65
|
+
question,
|
|
66
|
+
variants,
|
|
67
|
+
type: isSingle ? "single" : "multi",
|
|
68
|
+
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Browser entrypoint уже содержит и регистрирует PDF.js внутри пакета. В обычном React, Vite, Webpack или Chrome-extension коде не нужно отдельно импортировать `pdfjs-dist`, настраивать `GlobalWorkerOptions.workerSrc` или передавать `pdfjsLib` в каждый вызов.
|
|
72
|
+
|
|
73
|
+
## Подключение через script tag
|
|
74
|
+
|
|
75
|
+
Для прямого подключения в браузере:
|
|
76
|
+
|
|
77
|
+
```html
|
|
78
|
+
<script src="./dist/med-pdf-nmo.browser.js"></script>
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Глобальный объект:
|
|
82
|
+
|
|
83
|
+
```html
|
|
84
|
+
<input id="pdf" type="file" accept="application/pdf" />
|
|
85
|
+
|
|
86
|
+
<script>
|
|
87
|
+
document.querySelector("#pdf").addEventListener("change", async (event) => {
|
|
88
|
+
const file = event.target.files[0];
|
|
89
|
+
|
|
90
|
+
const result = await MedPdfNmo.answerQuestion(file, {
|
|
91
|
+
question: "Текст вопроса",
|
|
92
|
+
variants: ["Ответ A", "Ответ B", "Ответ C"],
|
|
93
|
+
type: "single"
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
console.log(result.selectedIds, result.selected, result.confidence);
|
|
97
|
+
});
|
|
98
|
+
</script>
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Для публичного GitHub-репозитория можно использовать CDN:
|
|
102
|
+
|
|
103
|
+
```html
|
|
104
|
+
<script src="https://cdn.jsdelivr.net/gh/lKolabrodl/med-pdf-nmo@main/dist/med-pdf-nmo.browser.js"></script>
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Node.js
|
|
108
|
+
|
|
109
|
+
```js
|
|
110
|
+
import fs from "node:fs/promises";
|
|
111
|
+
import { answerQuestion } from "med-pdf-nmo";
|
|
112
|
+
|
|
113
|
+
const pdfBuffer = await fs.readFile("./doc.pdf");
|
|
114
|
+
|
|
115
|
+
const result = await answerQuestion(pdfBuffer, {
|
|
116
|
+
question: "Какой препарат показан пациенту?",
|
|
117
|
+
variants: ["Ответ A", "Ответ B", "Ответ C", "Ответ D"],
|
|
118
|
+
type: "single"
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
console.log(result.selectedIds);
|
|
122
|
+
console.log(result.selected);
|
|
123
|
+
console.log(result.confidence);
|
|
124
|
+
console.log(result.evidence);
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
В Node.js PDF можно передавать как `Buffer`, `Uint8Array`, `ArrayBuffer` или URL-строку.
|
|
128
|
+
|
|
129
|
+
## API
|
|
130
|
+
|
|
131
|
+
### `answerQuestion(pdf, options)`
|
|
132
|
+
|
|
133
|
+
```ts
|
|
134
|
+
const result = await answerQuestion(pdf, {
|
|
135
|
+
question: "Текст вопроса",
|
|
136
|
+
variants: ["Ответ A", "Ответ B", "Ответ C"],
|
|
137
|
+
type: "single"
|
|
138
|
+
});
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
`pdf` может быть:
|
|
142
|
+
|
|
143
|
+
- `File`
|
|
144
|
+
- `Blob`
|
|
145
|
+
- `Buffer`
|
|
146
|
+
- `ArrayBuffer`
|
|
147
|
+
- `Uint8Array`
|
|
148
|
+
- URL-строка
|
|
149
|
+
- объект с методом `arrayBuffer()`
|
|
150
|
+
|
|
151
|
+
`options`:
|
|
152
|
+
|
|
153
|
+
- `question`: текст вопроса.
|
|
154
|
+
- `variants`: варианты ответа.
|
|
155
|
+
- `answers`: алиас для `variants`.
|
|
156
|
+
- `type`: `"single"` или `"multi"`.
|
|
157
|
+
- `mode`: алиас для `type`.
|
|
158
|
+
- `cacheKey`: необязательный ключ кеша для текста PDF.
|
|
159
|
+
- `pdfjsLib`: необязательная явная передача PDF.js модуля.
|
|
160
|
+
|
|
161
|
+
Варианты можно передавать строками:
|
|
162
|
+
|
|
163
|
+
```js
|
|
164
|
+
variants: ["Ответ A", "Ответ B", "Ответ C"]
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Или объектами со стабильными ID:
|
|
168
|
+
|
|
169
|
+
```js
|
|
170
|
+
variants: [
|
|
171
|
+
{ id: "A", text: "Ответ A" },
|
|
172
|
+
{ id: "B", text: "Ответ B" },
|
|
173
|
+
{ id: "C", text: "Ответ C" }
|
|
174
|
+
]
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Результат
|
|
178
|
+
|
|
179
|
+
```js
|
|
180
|
+
{
|
|
181
|
+
selected: ["Ответ B"],
|
|
182
|
+
selectedIds: ["B"],
|
|
183
|
+
mode: "single",
|
|
184
|
+
confidence: 0.73,
|
|
185
|
+
scores: [
|
|
186
|
+
{ id: "A", variant: "Ответ A", score: 0.12, raw: 0.41 },
|
|
187
|
+
{ id: "B", variant: "Ответ B", score: 0.73, raw: 1.92 }
|
|
188
|
+
],
|
|
189
|
+
evidence: [],
|
|
190
|
+
meta: {},
|
|
191
|
+
raw: {}
|
|
192
|
+
}
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Главные поля:
|
|
196
|
+
|
|
197
|
+
- `selected`: выбранные тексты ответов.
|
|
198
|
+
- `selectedIds`: ID выбранных ответов.
|
|
199
|
+
- `confidence`: относительная уверенность.
|
|
200
|
+
- `scores`: score по всем вариантам.
|
|
201
|
+
- `evidence`: найденные фрагменты PDF.
|
|
202
|
+
- `raw`: низкоуровневый результат predictor.
|
|
203
|
+
|
|
204
|
+
## Multi-answer вопросы
|
|
205
|
+
|
|
206
|
+
```js
|
|
207
|
+
const result = await answerQuestion(pdfBuffer, {
|
|
208
|
+
question: "Какие утверждения верны?",
|
|
209
|
+
variants: [
|
|
210
|
+
{ id: "A", text: "Утверждение A" },
|
|
211
|
+
{ id: "B", text: "Утверждение B" },
|
|
212
|
+
{ id: "C", text: "Утверждение C" },
|
|
213
|
+
{ id: "D", text: "Утверждение D" }
|
|
214
|
+
],
|
|
215
|
+
type: "multi"
|
|
216
|
+
});
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
В `selectedIds` будет массив выбранных ID.
|
|
220
|
+
|
|
221
|
+
## Низкоуровневые exports
|
|
222
|
+
|
|
223
|
+
```js
|
|
224
|
+
import {
|
|
225
|
+
predict,
|
|
226
|
+
answerQuestion,
|
|
227
|
+
setPdfJsLib,
|
|
228
|
+
clearPredictorCache
|
|
229
|
+
} from "med-pdf-nmo";
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
- `answerQuestion`: удобный высокоуровневый API.
|
|
233
|
+
- `predict`: низкоуровневый predictor API.
|
|
234
|
+
- `setPdfJsLib`: ручная настройка PDF.js.
|
|
235
|
+
- `clearPredictorCache`: очистка runtime-кеша predictor.
|
|
236
|
+
|
|
237
|
+
## CLI
|
|
238
|
+
|
|
239
|
+
После установки пакет добавляет команду:
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
med-pdf-nmo --help
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
Пример:
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
med-pdf-nmo --pdf doc.pdf --question "Текст вопроса" --mode single --answer A="Ответ A" --answer B="Ответ B"
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Локально в репозитории:
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
npm run predict -- --pdf doc.pdf --question "Текст вопроса" --mode single --answer A="Ответ A" --answer B="Ответ B"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## Сборка
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
npm install
|
|
261
|
+
npm run build
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Сборка создает:
|
|
265
|
+
|
|
266
|
+
- `dist/index.js`: основной ESM entrypoint.
|
|
267
|
+
- `dist/index.d.ts`: TypeScript-типы.
|
|
268
|
+
- `dist/med-pdf-nmo.browser.js`: браузерный global bundle `MedPdfNmo`.
|
|
269
|
+
- `dist/med-pdf-nmo.browser.mjs`: браузерный ESM bundle с PDF.js внутри.
|
|
270
|
+
- `dist/browser-shims/*`: browser alias targets для Node built-ins.
|
|
271
|
+
- `dist/cli.js`: CLI entrypoint.
|
|
272
|
+
|
|
273
|
+
## Проверки разработки
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
npm test
|
|
277
|
+
npm run typecheck
|
|
278
|
+
npm run build
|
|
279
|
+
npm pack --dry-run
|
|
280
|
+
npm run eval
|
|
281
|
+
npm run eval:holdout
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
`npm run eval` и `npm run eval:holdout` - developer tooling. Они читают локальные тестовые PDF и answer key, чтобы посчитать accuracy.
|
|
285
|
+
|
|
286
|
+
Runtime API пакета во время inference не читает eval-файлы, split-файлы, правильные ответы или тестовые fixtures.
|
|
287
|
+
|
|
288
|
+
## Ограничения
|
|
289
|
+
|
|
290
|
+
- Пакет не является медицинским советником и не заменяет эксперта.
|
|
291
|
+
- Качество зависит от того, насколько хорошо PDF.js извлек текст из конкретного PDF.
|
|
292
|
+
- Сканированные PDF без текстового слоя могут потребовать OCR до передачи в пакет.
|
|
293
|
+
- Алгоритм выбирает вероятные ответы по PDF evidence, но не гарантирует абсолютную правильность.
|
|
294
|
+
- Runtime inference не использует LLM и не обращается к внешним интеллектуальным сервисам.
|
|
295
|
+
|
|
296
|
+
## Лицензия
|
|
297
|
+
|
|
298
|
+
MIT. Подробнее см. [LICENSE](./LICENSE).
|
package/dist/bm25.d.ts
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Документ, сохраненный в поисковом BM25-индексе.
|
|
3
|
+
*
|
|
4
|
+
* Чанки predictor добавляют дополнительные метаданные: страницу, тип, текст и id.
|
|
5
|
+
*/
|
|
6
|
+
export type BM25Document = {
|
|
7
|
+
tokens?: string[];
|
|
8
|
+
[key: string]: unknown;
|
|
9
|
+
};
|
|
10
|
+
/**
|
|
11
|
+
* Небольшая реализация BM25 для локального поиска по чанкам PDF.
|
|
12
|
+
*/
|
|
13
|
+
export declare class BM25Index {
|
|
14
|
+
documents: BM25Document[];
|
|
15
|
+
k1: number;
|
|
16
|
+
b: number;
|
|
17
|
+
docFreq: Map<string, number>;
|
|
18
|
+
termFreqs: Map<string, number>[];
|
|
19
|
+
lengths: number[];
|
|
20
|
+
avgdl: number;
|
|
21
|
+
/**
|
|
22
|
+
* Создает индекс для уже токенизированных документов.
|
|
23
|
+
*
|
|
24
|
+
* @param documents Документы с необязательными массивами `tokens`.
|
|
25
|
+
* @param options Параметры настройки BM25.
|
|
26
|
+
*/
|
|
27
|
+
constructor(documents: BM25Document[], { k1, b }?: {
|
|
28
|
+
k1?: number;
|
|
29
|
+
b?: number;
|
|
30
|
+
});
|
|
31
|
+
/**
|
|
32
|
+
* Считает inverse document frequency для одного нормализованного токена.
|
|
33
|
+
*/
|
|
34
|
+
idf(token: string): number;
|
|
35
|
+
/**
|
|
36
|
+
* Считает score токенизированного запроса для одного документа в индексе.
|
|
37
|
+
*/
|
|
38
|
+
scoreTokens(queryTokens: string[], docIndex: number): number;
|
|
39
|
+
/**
|
|
40
|
+
* Ищет по индексу сырой текст или заранее токенизированный запрос.
|
|
41
|
+
*
|
|
42
|
+
* @returns Лучшие совпадающие чанки с положительным BM25 score.
|
|
43
|
+
*/
|
|
44
|
+
search(query: string | string[], { limit }?: {
|
|
45
|
+
limit?: number;
|
|
46
|
+
}): any[];
|
|
47
|
+
}
|