@precisa-saude/fhir-ocr-utils 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +85 -0
- package/dist/index.cjs +126 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +35 -0
- package/dist/index.d.ts +35 -0
- package/dist/index.js +126 -0
- package/dist/index.js.map +1 -0
- package/package.json +48 -0
package/README.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# @precisa-saude/fhir-ocr-utils
|
|
2
|
+
|
|
3
|
+
Utilitários de ancoragem OCR para extração de biomarcadores de PDFs de resultados laboratoriais.
|
|
4
|
+
|
|
5
|
+
## Instalação
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @precisa-saude/fhir-ocr-utils
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
> **Nota:** Requer `@precisa-saude/fhir` como peer dependency.
|
|
12
|
+
|
|
13
|
+
## Uso rápido
|
|
14
|
+
|
|
15
|
+
### Encontrar biomarcadores em texto OCR
|
|
16
|
+
|
|
17
|
+
```ts
|
|
18
|
+
import { findBiomarkersInText, getMatchedCodes } from '@precisa-saude/fhir-ocr-utils';
|
|
19
|
+
|
|
20
|
+
const ocrText = `
|
|
21
|
+
HEMOGRAMA COMPLETO
|
|
22
|
+
Hemoglobina: 14.2 g/dL
|
|
23
|
+
Glicose Jejum: 95 mg/dL
|
|
24
|
+
Colesterol Total: 195 mg/dL
|
|
25
|
+
HDL: 55 mg/dL
|
|
26
|
+
Triglicerídeos: 120 mg/dL
|
|
27
|
+
`;
|
|
28
|
+
|
|
29
|
+
const result = findBiomarkersInText(ocrText);
|
|
30
|
+
|
|
31
|
+
console.log(result.matches);
|
|
32
|
+
// [
|
|
33
|
+
// { code: 'Hgb', loinc: '718-7', matchedName: 'Hemoglobina', ... },
|
|
34
|
+
// { code: 'Glucose', loinc: '2345-7', matchedName: 'Glicose', ... },
|
|
35
|
+
// { code: 'Cholesterol', loinc: '2093-3', matchedName: 'Colesterol Total', ... },
|
|
36
|
+
// ...
|
|
37
|
+
// ]
|
|
38
|
+
|
|
39
|
+
const codes = getMatchedCodes(result);
|
|
40
|
+
// ['Hgb', 'Glucose', 'Cholesterol', 'HDL', 'Triglycerides']
|
|
41
|
+
|
|
42
|
+
console.log(result.filteredReference);
|
|
43
|
+
// Referência LLM filtrada apenas com os biomarcadores encontrados
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Padrão anti-alucinação
|
|
47
|
+
|
|
48
|
+
Este pacote implementa um padrão de **ancoragem antes do LLM** para prevenir alucinações na extração de dados laboratoriais:
|
|
49
|
+
|
|
50
|
+
1. **Ancoragem (este pacote):** Escaneia o texto OCR bruto procurando nomes de biomarcadores conhecidos usando correspondência exata de strings contra as 183+ definições de `@precisa-saude/fhir`.
|
|
51
|
+
|
|
52
|
+
2. **Filtragem:** Gera uma referência LLM filtrada (`filteredReference`) contendo apenas os biomarcadores que foram realmente encontrados no texto. O LLM só pode extrair valores para biomarcadores presentes nesta lista.
|
|
53
|
+
|
|
54
|
+
3. **Extração (LLM):** O modelo de linguagem recebe o texto OCR junto com a referência filtrada, restringindo sua saída apenas aos biomarcadores ancorados.
|
|
55
|
+
|
|
56
|
+
Este fluxo em dois estágios garante que o LLM não invente biomarcadores que não estão presentes no documento original.
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
PDF → OCR → findBiomarkersInText() → filteredReference → LLM → valores extraídos
|
|
60
|
+
| |
|
|
61
|
+
+── restringe quais biomarcadores ─────────────────────+
|
|
62
|
+
o LLM pode extrair
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## API
|
|
66
|
+
|
|
67
|
+
### `findBiomarkersInText(ocrText: string): AnchorResult`
|
|
68
|
+
|
|
69
|
+
Escaneia texto OCR e retorna todos os biomarcadores encontrados.
|
|
70
|
+
|
|
71
|
+
- `result.matches` — Lista de biomarcadores encontrados com código, LOINC, nome e posição
|
|
72
|
+
- `result.filteredReference` — Referência formatada para enviar ao LLM
|
|
73
|
+
- `result.stats` — Estatísticas de execução (total de padrões, encontrados, tempo)
|
|
74
|
+
|
|
75
|
+
### `getMatchedCodes(result: AnchorResult): string[]`
|
|
76
|
+
|
|
77
|
+
Extrai a lista de códigos de biomarcadores de um resultado de ancoragem.
|
|
78
|
+
|
|
79
|
+
## Aviso médico
|
|
80
|
+
|
|
81
|
+
Este pacote fornece utilitários de processamento de texto para extração de dados laboratoriais. **Não substitui orientação médica profissional.** Consulte o [DISCLAIMER.md](../../DISCLAIMER.md) na raiz do repositório para detalhes completos.
|
|
82
|
+
|
|
83
|
+
## Licença
|
|
84
|
+
|
|
85
|
+
[Apache-2.0](../../LICENSE)
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"use strict";Object.defineProperty(exports, "__esModule", {value: true});// src/anchor.ts
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
var _fhir = require('@precisa-saude/fhir');
|
|
6
|
+
function normalize(text) {
|
|
7
|
+
return text.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase().replace(/\s+/g, " ");
|
|
8
|
+
}
|
|
9
|
+
var UNAMBIGUOUS_SHORT_NAMES = /* @__PURE__ */ new Set([
|
|
10
|
+
"hdl",
|
|
11
|
+
"ldl",
|
|
12
|
+
"lh",
|
|
13
|
+
"tsh",
|
|
14
|
+
"crp",
|
|
15
|
+
"pcr",
|
|
16
|
+
"ggt",
|
|
17
|
+
"alt",
|
|
18
|
+
"ast",
|
|
19
|
+
"bun",
|
|
20
|
+
"wbc",
|
|
21
|
+
"rbc",
|
|
22
|
+
"mcv",
|
|
23
|
+
"mch",
|
|
24
|
+
"rdw",
|
|
25
|
+
"mpv",
|
|
26
|
+
"psa",
|
|
27
|
+
"fsh",
|
|
28
|
+
"hba1c",
|
|
29
|
+
"egfr",
|
|
30
|
+
"acr",
|
|
31
|
+
"esr",
|
|
32
|
+
"vhs",
|
|
33
|
+
"bmc",
|
|
34
|
+
"bmd",
|
|
35
|
+
"vat",
|
|
36
|
+
"dxa",
|
|
37
|
+
"dmo",
|
|
38
|
+
"cmo",
|
|
39
|
+
"ffm",
|
|
40
|
+
"lbm",
|
|
41
|
+
"mlg",
|
|
42
|
+
"tav"
|
|
43
|
+
]);
|
|
44
|
+
var cachedPatterns = null;
|
|
45
|
+
var cachedNormalized = null;
|
|
46
|
+
function getPatterns() {
|
|
47
|
+
if (!cachedPatterns) {
|
|
48
|
+
cachedPatterns = _fhir.getAllSearchPatterns.call(void 0, );
|
|
49
|
+
}
|
|
50
|
+
return cachedPatterns;
|
|
51
|
+
}
|
|
52
|
+
function getNormalizedPatterns() {
|
|
53
|
+
if (!cachedNormalized) {
|
|
54
|
+
const patterns = getPatterns();
|
|
55
|
+
const map = /* @__PURE__ */ new Map();
|
|
56
|
+
for (const pattern of patterns) {
|
|
57
|
+
for (const name of pattern.names) {
|
|
58
|
+
const normalized = normalize(name);
|
|
59
|
+
const existing = map.get(normalized) || [];
|
|
60
|
+
existing.push({
|
|
61
|
+
code: pattern.code,
|
|
62
|
+
...pattern.loinc && { loinc: pattern.loinc },
|
|
63
|
+
original: name
|
|
64
|
+
});
|
|
65
|
+
map.set(normalized, existing);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
cachedNormalized = map;
|
|
69
|
+
}
|
|
70
|
+
return cachedNormalized;
|
|
71
|
+
}
|
|
72
|
+
function findBiomarkersInText(ocrText) {
|
|
73
|
+
const startTime = Date.now();
|
|
74
|
+
const normalizedText = normalize(ocrText);
|
|
75
|
+
const matchedCodes = /* @__PURE__ */ new Set();
|
|
76
|
+
const matches = [];
|
|
77
|
+
const normalizedPatterns = getNormalizedPatterns();
|
|
78
|
+
for (const [normalizedName, entries] of normalizedPatterns) {
|
|
79
|
+
if (normalizedName.length < 3 && !UNAMBIGUOUS_SHORT_NAMES.has(normalizedName)) {
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
let position = -1;
|
|
83
|
+
if (normalizedName.length <= 4) {
|
|
84
|
+
const regex = new RegExp(`\\b${normalizedName}\\b`);
|
|
85
|
+
const match = regex.exec(normalizedText);
|
|
86
|
+
if (match) {
|
|
87
|
+
position = match.index;
|
|
88
|
+
}
|
|
89
|
+
} else {
|
|
90
|
+
position = normalizedText.indexOf(normalizedName);
|
|
91
|
+
}
|
|
92
|
+
if (position !== -1) {
|
|
93
|
+
for (const entry of entries) {
|
|
94
|
+
if (!matchedCodes.has(entry.code)) {
|
|
95
|
+
matchedCodes.add(entry.code);
|
|
96
|
+
matches.push({
|
|
97
|
+
code: entry.code,
|
|
98
|
+
confidence: 1,
|
|
99
|
+
loinc: entry.loinc,
|
|
100
|
+
matchedName: entry.original,
|
|
101
|
+
position
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
const scanTimeMs = Date.now() - startTime;
|
|
108
|
+
const matchedCodesArray = Array.from(matchedCodes);
|
|
109
|
+
return {
|
|
110
|
+
filteredReference: _fhir.generateFilteredLLMReference.call(void 0, matchedCodesArray),
|
|
111
|
+
matches,
|
|
112
|
+
stats: {
|
|
113
|
+
matchedCount: matches.length,
|
|
114
|
+
scanTimeMs,
|
|
115
|
+
totalPatterns: getPatterns().length
|
|
116
|
+
}
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
function getMatchedCodes(result) {
|
|
120
|
+
return result.matches.map((m) => m.code);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
exports.findBiomarkersInText = findBiomarkersInText; exports.getMatchedCodes = getMatchedCodes;
|
|
126
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["/Users/rafael/Github/precisa-saude/fhir-brasil/packages/ocr-utils/dist/index.cjs","../src/anchor.ts"],"names":[],"mappings":"AAAA;ACQA;AAEE;AACA;AAAA,2CACK;AA0BP,SAAS,SAAA,CAAU,IAAA,EAAsB;AACvC,EAAA,OAAO,IAAA,CACJ,SAAA,CAAU,KAAK,CAAA,CACf,OAAA,CAAQ,kBAAA,EAAoB,EAAE,CAAA,CAC9B,WAAA,CAAY,CAAA,CACZ,OAAA,CAAQ,MAAA,EAAQ,GAAG,CAAA;AACxB;AAEA,IAAM,wBAAA,kBAA0B,IAAI,GAAA,CAAI;AAAA,EACtC,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,IAAA;AAAA,EAAM,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAC9D,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,OAAA;AAAA,EAAS,MAAA;AAAA,EACjE,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO,KAAA;AAAA,EAC/D,KAAA;AAAA,EAAO,KAAA;AAAA,EAAO;AAChB,CAAC,CAAA;AAID,IAAI,eAAA,EAAkD,IAAA;AACtD,IAAI,iBAAA,EAAuD,IAAA;AAE3D,SAAS,WAAA,CAAA,EAAwC;AAC/C,EAAA,GAAA,CAAI,CAAC,cAAA,EAAgB;AACnB,IAAA,eAAA,EAAiB,wCAAA,CAAqB;AAAA,EACxC;AACA,EAAA,OAAO,cAAA;AACT;AAEA,SAAS,qBAAA,CAAA,EAAqD;AAC5D,EAAA,GAAA,CAAI,CAAC,gBAAA,EAAkB;AACrB,IAAA,MAAM,SAAA,EAAW,WAAA,CAAY,CAAA;AAC7B,IAAA,MAAM,IAAA,kBAAM,IAAI,GAAA,CAA4B,CAAA;AAC5C,IAAA,IAAA,CAAA,MAAW,QAAA,GAAW,QAAA,EAAU;AAC9B,MAAA,IAAA,CAAA,MAAW,KAAA,GAAQ,OAAA,CAAQ,KAAA,EAAO;AAChC,QAAA,MAAM,WAAA,EAAa,SAAA,CAAU,IAAI,CAAA;AACjC,QAAA,MAAM,SAAA,EAAW,GAAA,CAAI,GAAA,CAAI,UAAU,EAAA,GAAK,CAAC,CAAA;AACzC,QAAA,QAAA,CAAS,IAAA,CAAK;AAAA,UACZ,IAAA,EAAM,OAAA,CAAQ,IAAA;AAAA,UACd,GAAI,OAAA,CAAQ,MAAA,GAAS,EAAE,KAAA,EAAO,OAAA,CAAQ,MAAM,CAAA;AAAA,UAC5C,QAAA,EAAU;AAAA,QACZ,CAAC,CAAA;AACD,QAAA,GAAA,CAAI,GAAA,CAAI,UAAA,EAAY,QAAQ,CAAA;AAAA,MAC9B;AAAA,IACF;AACA,IAAA,iBAAA,EAAmB,GAAA;AAAA,EACrB;AACA,EAAA,OAAO,gBAAA;AACT;AAOO,SAAS,oBAAA,CAAqB,OAAA,EAA+B;AAClE,EAAA,MAAM,UAAA,EAAY,IAAA,CAAK,GAAA,CAAI,CAAA;AAC3B,EAAA,MAAM,eAAA,EAAiB,SAAA,CAAU,OAAO,CAAA;AACxC,EAAA,MAAM,aAAA,kBAAe,IAAI,GAAA,CAAY,CAAA;AACrC,EAAA,MAAM,QAAA,EAAyB,CAAC,CAAA;AAChC,EAAA,MAAM,mBAAA,EAAqB,qBAAA,CAAsB,CAAA;AAEjD,EAAA,IAAA,CAAA,MAAW,CAAC,cAAA,EAAgB,OAAO,EAAA,GAAK,kBAAA,EAAoB;AAC1D,IAAA,GAAA,CAAI,cAAA,CAAe,OAAA,EAAS,EAAA,GAAK,CAAC,uBAAA,CAAwB,GAAA,CAAI,cAAc,CAAA,EAAG;AAC7E,MAAA,QAAA;AAAA,IACF;AAEA,IAAA,IAAI,SAAA,EAAW,CAAA,CAAA;AACf,IAAA,GAAA,CAAI,cAAA,CAAe,OAAA,GAAU,CAAA,EAAG;AAC9B,MAAA,MAAM,MAAA,EAAQ,IAAI,MAAA,CAAO,CAAA,GAAA,EAAM,cAAc,CAAA,GAAA,CAAK,CAAA;AAClD,MAAA,MAAM,MAAA,EAAQ,KAAA,CAAM,IAAA,CAAK,cAAc,CAAA;AACvC,MAAA,GAAA,CAAI,KAAA,EAAO;AACT,QAAA,SAAA,EAAW,KAAA,CAAM,KAAA;AAAA,MACnB;AAAA,IACF,EAAA,KAAO;AACL,MAAA,SAAA,EAAW,cAAA,CAAe,OAAA,CAAQ,cAAc,CAAA;AAAA,IAClD;AAEA,IAAA,GAAA,CAAI,SAAA,IAAa,CAAA,CAAA,EAAI;AACnB,MAAA,IAAA,CAAA,MAAW,MAAA,GAAS,OAAA,EAAS;AAC3B,QAAA,GAAA,CAAI,CAAC,YAAA,CAAa,GAAA,CAAI,KAAA,CAAM,IAAI,CAAA,EAAG;AACjC,UAAA,YAAA,CAAa,GAAA,CAAI,KAAA,CAAM,IAAI,CAAA;AAC3B,UAAA,OAAA,CAAQ,IAAA,CAAK;AAAA,YACX,IAAA,EAAM,KAAA,CAAM,IAAA;AAAA,YACZ,UAAA,EAAY,CAAA;AAAA,YACZ,KAAA,EAAO,KAAA,CAAM,KAAA;AAAA,YACb,WAAA,EAAa,KAAA,CAAM,QAAA;AAAA,YACnB;AAAA,UACF,CAAC,CAAA;AAAA,QACH;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,EAAA,MAAM,WAAA,EAAa,IAAA,CAAK,GAAA,CAAI,EAAA,EAAI,SAAA;AAChC,EAAA,MAAM,kBAAA,EAAoB,KAAA,CAAM,IAAA,CAAK,YAAY,CAAA;AAEjD,EAAA,OAAO;AAAA,IACL,iBAAA,EAAmB,gDAAA,iBAA8C,CAAA;AAAA,IACjE,OAAA;AAAA,IACA,KAAA,EAAO;AAAA,MACL,YAAA,EAAc,OAAA,CAAQ,MAAA;AAAA,MACtB,UAAA;AAAA,MACA,aAAA,EAAe,WAAA,CAAY,CAAA,CAAE;AAAA,IAC/B;AAAA,EACF,CAAA;AACF;AAKO,SAAS,eAAA,CAAgB,MAAA,EAAgC;AAC9D,EAAA,OAAO,MAAA,CAAO,OAAA,CAAQ,GAAA,CAAI,CAAC,CAAA,EAAA,GAAM,CAAA,CAAE,IAAI,CAAA;AACzC;AD5BA;AACE;AACA;AACF,+FAAC","file":"/Users/rafael/Github/precisa-saude/fhir-brasil/packages/ocr-utils/dist/index.cjs","sourcesContent":[null,"/**\n * OCR Anchor — Biomarker text anchoring\n *\n * Scans OCR text for biomarker names BEFORE sending to LLM.\n * This prevents hallucination by constraining what biomarkers\n * the LLM is allowed to extract.\n */\n\nimport {\n type BiomarkerSearchPattern,\n generateFilteredLLMReference,\n getAllSearchPatterns,\n} from '@precisa-saude/fhir';\n\nexport interface AnchorMatch {\n code: string;\n confidence: number;\n loinc?: string;\n matchedName: string;\n position: number;\n}\n\nexport interface AnchorResult {\n filteredReference: string;\n matches: AnchorMatch[];\n stats: {\n totalPatterns: number;\n matchedCount: number;\n scanTimeMs: number;\n };\n}\n\n/**\n * Normalize text for comparison:\n * - Removes diacritics (ã→a, ç→c, é→e)\n * - Converts to lowercase\n * - Normalizes whitespace\n */\nfunction normalize(text: string): string {\n return text\n .normalize('NFD')\n .replace(/[\\u0300-\\u036f]/g, '')\n .toLowerCase()\n .replace(/\\s+/g, ' ');\n}\n\nconst UNAMBIGUOUS_SHORT_NAMES = new Set([\n 'hdl', 'ldl', 'lh', 'tsh', 'crp', 'pcr', 'ggt', 'alt', 'ast', 'bun',\n 'wbc', 'rbc', 'mcv', 'mch', 'rdw', 'mpv', 'psa', 'fsh', 'hba1c', 'egfr',\n 'acr', 'esr', 'vhs', 'bmc', 'bmd', 'vat', 'dxa', 'dmo', 'cmo', 'ffm',\n 'lbm', 'mlg', 'tav',\n]);\n\ntype PatternEntry = { original: string; code: string; loinc?: string };\n\nlet cachedPatterns: BiomarkerSearchPattern[] | null = null;\nlet cachedNormalized: Map<string, PatternEntry[]> | null = null;\n\nfunction getPatterns(): BiomarkerSearchPattern[] {\n if (!cachedPatterns) {\n cachedPatterns = getAllSearchPatterns();\n }\n return cachedPatterns;\n}\n\nfunction getNormalizedPatterns(): Map<string, PatternEntry[]> {\n if (!cachedNormalized) {\n const patterns = getPatterns();\n const map = new Map<string, PatternEntry[]>();\n for (const pattern of patterns) {\n for (const name of pattern.names) {\n const normalized = normalize(name);\n const existing = map.get(normalized) || [];\n existing.push({\n code: pattern.code,\n ...(pattern.loinc && { loinc: pattern.loinc }),\n original: name,\n });\n map.set(normalized, existing);\n }\n }\n cachedNormalized = map;\n }\n return cachedNormalized;\n}\n\n/**\n * Find all biomarker names present in OCR text.\n * Uses exact string matching on normalized text.\n * Returns unique matches (same biomarker won't be matched twice).\n */\nexport function findBiomarkersInText(ocrText: string): AnchorResult {\n const startTime = Date.now();\n const normalizedText = normalize(ocrText);\n const matchedCodes = new Set<string>();\n const matches: AnchorMatch[] = [];\n const normalizedPatterns = getNormalizedPatterns();\n\n for (const [normalizedName, entries] of normalizedPatterns) {\n if (normalizedName.length < 3 && !UNAMBIGUOUS_SHORT_NAMES.has(normalizedName)) {\n continue;\n }\n\n let position = -1;\n if (normalizedName.length <= 4) {\n const regex = new RegExp(`\\\\b${normalizedName}\\\\b`);\n const match = regex.exec(normalizedText);\n if (match) {\n position = match.index;\n }\n } else {\n position = normalizedText.indexOf(normalizedName);\n }\n\n if (position !== -1) {\n for (const entry of entries) {\n if (!matchedCodes.has(entry.code)) {\n matchedCodes.add(entry.code);\n matches.push({\n code: entry.code,\n confidence: 1.0,\n loinc: entry.loinc,\n matchedName: entry.original,\n position,\n });\n }\n }\n }\n }\n\n const scanTimeMs = Date.now() - startTime;\n const matchedCodesArray = Array.from(matchedCodes);\n\n return {\n filteredReference: generateFilteredLLMReference(matchedCodesArray),\n matches,\n stats: {\n matchedCount: matches.length,\n scanTimeMs,\n totalPatterns: getPatterns().length,\n },\n };\n}\n\n/**\n * Get the list of matched biomarker codes from an anchor result.\n */\nexport function getMatchedCodes(result: AnchorResult): string[] {\n return result.matches.map((m) => m.code);\n}\n"]}
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OCR Anchor — Biomarker text anchoring
|
|
3
|
+
*
|
|
4
|
+
* Scans OCR text for biomarker names BEFORE sending to LLM.
|
|
5
|
+
* This prevents hallucination by constraining what biomarkers
|
|
6
|
+
* the LLM is allowed to extract.
|
|
7
|
+
*/
|
|
8
|
+
interface AnchorMatch {
|
|
9
|
+
code: string;
|
|
10
|
+
confidence: number;
|
|
11
|
+
loinc?: string;
|
|
12
|
+
matchedName: string;
|
|
13
|
+
position: number;
|
|
14
|
+
}
|
|
15
|
+
interface AnchorResult {
|
|
16
|
+
filteredReference: string;
|
|
17
|
+
matches: AnchorMatch[];
|
|
18
|
+
stats: {
|
|
19
|
+
totalPatterns: number;
|
|
20
|
+
matchedCount: number;
|
|
21
|
+
scanTimeMs: number;
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Find all biomarker names present in OCR text.
|
|
26
|
+
* Uses exact string matching on normalized text.
|
|
27
|
+
* Returns unique matches (same biomarker won't be matched twice).
|
|
28
|
+
*/
|
|
29
|
+
declare function findBiomarkersInText(ocrText: string): AnchorResult;
|
|
30
|
+
/**
|
|
31
|
+
* Get the list of matched biomarker codes from an anchor result.
|
|
32
|
+
*/
|
|
33
|
+
declare function getMatchedCodes(result: AnchorResult): string[];
|
|
34
|
+
|
|
35
|
+
export { type AnchorMatch, type AnchorResult, findBiomarkersInText, getMatchedCodes };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OCR Anchor — Biomarker text anchoring
|
|
3
|
+
*
|
|
4
|
+
* Scans OCR text for biomarker names BEFORE sending to LLM.
|
|
5
|
+
* This prevents hallucination by constraining what biomarkers
|
|
6
|
+
* the LLM is allowed to extract.
|
|
7
|
+
*/
|
|
8
|
+
interface AnchorMatch {
|
|
9
|
+
code: string;
|
|
10
|
+
confidence: number;
|
|
11
|
+
loinc?: string;
|
|
12
|
+
matchedName: string;
|
|
13
|
+
position: number;
|
|
14
|
+
}
|
|
15
|
+
interface AnchorResult {
|
|
16
|
+
filteredReference: string;
|
|
17
|
+
matches: AnchorMatch[];
|
|
18
|
+
stats: {
|
|
19
|
+
totalPatterns: number;
|
|
20
|
+
matchedCount: number;
|
|
21
|
+
scanTimeMs: number;
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Find all biomarker names present in OCR text.
|
|
26
|
+
* Uses exact string matching on normalized text.
|
|
27
|
+
* Returns unique matches (same biomarker won't be matched twice).
|
|
28
|
+
*/
|
|
29
|
+
declare function findBiomarkersInText(ocrText: string): AnchorResult;
|
|
30
|
+
/**
|
|
31
|
+
* Get the list of matched biomarker codes from an anchor result.
|
|
32
|
+
*/
|
|
33
|
+
declare function getMatchedCodes(result: AnchorResult): string[];
|
|
34
|
+
|
|
35
|
+
export { type AnchorMatch, type AnchorResult, findBiomarkersInText, getMatchedCodes };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
// src/anchor.ts
|
|
2
|
+
import {
|
|
3
|
+
generateFilteredLLMReference,
|
|
4
|
+
getAllSearchPatterns
|
|
5
|
+
} from "@precisa-saude/fhir";
|
|
6
|
+
function normalize(text) {
|
|
7
|
+
return text.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase().replace(/\s+/g, " ");
|
|
8
|
+
}
|
|
9
|
+
var UNAMBIGUOUS_SHORT_NAMES = /* @__PURE__ */ new Set([
|
|
10
|
+
"hdl",
|
|
11
|
+
"ldl",
|
|
12
|
+
"lh",
|
|
13
|
+
"tsh",
|
|
14
|
+
"crp",
|
|
15
|
+
"pcr",
|
|
16
|
+
"ggt",
|
|
17
|
+
"alt",
|
|
18
|
+
"ast",
|
|
19
|
+
"bun",
|
|
20
|
+
"wbc",
|
|
21
|
+
"rbc",
|
|
22
|
+
"mcv",
|
|
23
|
+
"mch",
|
|
24
|
+
"rdw",
|
|
25
|
+
"mpv",
|
|
26
|
+
"psa",
|
|
27
|
+
"fsh",
|
|
28
|
+
"hba1c",
|
|
29
|
+
"egfr",
|
|
30
|
+
"acr",
|
|
31
|
+
"esr",
|
|
32
|
+
"vhs",
|
|
33
|
+
"bmc",
|
|
34
|
+
"bmd",
|
|
35
|
+
"vat",
|
|
36
|
+
"dxa",
|
|
37
|
+
"dmo",
|
|
38
|
+
"cmo",
|
|
39
|
+
"ffm",
|
|
40
|
+
"lbm",
|
|
41
|
+
"mlg",
|
|
42
|
+
"tav"
|
|
43
|
+
]);
|
|
44
|
+
var cachedPatterns = null;
|
|
45
|
+
var cachedNormalized = null;
|
|
46
|
+
function getPatterns() {
|
|
47
|
+
if (!cachedPatterns) {
|
|
48
|
+
cachedPatterns = getAllSearchPatterns();
|
|
49
|
+
}
|
|
50
|
+
return cachedPatterns;
|
|
51
|
+
}
|
|
52
|
+
function getNormalizedPatterns() {
|
|
53
|
+
if (!cachedNormalized) {
|
|
54
|
+
const patterns = getPatterns();
|
|
55
|
+
const map = /* @__PURE__ */ new Map();
|
|
56
|
+
for (const pattern of patterns) {
|
|
57
|
+
for (const name of pattern.names) {
|
|
58
|
+
const normalized = normalize(name);
|
|
59
|
+
const existing = map.get(normalized) || [];
|
|
60
|
+
existing.push({
|
|
61
|
+
code: pattern.code,
|
|
62
|
+
...pattern.loinc && { loinc: pattern.loinc },
|
|
63
|
+
original: name
|
|
64
|
+
});
|
|
65
|
+
map.set(normalized, existing);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
cachedNormalized = map;
|
|
69
|
+
}
|
|
70
|
+
return cachedNormalized;
|
|
71
|
+
}
|
|
72
|
+
function findBiomarkersInText(ocrText) {
|
|
73
|
+
const startTime = Date.now();
|
|
74
|
+
const normalizedText = normalize(ocrText);
|
|
75
|
+
const matchedCodes = /* @__PURE__ */ new Set();
|
|
76
|
+
const matches = [];
|
|
77
|
+
const normalizedPatterns = getNormalizedPatterns();
|
|
78
|
+
for (const [normalizedName, entries] of normalizedPatterns) {
|
|
79
|
+
if (normalizedName.length < 3 && !UNAMBIGUOUS_SHORT_NAMES.has(normalizedName)) {
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
let position = -1;
|
|
83
|
+
if (normalizedName.length <= 4) {
|
|
84
|
+
const regex = new RegExp(`\\b${normalizedName}\\b`);
|
|
85
|
+
const match = regex.exec(normalizedText);
|
|
86
|
+
if (match) {
|
|
87
|
+
position = match.index;
|
|
88
|
+
}
|
|
89
|
+
} else {
|
|
90
|
+
position = normalizedText.indexOf(normalizedName);
|
|
91
|
+
}
|
|
92
|
+
if (position !== -1) {
|
|
93
|
+
for (const entry of entries) {
|
|
94
|
+
if (!matchedCodes.has(entry.code)) {
|
|
95
|
+
matchedCodes.add(entry.code);
|
|
96
|
+
matches.push({
|
|
97
|
+
code: entry.code,
|
|
98
|
+
confidence: 1,
|
|
99
|
+
loinc: entry.loinc,
|
|
100
|
+
matchedName: entry.original,
|
|
101
|
+
position
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
const scanTimeMs = Date.now() - startTime;
|
|
108
|
+
const matchedCodesArray = Array.from(matchedCodes);
|
|
109
|
+
return {
|
|
110
|
+
filteredReference: generateFilteredLLMReference(matchedCodesArray),
|
|
111
|
+
matches,
|
|
112
|
+
stats: {
|
|
113
|
+
matchedCount: matches.length,
|
|
114
|
+
scanTimeMs,
|
|
115
|
+
totalPatterns: getPatterns().length
|
|
116
|
+
}
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
function getMatchedCodes(result) {
|
|
120
|
+
return result.matches.map((m) => m.code);
|
|
121
|
+
}
|
|
122
|
+
export {
|
|
123
|
+
findBiomarkersInText,
|
|
124
|
+
getMatchedCodes
|
|
125
|
+
};
|
|
126
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/anchor.ts"],"sourcesContent":["/**\n * OCR Anchor — Biomarker text anchoring\n *\n * Scans OCR text for biomarker names BEFORE sending to LLM.\n * This prevents hallucination by constraining what biomarkers\n * the LLM is allowed to extract.\n */\n\nimport {\n type BiomarkerSearchPattern,\n generateFilteredLLMReference,\n getAllSearchPatterns,\n} from '@precisa-saude/fhir';\n\nexport interface AnchorMatch {\n code: string;\n confidence: number;\n loinc?: string;\n matchedName: string;\n position: number;\n}\n\nexport interface AnchorResult {\n filteredReference: string;\n matches: AnchorMatch[];\n stats: {\n totalPatterns: number;\n matchedCount: number;\n scanTimeMs: number;\n };\n}\n\n/**\n * Normalize text for comparison:\n * - Removes diacritics (ã→a, ç→c, é→e)\n * - Converts to lowercase\n * - Normalizes whitespace\n */\nfunction normalize(text: string): string {\n return text\n .normalize('NFD')\n .replace(/[\\u0300-\\u036f]/g, '')\n .toLowerCase()\n .replace(/\\s+/g, ' ');\n}\n\nconst UNAMBIGUOUS_SHORT_NAMES = new Set([\n 'hdl', 'ldl', 'lh', 'tsh', 'crp', 'pcr', 'ggt', 'alt', 'ast', 'bun',\n 'wbc', 'rbc', 'mcv', 'mch', 'rdw', 'mpv', 'psa', 'fsh', 'hba1c', 'egfr',\n 'acr', 'esr', 'vhs', 'bmc', 'bmd', 'vat', 'dxa', 'dmo', 'cmo', 'ffm',\n 'lbm', 'mlg', 'tav',\n]);\n\ntype PatternEntry = { original: string; code: string; loinc?: string };\n\nlet cachedPatterns: BiomarkerSearchPattern[] | null = null;\nlet cachedNormalized: Map<string, PatternEntry[]> | null = null;\n\nfunction getPatterns(): BiomarkerSearchPattern[] {\n if (!cachedPatterns) {\n cachedPatterns = getAllSearchPatterns();\n }\n return cachedPatterns;\n}\n\nfunction getNormalizedPatterns(): Map<string, PatternEntry[]> {\n if (!cachedNormalized) {\n const patterns = getPatterns();\n const map = new Map<string, PatternEntry[]>();\n for (const pattern of patterns) {\n for (const name of pattern.names) {\n const normalized = normalize(name);\n const existing = map.get(normalized) || [];\n existing.push({\n code: pattern.code,\n ...(pattern.loinc && { loinc: pattern.loinc }),\n original: name,\n });\n map.set(normalized, existing);\n }\n }\n cachedNormalized = map;\n }\n return cachedNormalized;\n}\n\n/**\n * Find all biomarker names present in OCR text.\n * Uses exact string matching on normalized text.\n * Returns unique matches (same biomarker won't be matched twice).\n */\nexport function findBiomarkersInText(ocrText: string): AnchorResult {\n const startTime = Date.now();\n const normalizedText = normalize(ocrText);\n const matchedCodes = new Set<string>();\n const matches: AnchorMatch[] = [];\n const normalizedPatterns = getNormalizedPatterns();\n\n for (const [normalizedName, entries] of normalizedPatterns) {\n if (normalizedName.length < 3 && !UNAMBIGUOUS_SHORT_NAMES.has(normalizedName)) {\n continue;\n }\n\n let position = -1;\n if (normalizedName.length <= 4) {\n const regex = new RegExp(`\\\\b${normalizedName}\\\\b`);\n const match = regex.exec(normalizedText);\n if (match) {\n position = match.index;\n }\n } else {\n position = normalizedText.indexOf(normalizedName);\n }\n\n if (position !== -1) {\n for (const entry of entries) {\n if (!matchedCodes.has(entry.code)) {\n matchedCodes.add(entry.code);\n matches.push({\n code: entry.code,\n confidence: 1.0,\n loinc: entry.loinc,\n matchedName: entry.original,\n position,\n });\n }\n }\n }\n }\n\n const scanTimeMs = Date.now() - startTime;\n const matchedCodesArray = Array.from(matchedCodes);\n\n return {\n filteredReference: generateFilteredLLMReference(matchedCodesArray),\n matches,\n stats: {\n matchedCount: matches.length,\n scanTimeMs,\n totalPatterns: getPatterns().length,\n },\n };\n}\n\n/**\n * Get the list of matched biomarker codes from an anchor result.\n */\nexport function getMatchedCodes(result: AnchorResult): string[] {\n return result.matches.map((m) => m.code);\n}\n"],"mappings":";AAQA;AAAA,EAEE;AAAA,EACA;AAAA,OACK;AA0BP,SAAS,UAAU,MAAsB;AACvC,SAAO,KACJ,UAAU,KAAK,EACf,QAAQ,oBAAoB,EAAE,EAC9B,YAAY,EACZ,QAAQ,QAAQ,GAAG;AACxB;AAEA,IAAM,0BAA0B,oBAAI,IAAI;AAAA,EACtC;AAAA,EAAO;AAAA,EAAO;AAAA,EAAM;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAC9D;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAS;AAAA,EACjE;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAAO;AAAA,EAC/D;AAAA,EAAO;AAAA,EAAO;AAChB,CAAC;AAID,IAAI,iBAAkD;AACtD,IAAI,mBAAuD;AAE3D,SAAS,cAAwC;AAC/C,MAAI,CAAC,gBAAgB;AACnB,qBAAiB,qBAAqB;AAAA,EACxC;AACA,SAAO;AACT;AAEA,SAAS,wBAAqD;AAC5D,MAAI,CAAC,kBAAkB;AACrB,UAAM,WAAW,YAAY;AAC7B,UAAM,MAAM,oBAAI,IAA4B;AAC5C,eAAW,WAAW,UAAU;AAC9B,iBAAW,QAAQ,QAAQ,OAAO;AAChC,cAAM,aAAa,UAAU,IAAI;AACjC,cAAM,WAAW,IAAI,IAAI,UAAU,KAAK,CAAC;AACzC,iBAAS,KAAK;AAAA,UACZ,MAAM,QAAQ;AAAA,UACd,GAAI,QAAQ,SAAS,EAAE,OAAO,QAAQ,MAAM;AAAA,UAC5C,UAAU;AAAA,QACZ,CAAC;AACD,YAAI,IAAI,YAAY,QAAQ;AAAA,MAC9B;AAAA,IACF;AACA,uBAAmB;AAAA,EACrB;AACA,SAAO;AACT;AAOO,SAAS,qBAAqB,SAA+B;AAClE,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,iBAAiB,UAAU,OAAO;AACxC,QAAM,eAAe,oBAAI,IAAY;AACrC,QAAM,UAAyB,CAAC;AAChC,QAAM,qBAAqB,sBAAsB;AAEjD,aAAW,CAAC,gBAAgB,OAAO,KAAK,oBAAoB;AAC1D,QAAI,eAAe,SAAS,KAAK,CAAC,wBAAwB,IAAI,cAAc,GAAG;AAC7E;AAAA,IACF;AAEA,QAAI,WAAW;AACf,QAAI,eAAe,UAAU,GAAG;AAC9B,YAAM,QAAQ,IAAI,OAAO,MAAM,cAAc,KAAK;AAClD,YAAM,QAAQ,MAAM,KAAK,cAAc;AACvC,UAAI,OAAO;AACT,mBAAW,MAAM;AAAA,MACnB;AAAA,IACF,OAAO;AACL,iBAAW,eAAe,QAAQ,cAAc;AAAA,IAClD;AAEA,QAAI,aAAa,IAAI;AACnB,iBAAW,SAAS,SAAS;AAC3B,YAAI,CAAC,aAAa,IAAI,MAAM,IAAI,GAAG;AACjC,uBAAa,IAAI,MAAM,IAAI;AAC3B,kBAAQ,KAAK;AAAA,YACX,MAAM,MAAM;AAAA,YACZ,YAAY;AAAA,YACZ,OAAO,MAAM;AAAA,YACb,aAAa,MAAM;AAAA,YACnB;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,aAAa,KAAK,IAAI,IAAI;AAChC,QAAM,oBAAoB,MAAM,KAAK,YAAY;AAEjD,SAAO;AAAA,IACL,mBAAmB,6BAA6B,iBAAiB;AAAA,IACjE;AAAA,IACA,OAAO;AAAA,MACL,cAAc,QAAQ;AAAA,MACtB;AAAA,MACA,eAAe,YAAY,EAAE;AAAA,IAC/B;AAAA,EACF;AACF;AAKO,SAAS,gBAAgB,QAAgC;AAC9D,SAAO,OAAO,QAAQ,IAAI,CAAC,MAAM,EAAE,IAAI;AACzC;","names":[]}
|
package/package.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@precisa-saude/fhir-ocr-utils",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "OCR text anchoring utilities for biomarker extraction from lab result PDFs",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.cjs",
|
|
7
|
+
"module": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": {
|
|
12
|
+
"types": "./dist/index.d.ts",
|
|
13
|
+
"default": "./dist/index.js"
|
|
14
|
+
},
|
|
15
|
+
"require": {
|
|
16
|
+
"types": "./dist/index.d.cts",
|
|
17
|
+
"default": "./dist/index.cjs"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"files": ["dist"],
|
|
22
|
+
"scripts": {
|
|
23
|
+
"build": "tsup",
|
|
24
|
+
"lint": "tsc --noEmit",
|
|
25
|
+
"typecheck": "tsc --noEmit",
|
|
26
|
+
"test": "vitest run --passWithNoTests",
|
|
27
|
+
"test:coverage": "vitest run --coverage",
|
|
28
|
+
"clean": "rm -rf dist .turbo"
|
|
29
|
+
},
|
|
30
|
+
"keywords": ["fhir", "ocr", "biomarker", "lab-results", "text-anchoring"],
|
|
31
|
+
"license": "Apache-2.0",
|
|
32
|
+
"repository": {
|
|
33
|
+
"type": "git",
|
|
34
|
+
"url": "https://github.com/precisa-saude/fhir-brasil.git",
|
|
35
|
+
"directory": "packages/ocr-utils"
|
|
36
|
+
},
|
|
37
|
+
"publishConfig": {
|
|
38
|
+
"access": "public"
|
|
39
|
+
},
|
|
40
|
+
"dependencies": {
|
|
41
|
+
"@precisa-saude/fhir": "workspace:*"
|
|
42
|
+
},
|
|
43
|
+
"devDependencies": {
|
|
44
|
+
"tsup": "^8.3.5",
|
|
45
|
+
"typescript": "~5.7.3",
|
|
46
|
+
"vitest": "^2.1.8"
|
|
47
|
+
}
|
|
48
|
+
}
|